Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
TCudnn.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Joana Niermann 23/07/19
3
4/*************************************************************************
5 * Copyright (C) 2019, Joana Niermann *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12///////////////////////////////////////////////////////////////////
13// Definition of the TCudnn architecture class, which provides //
14// a wrapping of the low-level functionality for neural networks //
15// in the cuDNN library. //
16///////////////////////////////////////////////////////////////////
17
18#ifndef TMVA_DNN_ARCHITECTURES_CUDNN
19#define TMVA_DNN_ARCHITECTURES_CUDNN
20
21#include "RConfigure.h" // for definition of R__HAS_CUDNN
22
23#ifndef R__HAS_CUDNN
24#error This file can be compiled only when cudnn is available in ROOT
25#else
26
27#include "TMVA/DNN/Functions.h"
29//#include "TMVA/DNN/CNN/Descriptors.h"
36
37#include "cudnn.h"
38#include "Cuda/CudaBuffers.h"
39#include "Cuda/CudaTensor.h"
41#include <utility>
42#include <vector>
43#include <string>
44
46
47class TRandom;
48
49namespace TMVA
50{
51namespace DNN
52{
53
54struct TCudnnEmptyDescriptor {};
55
56
57/** The TCudnn architecture class.
58 *
59 * Low-level interface class for CUDA computing architectures using the cuDNN
60 * library as backend. Contains as public types the declaration of the scalar,
61 * matrix and buffer types for this architecture, as well as the remaining
62 * functions in the low-level interface in the form of static members.
63 */
64template<typename AFloat = Float_t>
65class TCudnn
66{
67private:
68 static TRandom * fgRandomGen;
69public:
70
71 using Scalar_t = AFloat;
72 using Matrix_t = TCudaTensor<AFloat>;
73 using Tensor_t = TCudaTensor<AFloat>;
74 using DeviceBuffer_t = TCudaDeviceBuffer<AFloat>;
75 using HostBuffer_t = TCudaHostBuffer<AFloat>;
76
77 // The descriptors for the (tensor) data are held by the data classes (CudaTensor)
78 using ActivationDescriptor_t = cudnnActivationDescriptor_t;
79 using ConvolutionDescriptor_t = cudnnConvolutionDescriptor_t;
80 using DropoutDescriptor_t = cudnnDropoutDescriptor_t;
81 using FilterDescriptor_t = cudnnFilterDescriptor_t;
82 //using OpTensorDescriptor_t = cudnnOpTensorDescriptor_t;
83 using PoolingDescriptor_t = cudnnPoolingDescriptor_t;
84 //using ReductionDescriptor_t = cudnnReduceTensorDescriptor_t;
85 using AlgorithmForward_t = cudnnConvolutionFwdAlgo_t;
86 using AlgorithmBackward_t = cudnnConvolutionBwdDataAlgo_t;
87 using AlgorithmHelper_t = cudnnConvolutionBwdFilterAlgo_t;
88 using AlgorithmDataType_t = cudnnDataType_t;
89 using ReduceTensorDescriptor_t = cudnnReduceTensorDescriptor_t;
90 using TensorDescriptor_t = cudnnTensorDescriptor_t;
91 using RecurrentDescriptor_t = cudnnRNNDescriptor_t;
92
93 using EmptyDescriptor_t = TCudnnEmptyDescriptor; // Used if a descriptor is not needed in a class
94
95 using BNormLayer_t = TBatchNormLayer<TCudnn<AFloat>>;
96 using BNormDescriptors_t = TDNNGenDescriptors<BNormLayer_t>;
97 //using BNormWorkspace_t = CNN::TCNNWorkspace<BNormLayer_t>;*/
98 using ConvLayer_t = CNN::TConvLayer<TCudnn<AFloat>>;
99 using ConvDescriptors_t = CNN::TCNNDescriptors<ConvLayer_t>;
100 using ConvWorkspace_t = CNN::TCNNWorkspace<ConvLayer_t>;
101 using PoolingLayer_t = CNN::TMaxPoolLayer<TCudnn<AFloat>>;
102 using PoolingDescriptors_t = CNN::TCNNDescriptors<PoolingLayer_t>;
103 using PoolingWorkspace_t = CNN::TCNNWorkspace<PoolingLayer_t>;
104
105 using RNNLayer_t = RNN::TBasicRNNLayer<TCudnn<AFloat>>;
106 using RNNDescriptors_t = RNN::TRNNDescriptors<TCudnn<AFloat>>;
107 using RNNWorkspace_t = RNN::TRNNWorkspace<TCudnn<AFloat>>;
108
109 using LSTMLayer_t = RNN::TBasicLSTMLayer<TCudnn<AFloat>>;
110 // using LSTMDescriptors_t = RNN::TRNNDescriptors<LSTMLayer_t>;
111 // using LSTMWorkspace_t = RNN::TRNNWorkspace<LSTMLayer_t>;
112
113 using GRULayer_t = RNN::TBasicGRULayer<TCudnn<AFloat>>;
114 // using GRUDescriptors_t = RNN::TRNNDescriptors<GRULayer_t>;
115 // using GRUWorkspace_t = RNN::TRNNWorkspace<GRULayer_t>;
116
117 // template <typename AFloat>
118 // using ConvDescriptors_t = CNN::TCNNDescriptors<CNN::TConvLayer<TCudnn<AFloat>>>;
119
120 // convolution options
121 // default is -1 (left to cudnn)
122 struct CNNOptions {
123
124 static int ConvFwdAlgorithm;
125 static int ConvBwdDataAlgorithm;
126 static int ConvBwdFilterAlgorithm;
127 // default is 0 (left to cudnn : a value -1 will indicate to not use any space)
128 static Long_t ConvMaxWorkspaceSize;
129 }; // namespace DNN
130
131 static TMVA::Experimental::MemoryLayout GetTensorLayout() { return TMVA::Experimental::MemoryLayout::RowMajor; }
132
133
134 static Tensor_t CreateTensor(size_t n, size_t c, size_t h, size_t w) {
135 return Tensor_t( {n,c,h,w}, GetTensorLayout(), 0, 0);
136 }
137
138 static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t n, size_t c, size_t h, size_t w) {
139 return Tensor_t( buffer, {n,c,h,w}, GetTensorLayout(), 0, 0);
140 }
141
142 static Tensor_t CreateTensor(size_t n, size_t c, size_t w)
143 {
144 return Tensor_t({n, c, w}, GetTensorLayout(), 0, 0);
145 }
146
147 static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t n, size_t c, size_t w)
148 {
149 return Tensor_t(buffer, {n, c, w}, GetTensorLayout(), 0, 0);
150 }
151
152 static bool IsCudnn() { return true; }
153
154 // create a weight tensor/matrix vector from another tensor/weight vector using the given tensor shapes
155 // this function is used by the optimizers to store intermediate weights representations
156 static void CreateWeightTensors( std::vector<Matrix_t> & newWeights, const std::vector<Matrix_t> & weights) {
157 if (!newWeights.empty()) newWeights.clear();
158 size_t n = weights.size();
159 for (size_t i = 0; i < n; ++i)
160 newWeights.emplace_back( weights[i].GetShape(), weights[i].GetLayout(), 0, 0);
161 }
162 //____________________________________________________________________________
163 //
164 // Architecture Initialization
165 //____________________________________________________________________________
166
167 static void InitializeBNormDescriptors(TDescriptors * & descriptors,
168 BNormLayer_t *L = nullptr);
169
170 static void InitializeConvDescriptors(TDescriptors * & descriptors,
171 ConvLayer_t *L = nullptr);
172
173 static void InitializePoolDescriptors(TDescriptors * & descriptors,
174 PoolingLayer_t *L = nullptr);
175
176 static void InitializeRNNDescriptors(TDescriptors *&descriptors, RNNLayer_t *layer)
177 {
178 InitializeRecurrentDescriptors<RNNLayer_t>(descriptors, layer);
179 }
180 static void InitializeLSTMDescriptors(TDescriptors *&descriptors, LSTMLayer_t *layer) {
181 InitializeRecurrentDescriptors<LSTMLayer_t>(descriptors, layer);
182 }
183 static void InitializeGRUDescriptors(TDescriptors *&descriptors, GRULayer_t *layer) {
184 InitializeRecurrentDescriptors<GRULayer_t>(descriptors, layer);
185 }
186 template<typename RNNLayer>
187 static void InitializeRecurrentDescriptors(TDescriptors *&descriptors, RNNLayer *L);
188 // static void InitializeRNNDescriptors(TDescriptors *&descriptors, LSTMLayer_t *L = nullptr);
189 // static void InitializeRNNDescriptors(TDescriptors *&descriptors, GRULayer_t *L = nullptr);
190
191 static void InitializeActivationDescriptor(ActivationDescriptor_t & descriptors, EActivationFunction activFunc, double coef = 0.0);
192
193 static void ReleaseConvDescriptors(TDescriptors * descriptors );
194 static void ReleasePoolDescriptors(TDescriptors * descriptors );
195 static void ReleaseRNNDescriptors(TDescriptors *descriptors);
196 static void ReleaseBNormDescriptors(TDescriptors * descriptors );
197 static void ReleaseDescriptor(EmptyDescriptor_t & emptyDescr) {} // Does nothing
198 static void ReleaseDescriptor(ActivationDescriptor_t & activationDescr);
199 static void ReleaseDescriptor(ConvolutionDescriptor_t & convolutionDescr);
200 static void ReleaseDescriptor(DropoutDescriptor_t & dropoutDescr);
201 static void ReleaseDescriptor(FilterDescriptor_t & filterDescr);
202 static void ReleaseDescriptor(PoolingDescriptor_t & poolingDescr);
203 static void ReleaseDescriptor(TensorDescriptor_t & tensorDescr);
204
205
206 static void InitializeConvWorkspace(TWorkspace * & workspace,
207 TDescriptors * & descriptors,
208 const DNN::CNN::TConvParams & params,
209 ConvLayer_t *L = nullptr);
210 static void InitializePoolDropoutWorkspace(TWorkspace * & workspace,
211 TDescriptors * & descriptors,
212 const DNN::CNN::TConvParams & params,
213 PoolingLayer_t *L = nullptr);
214
215 static void InitializeRNNWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors, RNNLayer_t *layer)
216 {
217 InitializeRecurrentWorkspace<RNNLayer_t>(workspace, descriptors, layer);
218 }
219 static void InitializeLSTMWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors, LSTMLayer_t *layer)
220 {
221 InitializeRecurrentWorkspace<LSTMLayer_t>(workspace, descriptors, layer);
222 }
223 static void InitializeGRUWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors, GRULayer_t *layer)
224 {
225 InitializeRecurrentWorkspace<GRULayer_t>(workspace, descriptors, layer);
226 }
227 template<typename RNNLayer>
228 static void InitializeRecurrentWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors,
229 RNNLayer *layer);
230
231 static void FreeConvWorkspace(TWorkspace * workspace);
232 static void FreePoolDropoutWorkspace(TWorkspace * workspace);
233 static void FreeRNNWorkspace(TWorkspace *workspace);
234
235 // tensor inizialization for recurrent networks
236 static void InitializeRNNTensors(RNNLayer_t *layer) { InitializeRecurrentTensors<RNNLayer_t>(layer); }
237 static void InitializeLSTMTensors(LSTMLayer_t *layer) { InitializeRecurrentTensors<LSTMLayer_t>(layer); }
238 static void InitializeGRUTensors(GRULayer_t *layer) { InitializeRecurrentTensors<GRULayer_t>(layer); }
239 template <typename RNNLayer>
240 static void InitializeRecurrentTensors(RNNLayer *layer);
241
242 //____________________________________________________________________________
243 //
244 // Propagation
245 //____________________________________________________________________________
246
247 /** @name Forward Propagation
248 * Low-level functions required for the forward propagation of activations
249 * through the network.
250 */
251 ///@{
252 /** Matrix-multiply \p input with the transpose of \pweights and
253 * write the results into \p output. */
254 static void MultiplyTranspose(Tensor_t &output, const Tensor_t &input, const Matrix_t &weights);
255
256 /** Add the vectors biases row-wise to the matrix output */
257 static void AddRowWise(Tensor_t &output,const Matrix_t &biases);
258
259 /** @name Backward Propagation (Dense Layers)
260 * Low-level functions required for the forward propagation of activations
261 * through the network.
262 */
263 ///@{
264 /** Perform the complete backward propagation step. If the provided
265 * \p activationGradientsBackward matrix is not empty, compute the
266 * gradients of the objective function with respect to the activations
267 * of the previous layer (backward direction).
268 * Also compute the weight and the bias gradients. Modifies the values
269 * in \p df and thus produces only a valid result, if it is applied the
270 * first time after the corresponding forward propagation has been per-
271 * formed. */
272 static void Backward(Tensor_t & activationGradientsBackward,
273 Matrix_t & weightGradients,
274 Matrix_t & biasGradients,
275 Tensor_t & df,
276 const Tensor_t & activationGradients,
277 const Matrix_t & weights,
278 const Tensor_t & activationBackward);
279
280 /** Above functions extended to vectors */
281 static void ScaleAdd(Tensor_t & A, const Tensor_t & B,
282 Scalar_t alpha = 1.0,
283 Scalar_t beta = 1.0);
284
285 /** Deep copy from B to A. */
286 static void Copy(Tensor_t & A, const Tensor_t & B);
287
288 // copy from another tensor
289 template<typename ATensor_t>
290 static void CopyDiffArch(Tensor_t & A,
291 const ATensor_t & B);
292
293 template <typename ATensor_t>
294 static void CopyWeightsDiffArch(Tensor_t &A, const ATensor_t &B);
295
296 //template<>
297 static void CopyDiffArch(Tensor_t A, const Tensor_t & B ) { Copy(A,B); }
298
299 // copy from vector of matrices of different types
300 template<typename AMatrix_t>
301 static void CopyDiffArch(std::vector<Tensor_t> & A,
302 const std::vector<AMatrix_t> & B);
303
304
305 //____________________________________________________________________________
306 //
307 // Activation Functions
308 //____________________________________________________________________________
309
310 /** @name Activation Functions
311 * For each activation function, the low-level interface contains two routines.
312 * One that applies the activation function to a matrix and one that evaluate
313 * the derivatives of the activation function at the elements of a given matrix
314 * and writes the results into the result matrix.
315 */
316 ///@{
317 static void Identity(Tensor_t & X) {}
318 static void IdentityDerivative(Tensor_t & dX, Tensor_t& X,
319 Tensor_t & Y, Tensor_t & dY,
320 ActivationDescriptor_t activationDescr,
321 const AFloat alpha = 1,
322 const AFloat beta = 1) {}
323
324 static void ActivationFunctionForward(Tensor_t & X, EActivationFunction activFunct,
325 const ActivationDescriptor_t activationDescr,
326 const double coef = 0.0, const AFloat alpha = 1,
327 const AFloat beta = 0);
328
329 // same as above but using different input/output tensors
330 static void ActivationFunctionForward(Tensor_t &Y, const Tensor_t & X, EActivationFunction activFunct,
331 const ActivationDescriptor_t activationDescr, const double coef = 0.0,
332 const AFloat alpha = 1, const AFloat beta = 0);
333
334 /** Computes the gradient of the activation function */
335 static void ActivationFunctionBackward(Tensor_t & dX, const Tensor_t & Y,
336 const Tensor_t & dY, const Tensor_t & X,
337 EActivationFunction activFunct,
338 const ActivationDescriptor_t activationDescr,
339 const AFloat alpha = 1,
340 const AFloat beta = 0);
341
342 //
343 // No cudnn implementation for the following activation functions
344 //
345 //static void SymmetricRelu(Tensor_t & B);
346
347 // implementations not used by Cudnn
348 static void Relu(Tensor_t &) {}
349 static void Sigmoid(Tensor_t &) {}
350 static void Tanh(Tensor_t &) {}
351 static void FastTanh(Tensor_t &) {}
352 static void SymmetricRelu(Tensor_t &) {}
353 static void SoftSign(Tensor_t &) {}
354 static void Gauss(Tensor_t &) {}
355
356 static void IdentityDerivative(Tensor_t &, const Tensor_t &) {}
357 static void ReluDerivative(Tensor_t &, const Tensor_t &) {}
358 static void SigmoidDerivative(Tensor_t &, const Tensor_t &) {}
359 static void TanhDerivative(Tensor_t &, const Tensor_t &) {}
360 static void FastTanhDerivative(Tensor_t &, const Tensor_t &) {}
361 static void SymmetricReluDerivative(Tensor_t & , const Tensor_t & ) {}
362 static void SoftSignDerivative(Tensor_t & , const Tensor_t & ) {}
363 static void GaussDerivative(Tensor_t & , const Tensor_t & ) {}
364 ///@}
365
366 //____________________________________________________________________________
367 //
368 // Loss Functions
369 //____________________________________________________________________________
370
371 /** @name Loss Functions
372 * Loss functions compute a scalar value given the \p output of the network
373 * for a given training input and the expected network prediction \p Y that
374 * quantifies the quality of the prediction. For each function also a routing
375 * that computes the gradients (suffixed by Gradients) must be provided for
376 * the starting of the backpropagation algorithm.
377 */
378 ///@{
379
380 static Scalar_t MeanSquaredError(const Matrix_t &Y, const Matrix_t &output,
381 const Matrix_t &weights);
382 static void MeanSquaredErrorGradients(Matrix_t &dY, const Matrix_t &Y,
383 const Matrix_t &output, const Matrix_t &weights);
384
385 /** Sigmoid transformation is implicitly applied, thus \p output should
386 * hold the linear activations of the last layer in the net. */
387 static Scalar_t CrossEntropy(const Matrix_t &Y, const Matrix_t &output,
388 const Matrix_t &weights);
389
390 static void CrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
391 const Matrix_t &output, const Matrix_t &weights);
392
393 /** Softmax transformation is implicitly applied, thus \p output should
394 * hold the linear activations of the last layer in the net. */
395 static Scalar_t SoftmaxCrossEntropy(const Matrix_t &Y, const Matrix_t &output,
396 const Matrix_t &weights);
397 static void SoftmaxCrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
398 const Matrix_t &output, const Matrix_t &weights);
399 ///@}
400
401 //____________________________________________________________________________
402 //
403 // Output Functions
404 //____________________________________________________________________________
405
406 /** @name Output Functions
407 * Output functions transform the activations \p output of the
408 * output layer in the network to a valid prediction \p YHat for
409 * the desired usage of the network, e.g. the identity function
410 * for regression or the sigmoid transformation for two-class
411 * classification.
412 */
413 ///@{
414 static void Sigmoid(Matrix_t &YHat,
415 const Matrix_t & );
416 static void Softmax(Matrix_t &YHat,
417 const Matrix_t & );
418 ///@}
419
420
421
422 //____________________________________________________________________________
423 //
424 // Dropout
425 //____________________________________________________________________________
426
427 /** @name Dropout
428 */
429 ///@{
430
431 /** Apply dropout with activation probability \p p to the given
432 * tensor \p A and scale the result by reciprocal of \p p. */
433 static void DropoutForward(Tensor_t & A,
434 TDescriptors * descriptors,
435 TWorkspace * workspace,
436 Scalar_t p);
437
438 static void DropoutBackward(Tensor_t & A,
439 TDescriptors * descriptors,
440 TWorkspace * workspace);
441
442 ///@}
443
444 //____________________________________________________________________________
445 //
446 // Batch Normalization
447 //____________________________________________________________________________
448
449 /** @name Batch Normalization Layer Propagation
450 */
451 ///@{
452
453 /** The input from each batch are normalized during training to have zero mean and unit variance
454 * and they are then scaled by two parameter, different for each input variable:
455 * - a scale factor \gamma gamma
456 * - an offset \beta beta */
457
458 static void BatchNormLayerForwardTraining(int axis, const Tensor_t &x, Tensor_t &y, Matrix_t &gamma, Matrix_t &beta,
459 Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans,
460 Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum,
461 Scalar_t epsilon, const TensorDescriptor_t &bnParDescriptor);
462
463 /** During inference the inputs are not normalized using the batch mean but the previously computed
464 * at running mean and variance */
465
466 static void BatchNormLayerForwardInference(int axis, const Tensor_t &x, Matrix_t &gamma, Matrix_t &beta,
467 Tensor_t &y, const Matrix_t &runningMeans,
468 const Matrix_t &runningVars, Scalar_t epsilon,
469 const TensorDescriptor_t &);
470
471 static void BatchNormLayerBackward(int axis, const Tensor_t &x, const Tensor_t &dy, Tensor_t &dx,
472 Matrix_t &gamma, // Matrix_t &beta, (not needed)
473 Matrix_t &dgamma, Matrix_t &dbeta, const Matrix_t &mean, const Matrix_t &variance,
474 const Matrix_t &iVariance, Scalar_t epsilon, const TensorDescriptor_t &);
475
476 //____________________________________________________________________________
477 //
478 // Regularization
479 //____________________________________________________________________________
480
481 /** @name Regularization
482 * For each regularization type two functions are required, one named
483 * <tt><Type>Regularization</tt> that evaluates the corresponding
484 * regularization functional for a given weight matrix and the
485 * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization
486 * component in the gradients to the provided matrix.
487 */
488
489 static Scalar_t L1Regularization(const Matrix_t &W)
490 {
491 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
492 return TCuda<AFloat>::L1Regularization(mW);
493 }
494 static void AddL1RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
495 {
496 TCudaMatrix<AFloat> mA(A.GetDeviceBuffer(), A.GetSize(), 1);
497 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
498 return TCuda<AFloat>::AddL1RegularizationGradients(mA, mW, weightDecay);
499 }
500
501 static Scalar_t L2Regularization(const Matrix_t &W)
502 {
503 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
504 return TCuda<AFloat>::L2Regularization(mW);
505 }
506 static void AddL2RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
507 {
508 TCudaMatrix<AFloat> mA(A.GetDeviceBuffer(), A.GetSize(), 1);
509 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
510 return TCuda<AFloat>::AddL1RegularizationGradients(mA, mW, weightDecay);
511 }
512 ///@}
513
514 //____________________________________________________________________________
515 //
516 // Initialization
517 //____________________________________________________________________________
518
519 /** @name Initialization
520 * For each initialization method, one function in the low-level interface
521 * is provided. The naming scheme is <p>Initialize<Type></p> for a given
522 * initialization method Type.
523 */
524 ///@{
525
526 static void InitializeGauss(Matrix_t &A);
527 static void InitializeUniform(Matrix_t &A);
528 static void InitializeIdentity(Matrix_t &A);
529 static void InitializeZero(Matrix_t &A);
530 static void InitializeGlorotNormal(Matrix_t &A);
531 static void InitializeGlorotUniform(Matrix_t &A);
532
533 // return static instance of random generator used for initialization
534 // if generator does not exist it is created the first time with a random seed (e.g. seed = 0)
535 static TRandom &GetRandomGenerator();
536 // set random seed for the static generator
537 // if the static generator does not exists it is created
538 static void SetRandomSeed(size_t seed);
539 ///@}
540
541 //____________________________________________________________________________
542 //
543 // Dropout
544 //____________________________________________________________________________
545
546 /** @name Dropout
547 */
548 ///@{
549
550 /** Apply dropout with activation probability \p p to the given
551 * tensor \p A and scale the result by reciprocal of \p p. */
552 static void Dropout(Tensor_t &A, Scalar_t p) {}
553
554 ///@}
555
556 //____________________________________________________________________________
557 //
558 // Convolutional Layer Propagation
559 //____________________________________________________________________________
560
561 /** @name Forward Propagation in Convolutional Layer
562 */
563 ///@{
564
565 /** Add the biases in the Convolutional Layer. */
566 static void AddConvBiases(Matrix_t &output, const Matrix_t &biases);
567 ///@}
568
569 /** Dummy placeholder - preparation is currently only required for the CUDA architecture. */
570 static void PrepareInternals(Tensor_t &) {}
571
572 /** Forward propagation in the Convolutional layer */
573 static void ConvLayerForward(Tensor_t &output,
574 Tensor_t &inputActivationFunc, // this is output conv w/o activ func.
575 const Tensor_t &input, const Matrix_t &weights, const Matrix_t &biases,
576 const DNN::CNN::TConvParams &params, EActivationFunction activFunc,
577 Tensor_t & /* inputPrime */, const ConvDescriptors_t &descriptors,
578 ConvWorkspace_t &workspace);
579 // const AFloat alpha = 1,
580 // const AFloat beta = 1);
581
582 /** @name Backward Propagation in Convolutional Layer
583 */
584 ///@{
585
586 /** Perform the complete backward propagation step in a Convolutional Layer.
587 * If the provided \p activationGradientsBackward matrix is not empty, compute the
588 * gradients of the objective function with respect to the activations
589 * of the previous layer (backward direction).
590 * Also compute the weight and the bias gradients. Modifies the values
591 * in \p df and thus produces only a valid result, if it is applied the
592 * first time after the corresponding forward propagation has been per-
593 * formed. */
594 static void ConvLayerBackward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients,
595 Matrix_t &biasGradients, Tensor_t &inputActivation, Tensor_t &activationGradients,
596 const Matrix_t &weights, const Tensor_t &activationBackward,
597 const Tensor_t &outputTensor, EActivationFunction activFunc,
598 const ConvDescriptors_t &descriptors, ConvWorkspace_t &workspace, size_t /*batchSize*/,
599 size_t /*inputHeight*/, size_t /*inputWidth*/, size_t /*depth*/, size_t /*height*/,
600 size_t /*width*/, size_t /*filterDepth*/, size_t /*filterHeight*/,
601 size_t /*filterWidth*/, size_t /*nLocalViews*/);
602
603 ///@}
604
605 //____________________________________________________________________________
606 //
607 // Max Pooling Layer Propagation
608 //____________________________________________________________________________
609 /** @name Forward Propagation in Max Pooling Layer
610 */
611 ///@{
612
613 /** Downsample the matrix \p C to the matrix \p A, using max
614 * operation, such that the winning indices are stored in matrix
615 * \p B. No winning indices needed for cuDNN. */
616 static void Downsample(Tensor_t &A, Tensor_t & /*B*/, const Tensor_t &C, const PoolingDescriptors_t &descriptors,
617 PoolingWorkspace_t &workspace, size_t imgHeight, size_t imgWidth, size_t fltHeight,
618 size_t fltWidth, size_t strideRows, size_t strideCols);
619
620 ///@}
621
622 /** @name Backward Propagation in Max Pooling Layer
623 */
624 ///@{
625 /** Perform the complete backward propagation step in a Pooling Layer. Based on the
626 * input to and output from the MaxPoolLayer, the gradients for the winning pixels
627 * are computed. */
628 static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward, const Tensor_t &activationGradients,
629 const Tensor_t & /*indexMatrix*/, const Tensor_t &inputActivation,
630 const Tensor_t &outputTensor, const PoolingDescriptors_t &descriptors,
631 PoolingWorkspace_t &workspace, size_t imgHeight, size_t imgWidth, size_t fltHeight,
632 size_t fltWidth, size_t strideRows, size_t strideCols, size_t nLocalViews);
633
634 ///@}
635
636 //____________________________________________________________________________
637 //
638 // Reshape Layer Propagation
639 //____________________________________________________________________________
640 /** @name Forward and Backward Propagation in Reshape Layer
641 */
642 ///@{
643
644 /** Transform the matrix \p B to a matrix with different dimensions \p A */
645 // static void Reshape(Matrix_t &A, const Matrix_t &B);
646
647 /** Flattens the tensor \p B, such that each matrix, is stretched in
648 * one row, resulting with a matrix \p A. */
649 static void Flatten(Tensor_t &A, const Tensor_t &B);
650
651 /** Transforms each row of \p B to a matrix and stores it in the
652 * tensor \p B. */
653 static void Deflatten(Tensor_t &A, const Tensor_t &B); // size_t index, size_t nRows,size_t nCols);
654
655 /** Rearrage data according to time fill B x T x D out with T x B x D matrix in*/
656 static void Rearrange(Tensor_t &out, const Tensor_t &in);
657
658 // RNN functions
659 static void RNNForward(const Tensor_t &x, const Tensor_t &hx, const Tensor_t &cx, const Tensor_t &weights,
660 Tensor_t &y, Tensor_t &hy, Tensor_t &cy, const RNNDescriptors_t &descr,
661 RNNWorkspace_t &workspace, bool isTraining);
662
663 static void RNNBackward(const Tensor_t &x, const Tensor_t &hx, const Tensor_t &cx, const Tensor_t &y, const Tensor_t &dy,
664 const Tensor_t &dhy, const Tensor_t &dcy, const Tensor_t &weights, Tensor_t &dx, Tensor_t &dhx,
665 Tensor_t &dcx, Tensor_t &dw, const RNNDescriptors_t &desc, RNNWorkspace_t &workspace);
666
667
668 // Backward pass for Recurrent Networks functions used by another architectures
669 //******************************************************************************************
670 static Matrix_t &RecurrentLayerBackward(Matrix_t &state_gradients_backward, // BxH
671 Matrix_t & /* input_weight_gradients */,
672 Matrix_t & /* state_weight_gradients */, Matrix_t & /* bias_gradients */,
673 Matrix_t & /* df */, // DxH
674 const Matrix_t & /* state */, // BxH
675 const Matrix_t & /* weights_input */, // HxD
676 const Matrix_t & /* weights_state */, // HxH
677 const Matrix_t & /* input */, // BxD
678 Matrix_t & /* input_gradient */)
679 {
680 return state_gradients_backward;
681 }
682 static Matrix_t &LSTMLayerBackward(
683 Matrix_t & state_gradients_backward , Matrix_t & /*cell_gradients_backward*/,
684 Matrix_t & /*input_weight_gradients*/, Matrix_t & /*forget_weight_gradients*/,
685 Matrix_t & /*candidate_weight_gradients*/, Matrix_t & /*output_weight_gradients*/,
686 Matrix_t & /*input_state_weight_gradients*/, Matrix_t & /*forget_state_weight_gradients*/,
687 Matrix_t & /*candidate_state_weight_gradients*/,
688 Matrix_t & /*output_state_weight_gradients*/, Matrix_t & /*input_bias_gradients*/,
689 Matrix_t & /*forget_bias_gradients*/, Matrix_t & /*candidate_bias_gradients*/,
690 Matrix_t & /*output_bias_gradients*/, Matrix_t & /*di*/, Matrix_t & /*df*/,
691 Matrix_t & /*dc*/, Matrix_t & /*dout*/,
692 const Matrix_t & /*precStateActivations*/, const Matrix_t & /*precCellActivations*/,
693 const Matrix_t & /*fInput*/, const Matrix_t & /*fForget*/,
694 const Matrix_t & /*fCandidate*/, const Matrix_t & /*fOutput*/,
695 const Matrix_t & /*weights_input*/, const Matrix_t & /*weights_forget*/,
696 const Matrix_t & /*weights_candidate*/, const Matrix_t & /*weights_output*/,
697 const Matrix_t & /*weights_input_state*/, const Matrix_t & /*weights_forget_state*/,
698 const Matrix_t & /*weights_candidate_state*/, const Matrix_t & /*weights_output_state*/,
699 const Matrix_t & /*input*/, Matrix_t & /*input_gradient*/,
700 Matrix_t & /*cell_gradient*/, Matrix_t & /*cell_tanh*/)
701 {
702 return state_gradients_backward;
703 }
704
705 /** Backward pass for GRU Network */
706 static Matrix_t &GRULayerBackward(
707 Matrix_t & state_gradients_backward, Matrix_t & /*reset_weight_gradients*/,
708 Matrix_t & /*update_weight_gradients*/, Matrix_t & /*candidate_weight_gradients*/,
709 Matrix_t & /*reset_state_weight_gradients*/, Matrix_t & /*update_state_weight_gradients*/,
710 Matrix_t & /*candidate_state_weight_gradients*/, Matrix_t & /*reset_bias_gradients*/,
711 Matrix_t & /*update_bias_gradients*/, Matrix_t & /*candidate_bias_gradients*/,
712 Matrix_t & /*dr*/, Matrix_t & /*du*/, Matrix_t & /*dc*/,
713 const Matrix_t & /*precStateActivations*/, const Matrix_t & /*fReset*/,
714 const Matrix_t & /*fUpdate*/, const Matrix_t & /*fCandidate*/,
715 const Matrix_t & /*weights_reset*/, const Matrix_t & /*weights_update*/,
716 const Matrix_t & /*weights_candidate*/, const Matrix_t & /*weights_reset_state*/,
717 const Matrix_t & /*weights_update_state*/, const Matrix_t & /*weights_candidate_state*/,
718 const Matrix_t & /*input*/, Matrix_t & /*input_gradient*/, bool)
719 {
720 return state_gradients_backward;
721 }
722
723 ///@}
724
725 //____________________________________________________________________________
726 //
727 // Additional Arithmetic Functions
728 //____________________________________________________________________________
729
730 /** @name Additional Arithmetic Functions
731 *
732 * Additional arithmetic on CUDA matrices used to implement the low-level
733 * interface.
734 */
735
736 /** In-place Hadamard (element-wise) product of matrices \p A and \p B
737 * with the result being written into \p A.
738 */
739 static void Hadamard(Tensor_t &A, const Tensor_t &B)
740 {
741 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), 1, A.GetSize());
742 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), 1, B.GetSize());
743 assert(A.GetSize() == B.GetSize());
744 TCuda<AFloat>::Hadamard(tmpA, tmpB);
745 }
746 // static void Hadamard(Matrix_t &A,
747 // const Matrix_t &B);*/
748 // {
749 // Tensor_t tA(A);
750 // Hadamard( tA, Tensor_t(B));
751 // }
752
753
754 /** Compute the sum of all elements in \p A */
755 static Scalar_t Sum(const Matrix_t &A, Scalar_t alpha = 1.0, Scalar_t beta = 0.0);
756
757 /** Check two matrices for equality, taking floating point arithmetic errors into account. */
758 //static bool AlmostEquals(const Matrix_t &A, const Matrix_t &B, double epsilon = 0.1);
759
760 /** Add the constant \p beta to all the elements of matrix \p A and write the
761 * result into \p A.
762 */
763 static void ConstAdd(Matrix_t &A, Scalar_t beta) {
764 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
765 TCuda<AFloat>::ConstAdd(tmp,beta);
766 }
767
768 /** Multiply the constant \p beta to all the elements of matrix \p A and write the
769 * result into \p A.
770 */
771 static void ConstMult(Matrix_t &A, Scalar_t beta) {
772 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
773 TCuda<AFloat>::ConstMult(tmp,beta);
774 }
775
776 /** Reciprocal each element of the matrix \p A and write the result into
777 * \p A
778 */
779 static void ReciprocalElementWise(Matrix_t &A) {
780 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
781 TCuda<AFloat>::ReciprocalElementWise(tmp);
782 }
783
784 /** Square each element of the matrix \p A and write the result into
785 * \p A
786 */
787 static void SquareElementWise(Matrix_t &A) {
788 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
789 TCuda<AFloat>::SquareElementWise(tmp);
790 }
791
792 /** Square root each element of the matrix \p A and write the result into
793 * \p A
794 */
795 //static void SqrtElementWise(Matrix_t &A, Scalar_t alpha = 1, Scalar_t beta = 0, Scalar_t gamma = 0) {
796 static void SqrtElementWise(Matrix_t &A) {
797 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
798 TCuda<AFloat>::SqrtElementWise(tmp);
799 }
800
801 // optimizer functions
802 static void AdamUpdate(Matrix_t & A, const Matrix_t & M, const Matrix_t & V, Scalar_t alpha, Scalar_t eps) {
803 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
804 TCudaMatrix<AFloat> tmpM(M.GetDeviceBuffer(), M.GetSize(),1);
805 TCudaMatrix<AFloat> tmpV(V.GetDeviceBuffer(), V.GetSize(),1);
806 TCuda<AFloat>::AdamUpdate(tmpA, tmpM, tmpV,alpha, eps);
807 }
808 static void AdamUpdateFirstMom(Matrix_t & A, const Matrix_t & B, Scalar_t beta) {
809 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
810 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), B.GetSize(),1);
811 TCuda<AFloat>::AdamUpdateFirstMom(tmpA, tmpB, beta);
812 }
813 static void AdamUpdateSecondMom(Matrix_t & A, const Matrix_t & B, Scalar_t beta) {
814 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
815 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), B.GetSize(),1);
816 TCuda<AFloat>::AdamUpdateSecondMom(tmpA, tmpB, beta);
817 }
818
819 // printing of tensor
820 static void PrintTensor( const Tensor_t & A, const std::string name = "tensor", bool = false);
821
822 static void PrintTensor4dDescriptor(TensorDescriptor_t descriptor);
823 static void PrintTensorNdDescriptor(TensorDescriptor_t descriptor, int n = 10);
824
825 ///////////////////////////////////////////////////////////////////////////////
826 /// extra functions defined only for CPU architecture !!!
827 //////////////////////////////////////////////////////////////////////////////
828
829 /** Sum rows of (m x n) matrix \p A and write the results into the first
830 * m elements in \p B.
831 */
832 static void SumRows(Matrix_t &B, const Matrix_t &A);
833};
834
835
836//____________________________________________________________________________
837template <typename AFloat>
838template <typename ATensor>
839void TCudnn<AFloat>::CopyDiffArch(TCudaTensor<AFloat> &B,
840 const ATensor &A)
841{
842
843 // should add static assert that A has not to be same type as B
844
845 // this copying tensors from different architectures
846 if (B.GetLayout() == GetTensorLayout()) {
847 if ( B.GetShape().size() == 4) {
848 assert(B.GetShape().size() == 4);
849 size_t firstSize = (A.GetLayout() == GetTensorLayout()) ? A.GetShape()[0] : A.GetShape().back();
850 for (size_t i = 0; i < firstSize; ++i) {
851 TMatrixT<AFloat> matIn = A.At(i).GetMatrix(); // this convert tensor (B,D,HW) in (D,HW)i -> (D,HW)i
852 // TMAtrix has the correct layout (row-wise) no need to traspose in this case
853 TCudaTensor<AFloat> tmpOut = B.At(i); // matrix (D,HW)
854 // copy will copy the buffer
855 TCudaTensor<AFloat> tmpIn(matIn.GetMatrixArray(), tmpOut.GetShape(), tmpOut.GetLayout());
856 Copy(tmpOut, tmpIn);
857 }
858 }
859 else {
860 // for RNN weights
862 TCudaMatrix<AFloat> tmp2(tmp);
863 TCudaTensor<AFloat> tA(tmp2);
864 Copy(B, tA);
865 }
866 } else {
867 // case of same layout (column major)
869 TCudaMatrix<AFloat> tmp2(tmp);
870 TCudaTensor<AFloat> tA(tmp2);
871 Copy(B, tA);
872 }
873}
874
875//____________________________________________________________________________
876template <typename AFloat>
877template <typename AMatrix>
878void TCudnn<AFloat>::CopyWeightsDiffArch(TCudaTensor<AFloat> &B, const AMatrix &A)
879{
880 // copy from another architecture using the reference one
881 // this is not very efficient since creates temporary objects
882 TMatrixT<AFloat> tmp = A; // .GetMatrix();
883 // we need to traspose for different layout
884 if (B.GetLayout() == GetTensorLayout() ) {
885 // this is for CNN weights that are in row-major formats
886 //assert(B.GetShape().size() == 4); // weights shape should be 4
887 tmp.T();
888 }
889 TCudaMatrix<AFloat> tmp2(tmp);
890 TCudaTensor<AFloat> tA(tmp2);
891 Copy(B, tA);
892}
893
894//____________________________________________________________________________
895template <typename AFloat>
896template <typename AMatrix_t>
897void TCudnn<AFloat>::CopyDiffArch(std::vector<Tensor_t> &B,
898 const std::vector<AMatrix_t> &A)
899{
900 for (size_t i = 0; i < B.size(); ++i) {
901 CopyWeightsDiffArch(B[i], A[i]);
902 }
903}
904
905template <typename AFloat>
906void TCudnn<AFloat>::PrintTensor(const typename TCudnn<AFloat>::Tensor_t & A, const std::string name, bool truncate )
907{
908 std::cout << name << " size = " << A.GetSize() << " shape = { ";
909 auto shape = A.GetShape();
910 for (size_t k = 0; k < shape.size()-1; ++k)
911 std::cout << shape[k] << " , ";
912 std::cout << shape.back() << " } ";
913 std::cout << " strides = { ";
914 auto strides = A.GetStrides();
915 for (size_t k = 0; k < strides.size()-1; ++k)
916 std::cout << strides[k] << " , ";
917 std::cout << strides.back() << " }\n ";
918
919 if (A.GetShape().size() == 2 ) {
920 for (size_t i = 0; i < A.GetShape()[0]; ++i) {
921 std::cout << "{ ";
922 size_t n = A.GetShape()[1];
923 if (truncate) n = std::min(n,size_t(10));
924 for (size_t j = 0; j < n; ++j) {
925 std::cout << A(i,j) << " ";
926
927 }
928 if (truncate && n < A.GetShape()[1]) std::cout << " ...... ";
929 std::cout << " } " << std::endl;
930 }
931 } else if (A.GetShape().size() == 3 ) {
932 for (size_t i = 0; i < A.GetFirstSize(); ++i) {
933 std::cout << "{ ";
934 for (size_t j = 0; j < A.GetHSize(); ++j) {
935 std::cout << "{ ";
936 size_t n = A.GetWSize();
937 if (truncate) n = std::min(n,size_t(10));
938 for (size_t k = 0; k < n; ++k) {
939 std::cout << A(i,j,k) << " ";
940 }
941 if (truncate && n < A.GetWSize()) std::cout << " ...... ";
942 std::cout << " } " << std::endl;
943 }
944 std::cout << " } " << std::endl;
945 }
946 } else if (A.GetShape().size() == 4 ) {
947 for (size_t i = 0; i < A.GetShape()[0]; ++i) {
948 std::cout << "{ ";
949 for (size_t j = 0; j < A.GetShape()[1]; ++j) {
950 std::cout << "{ ";
951 for (size_t k = 0; k < A.GetShape()[2]; ++k) {
952 size_t n = A.GetShape()[3];
953 if (truncate) n = std::min(n,size_t(10));
954 for (size_t l = 0; l < n; ++l) {
955 std::cout << A(i,j,k,l) << " ";
956 }
957 if (truncate && n < A.GetShape()[3]) std::cout << " ...... ";
958 std::cout << " } " << std::endl;
959 }
960 std::cout << " } " << std::endl;
961 }
962 std::cout << " } " << std::endl;
963 }
964 }
965 else {
966 for (size_t l = 0; l < A.GetSize(); ++l) {
967 std::cout << A.GetData()[l] << " ";
968 }
969 std::cout << "\n";
970 }
971}
972
973template <typename AFloat>
974void TCudnn<AFloat>::PrintTensor4dDescriptor(TensorDescriptor_t descriptor) {
975 int n, c, h, w = 0;
976 int s1, s2, s3, s4 = 0;
977 cudnnDataType_t dataType;
978 cudnnGetTensor4dDescriptor(descriptor, &dataType, &n, &c, &h, &w, &s1, &s2, &s3, &s4);
979 std::cout << "Descriptor for 4d tensor of shape { " << n << " , " << c << " , " << h << " , " << w << " }"
980 << " and strides { " << s1 << " , " << s2 << " , " << s3 << " , " << s4 << " }" << std::endl;
981}
982template <typename AFloat>
983void TCudnn<AFloat>::PrintTensorNdDescriptor(TensorDescriptor_t descriptor, int ndim)
984{
985 int n = 0;
986 std::vector<int> dims(ndim);
987 std::vector<int> strides(ndim);
988 cudnnDataType_t dataType;
989 cudnnGetTensorNdDescriptor(descriptor, ndim, &dataType, &n, dims.data(), strides.data());
990 dims.resize(n);
991 strides.resize(n);
992 std::cout << "Descriptor for Nd tensor of dim = " << n << " shape { ";
993 for (auto d : dims)
994 std::cout << d << " , ";
995 std::cout << "} and strides { ";
996 for (auto s : strides)
997 std::cout << s << " , ";
998 std::cout << " }" << std::endl;
999}
1000
1001// initialize the CNN options
1002// possible options for forward (from 0 to 7)
1003//
1004// 0 : CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
1005// 1 : CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
1006// 6 : CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
1007// 7 : CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; (lots of memory)
1008
1009// for backward data (from 0 to 5)
1010// 1 : CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
1011// 5 CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
1012
1013template <typename AFloat>
1014int TCudnn<AFloat>::CNNOptions::ConvFwdAlgorithm = -1;
1015template <typename AFloat>
1016int TCudnn<AFloat>::CNNOptions::ConvBwdDataAlgorithm = -1;
1017template <typename AFloat>
1018int TCudnn<AFloat>::CNNOptions::ConvBwdFilterAlgorithm = -1;
1019template <typename AFloat>
1020Long_t TCudnn<AFloat>::CNNOptions::ConvMaxWorkspaceSize = -1; // -1 let use Cudnn defaults
1021
1022} // namespace DNN
1023} // namespace TMVA
1024
1025#endif
1026#endif
#define d(i)
Definition RSha256.hxx:102
#define c(i)
Definition RSha256.hxx:101
#define s1(x)
Definition RSha256.hxx:91
#define h(i)
Definition RSha256.hxx:106
long Long_t
Definition RtypesCore.h:54
#define X(type, name)
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
char name[80]
Definition TGX11.cxx:110
void PrintTensor(RTensor< T > &t)
TMatrixT.
Definition TMatrixT.h:39
const Element * GetMatrixArray() const override
Definition TMatrixT.h:225
This is the base class for the ROOT Random number generators.
Definition TRandom.h:27
T Sum(const RVec< T > &v, const T zero=T(0))
Sum elements of an RVec.
Definition RVec.hxx:1917
Double_t y[n]
Definition legend1.C:17
Double_t x[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
void Copy(void *source, void *dest)
__global__ void SymmetricRelu(AFloat *A, int m, int n)
Definition Kernels.cuh:590
__global__ void SigmoidDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:524
__global__ void Dropout(AFloat *A, int m, int n, AFloat dropoutProbability, curandState_t *state)
Definition Kernels.cuh:964
__global__ void SoftmaxCrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:882
__global__ void IdentityDerivative(AFloat *A, int m, int n)
Definition Kernels.cuh:450
__global__ void SqrtElementWise(AFloat *A, int m, int n)
Definition Kernels.cuh:391
__global__ void AdamUpdate(AFloat *A, const AFloat *M, const AFloat *V, int m, int n, AFloat alpha, AFloat eps)
optimizer kernel functions
Definition Kernels.cuh:408
__global__ void SoftmaxCrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:851
__global__ void AddL1RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
Definition Kernels.cuh:767
__global__ void MeanSquaredErrorGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:750
__global__ void Relu(AFloat *A, int m, int n)
Definition Kernels.cuh:463
__global__ void ReluDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:478
__global__ void AddL2RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
Definition Kernels.cuh:784
__global__ void AddRowWise(AFloat *W, const AFloat *theta, int m, int n)
Definition Kernels.cuh:307
__global__ void ConstMult(AFloat *A, AFloat beta, int m, int n)
Definition Kernels.cuh:349
__global__ void GaussDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:665
__global__ void Deflatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Deflatten a 2D-array into an array of 2D-arrays.
Definition Kernels.cuh:1225
__global__ void CrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:800
__global__ void Softmax(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:540
__global__ void TanhDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:574
__global__ void CrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:831
__global__ void ConstAdd(AFloat *A, AFloat beta, int m, int n)
Definition Kernels.cuh:335
__global__ void SymmetricReluDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:604
__global__ void MeanSquaredError(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:681
__global__ void SquareElementWise(AFloat *A, int m, int n)
Definition Kernels.cuh:377
__global__ void SoftSignDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:634
__global__ void Hadamard(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:321
__global__ void AdamUpdateFirstMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
Definition Kernels.cuh:422
__global__ void ReciprocalElementWise(AFloat *A, int m, int n)
Definition Kernels.cuh:363
__global__ void Downsample(AFloat *output, AFloat *indexMatrix, const AFloat *input, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Downsampling kernel used as the forward propagation step of a Max-Pooling layer.
Definition Kernels.cuh:1002
__global__ void AdamUpdateSecondMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
Definition Kernels.cuh:436
std::shared_ptr< std::function< double(double)> > Tanh
Definition NeuralNet.cxx:29
std::shared_ptr< std::function< double(double)> > Gauss
Definition NeuralNet.cxx:12
std::shared_ptr< std::function< double(double)> > Sigmoid
Definition NeuralNet.cxx:26
std::shared_ptr< std::function< double(double)> > SoftSign
Definition NeuralNet.cxx:32
MemoryLayout
Memory layout type (copy from RTensor.hxx)
Definition CudaTensor.h:47
create variable transformations
TLine l
Definition textangle.C:4
static void output()