Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
Propagate.cu
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 13/07/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12 //////////////////////////////////////////////////////////////////
13 // Implementation of the functions required for the forward and //
14 // backward propagation of activations through a neural network //
15 // for CUDA architectures using cuDNN library. //
16 //////////////////////////////////////////////////////////////////
17
19
21
23
24#include "TRandom.h"
25
26// #include "TMVA/DNN/Architectures/Cuda/Device.h"
27// #include "Kernels.cuh"*/
28// #include <math.h>
29
30// for std::numeric_limits<T>::max()
31#include <limits>
32
33namespace TMVA {
34namespace DNN {
35
36
37//____________________________________________________________________________
38template<typename AFloat>
39void TCudnn<AFloat>::MultiplyTranspose(TCudaTensor<AFloat> &output,
40 const TCudaTensor<AFloat> &input,
41 const TCudaTensor<AFloat> &weights)
42{
43 //PrintTensor(input,"input to MultTrans");
44 //PrintTensor(weights,"dense layer weights");
45 TCuda<AFloat>::MultiplyTranspose(output, input, weights.GetMatrix());
46 //PrintTensor(input,"output of MultTrans");
47}
48
49//____________________________________________________________________________
50template<typename AFloat>
51void TCudnn<AFloat>::AddRowWise(TCudaTensor<AFloat> &output,
52 const TCudaTensor<AFloat> &biases)
53{
54 TCuda<AFloat>::AddRowWise( output, biases.GetMatrix());
55}
56
57//____________________________________________________________________________
58template<typename AFloat>
59void TCudnn<AFloat>::Backward(TCudaTensor<AFloat> & activation_gradients_backward,
60 TCudaTensor<AFloat> & weight_gradients,
61 TCudaTensor<AFloat> & bias_gradients,
62 TCudaTensor<AFloat> & df,
63 const TCudaTensor<AFloat> & activation_gradients,
64 const TCudaTensor<AFloat> & weights,
65 const TCudaTensor<AFloat> & activation_backward)
66{
67 // use implentation from TCuda
68
69 //std::cout << "\n\n ------ Backward--------\n";
70 //PrintTensor(activation_backward,"input to backward");
71 //PrintTensor(weights,"dense layer weights");
72 //PrintTensor(activation_gradients,"input dy");
73 //PrintTensor(activation_gradients,"df");
74
75 TCudaMatrix<AFloat> weightGradMatrix = weight_gradients.GetMatrix();
76 TCudaMatrix<AFloat> biasGradMatrix = bias_gradients.GetMatrix();
77
78 TCuda<AFloat>::Backward(activation_gradients_backward,
79 weightGradMatrix,
80 biasGradMatrix,
81 df,
82 activation_gradients,
83 weights.GetMatrix(),
84 activation_backward);
85
86 //PrintTensor(activation_gradients_backward,"computed dx");
87 //PrintTensor(weight_gradients,"computed dw");
88}
89
90//____________________________________________________________________________
91template<typename AFloat>
92void TCudnn<AFloat>::Copy(Tensor_t & B, const Tensor_t & A)
93{
94 size_t nElements = A.GetSize();
95 R__ASSERT(nElements == B.GetSize());
96
97 cudaMemcpyAsync(B.GetDataPointer(), A.GetDataPointer(),
98 nElements * sizeof(AFloat), cudaMemcpyDeviceToDevice, 0);
99}
100
101
102
103///////////////////////////////////////////////////////////////////////////////
104// Initialization of the cuDNN objects for the different layers
105// ...
106///////////////////////////////////////////////////////////////////////////////
107template<typename AFloat>
108void TCudnn<AFloat>::InitializeBNormDescriptors(TDescriptors * & descriptors, typename TCudnn<AFloat>::BNormLayer_t *L)
109{
110 auto bnormDescriptors = new BNormDescriptors_t ();
111
112 // reshaped tensors of bnorm layer - look if output tensor has right shape
113 Tensor_t &outputTensor = L->GetOutput();
114 Tensor_t &data = L->GetReshapedData();
115 if (L->GetNormAxis() == -1 && L->GetBatchSize() == outputTensor.GetShape()[0] && L->GetDepth() == 1 && L->GetHeight() == 1 ) {
116 // case of dense layer before - need to reshape the data
117 R__ASSERT(outputTensor.GetLayout() != GetTensorLayout()); // has to be output column major
118 // case of convolutions layer before
119 Tensor_t &data = L->GetReshapedData();
120 // it is not important which buffer we use. Important is the shape and layout of tensor
121 data = Tensor_t(outputTensor.GetDeviceBuffer(), {1, L->GetWidth(), 1, L->GetBatchSize()}, GetTensorLayout(), 0, 0);
122 } else if (L->GetNormAxis() == 1 ) {
123 // case of convolutional layer before
124 outputTensor.PrintShape("output");
125 Tensor_t tmp( {L->GetBatchSize() , L->GetDepth(), L->GetHeight(), L->GetWidth()}, GetTensorLayout(), 0, 0);
126 tmp.PrintShape("tmp");
127 data = Tensor_t(outputTensor.GetDeviceBuffer(), {L->GetBatchSize() , L->GetDepth(), L->GetHeight(), L->GetWidth() }, GetTensorLayout(), 0, 0);
128
129
130 // reshape output tensor and activation gradient tensor of pool layer
131 outputTensor = Tensor_t(outputTensor.GetDeviceBuffer(),
132 {L->GetBatchSize(), L->GetDepth(), L->GetHeight(), L->GetWidth()},
133 GetTensorLayout(), 0, 0 );
134
135 Tensor_t &activationGradients = L->GetActivationGradients();
136 activationGradients = Tensor_t(activationGradients.GetDeviceBuffer(),
137 outputTensor.GetShape(), GetTensorLayout(), 0, 0);
138 outputTensor.PrintShape("output2");
139
140 }
141
142 outputTensor.PrintShape("output bnorm");
143 data.PrintShape("reshaped data");
144
145 // Tensor_t &activationGradients = L->GetActivationGradients();
146 // activationGradients = Tensor_t(activationGradients.GetDeviceBuffer(), outputTensor.GetShape(), GetTensorLayout(), 0, 0);
147
148 CUDNNCHECK(cudnnCreateTensorDescriptor(&bnormDescriptors->HelperDescriptor));
149
150 cudnnBatchNormMode_t bnMode = CUDNN_BATCHNORM_SPATIAL;
151
152 // CUDNN_BATCHNORM_PER_ACTIVATION;
153 // if (L->GetNormAxis() == 1)
154 // bnMode = CUDNN_BATCHNORM_SPATIAL;
155
156 CUDNNCHECK(cudnnDeriveBNTensorDescriptor(bnormDescriptors->HelperDescriptor,
157 data.GetTensorDescriptor(),
158 bnMode) );
159
160 descriptors = bnormDescriptors;
161}
162
163template <typename AFloat>
164void TCudnn<AFloat>::InitializeActivationDescriptor(TCudnn<AFloat>::ActivationDescriptor_t &descriptor,
165 EActivationFunction activFunc, double coef)
166{
167 cudnnActivationMode_t activationMode;
168 bool isIdentity = false;
169 switch (activFunc) {
171 isIdentity = true;
172 break; // Identity activation only works for cudnnConvolutionBiasActivationForward()
174 activationMode = CUDNN_ACTIVATION_RELU;
175 break;
177 activationMode = CUDNN_ACTIVATION_SIGMOID;
178 break;
180 activationMode = CUDNN_ACTIVATION_TANH;
181 break;
183 activationMode = CUDNN_ACTIVATION_TANH;
184 break;
185 // The activations otherwise used are not supported by cuDNN
186 default:
187 activationMode = CUDNN_ACTIVATION_RELU;
188 };
189
190 CUDNNCHECK(cudnnCreateActivationDescriptor(&descriptor));
191
192 // Dont set activation function descriptor for identity function
193 if (!isIdentity) CUDNNCHECK(cudnnSetActivationDescriptor(descriptor, activationMode, CUDNN_PROPAGATE_NAN, coef));
194}
195
196template<typename AFloat>
197void TCudnn<AFloat>::InitializeConvDescriptors(TDescriptors * & descriptors, ConvLayer_t *L) {
198
199 auto convDescriptors = new CNN::TCNNDescriptors<typename TCudnn<AFloat>::ConvLayer_t> ();
200
201 //FIXME: Move this to constructor
202 cudnnDataType_t cudnnDataType;
203 if (std::is_same<AFloat, double>::value) { cudnnDataType = CUDNN_DATA_DOUBLE;}
204 else if (std::is_same<AFloat, float>::value) { cudnnDataType = CUDNN_DATA_FLOAT;}
205
206 double coef = 0.0; // this is a coefficient which can be used for activation (e.g. relu) . it is not yet supported
207 InitializeActivationDescriptor(convDescriptors->HelperDescriptor, L->GetActivationFunction(), coef);
208
209 CUDNNCHECK(cudnnCreateConvolutionDescriptor(&convDescriptors->LayerDescriptor));
210 CUDNNCHECK(cudnnCreateFilterDescriptor(&convDescriptors->WeightsDescriptor));
211
212 // Set the convolution parameters
213 CUDNNCHECK(cudnnSetConvolution2dDescriptor(convDescriptors->LayerDescriptor,
214 L->GetPaddingHeight(),
215 L->GetPaddingWidth(),
216 L->GetStrideRows(),
217 L->GetStrideCols(),
218 1, //Dilation height
219 1, //Dilation width
220 CUDNN_CROSS_CORRELATION,
221 cudnnDataType));
222
223 // Set the filter parameters
224 CUDNNCHECK(cudnnSetFilter4dDescriptor(convDescriptors->WeightsDescriptor,
225 cudnnDataType,
226 CUDNN_TENSOR_NCHW,
227 L->GetDepth(),
228 L->GetInputDepth(),
229 L->GetFilterHeight(),
230 L->GetFilterWidth()));
231
232 descriptors = convDescriptors;
233}
234
235//____________________________________________________________________________
236template <typename AFloat>
237void TCudnn<AFloat>::InitializePoolDescriptors(TDescriptors * & descriptors,
238 PoolingLayer_t *L) {
239 auto poolDescriptors = new CNN::TCNNDescriptors<typename TCudnn<AFloat>::PoolingLayer_t> ();
240 CUDNNCHECK(cudnnCreatePoolingDescriptor(&poolDescriptors->LayerDescriptor));
241
242 CUDNNCHECK(cudnnCreateDropoutDescriptor(&poolDescriptors->HelperDescriptor));
243
244 CUDNNCHECK(cudnnSetPooling2dDescriptor(poolDescriptors->LayerDescriptor,
245 CUDNN_POOLING_MAX,
246 CUDNN_PROPAGATE_NAN,
247 L->GetFilterHeight(),
248 L->GetFilterWidth(),
249 0,//L->GetPaddingHeight()
250 0,//L->GetPaddingWidth()
251 L->GetStrideRows(),
252 L->GetStrideCols()));
253
254
255
256 descriptors = poolDescriptors;
257
258 // reshape output tensor and activation gradient tensor of pool layer
259 Tensor_t &outputTensor = L->GetOutput();
260 outputTensor = Tensor_t(outputTensor.GetDeviceBuffer(),
261 {L->GetBatchSize(), L->GetDepth(), L->GetHeight(), L->GetWidth()},
262 GetTensorLayout(), 0, 0);
263
264 Tensor_t &activationGradients = L->GetActivationGradients();
265 activationGradients = Tensor_t(activationGradients.GetDeviceBuffer(),
266 outputTensor.GetShape(), GetTensorLayout(), 0, 0);
267}
268
269//____________________________________________________________________________
270template<typename AFloat>
271void TCudnn<AFloat>::ReleaseConvDescriptors(TDescriptors * descriptors) {
272 auto convDescriptors = static_cast<ConvDescriptors_t *>(descriptors);
273 ReleaseDescriptor(convDescriptors->LayerDescriptor);
274 ReleaseDescriptor(convDescriptors->HelperDescriptor);
275 ReleaseDescriptor(convDescriptors->WeightsDescriptor);
276}
277
278//____________________________________________________________________________
279template <typename AFloat>
280void TCudnn<AFloat>::ReleasePoolDescriptors(TDescriptors * descriptors) {
281 auto poolDescriptors = static_cast<PoolingDescriptors_t *>(descriptors);
282 ReleaseDescriptor(poolDescriptors->LayerDescriptor);
283 ReleaseDescriptor(poolDescriptors->HelperDescriptor);
284 ReleaseDescriptor(poolDescriptors->WeightsDescriptor);
285}
286
287//____________________________________________________________________________
288template <typename AFloat>
289void TCudnn<AFloat>::ReleaseBNormDescriptors(TDescriptors * descriptors) {
290 auto bnormDescriptors = static_cast<BNormDescriptors_t *>(descriptors);
291 ReleaseDescriptor(bnormDescriptors->HelperDescriptor); // it is a tensor descriptor
292}
293
294//____________________________________________________________________________
295template <typename AFloat>
296void TCudnn<AFloat>::ReleaseDescriptor(ActivationDescriptor_t & activationDescr) {
297 CUDNNCHECK(cudnnDestroyActivationDescriptor(activationDescr));
298}
299
300//____________________________________________________________________________
301template <typename AFloat>
302void TCudnn<AFloat>::ReleaseDescriptor(ConvolutionDescriptor_t & convolutionDescr) {
303 CUDNNCHECK(cudnnDestroyConvolutionDescriptor(convolutionDescr));
304}
305
306//____________________________________________________________________________
307template <typename AFloat>
308void TCudnn<AFloat>::ReleaseDescriptor(DropoutDescriptor_t & dropoutDescr) {
309 CUDNNCHECK(cudnnDestroyDropoutDescriptor(dropoutDescr));
310}
311//____________________________________________________________________________
312template <typename AFloat>
313void TCudnn<AFloat>::ReleaseDescriptor(TensorDescriptor_t & tensorDescr) {
314 CUDNNCHECK(cudnnDestroyTensorDescriptor(tensorDescr));
315}
316
317//____________________________________________________________________________
318template <typename AFloat>
319void TCudnn<AFloat>::ReleaseDescriptor(FilterDescriptor_t & filterDescr) {
320 CUDNNCHECK(cudnnDestroyFilterDescriptor(filterDescr));
321}
322
323//____________________________________________________________________________
324template <typename AFloat>
325void TCudnn<AFloat>::ReleaseDescriptor(PoolingDescriptor_t & poolingDescr) {
326 CUDNNCHECK(cudnnDestroyPoolingDescriptor(poolingDescr));
327}
328
329
330//____________________________________________________________________________
331template <typename AFloat>
332void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
333 TDescriptors * & descriptors,
334 const DNN::CNN::TConvParams & /*params*/,
335 ConvLayer_t *L) {
336 auto convWorkspace = new ConvWorkspace_t();
337 size_t memLimit = (CNNOptions::ConvMaxWorkspaceSize > 0) ? static_cast<size_t>(CNNOptions::ConvMaxWorkspaceSize) : 0;
338 auto convDescriptors = static_cast<ConvDescriptors_t *>(descriptors);
339 // can we do the following and substitute below???
340 // auto weightsDescriptor{convDescriptors->WeightsDescriptor};
341 // auto convDescriptor{convDescriptors->LayerDescriptor};
342
343#if (CUDNN_VERSION >= 8000)
344 enum algoPreference { no_workspace, fastest, workspace_limit };
345 algoPreference algoChoice;
346 // C++11 lambdas cannot be templated, so we have to do this HORRIBLE stuff...
347 class LocalPerf {
348 public:
349 LocalPerf(cudnnConvolutionFwdAlgoPerf_t * fwd) {m_fwd = fwd;}
350 LocalPerf(cudnnConvolutionBwdFilterAlgoPerf_t * bwdFilter) {m_bwdFilter = bwdFilter;}
351 LocalPerf(cudnnConvolutionBwdDataAlgoPerf_t * bwdData) {m_bwdData = bwdData;}
352 size_t getMemory(int i) {return m_fwd != nullptr ? m_fwd[i].memory : m_bwdFilter != nullptr ? m_bwdFilter[i].memory : m_bwdData != nullptr ? m_bwdData[i].memory : 0;}
353 float getTime(int i) {return m_fwd != nullptr ? m_fwd[i].time : m_bwdFilter != nullptr ? m_bwdFilter[i].time : m_bwdData != nullptr ? m_bwdData[i].time : 0;}
354 cudnnStatus_t getStatus(int i) {return m_fwd != nullptr ? m_fwd[i].status : m_bwdFilter != nullptr ? m_bwdFilter[i].status : m_bwdData != nullptr ? m_bwdData[i].status : CUDNN_STATUS_BAD_PARAM;}
355 int getIdx(algoPreference const & algoPref, int const algoCount, size_t memLim = std::numeric_limits<size_t>::max()) {
356 int algoIdx{0};
357 if (algoPref == algoPreference::fastest) { // prefer fastest
358 float temp_runtime{std::numeric_limits<float>::max()};
359 for (int i = 0; i < algoCount; ++i) {
360 if (getStatus(i) == CUDNN_STATUS_SUCCESS && getTime(i) < temp_runtime) {
361 temp_runtime = getTime(i);
362 algoIdx = i;
363 }
364 }
365 } else if (algoPref == algoPreference::workspace_limit) { // constrain to workspace size
366 float temp_runtime{std::numeric_limits<float>::max()};
367 for (int i = 0; i < algoCount; ++i) {
368 if (getStatus(i) == CUDNN_STATUS_SUCCESS && getTime(i) < temp_runtime && getMemory(i) <= memLim) {
369 temp_runtime = getTime(i);
370 algoIdx = i;
371 }
372 }
373 } else { // prefer smallest workspace size
374 size_t temp_memsize{std::numeric_limits<size_t>::max()};
375 for (int i = 0; i < algoCount; ++i) {
376 if (getStatus(i) == CUDNN_STATUS_SUCCESS && getMemory(i) < temp_memsize) {
377 temp_memsize = getMemory(i);
378 algoIdx = i;
379 }
380 }
381 }
382 return algoIdx;
383 };
384 private:
385 LocalPerf();
386 // these three type are absolutely equivalent
387 // and one can access them as they wish to get info
388 cudnnConvolutionFwdAlgoPerf_t *m_fwd = nullptr;
389 cudnnConvolutionBwdFilterAlgoPerf_t *m_bwdFilter = nullptr;
390 cudnnConvolutionBwdDataAlgoPerf_t * m_bwdData = nullptr;
391 };
392#else
393 // More detailed alternative: cudnnFindConvolutionForwardAlgorithm (only option in newer cuDNN versions)
394 cudnnConvolutionFwdPreference_t preferenceFwd;
395 cudnnConvolutionBwdDataPreference_t preferenceBwdData;
396 cudnnConvolutionBwdFilterPreference_t preferenceBwdFilter;
397#endif
398 // decide on algorithm preference early
399 if (CNNOptions::ConvMaxWorkspaceSize < 0) {
400#if (CUDNN_VERSION >= 8000)
401 // fastest overall
402 algoChoice = fastest;
403#else
404 preferenceFwd = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
405 preferenceBwdData = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
406 preferenceBwdFilter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
407#endif
408
409 } else if (CNNOptions::ConvMaxWorkspaceSize == 0) {
410 // no workspace case
411#if (CUDNN_VERSION >= 8000)
412 algoChoice = no_workspace;
413#else
414 preferenceFwd = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
415 preferenceBwdData = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
416 preferenceBwdFilter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
417#endif
418
419 } else {
420 // fastest in memory limit
421#if (CUDNN_VERSION >= 8000)
422 algoChoice = workspace_limit;
423#else
424 preferenceFwd = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
425 preferenceBwdData = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
426 preferenceBwdFilter = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
427#endif
428 }
429 // fix the weight tensor shapes
430 // by default the weights are columnmajor, set them to be row major . At this points
431 // they are not yet initialized
432 Tensor_t & filters = L->GetWeightsAt(0);
433 filters = Tensor_t(filters.GetDeviceBuffer(), {L->GetDepth(), L->GetInputDepth(), L->GetFilterHeight(), L->GetFilterWidth()}, MemoryLayout::RowMajor, 0, 0);
434 // PrintTensor(L->GetWeightsAt(0));
435 Tensor_t & biases = L->GetBiasesAt(0);
436 biases = Tensor_t(biases.GetDeviceBuffer(), {1, L->GetDepth(), 1, 1}, GetTensorLayout(), 0, 0);
437
438 Tensor_t & outputTensor = L->GetOutput();
439 outputTensor = Tensor_t(outputTensor.GetDeviceBuffer(), {L->GetBatchSize(), L->GetDepth(), L->GetHeight(), L->GetWidth()}, GetTensorLayout(), 0, 0);
440 Tensor_t & inputActivation = L->GetInputActivation();
441 inputActivation = Tensor_t(inputActivation.GetDeviceBuffer(),outputTensor.GetShape() ,GetTensorLayout(), 0, 0);
442
443 Tensor_t & activationGradients = L->GetActivationGradients();
444 activationGradients = Tensor_t(activationGradients.GetDeviceBuffer(),outputTensor.GetShape(), GetTensorLayout(), 0, 0);
445
446 Tensor_t & weightGradients = L->GetWeightGradientsAt(0);
447 weightGradients = Tensor_t(weightGradients.GetDeviceBuffer(), filters.GetShape(), GetTensorLayout(), 0, 0);
448
449 Tensor_t & biasGradients = L->GetBiasGradientsAt(0);
450 biasGradients = Tensor_t(biasGradients.GetDeviceBuffer(), biases.GetShape(), GetTensorLayout(), 0, 0);
451
452
453 // FIXME: Use descriptors instead (Tensor device memory is otherwise allocated during initialization)
454 // Tensor_t inputTensor ({L->GetBatchSize(), L->GetInputDepth(), L->GetInputHeight(), L->GetInputWidth()}, MemoryLayout::RowMajor, 0, 0);
455 cudnnTensorDescriptor_t inputTensorDescriptor;
456 CUDNNCHECK(cudnnCreateTensorDescriptor(&inputTensorDescriptor) );
457 CUDNNCHECK(cudnnSetTensor4dDescriptor(inputTensorDescriptor,
458 CUDNN_TENSOR_NCHW,// Layout of the tensor in memory
459 Tensor_t::GetDataType(),
460 (int)L->GetBatchSize(),
461 (int)L->GetInputDepth(),
462 (int)L->GetInputHeight(),
463 (int)L->GetInputWidth() ) );
464
465
466 // size_t outputHeight = ConvLayer_t::calculateDimension(L->GetInputHeight(), L->GetFilterHeight(), L->GetPaddingHeight(), L->GetStrideRows());
467 // size_t outputWidth = ConvLayer_t::calculateDimension(L->GetInputWidth(), L->GetFilterWidth(), L->GetPaddingWidth(), L->GetStrideCols());
468 //Tensor_t outputTensor ({L->GetBatchSize(), L->GetDepth(), outputHeight, outputWidth}, MemoryLayout::RowMajor, 0, 0);
469
470 // Get access to cudnn library handle, which is static for the CudaTensor class
471 cudnnHandle_t cudnnHandle = outputTensor.GetCudnnHandle();
472
473 // cuDNN decides which algorithm to use
474#if (CUDNN_VERSION >= 8000)
475 /**
476 * I'm sure there may be a faster way, but this works
477 */
478 int convRequestedAlgoCount{0}; // requestedAlgoCount is setting how many algorithms to try
479 CUDNNCHECK(cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle, &convRequestedAlgoCount)) // ask cuDNN how much it can try
480
481 int algoCount;
482 cudnnConvolutionFwdAlgoPerf_t convFwdPerfResults[convRequestedAlgoCount]; // this will store metrics to choose convolution algorithm
483 CUDNNCHECK(
484 cudnnFindConvolutionForwardAlgorithm(
485 cudnnHandle,
486 inputTensorDescriptor,
487 convDescriptors->WeightsDescriptor,
488 convDescriptors->LayerDescriptor,
489 outputTensor.GetTensorDescriptor(),
490 convRequestedAlgoCount,
491 &algoCount,
492 convFwdPerfResults
493 )
494 );
495 // we could also do it with the expert mode (cudnnFindConvolutionForwardAlgorithmEx),
496 // i.e.
497 // create an input tensor before the inputTensorDescriptor
498 // and get the descriptor from there
499 // Tensor_t inputTensor({L->GetBatchSize(), L->GetInputDepth(), L->GetInputHeight(), L->GetInputWidth()}, MemoryLayout::RowMajor, 0, 0);
500 // CUDNNCHECK(cudnnFindConvolutionForwardAlgorithmEx(
501 // cudnnHandle,
502 // inputTensor.GetTensorDescriptor(),
503 // &inputTensor,
504 // convDescriptors->WeightsDescriptor,
505 // &filters,
506 // convDescriptors->LayerDescriptor,
507 // outputTensor.GetTensorDescriptor(),
508 // &outputTensor,
509 // convRequestedAlgoCount,
510 // &algoCount,
511 // &convFwdPerfResults,
512 // &convWorkspace,
513 // memLimit)); // use memLimit for workspace size
514 // instead choose either fastest or lowest memory algo as per preference
515 LocalPerf fwdPerfResults{convFwdPerfResults};
516 convWorkspace->AlgorithmForward = convFwdPerfResults[fwdPerfResults.getIdx(algoChoice, algoCount, memLimit)].algo;
517#else
518 CUDNNCHECK(cudnnGetConvolutionForwardAlgorithm(
519 cudnnHandle, inputTensorDescriptor, convDescriptors->WeightsDescriptor, convDescriptors->LayerDescriptor,
520 outputTensor.GetTensorDescriptor(), preferenceFwd,
521 memLimit, // Memory limit in bytes for mode CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
522 &convWorkspace->AlgorithmForward));
523#endif
524
525 // Allocate memory for the convolution
526 //size_t workSpaceSizeInBytes = 0;
527
528 std::cout << "CONV FWD Algo used for convolution of input shape { " << L->GetBatchSize() << " , " << L->GetInputDepth() << " , "
529 <<L->GetInputHeight() << " , " << L->GetInputWidth() << " } is "
530 << convWorkspace->AlgorithmForward << std::endl;
531
532 // convWorkspace->AlgorithmForward = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
533 // convWorkspace->AlgorithmForward = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
534 // convWorkspace->AlgorithmForward = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
535 // CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
536 if (CNNOptions::ConvFwdAlgorithm > 0) {
537 convWorkspace->AlgorithmForward = (cudnnConvolutionFwdAlgo_t) CNNOptions::ConvFwdAlgorithm;
538 std::cout << " but force using " << convWorkspace->AlgorithmForward << std::endl;
539 }
540
541
542 cudnnMathType_t math_type = CUDNN_TENSOR_OP_MATH; // : CUDNN_DEFAULT_MATH);
543 // if using tensor math (cudnn version > 7)
544 CUDNNCHECK(cudnnSetConvolutionMathType(convDescriptors->LayerDescriptor, math_type));
545
546 CUDNNCHECK(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
547 inputTensorDescriptor,
548 convDescriptors->WeightsDescriptor,
549 convDescriptors->LayerDescriptor,
550 outputTensor.GetTensorDescriptor(),
551 convWorkspace->AlgorithmForward,
552 &convWorkspace->ForwardWorkspaceSize));
553
554 if (convWorkspace->ForwardWorkspaceSize) cudaMalloc(&convWorkspace->ForwardWorkspace, convWorkspace->ForwardWorkspaceSize*sizeof(AFloat));
555 if (convWorkspace->ForwardWorkspaceSize > 0 && convWorkspace->ForwardWorkspace == nullptr ) {
556 std::cerr << "Error allocating FWD CONV workspace of size " << convWorkspace->ForwardWorkspaceSize << " - probably running out of memory on the GPU"
557 << std::endl;
558 std::cout << " layer input shape is { " << L->GetBatchSize() << " , " << L->GetInputDepth() << " , "
559 <<L->GetInputHeight() << " , " << L->GetInputWidth() << " } " << std::endl;
560 //inputTensor.PrintShape("inputTensor");
561 R__ASSERT(false);
562 }
563 //
564 // Backward Algorithm
565 //
566
567 //Tensor_t activationGradients ({L->GetBatchSize(), L->GetDepth(), outputHeight, outputWidth}, MemoryLayout::RowMajor, 0, 0);
568 //Tensor_t activationGradientsBackward ({L->GetBatchSize(), L->GetInputDepth(), L->GetInputHeight(), L->GetInputWidth()}, MemoryLayout::RowMajor, 0, 0);
569 cudnnTensorDescriptor_t activationGradientsBackwardDescriptor = inputTensorDescriptor;
570
571 cudnnHandle = activationGradients.GetCudnnHandle();
572 // dx : Activation gradient to be computed -> activationGradients [in place op]
573 // dy : Gradient of activation from the following layer (backpropagation)-> activationGradients
574
575#if (CUDNN_VERSION >= 8000)
576 /**
577 * I'm sure there may be a faster way, but this works
578 */
579 CUDNNCHECK(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle, &convRequestedAlgoCount)) // ask cuDNN how much it can try
580 cudnnConvolutionBwdDataAlgoPerf_t convBwdDataPerfResults[convRequestedAlgoCount]; // this will store metrics to choose convolution algorithm
581 CUDNNCHECK(cudnnFindConvolutionBackwardDataAlgorithm(
582 cudnnHandle,
583 convDescriptors->WeightsDescriptor,
584 activationGradients.GetTensorDescriptor(),
585 convDescriptors->LayerDescriptor,
586 activationGradientsBackwardDescriptor,
587 convRequestedAlgoCount,
588 &algoCount,
589 convBwdDataPerfResults));
590 // we could also do it with the expert mode (cudnnFindConvolutionForwardAlgorithmEx),
591 // i.e.
592 // CUDNNCHECK(cudnnFindConvolutionBackwardDataAlgorithmEx(
593 // cudnnHandle,
594 // convDescriptors->WeightsDescriptor,
595 // &filters,
596 // activationGradients.GetTensorDescriptor(),
597 // &activationGradients,
598 // convDescriptors->LayerDescriptor,
599 // activationGradientsBackwardDescriptor,
600 // &inputTensor,
601 // convRequestedAlgoCount,
602 // &algoCount,
603 // &convBwdDataPerfResults,
604 // &convWorkspace,
605 // memLimit)); // use memLimit for workspace size
606 // instead choose either fastest or lowest memory algo as per preference
607 LocalPerf bwdDataPerfResults{convBwdDataPerfResults};
608 convWorkspace->AlgorithmBackward = convBwdDataPerfResults[bwdDataPerfResults.getIdx(algoChoice, algoCount, memLimit)].algo;
609#else
610 CUDNNCHECK(cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle,
611 convDescriptors->WeightsDescriptor,
612 activationGradients.GetTensorDescriptor(),
613 convDescriptors->LayerDescriptor,
614 activationGradientsBackwardDescriptor,
615 preferenceBwdData, memLimit,
616 &convWorkspace->AlgorithmBackward));
617#endif
618
619 std::cout << "CONV BWD Data Algo used is " << convWorkspace->AlgorithmBackward << std::endl;
620 //CUDNNCHECK(cudnnSetConvolutionMathType(convDescriptors->LayerDescriptor, CUDNN_TENSOR_OP_MATH));
621
622
623 if (CNNOptions::ConvBwdDataAlgorithm > 0) {
624 convWorkspace->AlgorithmBackward = (cudnnConvolutionBwdDataAlgo_t)CNNOptions::ConvBwdDataAlgorithm;
625 std::cout << " but force using " << convWorkspace->AlgorithmBackward << std::endl;
626 }
627
628 CUDNNCHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle,
629 convDescriptors->WeightsDescriptor,
630 activationGradients.GetTensorDescriptor(),
631 convDescriptors->LayerDescriptor,
632 activationGradientsBackwardDescriptor,
633 convWorkspace->AlgorithmBackward,
634 &convWorkspace->BackwardWorkspaceSize));
635
636 if (convWorkspace->BackwardWorkspaceSize) cudaMalloc(&convWorkspace->BackwardWorkspace, convWorkspace->BackwardWorkspaceSize*sizeof(AFloat));
637 if (convWorkspace->BackwardWorkspaceSize > 0 && convWorkspace->BackwardWorkspace == nullptr ) {
638 std::cerr << "Error allocating BACKW DATA CONV workspace of size " << convWorkspace->BackwardWorkspaceSize << " - probably running out of memory on the GPU"
639 << std::endl;
640 std::cout << " layer input shape is { " << L->GetBatchSize() << " , " << L->GetInputDepth() << " , "
641 <<L->GetInputHeight() << " , " << L->GetInputWidth() << " } " << std::endl;
642 //inputTensor.PrintShape("inputTensor");
643 R__ASSERT(false);
644 }
645 // Filter gradient
646 //Tensor_t activationBackward ({L->GetBatchSize(), L->GetInputDepth(), L->GetInputHeight(), L->GetInputWidth()}, MemoryLayout::RowMajor, 0, 0);
647 // here should be able to use inputTensorDescriptor
648 cudnnTensorDescriptor_t activationBackwardDescriptor = inputTensorDescriptor;
649
650#if (CUDNN_VERSION >= 8000)
651 /**
652 * I'm sure there may be a faster way, but this works
653 */
654 CUDNNCHECK(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle, &convRequestedAlgoCount)) // ask cuDNN how much it can try
655 cudnnConvolutionBwdFilterAlgoPerf_t convBwdFilterPerfResults[convRequestedAlgoCount]; // this will store metrics to choose convolution algorithm
656 CUDNNCHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
657 cudnnHandle,
658 activationBackwardDescriptor,
659 activationGradients.GetTensorDescriptor(),
660 convDescriptors->LayerDescriptor,
661 convDescriptors->WeightsDescriptor,
662 convRequestedAlgoCount,
663 &algoCount,
664 convBwdFilterPerfResults));
665 // we could also do it with the expert mode (cudnnFindConvolutionForwardAlgorithmEx),
666 // i.e.
667 // CUDNNCHECK(cudnnFindConvolutionBackwardFilterAlgorithmEx(
668 // cudnnHandle,
669 // activationBackwardDescriptor,
670 // &inputTensor,
671 // activationGradients.GetTensorDescriptor(),
672 // &activationGradients,
673 // convDescriptors->LayerDescriptor,
674 // convDescriptors->WeightsDescriptor,
675 // &filters,
676 // convRequestedAlgoCount,
677 // &algoCount,
678 // &convBwdFilterPerfResults,
679 // &convWorkspace,
680 // memLimit)); // use memLimit for workspace size
681 // instead choose either fastest or lowest memory algo as per preference
682 LocalPerf bwdFilterPerfResults{convBwdFilterPerfResults};
683 convWorkspace->HelperAlgorithm = convBwdFilterPerfResults[bwdFilterPerfResults.getIdx(algoChoice, algoCount, memLimit)].algo;
684#else
685 CUDNNCHECK(cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle,
686 activationBackwardDescriptor,
687 activationGradients.GetTensorDescriptor(),
688 convDescriptors->LayerDescriptor,
689 convDescriptors->WeightsDescriptor,
690 preferenceBwdFilter,
691 memLimit,
692 &convWorkspace->HelperAlgorithm));
693#endif
694
695 std::cout << "CONV BWD Filter Algo used is " << convWorkspace->HelperAlgorithm << std::endl;
696
697 if (CNNOptions::ConvBwdFilterAlgorithm > 0) {
698 convWorkspace->HelperAlgorithm = (cudnnConvolutionBwdFilterAlgo_t)CNNOptions::ConvBwdFilterAlgorithm;
699 std::cout << " but force using " << convWorkspace->HelperAlgorithm << std::endl;
700 }
701
702 //CUDNNCHECK(cudnnSetConvolutionMathType(convDescriptors->LayerDescriptor, CUDNN_TENSOR_OP_MATH));
703
704 CUDNNCHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
705 cudnnHandle, activationBackwardDescriptor, activationGradients.GetTensorDescriptor(),
706 convDescriptors->LayerDescriptor, convDescriptors->WeightsDescriptor, convWorkspace->HelperAlgorithm,
707 &convWorkspace->HelperWorkspaceSize));
708
709 if (convWorkspace->HelperWorkspaceSize)
710 cudaMalloc(&convWorkspace->HelperWorkspace, convWorkspace->HelperWorkspaceSize * sizeof(AFloat));
711
712 if (convWorkspace->HelperWorkspaceSize > 0 && convWorkspace->HelperWorkspace == nullptr) {
713 std::cerr << "Error allocating BACKW FILTER CONV workspace of size " << convWorkspace->BackwardWorkspaceSize
714 << " - probably running out of memory on the GPU" << std::endl;
715 std::cout << " layer input shape is { " << L->GetBatchSize() << " , " << L->GetInputDepth() << " , "
716 << L->GetInputHeight() << " , " << L->GetInputWidth() << " } " << std::endl;
717 filters.PrintShape("filterTensor");
718 R__ASSERT(false);
719 }
720
721 /// allocate workspace and descriptor for reduction operation
722 // used to compiute bias gradients
723 // try reducing the tensor
724
725 CUDNNCHECK(cudnnCreateReduceTensorDescriptor(&convWorkspace->fReduceTensorDesc));
726
727 auto reduceTensorDesc = convWorkspace->fReduceTensorDesc;
728 CUDNNCHECK(cudnnSetReduceTensorDescriptor(reduceTensorDesc, CUDNN_REDUCE_TENSOR_ADD, Tensor_t::GetDataType(),
729 CUDNN_PROPAGATE_NAN, CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
730
731 CUDNNCHECK(cudnnGetReductionWorkspaceSize(cudnnHandle, reduceTensorDesc, activationGradients.GetTensorDescriptor(),
732 biasGradients.GetTensorDescriptor(),
733 &convWorkspace->fReductionWorkspaceSize));
734 if (convWorkspace->fReductionWorkspaceSize > 0)
735 cudaMalloc(&convWorkspace->fReductionWorkspace, convWorkspace->fReductionWorkspaceSize);
736
737
738 // size_t isizeInBytes;
739 // void *iSpace = nullptr;
740 // CUDNNCHECK(cudnnGetReductionIndicesSize(cudnnHandle, reduceTensorDesc, activationGradients.GetTensorDescriptor(),
741 // biasGradients.GetTensorDescriptor(), &isizeInBytes));
742
743 // if (isizeInBytes > 0)
744 // cudaMalloc(&convWorkspace->fIndiceWorkspace, isizeInBytes);
745
746 workspace = convWorkspace;
747
748 CUDNNCHECK(cudnnDestroyTensorDescriptor(inputTensorDescriptor));
749}
750
751
752//____________________________________________________________________________
753template <typename AFloat>
754void TCudnn<AFloat>::InitializePoolDropoutWorkspace(TWorkspace * & workspace,
755 TDescriptors * & descriptors,
756 const DNN::CNN::TConvParams & /*params*/,
757 PoolingLayer_t *L) {
758
759 auto poolWorkspace = new PoolingWorkspace_t ();
760 auto poolDescriptors = static_cast<PoolingDescriptors_t *>(descriptors);
761
762 //Tensor_t inputTensor ({L->GetBatchSize(), L->GetInputDepth(), L->GetInputHeight(), L->GetInputWidth()}, MemoryLayout::RowMajor, 0, 0);
763 cudnnHandle_t cudnnHandle = L->GetOutput().GetCudnnHandle();
764
765 // create tensor descriptors
766 cudnnTensorDescriptor_t inputTensorDescriptor;
767 CUDNNCHECK(cudnnCreateTensorDescriptor(&inputTensorDescriptor) );
768 CUDNNCHECK(cudnnSetTensor4dDescriptor(inputTensorDescriptor,
769 CUDNN_TENSOR_NCHW,// Layout of the tensor in memory
770 Tensor_t::GetDataType(),
771 (int)L->GetBatchSize(),
772 (int)L->GetInputDepth(),
773 (int)L->GetInputHeight(),
774 (int)L->GetInputWidth() ) );
775
776
777 // Space needed to execute forward and backward dropout pass
778 CUDNNCHECK(cudnnDropoutGetReserveSpaceSize(inputTensorDescriptor,
779 &poolWorkspace->HelperWorkspaceSize));
780
781 if (poolWorkspace->HelperWorkspaceSize) {
782 cudaMalloc(&poolWorkspace->HelperWorkspace, poolWorkspace->HelperWorkspaceSize * sizeof(AFloat));
783 if (poolWorkspace->HelperWorkspace == nullptr) {
784 std::cerr << "Error allocating POOL reserved droput workspace of size " << poolWorkspace->HelperWorkspaceSize
785 << " probably running out of memory on the GPU"
786 << std::endl;
787 std::cout << " layer input shape is { " << L->GetBatchSize() << " , " << L->GetInputDepth() << " , "
788 << L->GetInputHeight() << " , " << L->GetInputWidth() << " } " << std::endl;
789 R__ASSERT(false);
790 }
791 }
792
793 // Space that contain random pass
794 CUDNNCHECK(cudnnDropoutGetStatesSize(cudnnHandle,
795 &poolWorkspace->ForwardWorkspaceSize));
796
797 if (poolWorkspace->ForwardWorkspaceSize) {
798 cudaMalloc(&poolWorkspace->ForwardWorkspace, poolWorkspace->ForwardWorkspaceSize * sizeof(AFloat));
799 if (poolWorkspace->ForwardWorkspace == nullptr) {
800 std::cerr << "Error allocating POOL droput state of size " << poolWorkspace->ForwardWorkspaceSize <<
801 " probably running out of memory on the GPU" << std::endl;
802 std::cout << " layer input shape is { " << L->GetBatchSize() << " , " << L->GetInputDepth() << " , "
803 << L->GetInputHeight() << " , " << L->GetInputWidth() << " } " << std::endl;
804 R__ASSERT(false);
805 }
806 }
807
808 // Fill the dropout workspace with random numbers and copy to device
809 TRandom & rand = TCudnn<AFloat>::GetRandomGenerator();
810 // create a 64 bit seed using 2 32 bits integers
811 unsigned long long seed = (unsigned long long) rand.Integer(UINT_MAX) << 32 + rand.Integer(UINT_MAX);
812 // Reset the descriptor at every forward pass, so that random states get newly initialized?
813 CUDNNCHECK(cudnnSetDropoutDescriptor(poolDescriptors->HelperDescriptor,
814 cudnnHandle,
815 L->GetDropoutProbability(),
816 poolWorkspace->ForwardWorkspace,
817 poolWorkspace->ForwardWorkspaceSize,
818 seed));
819
820 workspace = poolWorkspace;
821
822 CUDNNCHECK(cudnnDestroyTensorDescriptor(inputTensorDescriptor));
823}
824
825//____________________________________________________________________________
826template <typename AFloat>
827void TCudnn<AFloat>::FreeConvWorkspace(TWorkspace * workspace) {
828 if (!workspace) return;
829 auto convWorkspace = static_cast<ConvWorkspace_t *>(workspace);
830
831 if(convWorkspace->ForwardWorkspace) cudaFree(convWorkspace->ForwardWorkspace);
832 if(convWorkspace->BackwardWorkspace) cudaFree(convWorkspace->BackwardWorkspace);
833 if(convWorkspace->HelperWorkspace) cudaFree(convWorkspace->HelperWorkspace);
834
835 CUDNNCHECK(cudnnDestroyReduceTensorDescriptor(convWorkspace->fReduceTensorDesc));
836
837 if (convWorkspace->fReductionWorkspace)
838 cudaFree(convWorkspace->fReductionWorkspace);
839
840}
841
842//____________________________________________________________________________
843template <typename AFloat>
844void TCudnn<AFloat>::FreePoolDropoutWorkspace(TWorkspace * workspace) {
845 if (!workspace) return;
846 auto poolWorkspace = static_cast<PoolingWorkspace_t *>(workspace);
847
848 if(poolWorkspace->ForwardWorkspace) cudaFree(poolWorkspace->ForwardWorkspace);
849 if(poolWorkspace->BackwardWorkspace) cudaFree(poolWorkspace->BackwardWorkspace);
850 if(poolWorkspace->HelperWorkspace) cudaFree(poolWorkspace->HelperWorkspace);
851}
852
853//____________________________________________________________________________
854template <typename AFloat>
855void TCudnn<AFloat>::BatchNormLayerForwardTraining(int axis, const Tensor_t &x,
856 Tensor_t & y,
857 Matrix_t &gamma, Matrix_t &beta,
858 Matrix_t & mean, Matrix_t &, Matrix_t & iVariance,
859 Matrix_t & runningMeans, Matrix_t & runningVars,
860 Scalar_t nTrainedBatches, Scalar_t momentum, Scalar_t epsilon,
861 const TensorDescriptor_t & bnParDescriptor )
862
863{
864 AFloat a = 1.0;
865 AFloat b = 0.0;
866 //cudnnBatchNormMode_t bnMode = CUDNN_BATCHNORM_PER_ACTIVATION;
867 //if (axis == 1) bnMode = CUDNN_BATCHNORM_SPATIAL;
868 cudnnBatchNormMode_t bnMode = CUDNN_BATCHNORM_SPATIAL;
869
870 //x.PrintShape("x");
871 //y.PrintShape("y");
872
873 // the factor is defined in Cudnn as 1-momentum
874 double exponentialAverageFactor = (momentum < 0.) ? 1. / (1 + nTrainedBatches) : 1. - momentum;
875 CUDNNCHECK(cudnnBatchNormalizationForwardTraining(x.GetCudnnHandle(), bnMode,
876 &a, &b,
877 x.GetTensorDescriptor(), x.GetDataPointer(),
878 y.GetTensorDescriptor(), y.GetDataPointer(),
879 bnParDescriptor,
880 gamma.GetDataPointer(), beta.GetDataPointer(),
881 exponentialAverageFactor,
882 runningMeans.GetDataPointer(),
883 runningVars.GetDataPointer(),
884 epsilon, mean.GetDataPointer(), iVariance.GetDataPointer() ) );
885
886}
887
888//____________________________________________________________________________
889template <typename AFloat>
890void TCudnn<AFloat>::BatchNormLayerForwardInference(int axis, const Tensor_t &x, Matrix_t &gamma, Matrix_t &beta,
891 Tensor_t &y, const Matrix_t &runningMeans,
892 const Matrix_t &runningVars, Scalar_t epsilon,
893 const TensorDescriptor_t & bnParDescriptor)
894
895{
896 AFloat a = 1.0;
897 AFloat b = 0.0;
898
899 cudnnBatchNormMode_t bnMode = CUDNN_BATCHNORM_SPATIAL;
900
901
902 CUDNNCHECK(cudnnBatchNormalizationForwardInference(x.GetCudnnHandle(), bnMode,
903 &a, &b,
904 x.GetTensorDescriptor(), x.GetDataPointer(),// pass y as descriptor
905 y.GetTensorDescriptor(), y.GetDataPointer(),
906 bnParDescriptor,
907 gamma.GetDataPointer(), beta.GetDataPointer(),
908 runningMeans.GetDataPointer(),
909 runningVars.GetDataPointer(),
910 epsilon) );
911
912}
913
914//____________________________________________________________________________
915template <typename AFloat>
916void TCudnn<AFloat>::BatchNormLayerBackward(int axis, const Tensor_t &x, const Tensor_t &dy, Tensor_t &dx,
917 Matrix_t &gamma, // Matrix_t &beta, (not needed)
918 Matrix_t &dgamma, Matrix_t &dbeta, const Matrix_t &mean,
919 const Matrix_t &variance, const Matrix_t &iVariance,
920 Scalar_t epsilon, const TensorDescriptor_t & bnParDescriptor)
921{
922 AFloat a = 1.0;
923 AFloat b = 0.0;
924 cudnnBatchNormMode_t bnMode = CUDNN_BATCHNORM_SPATIAL;
925
926 CUDNNCHECK(cudnnBatchNormalizationBackward(x.GetCudnnHandle(), bnMode,
927 &a, &b, &a, &b,
928 x.GetTensorDescriptor(), x.GetDataPointer(),
929 dy.GetTensorDescriptor(), dy.GetDataPointer(),
930 dx.GetTensorDescriptor(), dx.GetDataPointer(),
931 bnParDescriptor, gamma.GetDataPointer(),
932 dgamma.GetDataPointer(), dbeta.GetDataPointer(),
933 epsilon, mean.GetDataPointer(), iVariance.GetDataPointer() ) );
934
935}
936
937
938template <typename AFloat>
939void TCudnn<AFloat>::ConvLayerForward(Tensor_t & outputTensor,
940 Tensor_t & inputActivation,
941 const Tensor_t & input,
942 const Matrix_t & weights, const Matrix_t & biases,
943 const DNN::CNN::TConvParams & params,
944 EActivationFunction activFunc,
945 Tensor_t & inputPrime,
946 const ConvDescriptors_t & descriptors,
947 ConvWorkspace_t & workspace)
948// const AFloat alpha,
949// const AFloat beta)
950{
951 //((Tensor_t & )input).Reshape( {params.batchSize, params.inputDepth, params.inputHeight, params.inputWidth});
952
953 assert( input.GetLayout() == GetTensorLayout());
954
955 //size_t outputHeight = DNN::CNN::TConvLayer<TCudnn<AFloat>>::calculateDimension(params.inputHeight, params.filterHeight, params.paddingHeight, params.strideRows);
956 //size_t outputWidth = DNN::CNN::TConvLayer<TCudnn<AFloat>>::calculateDimension(params.inputWidth, params.filterWidth, params.paddingWidth, params.strideCols);
957
958 // PrintTensor(input,"input");
959 // PrintTensor(outputTensor,"output");
960 // PrintTensor(weights,"weights");
961 // PrintTensor(biases,"biases");
962 //((Tensor_t & )weights).Reshape( { params.numberFilters, params.inputDepth, params.filterHeight, params.filterWidth } );
963 //((Tensor_t & )biases).Reshape( { 1,params.numberFilters, 1, 1});
964 //biases.Reshape ( { 1,params.numberFilters, 1, 1});
965
966 AFloat alpha = 1.0;
967 AFloat beta = 0.0;
968 cudnnHandle_t cudnnHandle = input.GetCudnnHandle();
969
970 // check descriptors
971#ifndef NDEBUG
972 int n,c,h,w = 0;
973 int s1,s2,s3,s4 = 0;
974 cudnnDataType_t dataType;
975 cudnnGetTensor4dDescriptor( input.GetTensorDescriptor(), &dataType,&n,&c,&h,&w,&s1,&s2,&s3,&s4 );
976 std::vector<size_t> shape_input = {size_t(n), size_t(c) , size_t(h), size_t(w) };
977 assert (shape_input == input.GetShape());
978
979 cudnnGetTensor4dDescriptor( outputTensor.GetTensorDescriptor(), &dataType,&n,&c,&h,&w,&s1,&s2,&s3,&s4 );
980 std::vector<size_t> shape_output = {size_t(n), size_t(c) , size_t(h), size_t(w) };
981 assert (shape_output == outputTensor.GetShape());
982#endif
983
984
985 // Perform convolution
986 cudnnStatus_t status = cudnnConvolutionForward(cudnnHandle,
987 &alpha,
988 input.GetTensorDescriptor(),
989 input.GetDataPointer(),
990 descriptors.WeightsDescriptor,
991 weights.GetDataPointer(),
992 descriptors.LayerDescriptor,
993 workspace.AlgorithmForward,
994 workspace.ForwardWorkspace,
995 workspace.ForwardWorkspaceSize,
996 &beta,
997 inputActivation.GetTensorDescriptor(),
998 inputActivation.GetDataPointer());
999
1000 // Apply biases
1001 assert(status == CUDNN_STATUS_SUCCESS);
1002 CUDNNCHECK(status);
1003
1004 //PrintTensor(biases,"biases");
1005 //PrintTensor(outputTensor,"tensor before biases");
1006 AddConvBiases(inputActivation, biases);
1007
1008 //PrintTensor(outputTensor,"tensor after biases");
1009
1010 // Store the conv output before application of activation to use in the backward pass
1011 //TCudnn<AFloat>::Copy(outputTensor,inputActivation);
1012
1013 // Apply activation
1014 TCudnn<AFloat>::ActivationFunctionForward(outputTensor, inputActivation, activFunc, descriptors.HelperDescriptor, 0.0, 1.0, 0.0);
1015
1016
1017 //perform convolution + biases + activation in one single call
1018 // could use an extra vector z but here we just pass a dummy tensor
1019 // AFloat alpha2 = 0;
1020 // CUDNNCHECK(cudnnConvolutionBiasActivationForward(cudnnHandle,
1021 // &alpha,
1022 // input.GetTensorDescriptor(),
1023 // input.GetDataPointer(),
1024 // descriptors.WeightsDescriptor,
1025 // weights.GetDataPointer(),
1026 // descriptors.LayerDescriptor,
1027 // workspace.AlgorithmForward,
1028 // workspace.ForwardWorkspace,
1029 // workspace.ForwardWorkspaceSize,
1030 // &alpha2,
1031 // outputTensor.GetTensorDescriptor(), // z : not used
1032 // outputTensor.GetDataPointer(),
1033 // biases.GetDescriptor(),
1034 // biases.GetDataPointer(),
1035 // descriptors.HelperDescriptor,
1036 // outputTensor.GetTensorDescriptor(),
1037 // outputTensor.GetDataPointer()));
1038
1039 //TCudnn<AFloat>::PrintTensor(outputTensor, "after activation");
1040
1041}
1042
1043//____________________________________________________________________________
1044template<typename AFloat>
1045void TCudnn<AFloat>::ConvLayerBackward(Tensor_t &activationGradientsBackward,
1046 Matrix_t &weightGradients, Matrix_t &biasGradients,
1047 Tensor_t &inputActivation,
1048 Tensor_t &activationGradients,
1049 const Matrix_t &weights,
1050 const Tensor_t &activationBackward,
1051 const Tensor_t &outputTensor,
1052 EActivationFunction activFunc,
1053 const ConvDescriptors_t & descriptors,
1054 ConvWorkspace_t & workspace,
1055 size_t /*batchSize*/, size_t /*inputHeight*/,
1056 size_t /*inputWidth*/, size_t /*depth*/,
1057 size_t /*height*/, size_t /*width*/,
1058 size_t /*filterDepth*/, size_t /*filterHeight*/,
1059 size_t /*filterWidth*/, size_t /*nLocalViews*/)
1060{
1061 // activationGradients.Reshape( outputTensor.GetShape());
1062 // weightGradients.Reshape( weights.GetShape());
1063 // biasGradients.Reshape({ 1, outputTensor.GetShape()[1], 1, 1}); // second output dimension is number of filters
1064 // // activationGradientsBackward.Reshape()
1065 // activationBackward.Reshape
1066
1067 //--------------------------------------------------------------------------
1068 // Activation function gradient
1069 //--------------------------------------------------------------------------
1070
1071 // x : Output of previous layer without activation function -> inputActivation
1072 // dx : Activation gradient to be computed -> activationGradients [in place op]
1073 // y : Ouput of this layer (activation applied) -> outputTensor
1074 // dy : Gradient of activation from the following layer (backpropagation)-> activationGradients
1075
1076 //if (descriptors.HelperDescriptor)
1077 ActivationFunctionBackward(activationGradients, outputTensor, activationGradients, inputActivation,
1078 activFunc, descriptors.HelperDescriptor); //y dy x dx
1079
1080 //--------------------------------------------------------------------------
1081 // Network Activation gradient
1082 //--------------------------------------------------------------------------
1083 const AFloat alpha = 1.0;
1084 const AFloat beta = 0.0;
1085
1086 cudnnHandle_t cudnnHandle = outputTensor.GetCudnnHandle();
1087
1088 //cudnnMathType_t math_type = CUDNN_TENSOR_OP_MATH; // : CUDNN_DEFAULT_MATH);
1089 // if using tensor math (cudnn version > 7)
1090 //CUDNNCHECK(cudnnSetConvolutionMathType(descriptors.LayerDescriptor, math_type));
1091
1092 // do not compute activation gradients for first layer (i.e. when input activationGradientBackward is a dummy empty tensor)
1093 if (activationGradientsBackward.GetSize() > 0)
1094 CUDNNCHECK(cudnnConvolutionBackwardData(cudnnHandle,
1095 &alpha,
1096 descriptors.WeightsDescriptor,
1097 weights.GetDataPointer(),
1098 activationGradients.GetTensorDescriptor(),
1099 activationGradients.GetDataPointer(),
1100 descriptors.LayerDescriptor,
1101 workspace.AlgorithmBackward,
1102 workspace.BackwardWorkspace,
1103 workspace.BackwardWorkspaceSize,
1104 &beta,
1105 activationGradientsBackward.GetTensorDescriptor(),
1106 activationGradientsBackward.GetDataPointer()));
1107
1108 //--------------------------------------------------------------------------
1109 // Filter gradient
1110 //--------------------------------------------------------------------------
1111
1112 //CUDNNCHECK(cudnnSetConvolutionMathType(descriptors.LayerDescriptor, CUDNN_TENSOR_OP_MATH));
1113
1114 CUDNNCHECK(cudnnConvolutionBackwardFilter(
1115 cudnnHandle, &alpha, activationBackward.GetTensorDescriptor(), activationBackward.GetDataPointer(),
1116 activationGradients.GetTensorDescriptor(), activationGradients.GetDataPointer(), descriptors.LayerDescriptor,
1117 workspace.HelperAlgorithm, workspace.HelperWorkspace, workspace.HelperWorkspaceSize, &beta,
1118 descriptors.WeightsDescriptor, weightGradients.GetDataPointer()));
1119
1120 //--------------------------------------------------------------------------
1121 // Bias gradient
1122 //--------------------------------------------------------------------------
1123#if 0
1124 CUDNNCHECK(cudnnConvolutionBackwardBias(cudnnHandle, &alpha, activationGradients.GetTensorDescriptor(),
1125 activationGradients.GetDataPointer(), &beta,
1126 biasGradients.GetTensorDescriptor(), biasGradients.GetDataPointer()));
1127
1128 //PrintTensor(biasGradients,"computed gradient biases");
1129#endif
1130#if 0
1131// try trandforming the activation gradient tensor and reduce it
1132 auto shape = activationGradients.GetShape();
1133 Tensor_t actGradTransf({shape[1], shape[0], shape[2], shape[3]}, activationGradients.GetLayout());
1134 CUDNNCHECK(cudnnTransformTensor(cudnnHandle, &alpha, activationGradients.GetTensorDescriptor(),
1135 activationGradients.GetDataPointer(), &beta, actGradTransf.GetTensorDescriptor(),
1136 actGradTransf.GetDataPointer()));
1137 // now make the operation
1138 TCudaMatrix<AFloat> actGradMatrix(actGradTransf.GetDeviceBuffer(), shape[0] * shape[2] * shape[3], shape[1]);
1139 TCudaMatrix<AFloat> temp(biasGradients.GetDeviceBuffer(), biasGradients.GetShape()[1], 1);
1140 TCuda<AFloat>::SumColumns(temp, actGradMatrix);
1141#endif
1142
1143 // to compute the bias gradients is more efficient to reduce the activation gradient tensor in B,H,W dimensions and
1144 // adding their elements.
1145 // we create the descriptor for reduction and necessary workspaces in the initialization workspace function for the
1146 // convolution layer. Note that the indices workspace is not needed in case of addition operation
1147
1148 CUDNNCHECK(cudnnReduceTensor(cudnnHandle, workspace.fReduceTensorDesc, nullptr, 0, workspace.fReductionWorkspace,
1149 workspace.fReductionWorkspaceSize, &alpha, activationGradients.GetTensorDescriptor(),
1150 activationGradients.GetDataPointer(), &beta, biasGradients.GetTensorDescriptor(),
1151 biasGradients.GetDataPointer()));
1152
1153
1154#if 0 // this is very slow for large batches
1155 biasGradients.Zero();
1156 TCudaMatrix<AFloat> temp(biasGradients.GetShape()[1], 1);
1157 TCudaMatrix<AFloat> biasGradMatrix(biasGradients.GetDeviceBuffer(), biasGradients.GetShape()[1], 1);
1158 size_t batchSize = activationGradients.GetFirstSize();
1159 for (size_t event = 0; event < batchSize; event++) {
1160 TCudaTensor<AFloat> actGrad = activationGradients.At(event); // remember tihs is a rowwise tensor
1161 TCudaMatrix<AFloat> actGradMatrix(actGrad.GetDeviceBuffer(),
1162 activationGradients.GetShape()[2] * activationGradients.GetShape()[3],
1163 activationGradients.GetShape()[0]);
1164 TCuda<AFloat>::SumColumns(temp, actGradMatrix);
1165 TCuda<AFloat>::ScaleAdd(biasGradMatrix, temp);
1166 }
1167#endif
1168}
1169
1170//____________________________________________________________________________
1171template<typename AFloat>
1172void TCudnn<AFloat>::AddConvBiases(Tensor_t &output,
1173 const Tensor_t &biases)
1174{
1175 TCudnn<AFloat>::ScaleAdd(output, biases);
1176}
1177
1178
1179//____________________________________________________________________________
1180//////////////////////////////////////////////////////////////////////////////////////////////
1181/// \brief Downsampling function used as the forward propagation step of a
1182/// Max-Pooling layer.
1183///
1184/// \param[out] A The output matrix. Each row corresponds to a slice and each element
1185/// is the max within a receptive field.
1186/// \param[out] B The winning indices matrix. Each element is the index of the max element.
1187/// \param[in] C The input matrix. Each row is a slice.
1188/// \param[in] imgHeight The heigh of the input.
1189/// \param[in] imgWidth The output of the input.
1190/// \param[in] fltHeight Height of the kernel.
1191/// \param[in] fltWidth Width of the kernel.
1192/// \param[in] strideRows stride size in the horizontal dimension.
1193/// \param[in] strideCols stride size in the vertical dimension.
1194///
1195/// Each output element is the maximum of the receptive field. We also save the winning
1196/// indices to facilitate back-propagation - we need to know which input element influenced
1197/// the output and only apply the derivative correction to this particular element.
1198/// The slicing process is the same as in a convolutional layer, however padding is set to 0.
1199///////////////////////////////////////////////////////////////////////////////////////////////
1200template<typename AFloat>
1201void TCudnn<AFloat>::Downsample(Tensor_t &A,
1202 Tensor_t &/*B*/,
1203 const Tensor_t &C,
1204 const PoolingDescriptors_t & descriptors,
1205 PoolingWorkspace_t & workspace,
1206 size_t imgHeight,
1207 size_t imgWidth,
1208 size_t fltHeight,
1209 size_t fltWidth,
1210 size_t strideRows,
1211 size_t strideCols)
1212{
1213 const AFloat alpha = 1.0;
1214 const AFloat beta = 0.0;
1215
1216 CUDNNCHECK(cudnnPoolingForward(C.GetCudnnHandle(),
1217 descriptors.LayerDescriptor,
1218 &alpha,
1219 C.GetTensorDescriptor(),
1220 C.GetDataPointer(),
1221 &beta,
1222 A.GetTensorDescriptor(),
1223 A.GetDataPointer()));
1224}
1225
1226//____________________________________________________________________________
1227template<typename AFloat>
1228void TCudnn<AFloat>::MaxPoolLayerBackward(Tensor_t & activationGradientsBackward, //dx
1229 const Tensor_t & activationGradients, // dy
1230 const Tensor_t & indexMatrix,
1231 const Tensor_t & activationBackward, //X
1232 const Tensor_t & outputTensor, //Y
1233 const PoolingDescriptors_t & descriptors,
1234 PoolingWorkspace_t & workspace,
1235 size_t imgHeight,
1236 size_t imgWidth,
1237 size_t fltHeight,
1238 size_t fltWidth,
1239 size_t strideRows,
1240 size_t strideCols,
1241 size_t /* nLocalViews */)
1242{
1243 const AFloat alpha = 1.0;
1244 const AFloat beta = 0.0;
1245 // x : Output of previous layer without -> inputActivation
1246 // dx : Activation gradient to be computed -> activationGradientsBackward
1247 // y : Ouput of this layer (activation applied) -> outputTensor
1248 // dy : Gradient of activation from the following layer (backpropagation)-> activationGradients
1249 CUDNNCHECK(cudnnPoolingBackward(outputTensor.GetCudnnHandle(),
1250 descriptors.LayerDescriptor,
1251 &alpha,
1252 outputTensor.GetTensorDescriptor(),
1253 outputTensor.GetDataPointer(),
1254 activationGradients.GetTensorDescriptor(),
1255 activationGradients.GetDataPointer(),
1256 activationBackward.GetTensorDescriptor(),
1257 activationBackward.GetDataPointer(),
1258 &beta,
1259 activationGradientsBackward.GetTensorDescriptor(),
1260 activationGradientsBackward.GetDataPointer()));
1261}
1262
1263//____________________________________________________________________________
1264/*template<typename AFloat>
1265void TCudnn<AFloat>::Reshape(TCudaTensor<AFloat> &A, const TCudaTensor<AFloat> &B)
1266{
1267 dim3 blockDims = TDevice::BlockDims2D();
1268 dim3 gridDims = TDevice::GridDims2D(A);
1269 cudaStream_t s = A.GetComputeStream();
1270
1271 ::TMVA::DNN::Cuda::Reshape<<<gridDims, blockDims>>>(A.GetDataPointer(), B.GetDataPointer(),
1272 A.GetNrows(), A.GetNcols(), B.GetNrows(), B.GetNcols());
1273}*/
1274
1275//______________________________________________________________________________
1276/*template <typename AReal>
1277void TCudnn<AReal>::Rearrange(std::vector<TCudaTensor<AReal>> &out, const std::vector<TCudaTensor<AReal>> &in)
1278{
1279 // B x T x D out --- T x B x D in*/
1280 /*size_t B = out.size();
1281 size_t T = out[0].GetNrows();
1282 size_t D = out[0].GetNcols();
1283 if ((T != in.size()) || (B != in[0].GetNrows())
1284 || (D != in[0].GetNcols())) {
1285 std::cout << "Incompatible Dimensions\n"
1286 << in.size() << "x" << in[0].GetNrows() << "x" << in[0].GetNcols()
1287 << " --> " << B << "x" << T << "x" << D << "\n";
1288 return;
1289 }
1290 for (size_t i = 0; i < B; ++i) {
1291 for (size_t j = 0; j < T; ++j) {
1292 for (size_t k = 0; k < D; ++k) {
1293 out[i](j, k) = in[j](i, k);
1294 }
1295 }
1296 }
1297 return;
1298}*/
1299
1300//____________________________________________________________________________
1301////////////////////////////////////////////////////////////////////////////////
1302/// \brief Flatten a vector of matrices into a single matrix.
1303///
1304/// \param[out] A Output matrix.
1305/// \param[in] B Input vector. Each element is a matrix to be concatenated.
1306///
1307//////////////////////////////////////////////////////////////////////////////////
1308template<typename AFloat>
1309void TCudnn<AFloat>::Flatten(TCudaTensor<AFloat> &A,
1310 const TCudaTensor<AFloat> &B)
1311{
1313}
1314
1315//_________________________________________TCudaMatrix<AFloat> weightGradMatrix = weight_gradients.GetMatrix(); ____________________________
1316///////////////////////////////////////////TCudaMatrix<AFloat> weightGradMatrix = weight_gradients.GetMatrix(); //////////////////////////////
1317/// \brief Deflatten a matrix into a vectorTCudaMatrix<AFloat> weightGradMatrix = weight_gradients.GetMatrix(); rices.
1318///
1319/// \param[out] A Output matrices. Each eleTCudaMatrix<AFloat> weightGradMatrix = weight_gradients.GetMatrix(); ll be a part of the input.
1320/// \param[in] B Input flat matrix.
1321///
1322//////////////////////////////////////////////////////////////////////////////////
1323template<typename AFloat>
1324void TCudnn<AFloat>::Deflatten(TCudaTensor<AFloat> &A,
1325 const TCudaTensor<AFloat> &B)
1326{
1328}
1329
1330} // namespace DNN
1331} // namespace TMVA
#define b(i)
Definition RSha256.hxx:100
#define c(i)
Definition RSha256.hxx:101
#define a(i)
Definition RSha256.hxx:99
#define s1(x)
Definition RSha256.hxx:91
#define h(i)
Definition RSha256.hxx:106
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125
const char * filters[]
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void data
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
static void Backward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients, Matrix_t &biasGradients, const Tensor_t &df, const Tensor_t &activationGradients, const Matrix_t &weights, const Tensor_t &activationBackward)
Perform the complete backward propagation step.
static void Flatten(Tensor_t &A, const Tensor_t &B)
Flattens the tensor B, such that each matrix, is stretched in one row, resulting with a matrix A.
static void AddRowWise(Matrix_t &output, const Matrix_t &biases)
Add the vectors biases row-wise to the matrix output.
static void SumColumns(Matrix_t &B, const Matrix_t &A, Scalar_t alpha=1.0, Scalar_t beta=0.)
Sum columns of (m x n) matrix A and write the results into the first m elements in A.
static void Deflatten(Tensor_t &A, const Tensor_t &B)
Transforms each row of B to a matrix and stores it in the tensor B.
static void MultiplyTranspose(Matrix_t &output, const Matrix_t &input, const Matrix_t &weights)
Matrix-multiply input with the transpose of weights and write the results into output.
static void ScaleAdd(Matrix_t &A, const Matrix_t &B, Scalar_t beta=1.0)
Adds a the elements in matrix B scaled by c to the elements in the matrix A.
This is the base class for the ROOT Random number generators.
Definition TRandom.h:27
virtual UInt_t Integer(UInt_t imax)
Returns a random integer uniformly distributed on the interval [ 0, imax-1 ].
Definition TRandom.cxx:361
double beta(double x, double y)
Calculates the beta function.
Double_t y[n]
Definition legend1.C:17
Double_t x[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
double gamma(double x)
RooArgList L(Args_t &&... args)
Definition RooArgList.h:156
EActivationFunction
Enum that represents layer activation functions.
Definition Functions.h:32
create variable transformations
constexpr Double_t C()
Velocity of light in .
Definition TMath.h:114
static void output()