38template<
typename AFloat>
39void TCudnn<AFloat>::MultiplyTranspose(TCudaTensor<AFloat> &
output,
40 const TCudaTensor<AFloat> &
input,
41 const TCudaTensor<AFloat> &weights)
50template<
typename AFloat>
51void TCudnn<AFloat>::AddRowWise(TCudaTensor<AFloat> &
output,
52 const TCudaTensor<AFloat> &biases)
58template<
typename AFloat>
59void TCudnn<AFloat>::Backward(TCudaTensor<AFloat> & activation_gradients_backward,
60 TCudaTensor<AFloat> & weight_gradients,
61 TCudaTensor<AFloat> & bias_gradients,
62 TCudaTensor<AFloat> & df,
63 const TCudaTensor<AFloat> & activation_gradients,
64 const TCudaTensor<AFloat> & weights,
65 const TCudaTensor<AFloat> & activation_backward)
75 TCudaMatrix<AFloat> weightGradMatrix = weight_gradients.GetMatrix();
76 TCudaMatrix<AFloat> biasGradMatrix = bias_gradients.GetMatrix();
91template<
typename AFloat>
92void TCudnn<AFloat>::Copy(Tensor_t & B,
const Tensor_t & A)
94 size_t nElements = A.GetSize();
97 cudaMemcpyAsync(B.GetDataPointer(), A.GetDataPointer(),
98 nElements *
sizeof(AFloat), cudaMemcpyDeviceToDevice, 0);
107template<
typename AFloat>
108void TCudnn<AFloat>::InitializeBNormDescriptors(TDescriptors * & descriptors,
typename TCudnn<AFloat>::BNormLayer_t *L)
110 auto bnormDescriptors =
new BNormDescriptors_t ();
113 Tensor_t &outputTensor =
L->GetOutput();
114 Tensor_t &
data =
L->GetReshapedData();
115 if (
L->GetNormAxis() == -1 &&
L->GetBatchSize() == outputTensor.GetShape()[0] &&
L->GetDepth() == 1 &&
L->GetHeight() == 1 ) {
117 R__ASSERT(outputTensor.GetLayout() != GetTensorLayout());
119 Tensor_t &
data =
L->GetReshapedData();
121 data = Tensor_t(outputTensor.GetDeviceBuffer(), {1, L->GetWidth(), 1, L->GetBatchSize()}, GetTensorLayout(), 0, 0);
122 }
else if (
L->GetNormAxis() == 1 ) {
124 outputTensor.PrintShape(
"output");
125 Tensor_t
tmp( {
L->GetBatchSize() ,
L->GetDepth(),
L->GetHeight(),
L->GetWidth()}, GetTensorLayout(), 0, 0);
126 tmp.PrintShape(
"tmp");
127 data = Tensor_t(outputTensor.GetDeviceBuffer(), {L->GetBatchSize() , L->GetDepth(), L->GetHeight(), L->GetWidth() }, GetTensorLayout(), 0, 0);
131 outputTensor = Tensor_t(outputTensor.GetDeviceBuffer(),
132 {L->GetBatchSize(), L->GetDepth(), L->GetHeight(), L->GetWidth()},
133 GetTensorLayout(), 0, 0 );
135 Tensor_t &activationGradients =
L->GetActivationGradients();
136 activationGradients = Tensor_t(activationGradients.GetDeviceBuffer(),
137 outputTensor.GetShape(), GetTensorLayout(), 0, 0);
138 outputTensor.PrintShape(
"output2");
142 outputTensor.PrintShape(
"output bnorm");
143 data.PrintShape(
"reshaped data");
148 CUDNNCHECK(cudnnCreateTensorDescriptor(&bnormDescriptors->HelperDescriptor));
150 cudnnBatchNormMode_t bnMode = CUDNN_BATCHNORM_SPATIAL;
156 CUDNNCHECK(cudnnDeriveBNTensorDescriptor(bnormDescriptors->HelperDescriptor,
157 data.GetTensorDescriptor(),
160 descriptors = bnormDescriptors;
163template <
typename AFloat>
164void TCudnn<AFloat>::InitializeActivationDescriptor(TCudnn<AFloat>::ActivationDescriptor_t &descriptor,
167 cudnnActivationMode_t activationMode;
168 bool isIdentity =
false;
174 activationMode = CUDNN_ACTIVATION_RELU;
177 activationMode = CUDNN_ACTIVATION_SIGMOID;
180 activationMode = CUDNN_ACTIVATION_TANH;
183 activationMode = CUDNN_ACTIVATION_TANH;
187 activationMode = CUDNN_ACTIVATION_RELU;
190 CUDNNCHECK(cudnnCreateActivationDescriptor(&descriptor));
193 if (!isIdentity) CUDNNCHECK(cudnnSetActivationDescriptor(descriptor, activationMode, CUDNN_PROPAGATE_NAN, coef));
196template<
typename AFloat>
197void TCudnn<AFloat>::InitializeConvDescriptors(TDescriptors * & descriptors, ConvLayer_t *L) {
199 auto convDescriptors =
new CNN::TCNNDescriptors<typename TCudnn<AFloat>::ConvLayer_t> ();
202 cudnnDataType_t cudnnDataType;
203 if (std::is_same<AFloat, double>::value) { cudnnDataType = CUDNN_DATA_DOUBLE;}
204 else if (std::is_same<AFloat, float>::value) { cudnnDataType = CUDNN_DATA_FLOAT;}
207 InitializeActivationDescriptor(convDescriptors->HelperDescriptor,
L->GetActivationFunction(), coef);
209 CUDNNCHECK(cudnnCreateConvolutionDescriptor(&convDescriptors->LayerDescriptor));
210 CUDNNCHECK(cudnnCreateFilterDescriptor(&convDescriptors->WeightsDescriptor));
213 CUDNNCHECK(cudnnSetConvolution2dDescriptor(convDescriptors->LayerDescriptor,
214 L->GetPaddingHeight(),
215 L->GetPaddingWidth(),
220 CUDNN_CROSS_CORRELATION,
224 CUDNNCHECK(cudnnSetFilter4dDescriptor(convDescriptors->WeightsDescriptor,
229 L->GetFilterHeight(),
230 L->GetFilterWidth()));
232 descriptors = convDescriptors;
236template <
typename AFloat>
237void TCudnn<AFloat>::InitializePoolDescriptors(TDescriptors * & descriptors,
239 auto poolDescriptors =
new CNN::TCNNDescriptors<typename TCudnn<AFloat>::PoolingLayer_t> ();
240 CUDNNCHECK(cudnnCreatePoolingDescriptor(&poolDescriptors->LayerDescriptor));
242 CUDNNCHECK(cudnnCreateDropoutDescriptor(&poolDescriptors->HelperDescriptor));
244 CUDNNCHECK(cudnnSetPooling2dDescriptor(poolDescriptors->LayerDescriptor,
247 L->GetFilterHeight(),
252 L->GetStrideCols()));
256 descriptors = poolDescriptors;
259 Tensor_t &outputTensor =
L->GetOutput();
260 outputTensor = Tensor_t(outputTensor.GetDeviceBuffer(),
261 {L->GetBatchSize(), L->GetDepth(), L->GetHeight(), L->GetWidth()},
262 GetTensorLayout(), 0, 0);
264 Tensor_t &activationGradients =
L->GetActivationGradients();
265 activationGradients = Tensor_t(activationGradients.GetDeviceBuffer(),
266 outputTensor.GetShape(), GetTensorLayout(), 0, 0);
270template<
typename AFloat>
271void TCudnn<AFloat>::ReleaseConvDescriptors(TDescriptors * descriptors) {
272 auto convDescriptors =
static_cast<ConvDescriptors_t *
>(descriptors);
273 ReleaseDescriptor(convDescriptors->LayerDescriptor);
274 ReleaseDescriptor(convDescriptors->HelperDescriptor);
275 ReleaseDescriptor(convDescriptors->WeightsDescriptor);
279template <
typename AFloat>
280void TCudnn<AFloat>::ReleasePoolDescriptors(TDescriptors * descriptors) {
281 auto poolDescriptors =
static_cast<PoolingDescriptors_t *
>(descriptors);
282 ReleaseDescriptor(poolDescriptors->LayerDescriptor);
283 ReleaseDescriptor(poolDescriptors->HelperDescriptor);
284 ReleaseDescriptor(poolDescriptors->WeightsDescriptor);
288template <
typename AFloat>
289void TCudnn<AFloat>::ReleaseBNormDescriptors(TDescriptors * descriptors) {
290 auto bnormDescriptors =
static_cast<BNormDescriptors_t *
>(descriptors);
291 ReleaseDescriptor(bnormDescriptors->HelperDescriptor);
295template <
typename AFloat>
296void TCudnn<AFloat>::ReleaseDescriptor(ActivationDescriptor_t & activationDescr) {
297 CUDNNCHECK(cudnnDestroyActivationDescriptor(activationDescr));
301template <
typename AFloat>
302void TCudnn<AFloat>::ReleaseDescriptor(ConvolutionDescriptor_t & convolutionDescr) {
303 CUDNNCHECK(cudnnDestroyConvolutionDescriptor(convolutionDescr));
307template <
typename AFloat>
308void TCudnn<AFloat>::ReleaseDescriptor(DropoutDescriptor_t & dropoutDescr) {
309 CUDNNCHECK(cudnnDestroyDropoutDescriptor(dropoutDescr));
312template <
typename AFloat>
313void TCudnn<AFloat>::ReleaseDescriptor(TensorDescriptor_t & tensorDescr) {
314 CUDNNCHECK(cudnnDestroyTensorDescriptor(tensorDescr));
318template <
typename AFloat>
319void TCudnn<AFloat>::ReleaseDescriptor(FilterDescriptor_t & filterDescr) {
320 CUDNNCHECK(cudnnDestroyFilterDescriptor(filterDescr));
324template <
typename AFloat>
325void TCudnn<AFloat>::ReleaseDescriptor(PoolingDescriptor_t & poolingDescr) {
326 CUDNNCHECK(cudnnDestroyPoolingDescriptor(poolingDescr));
331template <
typename AFloat>
332void TCudnn<AFloat>::InitializeConvWorkspace(TWorkspace * & workspace,
333 TDescriptors * & descriptors,
334 const DNN::CNN::TConvParams & ,
336 auto convWorkspace =
new ConvWorkspace_t();
337 size_t memLimit = (CNNOptions::ConvMaxWorkspaceSize > 0) ?
static_cast<size_t>(CNNOptions::ConvMaxWorkspaceSize) : 0;
338 auto convDescriptors =
static_cast<ConvDescriptors_t *
>(descriptors);
343#if (CUDNN_VERSION >= 8000)
344 enum algoPreference { no_workspace, fastest, workspace_limit };
345 algoPreference algoChoice;
349 LocalPerf(cudnnConvolutionFwdAlgoPerf_t * fwd) {m_fwd = fwd;}
350 LocalPerf(cudnnConvolutionBwdFilterAlgoPerf_t * bwdFilter) {m_bwdFilter = bwdFilter;}
351 LocalPerf(cudnnConvolutionBwdDataAlgoPerf_t * bwdData) {m_bwdData = bwdData;}
352 size_t getMemory(
int i) {
return m_fwd !=
nullptr ? m_fwd[i].memory : m_bwdFilter !=
nullptr ? m_bwdFilter[i].memory : m_bwdData !=
nullptr ? m_bwdData[i].memory : 0;}
353 float getTime(
int i) {
return m_fwd !=
nullptr ? m_fwd[i].time : m_bwdFilter !=
nullptr ? m_bwdFilter[i].time : m_bwdData !=
nullptr ? m_bwdData[i].time : 0;}
354 cudnnStatus_t getStatus(
int i) {
return m_fwd !=
nullptr ? m_fwd[i].status : m_bwdFilter !=
nullptr ? m_bwdFilter[i].status : m_bwdData !=
nullptr ? m_bwdData[i].status : CUDNN_STATUS_BAD_PARAM;}
355 int getIdx(algoPreference
const & algoPref,
int const algoCount,
size_t memLim = std::numeric_limits<size_t>::max()) {
357 if (algoPref == algoPreference::fastest) {
358 float temp_runtime{std::numeric_limits<float>::max()};
359 for (
int i = 0; i < algoCount; ++i) {
360 if (getStatus(i) == CUDNN_STATUS_SUCCESS && getTime(i) < temp_runtime) {
361 temp_runtime = getTime(i);
365 }
else if (algoPref == algoPreference::workspace_limit) {
366 float temp_runtime{std::numeric_limits<float>::max()};
367 for (
int i = 0; i < algoCount; ++i) {
368 if (getStatus(i) == CUDNN_STATUS_SUCCESS && getTime(i) < temp_runtime && getMemory(i) <= memLim) {
369 temp_runtime = getTime(i);
374 size_t temp_memsize{std::numeric_limits<size_t>::max()};
375 for (
int i = 0; i < algoCount; ++i) {
376 if (getStatus(i) == CUDNN_STATUS_SUCCESS && getMemory(i) < temp_memsize) {
377 temp_memsize = getMemory(i);
388 cudnnConvolutionFwdAlgoPerf_t *m_fwd =
nullptr;
389 cudnnConvolutionBwdFilterAlgoPerf_t *m_bwdFilter =
nullptr;
390 cudnnConvolutionBwdDataAlgoPerf_t * m_bwdData =
nullptr;
394 cudnnConvolutionFwdPreference_t preferenceFwd;
395 cudnnConvolutionBwdDataPreference_t preferenceBwdData;
396 cudnnConvolutionBwdFilterPreference_t preferenceBwdFilter;
399 if (CNNOptions::ConvMaxWorkspaceSize < 0) {
400#if (CUDNN_VERSION >= 8000)
402 algoChoice = fastest;
404 preferenceFwd = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
405 preferenceBwdData = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
406 preferenceBwdFilter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
409 }
else if (CNNOptions::ConvMaxWorkspaceSize == 0) {
411#if (CUDNN_VERSION >= 8000)
412 algoChoice = no_workspace;
414 preferenceFwd = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
415 preferenceBwdData = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
416 preferenceBwdFilter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
421#if (CUDNN_VERSION >= 8000)
422 algoChoice = workspace_limit;
424 preferenceFwd = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
425 preferenceBwdData = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
426 preferenceBwdFilter = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
432 Tensor_t &
filters =
L->GetWeightsAt(0);
433 filters = Tensor_t(
filters.GetDeviceBuffer(), {L->GetDepth(), L->GetInputDepth(), L->GetFilterHeight(), L->GetFilterWidth()}, MemoryLayout::RowMajor, 0, 0);
435 Tensor_t & biases =
L->GetBiasesAt(0);
436 biases = Tensor_t(biases.GetDeviceBuffer(), {1, L->GetDepth(), 1, 1}, GetTensorLayout(), 0, 0);
438 Tensor_t & outputTensor =
L->GetOutput();
439 outputTensor = Tensor_t(outputTensor.GetDeviceBuffer(), {L->GetBatchSize(), L->GetDepth(), L->GetHeight(), L->GetWidth()}, GetTensorLayout(), 0, 0);
440 Tensor_t & inputActivation =
L->GetInputActivation();
441 inputActivation = Tensor_t(inputActivation.GetDeviceBuffer(),outputTensor.GetShape() ,GetTensorLayout(), 0, 0);
443 Tensor_t & activationGradients =
L->GetActivationGradients();
444 activationGradients = Tensor_t(activationGradients.GetDeviceBuffer(),outputTensor.GetShape(), GetTensorLayout(), 0, 0);
446 Tensor_t & weightGradients =
L->GetWeightGradientsAt(0);
447 weightGradients = Tensor_t(weightGradients.GetDeviceBuffer(),
filters.GetShape(), GetTensorLayout(), 0, 0);
449 Tensor_t & biasGradients =
L->GetBiasGradientsAt(0);
450 biasGradients = Tensor_t(biasGradients.GetDeviceBuffer(), biases.GetShape(), GetTensorLayout(), 0, 0);
455 cudnnTensorDescriptor_t inputTensorDescriptor;
456 CUDNNCHECK(cudnnCreateTensorDescriptor(&inputTensorDescriptor) );
457 CUDNNCHECK(cudnnSetTensor4dDescriptor(inputTensorDescriptor,
459 Tensor_t::GetDataType(),
460 (
int)
L->GetBatchSize(),
461 (
int)
L->GetInputDepth(),
462 (
int)
L->GetInputHeight(),
463 (
int)
L->GetInputWidth() ) );
471 cudnnHandle_t cudnnHandle = outputTensor.GetCudnnHandle();
474#if (CUDNN_VERSION >= 8000)
478 int convRequestedAlgoCount{0};
479 CUDNNCHECK(cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle, &convRequestedAlgoCount))
482 cudnnConvolutionFwdAlgoPerf_t convFwdPerfResults[convRequestedAlgoCount];
484 cudnnFindConvolutionForwardAlgorithm(
486 inputTensorDescriptor,
487 convDescriptors->WeightsDescriptor,
488 convDescriptors->LayerDescriptor,
489 outputTensor.GetTensorDescriptor(),
490 convRequestedAlgoCount,
515 LocalPerf fwdPerfResults{convFwdPerfResults};
516 convWorkspace->AlgorithmForward = convFwdPerfResults[fwdPerfResults.getIdx(algoChoice, algoCount, memLimit)].algo;
518 CUDNNCHECK(cudnnGetConvolutionForwardAlgorithm(
519 cudnnHandle, inputTensorDescriptor, convDescriptors->WeightsDescriptor, convDescriptors->LayerDescriptor,
520 outputTensor.GetTensorDescriptor(), preferenceFwd,
522 &convWorkspace->AlgorithmForward));
528 std::cout <<
"CONV FWD Algo used for convolution of input shape { " <<
L->GetBatchSize() <<
" , " <<
L->GetInputDepth() <<
" , "
529 <<
L->GetInputHeight() <<
" , " <<
L->GetInputWidth() <<
" } is "
530 << convWorkspace->AlgorithmForward << std::endl;
536 if (CNNOptions::ConvFwdAlgorithm > 0) {
537 convWorkspace->AlgorithmForward = (cudnnConvolutionFwdAlgo_t) CNNOptions::ConvFwdAlgorithm;
538 std::cout <<
" but force using " << convWorkspace->AlgorithmForward << std::endl;
542 cudnnMathType_t math_type = CUDNN_TENSOR_OP_MATH;
544 CUDNNCHECK(cudnnSetConvolutionMathType(convDescriptors->LayerDescriptor, math_type));
546 CUDNNCHECK(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
547 inputTensorDescriptor,
548 convDescriptors->WeightsDescriptor,
549 convDescriptors->LayerDescriptor,
550 outputTensor.GetTensorDescriptor(),
551 convWorkspace->AlgorithmForward,
552 &convWorkspace->ForwardWorkspaceSize));
554 if (convWorkspace->ForwardWorkspaceSize) cudaMalloc(&convWorkspace->ForwardWorkspace, convWorkspace->ForwardWorkspaceSize*
sizeof(AFloat));
555 if (convWorkspace->ForwardWorkspaceSize > 0 && convWorkspace->ForwardWorkspace ==
nullptr ) {
556 std::cerr <<
"Error allocating FWD CONV workspace of size " << convWorkspace->ForwardWorkspaceSize <<
" - probably running out of memory on the GPU"
558 std::cout <<
" layer input shape is { " <<
L->GetBatchSize() <<
" , " <<
L->GetInputDepth() <<
" , "
559 <<
L->GetInputHeight() <<
" , " <<
L->GetInputWidth() <<
" } " << std::endl;
569 cudnnTensorDescriptor_t activationGradientsBackwardDescriptor = inputTensorDescriptor;
571 cudnnHandle = activationGradients.GetCudnnHandle();
575#if (CUDNN_VERSION >= 8000)
579 CUDNNCHECK(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle, &convRequestedAlgoCount))
580 cudnnConvolutionBwdDataAlgoPerf_t convBwdDataPerfResults[convRequestedAlgoCount];
581 CUDNNCHECK(cudnnFindConvolutionBackwardDataAlgorithm(
583 convDescriptors->WeightsDescriptor,
584 activationGradients.GetTensorDescriptor(),
585 convDescriptors->LayerDescriptor,
586 activationGradientsBackwardDescriptor,
587 convRequestedAlgoCount,
589 convBwdDataPerfResults));
607 LocalPerf bwdDataPerfResults{convBwdDataPerfResults};
608 convWorkspace->AlgorithmBackward = convBwdDataPerfResults[bwdDataPerfResults.getIdx(algoChoice, algoCount, memLimit)].algo;
610 CUDNNCHECK(cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle,
611 convDescriptors->WeightsDescriptor,
612 activationGradients.GetTensorDescriptor(),
613 convDescriptors->LayerDescriptor,
614 activationGradientsBackwardDescriptor,
615 preferenceBwdData, memLimit,
616 &convWorkspace->AlgorithmBackward));
619 std::cout <<
"CONV BWD Data Algo used is " << convWorkspace->AlgorithmBackward << std::endl;
623 if (CNNOptions::ConvBwdDataAlgorithm > 0) {
624 convWorkspace->AlgorithmBackward = (cudnnConvolutionBwdDataAlgo_t)CNNOptions::ConvBwdDataAlgorithm;
625 std::cout <<
" but force using " << convWorkspace->AlgorithmBackward << std::endl;
628 CUDNNCHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle,
629 convDescriptors->WeightsDescriptor,
630 activationGradients.GetTensorDescriptor(),
631 convDescriptors->LayerDescriptor,
632 activationGradientsBackwardDescriptor,
633 convWorkspace->AlgorithmBackward,
634 &convWorkspace->BackwardWorkspaceSize));
636 if (convWorkspace->BackwardWorkspaceSize) cudaMalloc(&convWorkspace->BackwardWorkspace, convWorkspace->BackwardWorkspaceSize*
sizeof(AFloat));
637 if (convWorkspace->BackwardWorkspaceSize > 0 && convWorkspace->BackwardWorkspace ==
nullptr ) {
638 std::cerr <<
"Error allocating BACKW DATA CONV workspace of size " << convWorkspace->BackwardWorkspaceSize <<
" - probably running out of memory on the GPU"
640 std::cout <<
" layer input shape is { " <<
L->GetBatchSize() <<
" , " <<
L->GetInputDepth() <<
" , "
641 <<
L->GetInputHeight() <<
" , " <<
L->GetInputWidth() <<
" } " << std::endl;
648 cudnnTensorDescriptor_t activationBackwardDescriptor = inputTensorDescriptor;
650#if (CUDNN_VERSION >= 8000)
654 CUDNNCHECK(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle, &convRequestedAlgoCount))
655 cudnnConvolutionBwdFilterAlgoPerf_t convBwdFilterPerfResults[convRequestedAlgoCount];
656 CUDNNCHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
658 activationBackwardDescriptor,
659 activationGradients.GetTensorDescriptor(),
660 convDescriptors->LayerDescriptor,
661 convDescriptors->WeightsDescriptor,
662 convRequestedAlgoCount,
664 convBwdFilterPerfResults));
682 LocalPerf bwdFilterPerfResults{convBwdFilterPerfResults};
683 convWorkspace->HelperAlgorithm = convBwdFilterPerfResults[bwdFilterPerfResults.getIdx(algoChoice, algoCount, memLimit)].algo;
685 CUDNNCHECK(cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle,
686 activationBackwardDescriptor,
687 activationGradients.GetTensorDescriptor(),
688 convDescriptors->LayerDescriptor,
689 convDescriptors->WeightsDescriptor,
692 &convWorkspace->HelperAlgorithm));
695 std::cout <<
"CONV BWD Filter Algo used is " << convWorkspace->HelperAlgorithm << std::endl;
697 if (CNNOptions::ConvBwdFilterAlgorithm > 0) {
698 convWorkspace->HelperAlgorithm = (cudnnConvolutionBwdFilterAlgo_t)CNNOptions::ConvBwdFilterAlgorithm;
699 std::cout <<
" but force using " << convWorkspace->HelperAlgorithm << std::endl;
704 CUDNNCHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
705 cudnnHandle, activationBackwardDescriptor, activationGradients.GetTensorDescriptor(),
706 convDescriptors->LayerDescriptor, convDescriptors->WeightsDescriptor, convWorkspace->HelperAlgorithm,
707 &convWorkspace->HelperWorkspaceSize));
709 if (convWorkspace->HelperWorkspaceSize)
710 cudaMalloc(&convWorkspace->HelperWorkspace, convWorkspace->HelperWorkspaceSize *
sizeof(AFloat));
712 if (convWorkspace->HelperWorkspaceSize > 0 && convWorkspace->HelperWorkspace ==
nullptr) {
713 std::cerr <<
"Error allocating BACKW FILTER CONV workspace of size " << convWorkspace->BackwardWorkspaceSize
714 <<
" - probably running out of memory on the GPU" << std::endl;
715 std::cout <<
" layer input shape is { " <<
L->GetBatchSize() <<
" , " <<
L->GetInputDepth() <<
" , "
716 <<
L->GetInputHeight() <<
" , " <<
L->GetInputWidth() <<
" } " << std::endl;
717 filters.PrintShape(
"filterTensor");
725 CUDNNCHECK(cudnnCreateReduceTensorDescriptor(&convWorkspace->fReduceTensorDesc));
727 auto reduceTensorDesc = convWorkspace->fReduceTensorDesc;
728 CUDNNCHECK(cudnnSetReduceTensorDescriptor(reduceTensorDesc, CUDNN_REDUCE_TENSOR_ADD, Tensor_t::GetDataType(),
729 CUDNN_PROPAGATE_NAN, CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
731 CUDNNCHECK(cudnnGetReductionWorkspaceSize(cudnnHandle, reduceTensorDesc, activationGradients.GetTensorDescriptor(),
732 biasGradients.GetTensorDescriptor(),
733 &convWorkspace->fReductionWorkspaceSize));
734 if (convWorkspace->fReductionWorkspaceSize > 0)
735 cudaMalloc(&convWorkspace->fReductionWorkspace, convWorkspace->fReductionWorkspaceSize);
746 workspace = convWorkspace;
748 CUDNNCHECK(cudnnDestroyTensorDescriptor(inputTensorDescriptor));
753template <
typename AFloat>
754void TCudnn<AFloat>::InitializePoolDropoutWorkspace(TWorkspace * & workspace,
755 TDescriptors * & descriptors,
756 const DNN::CNN::TConvParams & ,
759 auto poolWorkspace =
new PoolingWorkspace_t ();
760 auto poolDescriptors =
static_cast<PoolingDescriptors_t *
>(descriptors);
763 cudnnHandle_t cudnnHandle =
L->GetOutput().GetCudnnHandle();
766 cudnnTensorDescriptor_t inputTensorDescriptor;
767 CUDNNCHECK(cudnnCreateTensorDescriptor(&inputTensorDescriptor) );
768 CUDNNCHECK(cudnnSetTensor4dDescriptor(inputTensorDescriptor,
770 Tensor_t::GetDataType(),
771 (
int)
L->GetBatchSize(),
772 (
int)
L->GetInputDepth(),
773 (
int)
L->GetInputHeight(),
774 (
int)
L->GetInputWidth() ) );
778 CUDNNCHECK(cudnnDropoutGetReserveSpaceSize(inputTensorDescriptor,
779 &poolWorkspace->HelperWorkspaceSize));
781 if (poolWorkspace->HelperWorkspaceSize) {
782 cudaMalloc(&poolWorkspace->HelperWorkspace, poolWorkspace->HelperWorkspaceSize *
sizeof(AFloat));
783 if (poolWorkspace->HelperWorkspace ==
nullptr) {
784 std::cerr <<
"Error allocating POOL reserved droput workspace of size " << poolWorkspace->HelperWorkspaceSize
785 <<
" probably running out of memory on the GPU"
787 std::cout <<
" layer input shape is { " <<
L->GetBatchSize() <<
" , " <<
L->GetInputDepth() <<
" , "
788 <<
L->GetInputHeight() <<
" , " <<
L->GetInputWidth() <<
" } " << std::endl;
794 CUDNNCHECK(cudnnDropoutGetStatesSize(cudnnHandle,
795 &poolWorkspace->ForwardWorkspaceSize));
797 if (poolWorkspace->ForwardWorkspaceSize) {
798 cudaMalloc(&poolWorkspace->ForwardWorkspace, poolWorkspace->ForwardWorkspaceSize *
sizeof(AFloat));
799 if (poolWorkspace->ForwardWorkspace ==
nullptr) {
800 std::cerr <<
"Error allocating POOL droput state of size " << poolWorkspace->ForwardWorkspaceSize <<
801 " probably running out of memory on the GPU" << std::endl;
802 std::cout <<
" layer input shape is { " <<
L->GetBatchSize() <<
" , " <<
L->GetInputDepth() <<
" , "
803 <<
L->GetInputHeight() <<
" , " <<
L->GetInputWidth() <<
" } " << std::endl;
809 TRandom & rand = TCudnn<AFloat>::GetRandomGenerator();
811 unsigned long long seed = (
unsigned long long) rand.
Integer(UINT_MAX) << 32 + rand.
Integer(UINT_MAX);
813 CUDNNCHECK(cudnnSetDropoutDescriptor(poolDescriptors->HelperDescriptor,
815 L->GetDropoutProbability(),
816 poolWorkspace->ForwardWorkspace,
817 poolWorkspace->ForwardWorkspaceSize,
820 workspace = poolWorkspace;
822 CUDNNCHECK(cudnnDestroyTensorDescriptor(inputTensorDescriptor));
826template <
typename AFloat>
827void TCudnn<AFloat>::FreeConvWorkspace(TWorkspace * workspace) {
828 if (!workspace)
return;
829 auto convWorkspace =
static_cast<ConvWorkspace_t *
>(workspace);
831 if(convWorkspace->ForwardWorkspace) cudaFree(convWorkspace->ForwardWorkspace);
832 if(convWorkspace->BackwardWorkspace) cudaFree(convWorkspace->BackwardWorkspace);
833 if(convWorkspace->HelperWorkspace) cudaFree(convWorkspace->HelperWorkspace);
835 CUDNNCHECK(cudnnDestroyReduceTensorDescriptor(convWorkspace->fReduceTensorDesc));
837 if (convWorkspace->fReductionWorkspace)
838 cudaFree(convWorkspace->fReductionWorkspace);
843template <
typename AFloat>
844void TCudnn<AFloat>::FreePoolDropoutWorkspace(TWorkspace * workspace) {
845 if (!workspace)
return;
846 auto poolWorkspace =
static_cast<PoolingWorkspace_t *
>(workspace);
848 if(poolWorkspace->ForwardWorkspace) cudaFree(poolWorkspace->ForwardWorkspace);
849 if(poolWorkspace->BackwardWorkspace) cudaFree(poolWorkspace->BackwardWorkspace);
850 if(poolWorkspace->HelperWorkspace) cudaFree(poolWorkspace->HelperWorkspace);
854template <
typename AFloat>
855void TCudnn<AFloat>::BatchNormLayerForwardTraining(
int axis,
const Tensor_t &
x,
857 Matrix_t &gamma, Matrix_t &beta,
858 Matrix_t & mean, Matrix_t &, Matrix_t & iVariance,
859 Matrix_t & runningMeans, Matrix_t & runningVars,
860 Scalar_t nTrainedBatches, Scalar_t momentum, Scalar_t epsilon,
861 const TensorDescriptor_t & bnParDescriptor )
868 cudnnBatchNormMode_t bnMode = CUDNN_BATCHNORM_SPATIAL;
874 double exponentialAverageFactor = (momentum < 0.) ? 1. / (1 + nTrainedBatches) : 1. - momentum;
875 CUDNNCHECK(cudnnBatchNormalizationForwardTraining(
x.GetCudnnHandle(), bnMode,
877 x.GetTensorDescriptor(),
x.GetDataPointer(),
878 y.GetTensorDescriptor(),
y.GetDataPointer(),
880 gamma.GetDataPointer(),
beta.GetDataPointer(),
881 exponentialAverageFactor,
882 runningMeans.GetDataPointer(),
883 runningVars.GetDataPointer(),
884 epsilon, mean.GetDataPointer(), iVariance.GetDataPointer() ) );
889template <
typename AFloat>
890void TCudnn<AFloat>::BatchNormLayerForwardInference(
int axis,
const Tensor_t &
x, Matrix_t &gamma, Matrix_t &beta,
891 Tensor_t &
y,
const Matrix_t &runningMeans,
892 const Matrix_t &runningVars, Scalar_t epsilon,
893 const TensorDescriptor_t & bnParDescriptor)
899 cudnnBatchNormMode_t bnMode = CUDNN_BATCHNORM_SPATIAL;
902 CUDNNCHECK(cudnnBatchNormalizationForwardInference(
x.GetCudnnHandle(), bnMode,
904 x.GetTensorDescriptor(),
x.GetDataPointer(),
905 y.GetTensorDescriptor(),
y.GetDataPointer(),
907 gamma.GetDataPointer(),
beta.GetDataPointer(),
908 runningMeans.GetDataPointer(),
909 runningVars.GetDataPointer(),
915template <
typename AFloat>
916void TCudnn<AFloat>::BatchNormLayerBackward(
int axis,
const Tensor_t &
x,
const Tensor_t &dy, Tensor_t &dx,
918 Matrix_t &dgamma, Matrix_t &dbeta,
const Matrix_t &mean,
919 const Matrix_t &variance,
const Matrix_t &iVariance,
920 Scalar_t epsilon,
const TensorDescriptor_t & bnParDescriptor)
924 cudnnBatchNormMode_t bnMode = CUDNN_BATCHNORM_SPATIAL;
926 CUDNNCHECK(cudnnBatchNormalizationBackward(
x.GetCudnnHandle(), bnMode,
928 x.GetTensorDescriptor(),
x.GetDataPointer(),
929 dy.GetTensorDescriptor(), dy.GetDataPointer(),
930 dx.GetTensorDescriptor(), dx.GetDataPointer(),
931 bnParDescriptor,
gamma.GetDataPointer(),
932 dgamma.GetDataPointer(), dbeta.GetDataPointer(),
933 epsilon, mean.GetDataPointer(), iVariance.GetDataPointer() ) );
938template <
typename AFloat>
939void TCudnn<AFloat>::ConvLayerForward(Tensor_t & outputTensor,
940 Tensor_t & inputActivation,
941 const Tensor_t &
input,
942 const Matrix_t & weights,
const Matrix_t & biases,
943 const DNN::CNN::TConvParams & params,
945 Tensor_t & inputPrime,
946 const ConvDescriptors_t & descriptors,
947 ConvWorkspace_t & workspace)
953 assert(
input.GetLayout() == GetTensorLayout());
968 cudnnHandle_t cudnnHandle =
input.GetCudnnHandle();
974 cudnnDataType_t dataType;
975 cudnnGetTensor4dDescriptor(
input.GetTensorDescriptor(), &dataType,&
n,&
c,&
h,&
w,&
s1,&s2,&s3,&s4 );
976 std::vector<size_t> shape_input = {size_t(
n), size_t(
c) , size_t(
h), size_t(
w) };
977 assert (shape_input ==
input.GetShape());
979 cudnnGetTensor4dDescriptor( outputTensor.GetTensorDescriptor(), &dataType,&
n,&
c,&
h,&
w,&
s1,&s2,&s3,&s4 );
980 std::vector<size_t> shape_output = {size_t(
n), size_t(
c) , size_t(
h), size_t(
w) };
981 assert (shape_output == outputTensor.GetShape());
986 cudnnStatus_t status = cudnnConvolutionForward(cudnnHandle,
988 input.GetTensorDescriptor(),
989 input.GetDataPointer(),
990 descriptors.WeightsDescriptor,
991 weights.GetDataPointer(),
992 descriptors.LayerDescriptor,
993 workspace.AlgorithmForward,
994 workspace.ForwardWorkspace,
995 workspace.ForwardWorkspaceSize,
997 inputActivation.GetTensorDescriptor(),
998 inputActivation.GetDataPointer());
1001 assert(status == CUDNN_STATUS_SUCCESS);
1006 AddConvBiases(inputActivation, biases);
1014 TCudnn<AFloat>::ActivationFunctionForward(outputTensor, inputActivation, activFunc, descriptors.HelperDescriptor, 0.0, 1.0, 0.0);
1044template<
typename AFloat>
1045void TCudnn<AFloat>::ConvLayerBackward(Tensor_t &activationGradientsBackward,
1046 Matrix_t &weightGradients, Matrix_t &biasGradients,
1047 Tensor_t &inputActivation,
1048 Tensor_t &activationGradients,
1049 const Matrix_t &weights,
1050 const Tensor_t &activationBackward,
1051 const Tensor_t &outputTensor,
1053 const ConvDescriptors_t & descriptors,
1054 ConvWorkspace_t & workspace,
1077 ActivationFunctionBackward(activationGradients, outputTensor, activationGradients, inputActivation,
1078 activFunc, descriptors.HelperDescriptor);
1083 const AFloat alpha = 1.0;
1084 const AFloat
beta = 0.0;
1086 cudnnHandle_t cudnnHandle = outputTensor.GetCudnnHandle();
1093 if (activationGradientsBackward.GetSize() > 0)
1094 CUDNNCHECK(cudnnConvolutionBackwardData(cudnnHandle,
1096 descriptors.WeightsDescriptor,
1097 weights.GetDataPointer(),
1098 activationGradients.GetTensorDescriptor(),
1099 activationGradients.GetDataPointer(),
1100 descriptors.LayerDescriptor,
1101 workspace.AlgorithmBackward,
1102 workspace.BackwardWorkspace,
1103 workspace.BackwardWorkspaceSize,
1105 activationGradientsBackward.GetTensorDescriptor(),
1106 activationGradientsBackward.GetDataPointer()));
1114 CUDNNCHECK(cudnnConvolutionBackwardFilter(
1115 cudnnHandle, &alpha, activationBackward.GetTensorDescriptor(), activationBackward.GetDataPointer(),
1116 activationGradients.GetTensorDescriptor(), activationGradients.GetDataPointer(), descriptors.LayerDescriptor,
1117 workspace.HelperAlgorithm, workspace.HelperWorkspace, workspace.HelperWorkspaceSize, &beta,
1118 descriptors.WeightsDescriptor, weightGradients.GetDataPointer()));
1124 CUDNNCHECK(cudnnConvolutionBackwardBias(cudnnHandle, &alpha, activationGradients.GetTensorDescriptor(),
1125 activationGradients.GetDataPointer(), &beta,
1126 biasGradients.GetTensorDescriptor(), biasGradients.GetDataPointer()));
1132 auto shape = activationGradients.GetShape();
1133 Tensor_t actGradTransf({shape[1], shape[0], shape[2], shape[3]}, activationGradients.GetLayout());
1134 CUDNNCHECK(cudnnTransformTensor(cudnnHandle, &alpha, activationGradients.GetTensorDescriptor(),
1135 activationGradients.GetDataPointer(), &beta, actGradTransf.GetTensorDescriptor(),
1136 actGradTransf.GetDataPointer()));
1138 TCudaMatrix<AFloat> actGradMatrix(actGradTransf.GetDeviceBuffer(), shape[0] * shape[2] * shape[3], shape[1]);
1139 TCudaMatrix<AFloat> temp(biasGradients.GetDeviceBuffer(), biasGradients.GetShape()[1], 1);
1148 CUDNNCHECK(cudnnReduceTensor(cudnnHandle, workspace.fReduceTensorDesc,
nullptr, 0, workspace.fReductionWorkspace,
1149 workspace.fReductionWorkspaceSize, &alpha, activationGradients.GetTensorDescriptor(),
1150 activationGradients.GetDataPointer(), &beta, biasGradients.GetTensorDescriptor(),
1151 biasGradients.GetDataPointer()));
1155 biasGradients.Zero();
1156 TCudaMatrix<AFloat> temp(biasGradients.GetShape()[1], 1);
1157 TCudaMatrix<AFloat> biasGradMatrix(biasGradients.GetDeviceBuffer(), biasGradients.GetShape()[1], 1);
1158 size_t batchSize = activationGradients.GetFirstSize();
1159 for (
size_t event = 0;
event < batchSize;
event++) {
1160 TCudaTensor<AFloat> actGrad = activationGradients.At(event);
1161 TCudaMatrix<AFloat> actGradMatrix(actGrad.GetDeviceBuffer(),
1162 activationGradients.GetShape()[2] * activationGradients.GetShape()[3],
1163 activationGradients.GetShape()[0]);
1171template<
typename AFloat>
1172void TCudnn<AFloat>::AddConvBiases(Tensor_t &
output,
1173 const Tensor_t &biases)
1175 TCudnn<AFloat>::ScaleAdd(
output, biases);
1200template<
typename AFloat>
1201void TCudnn<AFloat>::Downsample(Tensor_t &A,
1204 const PoolingDescriptors_t & descriptors,
1205 PoolingWorkspace_t & workspace,
1213 const AFloat alpha = 1.0;
1214 const AFloat
beta = 0.0;
1216 CUDNNCHECK(cudnnPoolingForward(
C.GetCudnnHandle(),
1217 descriptors.LayerDescriptor,
1219 C.GetTensorDescriptor(),
1222 A.GetTensorDescriptor(),
1223 A.GetDataPointer()));
1227template<
typename AFloat>
1228void TCudnn<AFloat>::MaxPoolLayerBackward(Tensor_t & activationGradientsBackward,
1229 const Tensor_t & activationGradients,
1230 const Tensor_t & indexMatrix,
1231 const Tensor_t & activationBackward,
1232 const Tensor_t & outputTensor,
1233 const PoolingDescriptors_t & descriptors,
1234 PoolingWorkspace_t & workspace,
1243 const AFloat alpha = 1.0;
1244 const AFloat
beta = 0.0;
1249 CUDNNCHECK(cudnnPoolingBackward(outputTensor.GetCudnnHandle(),
1250 descriptors.LayerDescriptor,
1252 outputTensor.GetTensorDescriptor(),
1253 outputTensor.GetDataPointer(),
1254 activationGradients.GetTensorDescriptor(),
1255 activationGradients.GetDataPointer(),
1256 activationBackward.GetTensorDescriptor(),
1257 activationBackward.GetDataPointer(),
1259 activationGradientsBackward.GetTensorDescriptor(),
1260 activationGradientsBackward.GetDataPointer()));
1308template<
typename AFloat>
1309void TCudnn<AFloat>::Flatten(TCudaTensor<AFloat> &A,
1310 const TCudaTensor<AFloat> &B)
1323template<
typename AFloat>
1324void TCudnn<AFloat>::Deflatten(TCudaTensor<AFloat> &A,
1325 const TCudaTensor<AFloat> &B)
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void data
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
static void Backward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients, Matrix_t &biasGradients, const Tensor_t &df, const Tensor_t &activationGradients, const Matrix_t &weights, const Tensor_t &activationBackward)
Perform the complete backward propagation step.
static void Flatten(Tensor_t &A, const Tensor_t &B)
Flattens the tensor B, such that each matrix, is stretched in one row, resulting with a matrix A.
static void AddRowWise(Matrix_t &output, const Matrix_t &biases)
Add the vectors biases row-wise to the matrix output.
static void SumColumns(Matrix_t &B, const Matrix_t &A, Scalar_t alpha=1.0, Scalar_t beta=0.)
Sum columns of (m x n) matrix A and write the results into the first m elements in A.
static void Deflatten(Tensor_t &A, const Tensor_t &B)
Transforms each row of B to a matrix and stores it in the tensor B.
static void MultiplyTranspose(Matrix_t &output, const Matrix_t &input, const Matrix_t &weights)
Matrix-multiply input with the transpose of weights and write the results into output.
static void ScaleAdd(Matrix_t &A, const Matrix_t &B, Scalar_t beta=1.0)
Adds a the elements in matrix B scaled by c to the elements in the matrix A.
This is the base class for the ROOT Random number generators.
virtual UInt_t Integer(UInt_t imax)
Returns a random integer uniformly distributed on the interval [ 0, imax-1 ].
double beta(double x, double y)
Calculates the beta function.
RooArgList L(Args_t &&... args)
EActivationFunction
Enum that represents layer activation functions.
create variable transformations
constexpr Double_t C()
Velocity of light in .