Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
Propagation.cu
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 13/07/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12 //////////////////////////////////////////////////////////////////
13 // Implementation of the functions required for the forward and //
14 // backward propagation of activations through a neural network //
15 // for CUDA architectures. //
16 //////////////////////////////////////////////////////////////////
17
20#include "Kernels.cuh"
21#include <math.h>
22
23namespace TMVA {
24namespace DNN {
25
26//____________________________________________________________________________
27template<>
30 const TCudaMatrix<float> &Weights)
31{
32 int m, n, k;
33 k = input.GetNcols();
34 m = input.GetNrows();
35 n = Weights.GetNrows();
36 float alpha = 1.0, beta = 0.0;
37
38 // Compute C = beta * C + alpha * (A * B^T)
39 cudaStream_t s = output.GetComputeStream();
40 cublasSetStream(input.GetCublasHandle(), s);
41 cublasSgemm(input.GetCublasHandle(),
42 CUBLAS_OP_N, CUBLAS_OP_T,
43 m, n, k, & alpha,
44 input.GetDataPointer(), m, // *A, lda
45 Weights.GetDataPointer(), n, // *B, ldb
46 & beta, // beta
47 output.GetDataPointer(), m); // *C, ldc
48}
49
50//____________________________________________________________________________
51template<>
54 const TCudaMatrix<double> &Weights)
55{
56 int m, n, k;
57 k = input.GetNcols();
58 m = input.GetNrows();
59 n = Weights.GetNrows();
60 double alpha = 1.0, beta = 0.0;
61
62 // Compute C = beta * C + alpha * (A * B^T)
63 cudaStream_t s = output.GetComputeStream();
64 cublasSetStream(input.GetCublasHandle(), s);
65 cublasDgemm(input.GetCublasHandle(),
66 CUBLAS_OP_N, CUBLAS_OP_T,
67 m, n, k, & alpha,
68 input.GetDataPointer(), m, // *A, lda
69 Weights.GetDataPointer(), n, // *B, ldb
70 & beta, // beta
71 output.GetDataPointer(), m); // *C, ldc
72}
73
74//____________________________________________________________________________
75template<typename AFloat>
77 const TCudaMatrix<AFloat> &theta)
78{
79 dim3 blockDims = TDevice::BlockDims2D();
80 dim3 gridDims = TDevice::GridDims2D(Weights);
81 cudaStream_t s = Weights.GetComputeStream();
82 ::TMVA::DNN::Cuda::AddRowWise<<<gridDims, blockDims, 0, s>>>(
83 Weights.GetDataPointer(),
84 theta.GetDataPointer(),
85 Weights.GetNrows(),
86 Weights.GetNcols());
87}
88
89//____________________________________________________________________________
90template<typename AFloat>
91void TCuda<AFloat>::Backward(TCudaTensor<AFloat> & activation_gradients_backward,
92 TCudaMatrix<AFloat> & weight_gradients,
93 TCudaMatrix<AFloat> & bias_gradients,
94 const TCudaTensor<AFloat> & df,
95 const TCudaTensor<AFloat> & activation_gradients,
96 const TCudaMatrix<AFloat> & weights,
97 const TCudaTensor<AFloat> & activation_backward)
98{
99 // Compute element-wise product.
100 //Matrix_t df_m = df.GetMatrix();
101
102 // df is the output of ActivationBackward
103 //TCuda<AFloat>::Hadamard(df, activation_gradients);
104 //TCuda<AFloat>::Hadamard(df_m, activation_gradients.GetMatrix());
105
106 Matrix_t df_m = df.GetMatrix();
107
108 // Activation gradients.
109 if (activation_gradients_backward.GetSize() > 0) {
110
111 Matrix_t activation_gradients_backward_m = activation_gradients_backward.GetMatrix();
112 TCuda<AFloat>::Multiply(activation_gradients_backward_m, df_m, weights);
113 }
114
115 // Weight gradients.
116 if (weight_gradients.GetNoElements() > 0) {
117 TCuda<AFloat>::TransposeMultiply(weight_gradients, df_m, activation_backward.GetMatrix());
118 }
119
120 // Bias gradients.
121 if (bias_gradients.GetNoElements() > 0) {
122 TCuda<AFloat>::SumColumns(bias_gradients, df_m);
123 }
124
125}
126
127//____________________________________________________________________________
128template<typename AFloat>
130 const TCudaMatrix<AFloat> & A)
131{
132 size_t m = B.GetNrows();
133 size_t n = B.GetNcols();
134 cudaMemcpyAsync(B.GetDataPointer(), A.GetDataPointer(),
135 m * n * sizeof(AFloat), cudaMemcpyDeviceToDevice, 0);
136}
137
138//____________________________________________________________________________
139template<typename AFloat>
141 const TCudaTensor<AFloat> & A)
142{
143 size_t n = B.GetSize();
144 //asssert (A.GetSize() >= B.GetSize());
145 cudaMemcpyAsync(B.GetDataPointer(), A.GetDataPointer(),
146 n * sizeof(AFloat), cudaMemcpyDeviceToDevice, 0);
147}
148
149//____________________________________________________________________________
150template<typename AFloat>
151size_t TCuda<AFloat>::calculateDimension(size_t imgDim, size_t fltDim, size_t padding, size_t stride)
152{
153 size_t temp = imgDim - fltDim + 2 * padding;
154 if (temp % stride || temp + stride <= 0) {
155 Fatal("calculateDimension", "Not compatible hyper parameters for layer - (imageDim, filterDim, padding, stride)"
156 " %zu , %zu , %zu , %zu", imgDim, fltDim, padding, stride);
157 }
158 return temp / stride + 1;
159}
160
161
162///////////////////////////////////////////////////////////////////////////////////
163/// \brief A helper for image operations that rearranges image regions into
164/// column vectors.
165///
166/// \param[out] A The output matrix. Each row corresponds to a receptive field.
167/// \param[in] B The input matrix. Each row corresponds to a row in the image view.
168/// \param[in] imgHeight The heigh of the input.
169/// \param[in] imgWidth The output of the input.
170/// \param[in] fltHeight Height of the kernel.
171/// \param[in] fltWidth Width of the kernel.
172/// \param[in] strideRows stride size in the horizontal dimension.
173/// \param[in] strideCols stride size in the vertical dimension.
174/// \param[in] zeroPaddingHeight The padding in the horizontal dimension.
175/// \param[in] zeroPaddingWidth The padding in the vertical dimension.
176///
177/// This transformation allows us to express a 2D convolution as a matrix
178/// multiplication. We can therefore harness the finely tuned GEMM
179/// implementation of cuBLAS to achieve maximum performance. This function
180/// can greatly speed-up propagation in TConvLayer.
181///////////////////////////////////////////////////////////////////////////////////
182template<typename AFloat>
184 const TCudaMatrix<AFloat> &B,
185 size_t imgHeight,
186 size_t imgWidth,
187 size_t fltHeight,
188 size_t fltWidth,
189 size_t strideRows,
190 size_t strideCols,
191 size_t zeroPaddingHeight,
192 size_t zeroPaddingWidth)
193{
194 size_t depth = B.GetNrows();
195
196 dim3 blockDims = TDevice::BlockDims2D();
197 dim3 gridDims = TDevice::GridDims2D(A);
198 cudaStream_t s = A.GetComputeStream();
199
200 ::TMVA::DNN::Cuda::Im2Col<<<gridDims, blockDims, 0, s>>>(A.GetDataPointer(), B.GetDataPointer(), depth, imgHeight, imgWidth,
201 fltHeight, fltWidth, strideRows, strideCols,
202 zeroPaddingHeight, zeroPaddingWidth);
203}
204
205//____________________________________________________________________________
206template<typename AFloat>
208 const TCudaMatrix<AFloat> &B,
209 size_t filterDepth,
210 size_t filterHeight,
211 size_t filterWidth,
212 size_t numFilters)
213{
214 dim3 blockDims = TDevice::BlockDims2D();
215 dim3 gridDims = TDevice::GridDims2D(B);
216 cudaStream_t s = B.GetComputeStream();
217
218 ::TMVA::DNN::Cuda::RotateWeights<<<gridDims, blockDims, 0, s>>>(A.GetDataPointer(), B.GetDataPointer(), filterDepth,
219 filterHeight, filterWidth, numFilters);
220}
221
222#if 0
223template <typename AFloat>
225{
226 // non think this is needed when using tensor
227 // for (size_t event = 0; event < inputPrime.size(); event++) {
228 // cudaStream_t s;
229 // cudaStreamCreate(&s);
230 // inputPrime[event].SetComputeStream(s);
231 // }
232}
233#endif
234
235template <typename AFloat>
237 TCudaTensor<AFloat> & inputActivationFunc,
239 const TCudaMatrix<AFloat> &weights, const TCudaMatrix<AFloat> & biases,
240 const DNN::CNN::TConvParams & params, EActivationFunction activFunc,
241 TCudaTensor<AFloat> & inputPrime,
242 const ConvDescriptors_t & /*descriptors*/,
243 ConvWorkspace_t & /*workspace*/)
244{
245 size_t height = calculateDimension(params.inputHeight, params.filterHeight, params.paddingHeight, params.strideRows);
246 size_t width = calculateDimension(params.inputWidth, params.filterWidth, params.paddingWidth, params.strideCols);
247
248 // for(size_t event = 0; event < input.size(); event++) {
249 // cudaStream_t s = inputPrime[event].GetComputeStream();
250 // output[event].SetComputeStream(s);
251 // derivatives[event].SetComputeStream(s);
252 // }
253
254 for(size_t event = 0; event < input.GetFirstSize(); event++) {
255 Matrix_t inputPrime_m = inputPrime.At(event).GetMatrix();
256 Matrix_t output_m = output.At(event).GetMatrix();
257
258 Im2col(inputPrime_m, input.At(event).GetMatrix(), params.inputHeight, params.inputWidth, params.filterHeight, params.filterWidth,
259 params.strideRows, params.strideCols, params.paddingHeight, params.paddingWidth);
260
261 MultiplyTranspose(output_m, weights, inputPrime_m);
262 AddConvBiases(output_m, biases);
263 }
264
265 //evaluateDerivative<TCuda<AFloat>>(derivatives, activFunc, output);
266 //evaluate<TCuda<AFloat>>(output, activFunc);
267
268 // save output of convolution before activation function evaluation
269 Copy(inputActivationFunc, output);
270 ActivationFunctionForward(output, activFunc, ActivationDescriptor_t() );
271
272}
273
274//____________________________________________________________________________
275template<typename AFloat>
276void TCuda<AFloat>::ConvLayerBackward(TCudaTensor<AFloat> & activationGradientsBackward,
277 TCudaMatrix<AFloat> & weightGradients,
278 TCudaMatrix<AFloat> & biasGradients,
279 TCudaTensor<AFloat> & inputActivationFunc,
280 TCudaTensor<AFloat> & activationGradients,
281 const TCudaMatrix<AFloat> & weights,
282 const TCudaTensor<AFloat> & activationBackward,
283 const Tensor_t & outputTensor,
284 EActivationFunction activFunc,
285 const ConvDescriptors_t & /*descriptors*/,
286 ConvWorkspace_t & /*workspace*/,
287 size_t batchSize,
288 size_t inputHeight,
289 size_t inputWidth,
290 size_t depth,
291 size_t height,
292 size_t width,
293 size_t filterDepth,
294 size_t filterHeight,
295 size_t filterWidth,
296 size_t nLocalViews)
297{
298
299 // Compute activation backward
300 //Tensor_t df = activationGradients; // this is a shallow copy
301 Tensor_t df(activationGradients.GetShape() );
302 ActivationFunctionBackward(df, outputTensor, activationGradients, inputActivationFunc,
303 activFunc, ActivationDescriptor_t() );
304
305
306 //Hadamard(df, activationGradients);
307
308
309 // Calculate the activation gradients of the previous layer
310 CalculateConvActivationGradients(activationGradientsBackward, df, weights, batchSize, inputHeight, inputWidth, depth,
311 height, width, filterDepth, filterHeight, filterWidth);
312
313
314 // Calculate the weight gradients
315 CalculateConvWeightGradients(weightGradients, df, activationBackward, batchSize, inputHeight, inputWidth, depth,
316 height, width, filterDepth, filterHeight, filterWidth, nLocalViews);
317
318 // Calculate the bias gradients
319 CalculateConvBiasGradients(biasGradients, df, batchSize, depth, nLocalViews);
320}
321
322//____________________________________________________________________________
323template<typename AFloat>
325 TCudaTensor<AFloat> & activationGradientsBackward,
326 const TCudaTensor<AFloat> & df,
327 const TCudaMatrix<AFloat> & weights,
328 size_t batchSize,
329 size_t inputHeight,
330 size_t inputWidth,
331 size_t depth,
332 size_t height,
333 size_t width,
334 size_t filterDepth,
335 size_t filterHeight,
336 size_t filterWidth)
337{
338 if (activationGradientsBackward.GetSize() == 0) return;
339
340 TCudaMatrix<AFloat> rotWeights(filterDepth, depth * filterHeight * filterWidth);
341 RotateWeights(rotWeights, weights, filterDepth, filterHeight, filterWidth, weights.GetNrows());
342
343 // Calculate the zero paddings.
344 size_t tempZeroPaddingHeight = (size_t)(floor((inputHeight - height + filterHeight - 1) / 2));
345 size_t tempZeroPaddingWidth = (size_t)(floor((inputWidth - width + filterWidth - 1) / 2));
346
347 // Calculate the number of local views and the number of pixels in each view.
348 size_t tempNLocalViews = inputHeight * inputWidth;
349 size_t tempNLocalViewPixels = depth * filterHeight * filterWidth;
350
351 // Problem here. We need to generalize!
352 size_t tempStrideRows = 1;
353 size_t tempStrideCols = 1;
354
355 R__ASSERT( df.GetFirstSize() == batchSize);
356 // Convolution.
357 TCudaMatrix<AFloat> dfPrime(tempNLocalViews, tempNLocalViewPixels);
358 for(size_t event = 0; event < batchSize; event++) {
359 Im2col(dfPrime, df.At(event).GetMatrix(), height, width, filterHeight, filterWidth, tempStrideRows, tempStrideCols,
360 tempZeroPaddingHeight, tempZeroPaddingWidth);
361
362 TCudaMatrix<AFloat> agb_m = activationGradientsBackward.At(event).GetMatrix();
363 MultiplyTranspose(agb_m, rotWeights, dfPrime);
364 }
365}
366
367//____________________________________________________________________________
368template<typename AFloat>
370 const TCudaTensor<AFloat> & df,
371 const TCudaTensor<AFloat> & activationsBackward,
372 size_t batchSize,
373 size_t inputHeight,
374 size_t inputWidth,
375 size_t depth,
376 size_t height,
377 size_t width,
378 size_t filterDepth,
379 size_t filterHeight,
380 size_t filterWidth,
381 size_t nLocalViews)
382{
383 // reinitialize the weight gradients to 0
384 weightGradients.Zero();
385
386 const size_t filterSize = filterHeight * filterWidth;
387 const size_t nLocalViewPixels = filterDepth * filterSize;
388 R__ASSERT( weightGradients.GetNcols() == nLocalViewPixels);
389 R__ASSERT( weightGradients.GetNrows() == depth);
390 R__ASSERT( df.GetFirstSize() == batchSize);
391
392
393
394 const size_t tempStrideRows = 1;
395 const size_t tempStrideCols = 1;
396
397 // Calculate the zero paddings from the input height and width (assume stride = 1)
398 const size_t tempZeroPaddingHeight = (height - inputHeight + filterHeight - 1) / 2;
399 const size_t tempZeroPaddingWidth = (width - inputWidth + filterWidth - 1) / 2;
400
401 // Convolution.
402 TCudaMatrix<AFloat> activationsPrime(nLocalViews, nLocalViewPixels);
403 TCudaMatrix<AFloat> resPrime(depth, nLocalViewPixels);
404 for(size_t event = 0; event < batchSize; event++) {
405 Im2col(activationsPrime, activationsBackward.At(event).GetMatrix(), inputHeight, inputWidth, filterHeight, filterWidth,
406 tempStrideRows, tempStrideCols, tempZeroPaddingHeight, tempZeroPaddingWidth);
407
408 Multiply(resPrime, df.At(event).GetMatrix(), activationsPrime);
409
410 TCuda<AFloat>::ScaleAdd(weightGradients, resPrime, 1.0);
411 }
412}
413
414//____________________________________________________________________________
415template<typename AFloat>
417 const TCudaTensor<AFloat> & df,
418 size_t batchSize,
419 size_t /* depth */,
420 size_t /* nLocalViews */)
421{
422 biasGradients.Zero();
423 TCudaMatrix<AFloat> temp(biasGradients.GetNrows(), biasGradients.GetNcols());
424 for (size_t event = 0; event < batchSize; event++) {
425 TCuda<AFloat>::SumRows(temp, df.At(event).GetMatrix());
426 TCuda<AFloat>::ScaleAdd(biasGradients, temp);
427 }
428}
429
430//____________________________________________________________________________
431template<typename AFloat>
433 const TCudaMatrix<AFloat> &biases)
434{
435 dim3 blockDims = TDevice::BlockDims2D();
436 dim3 gridDims = TDevice::GridDims2D(output);
437 cudaStream_t s = output.GetComputeStream();
438 ::TMVA::DNN::Cuda::AddBiases<<<gridDims, blockDims, 0, s>>>(
439 output.GetDataPointer(),
440 biases.GetDataPointer(),
441 output.GetNrows(),
442 output.GetNcols());
443}
444
445
446//____________________________________________________________________________
447//////////////////////////////////////////////////////////////////////////////////////////////
448/// \brief Downsampling function used as the forward propagation step of a
449/// Max-Pooling layer.
450///
451/// \param[out] A The output matrix. Each row corresponds to a slice and each element
452/// is the max within a receptive field.
453/// \param[out] B The winning indices matrix. Each element is the index of the max element.
454/// \param[in] C The input matrix. Each row is a slice.
455/// \param[in] imgHeight The heigh of the input.
456/// \param[in] imgWidth The output of the input.
457/// \param[in] fltHeight Height of the kernel.
458/// \param[in] fltWidth Width of the kernel.
459/// \param[in] strideRows stride size in the horizontal dimension.
460/// \param[in] strideCols stride size in the vertical dimension.
461///
462/// Each output element is the maximum of the receptive field. We also save the winning
463/// indices to facilitate back-propagation - we need to know which input element influenced
464/// the output and only apply the derivative correction to this particular element.
465/// The slicing process is the same as in a convolutional layer, however padding is set to 0.
466///////////////////////////////////////////////////////////////////////////////////////////////
467template<typename AFloat>
470 const TCudaTensor<AFloat> &C,
471 const PoolingDescriptors_t & /*descriptors*/,
472 PoolingWorkspace_t & /*workspace*/,
473 size_t imgHeight,
474 size_t imgWidth,
475 size_t fltHeight,
476 size_t fltWidth,
477 size_t strideRows,
478 size_t strideCols)
479{
480 size_t depth = C.GetCSize();
481 size_t bsize = C.GetFirstSize();
482
483 dim3 blockDims = TDevice::BlockDims2D();
484 dim3 gridDims = TDevice::GridDims2D( A.GetHSize(), A.GetWSize() );
485 cudaStream_t s = A.GetComputeStream();
486
487 for(size_t event = 0; event < bsize; event++) {
488 // need to implement tensor kernel
489 // ::TMVA::DNN::Cuda::Downsample<<<gridDims, blockDims, 0, s>>>(mA.GetDataPointer(), mB.GetDataPointer(),
490 // mC.GetDataPointer(), depth, imgHeight, imgWidth,
491 // fltHeight, fltWidth, strideRows, strideCols);
492 ::TMVA::DNN::Cuda::Downsample<<<gridDims, blockDims, 0, s>>>(A.GetDataPointerAt(event), B.GetDataPointerAt(event),
493 C.GetDataPointerAt(event), depth, imgHeight, imgWidth,
494 fltHeight, fltWidth, strideRows, strideCols);
495 }
496}
497//____________________________________________________________________________
498template<typename AFloat>
500 const TCudaTensor<AFloat> & activationGradients,
501 const TCudaTensor<AFloat> & indexMatrix,
502 const Tensor_t & /*inputActivation*/,
503 const Tensor_t & /*outputTensor*/,
504 const PoolingDescriptors_t & /*descriptors*/,
505 PoolingWorkspace_t & /*workspace*/,
506 size_t imgHeight,
507 size_t imgWidth,
508 size_t fltHeight,
509 size_t fltWidth,
510 size_t strideRows,
511 size_t strideCols,
512 size_t /* nLocalViews */)
513{
514 size_t depth = activationGradientsBackward.GetCSize();
515 size_t bsize = activationGradients.GetFirstSize();
516
517 dim3 blockDims = TDevice::BlockDims2D();
518 // activationGradientsBackward.GetHSize() should be equal to depth
519 dim3 gridDims = TDevice::GridDims2D(activationGradientsBackward.GetHSize(),
520 activationGradientsBackward.GetWSize());
521 cudaStream_t s = activationGradientsBackward.GetComputeStream();
522
523 for(size_t event = 0; event < bsize; event++) {
524
525 ::TMVA::DNN::Cuda::MaxPoolBackward<<<gridDims, blockDims, 0, s>>>(activationGradientsBackward.GetDataPointerAt(event),
526 activationGradients.GetDataPointerAt(event),
527 indexMatrix.GetDataPointerAt(event),
528 depth, imgHeight, imgWidth, fltHeight, fltWidth,
529 strideRows, strideCols);
530 }
531}
532
533//____________________________________________________________________________
534template<typename AFloat>
536{
537 dim3 blockDims = TDevice::BlockDims2D();
538 dim3 gridDims = TDevice::GridDims2D(A);
539 cudaStream_t s = A.GetComputeStream();
540
541 ::TMVA::DNN::Cuda::Reshape<<<gridDims, blockDims>>>(A.GetDataPointer(), B.GetDataPointer(),
542 A.GetNrows(), A.GetNcols(), B.GetNrows(), B.GetNcols());
543}
544
545
546//______________________________________________________________________________
547template <typename AReal>
549{
550 // B x T x D out --- T x B x D in*/
551
552 // need to implement (usa CPu impl). Needs by RNN
553 out = in;
554
555 // size_t B = out.size();
556 // size_t T = out[0].GetNrows();
557 // size_t D = out[0].GetNcols();
558 // if ((T != in.size()) || (B != in[0].GetNrows())
559 // || (D != in[0].GetNcols())) {
560 // std::cout << "Incompatible Dimensions\n"
561 // << in.size() << "x" << in[0].GetNrows() << "x" << in[0].GetNcols()
562 // << " --> " << B << "x" << T << "x" << D << "\n";
563 // return;
564 // }
565 // for (size_t i = 0; i < B; ++i) {
566 // for (size_t j = 0; j < T; ++j) {
567 // for (size_t k = 0; k < D; ++k) {
568 // out[i](j, k) = in[j](i, k);
569 // }
570 // }
571 // }
572 return;
573}
574
575//____________________________________________________________________________
576////////////////////////////////////////////////////////////////////////////////
577/// \brief Flatten a vector of matrices into a single matrix.
578///
579/// \param[out] A Output matrix.
580/// \param[in] B Input vector. Each element is a matrix to be concatenated.
581/// \param[in] size Number of matrices in the input vector.
582/// \param[in] nRows Number of rows in each matrix of the input vector.
583/// \param[in] nCols Number of columns on each matrix of the input vector.
584///
585/// Each row in the output matrix is the concatenation of the same row in
586/// each of the input matrices. Passing an std::vector to a CUDA kernel is
587/// a non trivial task that requires manually allocating and copying to device
588/// memory - details in comments within the function's body. Launching one
589/// thread per output element.
590//////////////////////////////////////////////////////////////////////////////////
591template<typename AFloat>
593 const TCudaTensor<AFloat> &B)
594{
595 // flatten B: ( B x C x HW ) in ( 1, B , CHW)
596 size_t nDepth = B.GetFirstSize(); // B size
597 size_t nRows = B.GetCSize(); // C size
598 size_t nCols = B.GetWSize(); // HW size
599 if (B.GetNDim()==4) nCols *= B.GetHSize();
600 assert(B.GetNDim() <= 4);
601
602 dim3 blockDims = TDevice::BlockDims2D();
603 dim3 gridDims = TDevice::GridDims2D(A.GetHSize(), A.GetWSize());
604 cudaStream_t s = A.GetComputeStream();
605
606 // Get raw pointers from a vector of matrices - this is more challenging than it sounds.
607 //
608 // Attention: While `TCudaMatrix.GetDataPointer() returns a pointer to device memory,
609 // std::vector (and its .data() raw pointer) resides on host memory. Therefore
610 // we need to manually copy these pointers to the device prior to invoking the kernel.
611
612 // const AFloat ** dB; // device pointer to device pointers.S
613 // const AFloat ** hB = new const AFloat * [size]; // host pointer to device pointers.
614
615 // cudaMalloc(&dB, sizeof(AFloat *) * size);
616 // for(size_t i = 0; i < size; ++i) {
617 // hB[i] = B[i].GetDataPointer();
618 // }
619
620 // cudaMemcpy(dB, hB, sizeof(AFloat *) * size, cudaMemcpyHostToDevice);
621 //std::cout << "flatten from : " << nDepth << " , " << nRows << " , " << nCols << std::endl;
622
623
624 // for (size_t i = 0; i < size; i++) {
625 // for (size_t j = 0; j < nRows; j++) {
626 // for (size_t k = 0; k < nCols; k++) {
627 // A( 0, i, j * nCols + k) = B(i, j, k);
628 // }
629 // }
630 // }
631
632 //PrintTensor(A, "manual reshape");
633
634 // to be fixed !!!
635 // Launch the kernel using our device pointers.
636
637
638 // for columnwise tensor (B x HW X C) -> flatten in (CHW x B )
639 if (B.GetLayout() == GetTensorLayout() )
640 ::TMVA::DNN::Cuda::Flatten<<<gridDims, blockDims>>>(A.GetDataPointer(), B.GetDataPointer(), nDepth, nRows, nCols);
641 else
642 // in case of Row wise tensor (Cudnn) input is B x C x H x W --> CHW x B
643 // no need to traspose C with respect to HW
644 ::TMVA::DNN::Cuda::FlattenRM<<<gridDims, blockDims>>>(A.GetDataPointer(), B.GetDataPointer(), nDepth, nRows, nCols);
645
646 //PrintTensor(A, "kernel reshape");
647
648
649 // delete [] hB;
650 // cudaFree(dB);
651}
652
653//____________________________________________________________________________
654////////////////////////////////////////////////////////////////////////////////
655/// \brief Deflatten a matrix into a vector of matrices.
656///
657/// \param[out] A Output matrices. Each element will be a part of the input.
658/// \param[in] B Input flat matrix.
659/// \param[in] size Number of matrices in the output vector.
660/// \param[in] nRows Number of rows in each matrix of the output vector.
661/// \param[in] nCols Number of columns on each matrix of the output vector.
662///
663/// Each row in the input matrix is the concatenation of the same row in
664/// each of the output matrices. Passing an std::vector to a CUDA kernel is
665/// a non trivial task that requires manually allocating and copying to device
666/// memory - details in comments within the function's body. Launching one
667/// thread per input element.
668//////////////////////////////////////////////////////////////////////////////////
669template<typename AFloat>
671 const TCudaTensor<AFloat> &B)
672{
673 size_t nDepth = A.GetFirstSize(); // B size
674 size_t nRows = A.GetCSize(); // C size
675 size_t nCols = A.GetWSize(); // HW size
676 if (A.GetNDim()==4) nCols *= A.GetHSize();
677 assert(A.GetNDim() <= 4);
678
679
680 dim3 blockDims = TDevice::BlockDims2D();
681 dim3 gridDims = TDevice::GridDims2D(B.GetHSize(), B.GetWSize());
682 cudaStream_t s = B.GetComputeStream();
683
684 //std::cout << "Deflatten to " << size << " , " << nRows << " " << nCols << std::endl;
685
686 // Get raw pointers from a vector of matrices - this is more challenging than it sounds.
687 //
688 // Attention: While `TCudaMatrix.GetDataPointer() returns a pointer to device memory,
689 // std::vector (and its .data() raw pointer) resides on host memory. Therefore
690 // we need to manually copy these pointers to the device prior to invoking the kernel.
691
692 // AFloat ** dA; // device pointer to device pointers.
693 // AFloat ** hA = new AFloat * [size]; // host pointer to device pointers.
694
695 // cudaMalloc(&dA, sizeof(AFloat *) * size);
696
697 // for(size_t i = 0; i < size; ++i) {
698 // hA[i] = A[i].GetDataPointer();
699 // }
700
701 // cudaMemcpy(dA, hA, sizeof(AFloat *) * size, cudaMemcpyHostToDevice);
702
703 // Launch the kernel using our device pointers.
704 // for columnwise output tensor (B x HW X C) -> de-flatten transposing C and HW
705 if (A.GetLayout() == GetTensorLayout() )
706 ::TMVA::DNN::Cuda::Deflatten<<<gridDims, blockDims>>>(A.GetDataPointer(), B.GetDataPointer(), nDepth, nRows, nCols);
707 else
708 // case of deflatting in a row-wise tensor
709 ::TMVA::DNN::Cuda::DeflattenRM<<<gridDims, blockDims>>>(A.GetDataPointer(), B.GetDataPointer(), nDepth, nRows, nCols);
710
711 // assert ( B.GetFirstSize() == 1);
712 // assert ( B.GetHSize() == size);
713 // assert ( B.GetWSize() == nRows*nCols);
714 // for (size_t i = 0; i < (size_t)size; i++) {
715 // for (size_t j = 0; j < (size_t)nRows; j++) {
716 // for (size_t k = 0; k < (size_t)nCols; k++) {
717 // A(i, j, k) = B(0, i, j * nCols + k);
718 // }
719 // }
720 // }
721
722
723 // cudaFree(dA);
724 // delete [] hA;
725}
726
727//____________________________________________________________________________
728template <typename AFloat>
731 Matrix_t &, Matrix_t &,
733 const TensorDescriptor_t &)
734
735{
736 // Bnorm not yet implmented for Cuda
737 // just copy output =input
738 Copy(y, x);
739}
740
741//____________________________________________________________________________
742template <typename AFloat>
744 Tensor_t &y, const Matrix_t &,
745 const Matrix_t &, Scalar_t ,
746 const TensorDescriptor_t &)
747
748{
749 Copy(y, x);
750}
751
752//____________________________________________________________________________
753template <typename AFloat>
754void TCuda<AFloat>::BatchNormLayerBackward(int /* axis */, const Tensor_t &/* x */, const Tensor_t &dy, Tensor_t &dx,
755 Matrix_t &/* gamma */, // Matrix_t &beta, (not needed)
756 Matrix_t &/* dgamma */, Matrix_t &/* dbeta */, const Matrix_t &/* mean */,
757 const Matrix_t &/* variance */, const Matrix_t &/* iVariance */, Scalar_t /* epsilon */,
758 const TensorDescriptor_t &/* bnParDescriptor */)
759{
760 Copy(dx, dy);
761}
762
763} // namespace DNN
764} // namespace TMVA
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125
void Fatal(const char *location, const char *msgfmt,...)
Use this function in case of a fatal error. It will abort the program.
Definition TError.cxx:244
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t width
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t height
TCudaMatrix Class.
Definition CudaMatrix.h:103
size_t GetNcols() const
Definition CudaMatrix.h:160
cudaStream_t GetComputeStream() const
Definition CudaMatrix.h:268
size_t GetNoElements() const
Definition CudaMatrix.h:161
const AFloat * GetDataPointer() const
Definition CudaMatrix.h:163
size_t GetNrows() const
Definition CudaMatrix.h:159
TCudaTensor Class.
Definition CudaTensor.h:84
TCudaTensor< AFloat > At(size_t i) const
Definition CudaTensor.h:364
const AFloat * GetDataPointerAt(size_t i) const
Definition CudaTensor.h:199
const Shape_t & GetShape() const
Definition CudaTensor.h:188
size_t GetWSize() const
Definition CudaTensor.h:289
cudaStream_t GetComputeStream() const
Definition CudaTensor.h:213
MemoryLayout GetLayout() const
Definition CudaTensor.h:186
size_t GetCSize() const
Definition CudaTensor.h:279
TCudaMatrix< AFloat > GetMatrix() const
Definition CudaTensor.h:304
size_t GetNDim() const
Definition CudaTensor.h:191
const AFloat * GetDataPointer() const
Definition CudaTensor.h:194
size_t GetHSize() const
Definition CudaTensor.h:283
size_t GetFirstSize() const
Definition CudaTensor.h:274
size_t GetSize() const
Definition CudaTensor.h:192
static void Backward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients, Matrix_t &biasGradients, const Tensor_t &df, const Tensor_t &activationGradients, const Matrix_t &weights, const Tensor_t &activationBackward)
Perform the complete backward propagation step.
static void ConvLayerBackward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients, Matrix_t &biasGradients, Tensor_t &df, Tensor_t &activationGradients, const Matrix_t &weights, const Tensor_t &activationBackward, const Tensor_t &outputTensor, EActivationFunction activFunc, const ConvDescriptors_t &, ConvWorkspace_t &, size_t batchSize, size_t inputHeight, size_t inputWidth, size_t depth, size_t height, size_t width, size_t filterDepth, size_t filterHeight, size_t filterWidth, size_t nLocalViews)
Perform the complete backward propagation step in a Convolutional Layer.
static void CalculateConvWeightGradients(Matrix_t &weightGradients, const Tensor_t &df, const Tensor_t &activations_backward, size_t batchSize, size_t inputHeight, size_t inputWidth, size_t depth, size_t height, size_t width, size_t filterDepth, size_t filterHeight, size_t filterWidth, size_t nLocalViews)
Utility function for calculating the weight gradients of the convolutional layer.
static size_t calculateDimension(size_t imgDim, size_t fltDim, size_t padding, size_t stride)
Calculate how many neurons "fit" in the output layer, given the input as well as the layer's hyperpar...
static void ConvLayerForward(Tensor_t &output, Tensor_t &inputActivationFunc, const Tensor_t &input, const Matrix_t &weights, const Matrix_t &biases, const DNN::CNN::TConvParams &params, EActivationFunction activFunc, Tensor_t &, const ConvDescriptors_t &, ConvWorkspace_t &)
Forward propagation in the Convolutional layer.
static void CalculateConvActivationGradients(Tensor_t &activationGradientsBackward, const Tensor_t &df, const Matrix_t &weights, size_t batchSize, size_t inputHeight, size_t inputWidth, size_t depth, size_t height, size_t width, size_t filterDepth, size_t filterHeight, size_t filterWidth)
Utility function for calculating the activation gradients of the layer before the convolutional layer...
static void SumRows(Matrix_t &B, const Matrix_t &A)
extra functions defined only for CPU architecture !!!
static void Flatten(Tensor_t &A, const Tensor_t &B)
Flattens the tensor B, such that each matrix, is stretched in one row, resulting with a matrix A.
static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward, const Tensor_t &activationGradients, const Tensor_t &indexMatrix, const Tensor_t &, const Tensor_t &, const PoolingDescriptors_t &, PoolingWorkspace_t &, size_t imgHeight, size_t imgWidth, size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols, size_t nLocalViews)
Perform the complete backward propagation step in a Pooling Layer.
static void AddRowWise(Matrix_t &output, const Matrix_t &biases)
Add the vectors biases row-wise to the matrix output.
static void Multiply(Matrix_t &C, const Matrix_t &A, const Matrix_t &B)
Standard multiplication of two matrices A and B with the result being written into C.
static void Downsample(Tensor_t &A, Tensor_t &B, const Tensor_t &C, const PoolingDescriptors_t &, PoolingWorkspace_t &, size_t imgHeight, size_t imgWidth, size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols)
Downsample the matrix C to the matrix A, using max operation, such that the winning indices are store...
static void SumColumns(Matrix_t &B, const Matrix_t &A, Scalar_t alpha=1.0, Scalar_t beta=0.)
Sum columns of (m x n) matrix A and write the results into the first m elements in A.
static void RotateWeights(Matrix_t &A, const Matrix_t &B, size_t filterDepth, size_t filterHeight, size_t filterWidth, size_t numFilters)
Rotates the matrix B, which is representing a weights, and stores them in the matrix A.
static void Im2col(Matrix_t &A, const Matrix_t &B, size_t imgHeight, size_t imgWidth, size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols, size_t zeroPaddingHeight, size_t zeroPaddingWidth)
Transform the matrix B in local view format, suitable for convolution, and store it in matrix A.
static void CalculateConvBiasGradients(Matrix_t &biasGradients, const Tensor_t &df, size_t batchSize, size_t depth, size_t nLocalViews)
Utility function for calculating the bias gradients of the convolutional layer.
static void PrepareInternals(Tensor_t &)
Dummy placeholder - preparation is currently only required for the CUDA architecture.
Definition Cuda.h:545
static void Deflatten(Tensor_t &A, const Tensor_t &B)
Transforms each row of B to a matrix and stores it in the tensor B.
static void MultiplyTranspose(Matrix_t &output, const Matrix_t &input, const Matrix_t &weights)
Matrix-multiply input with the transpose of weights and write the results into output.
static void BatchNormLayerForwardTraining(int axis, const Tensor_t &x, Tensor_t &y, Matrix_t &gamma, Matrix_t &beta, Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans, Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum, Scalar_t epsilon, const TensorDescriptor_t &bnParDescriptor)
The input from each batch are normalized during training to have zero mean and unit variance and they...
static void BatchNormLayerBackward(int axis, const Tensor_t &x, const Tensor_t &dy, Tensor_t &dx, Matrix_t &gamma, Matrix_t &dgamma, Matrix_t &dbeta, const Matrix_t &mean, const Matrix_t &variance, const Matrix_t &iVariance, Scalar_t epsilon, const TensorDescriptor_t &)
static void Copy(Matrix_t &B, const Matrix_t &A)
static void BatchNormLayerForwardInference(int axis, const Tensor_t &x, Matrix_t &gamma, Matrix_t &beta, Tensor_t &y, const Matrix_t &runningMeans, const Matrix_t &runningVars, Scalar_t epsilon, const TensorDescriptor_t &)
During inference the inputs are not normalized using the batch mean but the previously computed at ru...
static void Rearrange(Tensor_t &out, const Tensor_t &in)
Rearrage data according to time fill B x T x D out with T x B x D matrix in.
static void Reshape(Matrix_t &A, const Matrix_t &B)
Transform the matrix B to a matrix with different dimensions A.
static void AddConvBiases(Matrix_t &output, const Matrix_t &biases)
Add the biases in the Convolutional Layer.
static void TransposeMultiply(Matrix_t &output, const Matrix_t &input, const Matrix_t &Weights, Scalar_t alpha=1.0, Scalar_t beta=0.)
Matrix multiplication of two matrices A and B^T (transposed) with the result being written into C.
static void ScaleAdd(Matrix_t &A, const Matrix_t &B, Scalar_t beta=1.0)
Adds a the elements in matrix B scaled by c to the elements in the matrix A.
static dim3 BlockDims2D()
Definition Device.h:55
static dim3 GridDims2D(int nrows, int ncols)
Definition Device.h:74
Double_t y[n]
Definition legend1.C:17
Double_t x[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
EActivationFunction
Enum that represents layer activation functions.
Definition Functions.h:32
create variable transformations
size_t strideRows
The number of row pixels to slid the filter each step.
Definition ConvLayer.h:57
size_t filterHeight
The height of the filter.
Definition ConvLayer.h:54
size_t inputHeight
The height of the previous layer or input.
Definition ConvLayer.h:50
size_t paddingWidth
The number of zero layers left and right of the input.
Definition ConvLayer.h:60
size_t filterWidth
The width of the filter.
Definition ConvLayer.h:55
size_t paddingHeight
The number of zero layers added top and bottom of the input.
Definition ConvLayer.h:59
size_t inputWidth
The width of the previous layer or input.
Definition ConvLayer.h:51
size_t strideCols
The number of column pixels to slid the filter each step.
Definition ConvLayer.h:58
TMarker m
Definition textangle.C:8
static void output()