Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
CudaTensor.cu
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 13/07/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12/////////////////////////////////////////////
13// Implementation of the TCudaTensor class. //
14/////////////////////////////////////////////
15
18
19#include <algorithm>
20#include <cassert>
21#include <iostream>
22
23namespace TMVA {
24namespace DNN {
25
26
27// Static members.
28//____________________________________________________________________________
29#ifdef R__HAS_CUDNN
30template<typename AFloat>
31std::vector<cudnnHandle_t> TCudaTensor<AFloat>::fCudnnHandle(1);
32template<typename AFloat>
33cudnnDataType_t TCudaTensor<AFloat>::fDataType = CUDNN_DATA_FLOAT;
34#endif
35
36template<typename AFloat>
37std::vector<int> TCudaTensor<AFloat>::fInstances(1,0);
38
39/// This information is needed for the multi-dimensional indexing. See here:
40/// https://en.wikipedia.org/wiki/Row-_and_column-major_order
41/// https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.strides.html
42template<typename AFloat>
43std::vector<std::size_t> TCudaTensor<AFloat>::ComputeStridesFromShape(const std::vector<std::size_t> &shape,
44 bool rowmajorLayout)
45{
46 const auto size = shape.size();
47 std::vector<std::size_t> strides(size);
48 if (rowmajorLayout) {
49 for (std::size_t i = 0; i < size; i++) {
50 if (i == 0) {
51 strides[size - 1 - i] = 1;
52 } else {
53 strides[size - 1 - i] = strides[size - 1 - i + 1] * shape[size - 1 - i + 1];
54 }
55 }
56 } else {
57 for (std::size_t i = 0; i < size; i++) {
58 if (i == 0) {
59 strides[i] = 1;
60 } else {
61 strides[i] = strides[i - 1] * shape[i - 1];
62 }
63 }
64 }
65 return strides;
66}
67
68// Constructors.
69//____________________________________________________________________________
70template<typename AFloat>
72 : fShape(), fStrides(), fNDim(0), fSize(0), fElementBuffer(), fStreamIndx(0), fTensorDescriptor(nullptr)
73{
74 //InitializeCuda();
75}
76
77
78//____________________________________________________________________________
79template<typename AFloat>
80TCudaTensor<AFloat>::TCudaTensor(const std::vector<size_t> & shape,
82 int device, int streamIndx)
83 : fShape(shape), fStrides(shape.size()), fNDim(shape.size()), fDevice(device), fStreamIndx(streamIndx),
84 fTensorDescriptor(nullptr), fMemoryLayout(layout)
85{
86 fStrides = ComputeStridesFromShape(fShape, layout==MemoryLayout::RowMajor);
87
88 fSize = (layout==MemoryLayout::RowMajor) ? fStrides.front()*fShape.front() :
89 fStrides.back()*fShape.back();
90
91 // create a new buffer in this case
93 // need to initialize Cuda when creating a new Cuda Buffer (e.g. create Tensor descriptor)
95}
96
97//____________________________________________________________________________
98template<typename AFloat>
99TCudaTensor<AFloat>::TCudaTensor(const AFloat * host_data, const std::vector<size_t> & shape,
101 int device, int streamIndx)
102 : TCudaTensor(shape, layout, device, streamIndx)
103{
104 // do I need to allocate this buffer ????
105 // is not a mem leak
106 // AFloat * buffer = new AFloat[fSize];
107 // size_t index = 0;
108 // for (size_t j = 0; j < fSize; ++j) {
109 // buffer[j] = static_cast<AFloat>(host_data[j]);
110 // }
111 // }
112
113 cudaMemcpy(fElementBuffer, host_data, fSize * sizeof(AFloat),
114 cudaMemcpyHostToDevice);
115
116 // no need to initialize cuda. Done in the other constructor that is called before
117 //InitializeCuda();
118}
119
120//____________________________________________________________________________
121template<typename AFloat>
123 const std::vector<size_t> & shape,
125 int device, int streamIndx)
126 : fNDim(shape.size()), fElementBuffer(buffer), fShape(shape), fStrides( shape.size()), fDevice(device),
127 fStreamIndx(streamIndx), fTensorDescriptor(nullptr), fMemoryLayout(layout)
128{
129 // constructor from an existing buffer . Buffer size must contain given size
130 fStrides = ComputeStridesFromShape(fShape, layout==MemoryLayout::RowMajor);
131
132 fSize = (layout==MemoryLayout::RowMajor) ? fStrides.front()*fShape.front() :
133 fStrides.back()*fShape.back();
134 R__ASSERT(fSize <= buffer.GetSize());
135
136 // need to Initialize Cuda in case device buffer was created separatly
138}
139
140//____________________________________________________________________________
141//FIXME: Go to shared_ptr implementation of instance tracking
142// template <typename AFloat>
143// TCudaTensor<AFloat>::TCudaTensor(const TCudaTensor<AFloat>& oldTensor) :
144// TCudaTensor(oldTensor.fShape, oldTensor.fMemoryLayout, oldTensor.fDevice, oldTensor.fStreamIndx)
145// {
146// // No deep copy
147// fStrides = oldTensor.fStrides;
148// fElementBuffer = oldTensor.fElementBuffer;
149
150// std::cout << "calling copy constructor of TCuda tensor" << std::endl;
151
152// InitializeCuda();
153// }
154
155//____________________________________________________________________________
156template <typename AFloat>
158 TCudaTensor( matrix.GetDeviceBuffer(), {matrix.GetNrows(), matrix.GetNcols()}, MemoryLayout::ColumnMajor)
159{
160 // No deep copy
161 if (dim > 2) {
162 // change shape from (nrows,ncols) to (nrows,ncols,1,1)
163 // this works onlt for coolum major layout since this is same of TCudaMatrix
164 fShape.insert(fShape.end(), dim-2, 1);
165 fStrides.insert(fStrides.end(),dim-2,fSize);
166 fNDim = dim;
167 // need to reset tensor descriptor since we are changing the shape
168 SetTensorDescriptor();
169 }
170}
171
172
173
174template<typename AFloat>
176{
177 // this should work only for size 2 or 4 tensors
178 if (GetLayout() == MemoryLayout::ColumnMajor &&
179 (fNDim == 2 || (fNDim == 3 && GetFirstSize() == 1)) ) {
180// return TCudaMatrix<AFloat>(fElementBuffer, GetHSize(), GetWSize());
181 TCudaMatrix<AFloat> temp = GetMatrix();
182 return temp;
183 }
184 // we can convert directy to TMatrix
185 // assert(fNDim <= 4);
186 // size_t nRows = fShape[0]*fShape[1];
187 // size_t nCols = fShape[2];
188 // if (fNDim == 4) nCols*= fShape[3];
189
190 if (GetLayout() == MemoryLayout::RowMajor) {
191
192 // This assume that tensor D1, D2, D3,D4 is converted in D1 , D2*D3*D4
193 TMatrixT<AFloat> hostMatrix( GetNrows(), GetNcols() );
194 cudaMemcpy(hostMatrix.GetMatrixArray(), fElementBuffer, fSize * sizeof(AFloat),
195 cudaMemcpyDeviceToHost);
196 return hostMatrix;
197
198 }
199 // else in case of column major tensor we need to transpose(this is what is done in TCudaMatrix)
200 // Here we assume that D1, D2, D3 is converted in a matrix (D3, D1*D2)
201 TMatrixT<AFloat> hostMatrix( GetNcols(), GetNrows() );
202 cudaMemcpy(hostMatrix.GetMatrixArray(), fElementBuffer, fSize * sizeof(AFloat),
203 cudaMemcpyDeviceToHost);
204 return hostMatrix.T(); // return transpose matrix
205
206}
207#ifdef R__HAS_CUDNN
208//____________________________________________________________________________
209template <typename AFloat>
211{
212 if (fTensorDescriptor && fTensorDescriptor.use_count() == 1 ) {
213 // //std::cout << "Destroy tensor descriptor for shape ";
214 // for (int ii = 0; ii < fNDim; ++ii)
215 // std::cout << fShape[ii] << ",";
216 // std::cout << std::endl;
217 CUDNNCHECK(cudnnDestroyTensorDescriptor(fTensorDescriptor->fCudnnDesc));
218
219 fInstances[fStreamIndx]--;
220
221 // When all tensors in a streamIndx are destroyed, release cudnn resources
222 if (fInstances[fStreamIndx] <= 0) {
223 std::cout << "All Cuda tensors are -released - destroy cudnn handle " << fInstances[fStreamIndx] << std::endl;
224 CUDNNCHECK(cudnnDestroy(fCudnnHandle[fStreamIndx]));
225 }
226
227 }
228 //std::cout << "Tensor descriptor destroyed - instances are " << fInstances[fStreamIndx] << std::endl;
229
230}
231//____________________________________________________________________________
232template <typename AFloat>
234{
235 // descriptor is needed for Cuddn tensor that are rowmajor
236 if (!fTensorDescriptor && fSize > 0 && fNDim >= 2) {
237
238
239 // if ((fInstances[fStreamIndx] < 4 && fInstances[fStreamIndx] > -4) || fInstances[fStreamIndx]%1000 == 0) {
240 // std::cout << " stream index " << fStreamIndx << " instances " << fInstances[fStreamIndx] << std::endl;
241 // PrintShape();
242 // }
243
244
245 // Also check whether a new streamIndx has been opened
246 if (fInstances.size() - 1 < fStreamIndx) {
247 // If need to resize once, need probably to resize more often
248 fInstances.resize(2 * fStreamIndx + 1, 0);
249 fCudnnHandle.resize(2 * fStreamIndx + 1, nullptr);
250 }
251 if (fInstances[fStreamIndx] == 0) {
252 std::cout << "TCudaTensor::create cudnn handle ! " << std::endl;
253 CUDNNCHECK(cudnnCreate(&fCudnnHandle[fStreamIndx]));
254 // CUDNNCHECK(cudnnSetStream(fCudnnHandle[fStreamIndx], fElementBuffer.GetComputeStream()));
255
256 // cublasCreate(&fCublasHandle);
257 // CUDACHECK(cudaMalloc(& fDeviceReturn, sizeof(AFloat)));
258 // CUDACHECK(cudaMalloc(& fCurandStates, TDevice::NThreads(*this)));
259 }
260 // if (TDevice::NThreads(*this) > (int) fNCurandStates) {
261 // fNCurandStates = TDevice::NThreads(*this);
262 // if (fCurandStates) {
263 // cudaFree(fCurandStates);
264 // }
265 // cudaMalloc(&fCurandStates, TDevice::NThreads(*this) * sizeof(curandState_t));
266 // InitializeCurandStates();
267 // }
268
269 // Prevent template specialization of entire class
270 if (std::is_same<AFloat, double>::value) {
271 fDataType = CUDNN_DATA_DOUBLE;
272 } else if (std::is_same<AFloat, float>::value) {
273 fDataType = CUDNN_DATA_FLOAT;
274 }
275
276 // create tensor descriptor
277 fTensorDescriptor = std::make_shared<TensorDescriptor>();
278 // std::cout << "create tensor descriptor ! " << std::endl;
279 CUDNNCHECK(cudnnCreateTensorDescriptor(&(fTensorDescriptor->fCudnnDesc)));
280
281 // we increment instances when we create the descriptor
282 fInstances[fStreamIndx]++;
283 }
284
285 SetTensorDescriptor();
286
287}
288template<typename AFloat>
290 if (!fTensorDescriptor) return;
291 if (fSize == 0) return;
292
293 // cuDNN NdTensor format has a minsize of 4 tensor dimensions
294 // 4D tensor is more performant on lower dimensions and supports all folowing operations
295 // is this really true ???
296 if (fNDim == 4 || fNDim > 1 && fMemoryLayout == MemoryLayout::ColumnMajor || fNDim == 2) {
297 // pad cudnn tensor column major with extra elements (these are used in the convolutions)
298 Shape_t shape = fShape;
299
300 if (fNDim < 4 && fNDim > 1) {
301 // // add 1 to tensor
302 if (fMemoryLayout == MemoryLayout::RowMajor)
303 shape.insert(shape.end(), 4 - fNDim, 1);
304 else
305 shape.insert(shape.begin(), 4 - fNDim, 1);
306 }
307
308 if (fMemoryLayout == MemoryLayout::RowMajor) {
309 auto status = cudnnSetTensor4dDescriptor(fTensorDescriptor->fCudnnDesc,
310 CUDNN_TENSOR_NCHW, // Layout of the tensor in memory
311 fDataType,
312 (int)shape[0], // batch size
313 (int)shape[1], // no. channels
314 (int)shape[2], // image height
315 (int)shape[3]); // image width
316 assert(status == CUDNN_STATUS_SUCCESS);
317 CUDNNCHECK(status);
318 } else {
319 CUDNNCHECK(cudnnSetTensor4dDescriptor(fTensorDescriptor->fCudnnDesc,
320 CUDNN_TENSOR_NCHW, // Layout of the tensor in memory
321 fDataType,
322 (int)shape[3], // batch size
323 (int)shape[2], // no. channels
324 (int)shape[1], // image height
325 (int)shape[0])); // image width
326 }
327
328 // Some operations in cudnn may not work with this tensor description
329 // do not support tensors with dims < 1
330 } else if (fNDim >2 || fNDim > 4) {
331 // these are used in the RNN layers
332
333 // seems to work for 3d tensor with row major (case of RNN tensors)
334 // rnn wnats 3d tensors but it does not work for 2d tensors
335 std::vector<int> shape(fShape.begin(), fShape.end());
336 std::vector<int> strides(fStrides.begin(), fStrides.end());
337 auto status = cudnnSetTensorNdDescriptor(fTensorDescriptor->fCudnnDesc, fDataType, (int)fNDim, shape.data(),
338 strides.data());
339 assert(status == CUDNN_STATUS_SUCCESS);
340 CUDNNCHECK(status);
341 }
342
343#ifdef NDEBUG
344 size_t tensorSize;
345 CUDNNCHECK(cudnnGetTensorSizeInBytes(fTensorDescriptor->fCudnnDesc, &tensorSize));
346 assert(fSize == tensorSize/sizeof(AFloat));
347
348 // int n,c,h,w = 0;
349 // int s1,s2,s3,s4 = 0;
350 // cudnnDataType_t dataType;
351 // cudnnGetTensor4dDescriptor( fTensorDescriptor, &dataType,&n,&c,&h,&w,&s1,&s2,&s3,&s4 );
352 // std::vector<size_t> shape_input = {n,c,h,w};
353 // assert (shape_input == GetShape());
354
355#endif
356
357
358}
359#else // case ROOT has not Cudnn (add dummy implementations)
360//____________________________________________________________________________
361template <typename AFloat>
363{}
364//____________________________________________________________________________
365template <typename AFloat>
367{}
368//____________________________________________________________________________
369template<typename AFloat>
371{}
372
373#endif
374
375//____________________________________________________________________________
376template<typename AFloat>
378{
379 // dim3 blockDims = TDevice::BlockDims2D();
380 // dim3 gridDims = TDevice::GridDims2D(*this);
381 // CurandInitializationKernel<<<gridDims, blockDims>>>(time(nullptr), fCurandStates);
382}
383
384template<typename AFloat>
385void TCudaTensor<AFloat>::Print(const char * name, bool truncate) const
386{
387 //TCudaBuffer<AFloat> hostBuffer (fSize);
388 //fElementBuffer.CopyTo(hostBuffer);
389 #if 0
390 AFloat hostBuffer[fSize];
391
392 cudaMemcpy(hostBuffer, fElementBuffer, fSize * sizeof(AFloat),
393 cudaMemcpyDeviceToHost);
394
395 for (size_t i = 0; i < fSize; i++) std::cout << hostBuffer[i] << " ";
396 #endif
397 PrintShape(name);
398 size_t n = fSize;
399 if (n > 10 && truncate) n = 10;
400 std::cout << "Data : { ";
401 for (size_t i = 0; i < n; ++i ) {
402 AFloat * elementPointer = fElementBuffer + i;
403 std::cout << AFloat( TCudaDeviceReference<AFloat>(elementPointer) );
404 if (i < n-1) std::cout << " , ";
405 }
406 if (n < fSize) std::cout << "............ } ";
407 std::cout << " } " << std::endl;
408}
409template<typename AFloat>
410void TCudaTensor<AFloat>::PrintShape(const char * name) const
411{
412 std::string memlayout = (GetLayout() == MemoryLayout::RowMajor) ? "RowMajor" : "ColMajor";
413 std::cout << name << " shape : { ";
414 for (size_t i = 0; i < fNDim-1; ++i )
415 std::cout << fShape[i] << " , ";
416 std::cout << fShape.back() << " } " << " Layout : " << memlayout << std::endl;
417}
418#if 0
419// Conversion to RTensor
420//____________________________________________________________________________
421template<typename AFloat>
423{
424 std::vector<size_t> shape(fNDims, fNDims + fDim)
425
426 Experimental::RTensor<AFloat> hostTensor( shape)
427
428 AFloat * buffer = new AFloat[fSize];
429 cudaMemcpy(buffer, fElementBuffer, fSize * sizeof(AFloat),
430 cudaMemcpyDeviceToHost);
431
432 int index = 0;
433 for (int j = 0; j < fSize; j++) {
434 hostTensor.GetData()[j] = static_cast<AFloat>(buffer[j]);
435 }
436 }
437
438 delete[] buffer;
439 return hostTensor;
440}
441#endif
442// Explicit Instantiations.
443
444template class TCudaTensor<float>;
445template class TCudaTensor<double>;
446
447} // namespace DNN
448} // namespace TMVA
Py_ssize_t * fShape
size_t fSize
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
#define R__ASSERT(e)
Definition TError.h:118
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
char name[80]
Definition TGX11.cxx:110
TCudaMatrix Class.
Definition CudaMatrix.h:109
size_t GetNcols() const
Definition CudaMatrix.h:166
size_t GetNrows() const
Definition CudaMatrix.h:165
TCudaTensor Class.
Definition CudaTensor.h:84
Shape_t fStrides
Strides between tensor dimensions (always assume dense, non overlapping tensor)
Definition CudaTensor.h:117
void InitializeCuda()
Initializes all shared devices resource and makes sure that a sufficient number of curand states are ...
static std::vector< int > fInstances
For each GPU device keep the CUDA streams in which tensors are used.
Definition CudaTensor.h:111
Shape_t fShape
The shape vector (size of dimensions) needs to be ordered as no.
Definition CudaTensor.h:116
void PrintShape(const char *name="Tensor") const
size_t fSize
No. of elements.
Definition CudaTensor.h:119
static std::vector< std::size_t > ComputeStridesFromShape(const std::vector< std::size_t > &shape, bool rowmajorLayout)
This information is needed for the multi-dimensional indexing.
Definition CudaTensor.cu:43
TCudaDeviceBuffer< AFloat > fElementBuffer
Definition CudaTensor.h:124
void Print(const char *name="Tensor", bool truncate=false) const
RTensor is a container with contiguous memory and shape information.
Definition RTensor.hxx:162
TMatrixT.
Definition TMatrixT.h:39
TMatrixT< Element > & T()
Definition TMatrixT.h:155
const Element * GetMatrixArray() const override
Definition TMatrixT.h:227
const Int_t n
Definition legend1.C:16
MemoryLayout
Memory layout type (copy from RTensor.hxx)
Definition CudaTensor.h:47
create variable transformations