Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
CudaTensor.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 13/07/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12///////////////////////////////////////////////////////////////////////
13// Contains the TCudaMatrix class for the representation of matrices //
14// on CUDA devices as well as the TCudaDeviceReference class which //
15// is a helper class to emulate lvalue references to floating point //
16// values on the device. //
17///////////////////////////////////////////////////////////////////////
18
19#ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDATENSOR
20#define TMVA_DNN_ARCHITECTURES_CUDA_CUDATENSOR
21
22
23#include <vector>
24#include <cstring>
25#include <cassert>
26#include <iostream>
27
28#include "CudaMatrix.h"
29#include "TMatrixT.h"
30#include "CudaBuffers.h"
31
32//#include "TMVA/RTensor.hxx"
33
34#ifdef R__HAS_CUDNN
35#include "cudnn.h"
36#define CUDNNCHECK(ans) {cudnnError((ans), __FILE__, __LINE__); }
37#endif
38
39namespace TMVA {
40
41
42
43#ifndef TMVA_RTENSOR
44
45namespace Experimental {
46/// Memory layout type (copy from RTensor.hxx)
47enum class MemoryLayout : uint8_t {
48 RowMajor = 0x01,
49 ColumnMajor = 0x02
50};
51}
52#endif
53
54namespace DNN {
55
57
58#ifdef R__HAS_CUDNN
59/**
60 * Function to handle the status output of cuDNN function calls. See also
61 * CUDACHECK in CudaMatrix.h.
62 */
63inline void cudnnError(cudnnStatus_t status, const char *file, int line, bool abort=true)
64{
65 if (status != CUDNN_STATUS_SUCCESS) {
66 fprintf(stderr, "CUDNN Error: %s %s %d\n", cudnnGetErrorString(status), file, line);
67 if (abort)
68 exit(status);
69 }
70}
71#endif
72//____________________________________________________________________________
73//
74// Cuda Tensor
75//____________________________________________________________________________
76
77/** TCudaTensor Class
78 *
79 * The TCudaTensor class extends the TCudaMatrix class for dimensions > 2.
80 *
81 */
82template<typename AFloat>
84{
85public:
86
87 using Shape_t = std::vector<size_t>;
89 using Scalar_t = AFloat;
90
91
92private:
93
94#ifdef R__HAS_CUDNN
95 struct TensorDescriptor {
96 cudnnTensorDescriptor_t fCudnnDesc;
97 };
98
99 static std::vector<cudnnHandle_t> fCudnnHandle; ///< Holds the cuddn library context (one for every CUDA stream)
100
101 static cudnnDataType_t fDataType; ///< Cudnn datatype used for the tensor
102#else
104 };
105#endif
106
107 /** For each GPU device keep the CUDA streams in which tensors are used.
108 * Instances belonging to the same stream on the same deviceshare a
109 * cudnn library handel to keep cudnn contexts seperated */
110 //static std::vector<std::vector<int> > fInstances;
111 static std::vector<int> fInstances;
112
113 /** The shape vector (size of dimensions) needs to be ordered as no. channels,
114 * image dimensions.
115 */
116 Shape_t fShape; ///< spatial subdimensions
117 Shape_t fStrides; ///< Strides between tensor dimensions (always assume dense, non overlapping tensor)
118 size_t fNDim; ///< Dimension of the tensor (first dimension is the batch size, second is the no. channels)
119 size_t fSize; ///< No. of elements
120 int fDevice; ///< Device associated with current tensor instance
121 int fStreamIndx; ///< Cuda stream associated with current instance
122
123 std::shared_ptr<TensorDescriptor> fTensorDescriptor;
125
127
128
129
130public:
131
132
133 //static AFloat * GetOnes() {return fOnes;}
134
136
137 TCudaTensor(const AFloat * data,
138 const std::vector<size_t> & shape,
139 MemoryLayout memlayout = MemoryLayout::ColumnMajor,
140 int deviceIndx = 0, int streamIndx = 0);
142 const std::vector<size_t> & shape,
143 MemoryLayout memlayout = MemoryLayout::ColumnMajor,
144 int deviceIndx = 0, int streamIndx = 0);
145 TCudaTensor(const std::vector<size_t> & shape,
146 MemoryLayout memlayout = MemoryLayout::ColumnMajor,
147 int deviceIndx = 0, int streamIndx = 0);
148
149 TCudaTensor(size_t bsize, size_t csize, size_t hwsize, MemoryLayout memlayout = MemoryLayout::ColumnMajor, int deviceIndx = 0, int streamIndx = 0) :
150 TCudaTensor( (memlayout == MemoryLayout::ColumnMajor) ? Shape_t({ csize, hwsize, bsize}) : Shape_t({ bsize, csize, hwsize }) , memlayout,
151 deviceIndx, streamIndx)
152 {}
153
154 TCudaTensor(size_t bsize, size_t csize, size_t hsize, size_t wsize, MemoryLayout memlayout = MemoryLayout::ColumnMajor, int deviceIndx = 0, int streamIndx = 0) :
155
156 TCudaTensor( {bsize, csize, hsize, wsize}, memlayout, deviceIndx, streamIndx)
157 {
158 if (memlayout == MemoryLayout::ColumnMajor)
159 *this = TCudaTensor(fElementBuffer, { csize, hsize, wsize, bsize}, memlayout, deviceIndx, streamIndx);
160 }
161
162 TCudaTensor(size_t n, size_t m, MemoryLayout memlayout = MemoryLayout::ColumnMajor, int deviceIndx = 0, int streamIndx = 0) :
163 // TCudaTensor( {n,m}, memlayout, deviceIndx, streamIndx) :
164 TCudaTensor( {n, m}, memlayout, deviceIndx, streamIndx)
165 {}
166
167 TCudaTensor(const TCudaMatrix<AFloat> & m, size_t dim = 2);
168
169 TCudaTensor(const TMatrixT<AFloat> & m, size_t dim = 2) :
170 TCudaTensor( TCudaMatrix<AFloat>(m), dim)
171 {}
172
173 TCudaTensor(TCudaDeviceBuffer<AFloat> buffer, size_t n, size_t m) :
174 TCudaTensor( buffer, {n,m}, MemoryLayout::ColumnMajor ,0,0) {}
175
176 TCudaTensor(const TCudaTensor &) = default;
178 TCudaTensor & operator=(const TCudaTensor &) = default;
181
182 /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */
183 operator TMatrixT<AFloat>() const;
184
185
187
188 const Shape_t & GetShape() const {return fShape;}
189 const Shape_t & GetStrides() const {return fStrides;}
190 size_t GetDimAt(size_t i) const {return fShape[i];}
191 size_t GetNDim() const {return fNDim;}
192 size_t GetSize() const {return fSize;}
193
194 const AFloat * GetDataPointer() const {return fElementBuffer;}
195 AFloat * GetDataPointer() {return fElementBuffer;}
196 const AFloat * GetData() const {return fElementBuffer;}
197 AFloat * GetData() {return fElementBuffer;}
198
199 const AFloat * GetDataPointerAt(size_t i ) const {
200 return (const_cast<TCudaDeviceBuffer<AFloat>&>(fElementBuffer)).GetSubBuffer(i * GetFirstStride(), GetFirstStride() ); }
201 AFloat * GetDataPointerAt(size_t i ) {return fElementBuffer.GetSubBuffer(i * GetFirstStride(), GetFirstStride() ); }
202
203
206
207#ifdef R__HAS_CUDNN
208 const cudnnHandle_t & GetCudnnHandle() const {return fCudnnHandle[fStreamIndx];}
209 const cudnnTensorDescriptor_t & GetTensorDescriptor() const {return fTensorDescriptor->fCudnnDesc;}
210 static cudnnDataType_t GetDataType() { return fDataType; }
211#endif
212
213 cudaStream_t GetComputeStream() const {
214 return fElementBuffer.GetComputeStream();
215 }
216 void SetComputeStream(cudaStream_t stream) {
217 fElementBuffer.SetComputeStream(stream);
218 }
219
221
222 if (fSize != other.GetSize()) return false;
223
224
225 std::unique_ptr<AFloat[]> hostBufferThis(new AFloat[fSize]);
226 std::unique_ptr<AFloat[]> hostBufferOther(new AFloat[fSize]);
227 cudaMemcpy(hostBufferThis.get(), fElementBuffer, fSize * sizeof(AFloat),
228 cudaMemcpyDeviceToHost);
229 cudaMemcpy(hostBufferOther.get(), other.GetDeviceBuffer(), fSize * sizeof(AFloat),
230 cudaMemcpyDeviceToHost);
231
232 for (size_t i = 0; i < fSize; i++) {
233 if (hostBufferThis[i] != hostBufferOther[i]) return false;
234 }
235 return true;
236 }
237
238 bool isEqual (const AFloat * hostBufferOther, size_t otherSize) {
239 if (fSize != otherSize) return false;
240
241
242 std::unique_ptr<AFloat[]> hostBufferThis(new AFloat[fSize]);
243 cudaMemcpy(hostBufferThis.get(), fElementBuffer, fSize * sizeof(AFloat),
244 cudaMemcpyDeviceToHost);
245
246 for (size_t i = 0; i < fSize; i++) {
247 if (hostBufferThis[i] != hostBufferOther[i]) return false;
248 }
249
250 return true;
251 }
252
253 void Print(const char * name = "Tensor", bool truncate = false) const;
254
255 void PrintShape(const char * name="Tensor") const;
256
257 void Zero() {
258 cudaMemset(GetDataPointer(), 0, sizeof(AFloat) * GetSize());
259 }
260
261 void SetConstVal(const AFloat constVal) {
262 TCudaHostBuffer<AFloat> hostBuffer(fSize);
263 hostBuffer.SetConstVal(constVal);
264 fElementBuffer.CopyFrom(hostBuffer);
265 }
266
267 // have this tensor representatrions
268 // 2-dimensional tensors : NW where N is batch size W is the feature size . Memory layout should be columnwise in this case
269 // 3 -dimensional tensor : represnetation is NHWC , tensor should be columnwise storage
270 // 4 -dimensional tensor : representation is NCHW ande tensor should be row wose
271 // a rowmajor tensor with dimension less than trhee should not exist but in case consider as a N, (CHW) for 2d, N, C, (HW) for 3d
272 // a columnmajor tensor for dimension >=4 should not exist but in case consider as a N,H,W,C (i.e. with shape C,W,H,N)
273
274 size_t GetFirstSize() const {
275 return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape.back() : fShape.front(); } // CM order
276 size_t GetFirstStride() const {
277 return (GetLayout() == MemoryLayout::ColumnMajor ) ? fStrides.back() : fStrides.front(); } // CM order
278
279 size_t GetCSize() const {
280 if (fNDim == 2) return 1;
281 return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape.front() : fShape[1] ; //assume NHWC
282 }
283 size_t GetHSize() const {
284 if (fNDim == 2) return fShape[0];
285 if (fNDim == 3) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[0] : fShape[1] ;// same as C
286 if (fNDim >= 4) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[2] : fShape[2] ;
287 return 0;
288 }
289 size_t GetWSize() const {
290 if (fNDim == 2) return fShape[1];
291 if (fNDim == 3) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[1] : fShape[2] ;
292 if (fNDim == 4) return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape[3] : fShape[3] ;
293 return 0;
294 }
295
296 // for backward compatibility (assume column-major
297 // for backward compatibility : for CM tensor (n1,n2,n3,n4) -> ( n1*n2*n3, n4)
298 // for RM tensor (n1,n2,n3,n4) -> ( n2*n3*n4, n1 ) ???
299 size_t GetNrows() const { return (GetLayout() == MemoryLayout::ColumnMajor ) ? fStrides.back() : fShape.front();}
300 size_t GetNcols() const { return (GetLayout() == MemoryLayout::ColumnMajor ) ? fShape.back() : fStrides.front(); }
301
302
303 // Matrix conversion for tensors of shape 2
305 // remember TCudaMatrix is always column-major
306 if ( GetLayout() == MemoryLayout::ColumnMajor &&
307 (fNDim == 2 || (fNDim == 3 && GetFirstSize() == 1) ) )
309
310
311 //case of N,M,1,1,..
312 bool caseNM11 = true;
313 for (size_t i = 2; i < fNDim; ++i) caseNM11 &= fShape[i] == 1;
314 if (caseNM11) {
315 return (GetLayout() == MemoryLayout::ColumnMajor ) ?
318 }
319 bool case11NM = true;
320 for (size_t i = 0; i < fNDim-2; ++i) case11NM &= fShape[i] == 1;
321 if (case11NM) {
322 return (GetLayout() == MemoryLayout::ColumnMajor ) ?
325 }
326
327 assert(false);
328 return TCudaMatrix<AFloat>();
329 }
330
331 // for backeard compatibility with old tensor
333 //assert(GetLayout() == MemoryLayout::ColumnMajor );
334 return At(i).GetMatrix();
335 }
336
337
338
339 static inline std::vector<std::size_t> ComputeStridesFromShape(const std::vector<std::size_t> &shape,
340 bool rowmajorLayout);
341
342 void ReshapeInPlace(const Shape_t & newShape) {
343 fShape = newShape;
344 fStrides = ComputeStridesFromShape(fShape, fMemoryLayout == MemoryLayout::RowMajor);
345 fNDim = fShape.size();
346 // in principle reshape should not change tensor size
347 size_t newSize = (fMemoryLayout == MemoryLayout::RowMajor) ? fStrides.front() * fShape.front() : fStrides.back() * fShape.back();
348 R__ASSERT(newSize <= fSize);
349 fSize = newSize;
350 // reset the descritor for Cudnn
352 }
353
354 TCudaTensor<AFloat> Reshape(const Shape_t & newShape) const {
355 TCudaTensor<AFloat> tmp(this->GetDeviceBuffer(), newShape, this->GetLayout(), fDevice, fStreamIndx);
356 return tmp;
357 }
358
360
361 // return slice of tensor
362 // return slices in the first dimension (if row wise) or last dimension if colun wise
363 // so single event slides
364 TCudaTensor<AFloat> At(size_t i) const {
365 Shape_t sliced_shape = (GetLayout() == MemoryLayout::RowMajor)
366 ? Shape_t(fShape.begin() + 1, fShape.end()) :
367 Shape_t(fShape.begin(), fShape.end() - 1);
368
369
370 size_t buffsize = (GetLayout() == MemoryLayout::RowMajor) ?
371 fStrides.front() : fStrides.back();
372
373 size_t offset = i * buffsize;
374
375 return TCudaTensor<AFloat>((const_cast<TCudaDeviceBuffer<AFloat>&>(fElementBuffer)).GetSubBuffer(offset, buffsize), sliced_shape, GetLayout());
376 }
377
378
379 // element access ( for debugging)
380 TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const
381 {
382 // like this works also for multi-dim tensors
383 // and consider the tensor as a multidim one
384 size_t nrows = GetNrows();
385 size_t ncols = GetNcols();
386
387 size_t offset = (GetLayout() == MemoryLayout::RowMajor) ?
388 i * ncols + j : j * nrows + i;
389
390 AFloat * elementPointer = fElementBuffer + offset;
391 return TCudaDeviceReference<AFloat>(elementPointer);
392 }
393 // element access ( for debugging)
394 TCudaDeviceReference<AFloat> operator()(size_t i, size_t j, size_t k) const
395 {
396 // k is B, i is C, j is HW :
397 assert( fNDim >= 3); // || ( k==0 && fNDim == 2 ) );
398 //note for larger dimension k is all other dims collapsed !!!
399
400 size_t offset = (GetLayout() == MemoryLayout::RowMajor) ?
401 i * fStrides[0] + j * fStrides[1] + k :
402 i * fStrides[2] + k * fStrides[1] + j;
403
404 AFloat * elementPointer = fElementBuffer + offset;
405
406 return TCudaDeviceReference<AFloat>(elementPointer);
407 }
408
409 TCudaDeviceReference<AFloat> operator()(size_t i, size_t j, size_t k, size_t l) const
410 {
411 // for rowsise
412 //assert(GetLayout() == MemoryLayout::RowMajor);
413 assert( fNDim == 4); // || ( k==0 && fNDim == 2 ) );
414
415 size_t offset = (GetLayout() == MemoryLayout::RowMajor) ?
416 i * fStrides[0] + j * fStrides[1] + k * fStrides[2] + l:
417 l * fStrides[3] + k * fStrides[2] + j * fStrides[1] + i;
418
419 AFloat * elementPointer = fElementBuffer + offset;
420
421 return TCudaDeviceReference<AFloat>(elementPointer);
422 }
423
424private:
425
426 /** Initializes all shared devices resource and makes sure that a sufficient
427 * number of curand states are allocated on the device and initialized as
428 * well as that the one-vector for the summation over columns has the right
429 * size. */
432
433};
434
435
436
437
438} // namespace DNN
439} // namespace TMVA
440
441#endif
uint8_t
#define R__ASSERT(e)
Definition TError.h:118
char name[80]
Definition TGX11.cxx:110
TCudaDeviceReference.
Definition CudaMatrix.h:65
void SetConstVal(const AFloat constVal)
Sets the entire buffer to a constant value.
TCudaMatrix Class.
Definition CudaMatrix.h:109
TCudaTensor Class.
Definition CudaTensor.h:84
TCudaTensor< AFloat > At(size_t i) const
Definition CudaTensor.h:364
const AFloat * GetDataPointerAt(size_t i) const
Definition CudaTensor.h:199
const Shape_t & GetShape() const
Definition CudaTensor.h:188
TCudaTensor(const std::vector< size_t > &shape, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
size_t GetWSize() const
Definition CudaTensor.h:289
AFloat * GetDataPointer()
Definition CudaTensor.h:195
std::vector< size_t > Shape_t
Definition CudaTensor.h:87
TCudaTensor(const TMatrixT< AFloat > &m, size_t dim=2)
Definition CudaTensor.h:169
const AFloat * GetData() const
Definition CudaTensor.h:196
size_t GetDimAt(size_t i) const
Definition CudaTensor.h:190
static std::vector< int > fInstances
For each GPU device keep the CUDA streams in which tensors are used.
Definition CudaTensor.h:111
Shape_t fStrides
Strides between tensor dimensions (always assume dense, non overlapping tensor)
Definition CudaTensor.h:117
int fDevice
Device associated with current tensor instance.
Definition CudaTensor.h:120
bool isEqual(TCudaTensor< AFloat > &other)
Definition CudaTensor.h:220
TCudaTensor & operator=(TCudaTensor &&)=default
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Definition CudaTensor.h:380
TCudaMatrix< AFloat > operator[](size_t i) const
Definition CudaTensor.h:332
TCudaTensor(size_t bsize, size_t csize, size_t hsize, size_t wsize, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
Definition CudaTensor.h:154
size_t GetNrows() const
Definition CudaTensor.h:299
size_t fNDim
Dimension of the tensor (first dimension is the batch size, second is the no. channels)
Definition CudaTensor.h:118
TCudaTensor & operator=(const TCudaTensor &)=default
cudaStream_t GetComputeStream() const
Definition CudaTensor.h:213
void InitializeCuda()
Initializes all shared devices resource and makes sure that a sufficient number of curand states are ...
MemoryLayout GetLayout() const
Definition CudaTensor.h:186
TCudaTensor(const TCudaMatrix< AFloat > &m, size_t dim=2)
TCudaDeviceBuffer< AFloat > & GetDeviceBuffer()
Definition CudaTensor.h:205
size_t GetNcols() const
Definition CudaTensor.h:300
TCudaTensor(TCudaTensor &&)=default
TCudaTensor(size_t bsize, size_t csize, size_t hwsize, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
Definition CudaTensor.h:149
TCudaTensor(const AFloat *data, const std::vector< size_t > &shape, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
TCudaTensor(const TCudaTensor &)=default
Shape_t fShape
The shape vector (size of dimensions) needs to be ordered as no.
Definition CudaTensor.h:116
bool isEqual(const AFloat *hostBufferOther, size_t otherSize)
Definition CudaTensor.h:238
AFloat * GetDataPointerAt(size_t i)
Definition CudaTensor.h:201
size_t GetCSize() const
Definition CudaTensor.h:279
TCudaTensor(TCudaDeviceBuffer< AFloat > buffer, size_t n, size_t m)
Definition CudaTensor.h:173
void PrintShape(const char *name="Tensor") const
TCudaTensor< AFloat > Reshape(const Shape_t &newShape) const
Definition CudaTensor.h:354
size_t fSize
No. of elements.
Definition CudaTensor.h:119
TCudaMatrix< AFloat > GetMatrix() const
Definition CudaTensor.h:304
TCudaTensor(size_t n, size_t m, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
Definition CudaTensor.h:162
size_t GetNDim() const
Definition CudaTensor.h:191
TCudaTensor(TCudaDeviceBuffer< AFloat > buffer, const std::vector< size_t > &shape, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
const AFloat * GetDataPointer() const
Definition CudaTensor.h:194
const Shape_t & GetStrides() const
Definition CudaTensor.h:189
void ReshapeInPlace(const Shape_t &newShape)
Definition CudaTensor.h:342
size_t GetHSize() const
Definition CudaTensor.h:283
TCudaDeviceBuffer< AFloat > fElementBuffer
Definition CudaTensor.h:124
size_t GetFirstStride() const
Definition CudaTensor.h:276
const TCudaDeviceBuffer< AFloat > & GetDeviceBuffer() const
Definition CudaTensor.h:204
MemoryLayout fMemoryLayout
Definition CudaTensor.h:126
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j, size_t k, size_t l) const
Definition CudaTensor.h:409
void SetComputeStream(cudaStream_t stream)
Definition CudaTensor.h:216
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j, size_t k) const
Definition CudaTensor.h:394
size_t GetFirstSize() const
Definition CudaTensor.h:274
void SetConstVal(const AFloat constVal)
Definition CudaTensor.h:261
size_t GetSize() const
Definition CudaTensor.h:192
int fStreamIndx
Cuda stream associated with current instance.
Definition CudaTensor.h:121
void Print(const char *name="Tensor", bool truncate=false) const
static std::vector< std::size_t > ComputeStridesFromShape(const std::vector< std::size_t > &shape, bool rowmajorLayout)
std::shared_ptr< TensorDescriptor > fTensorDescriptor
Definition CudaTensor.h:123
TMatrixT.
Definition TMatrixT.h:39
TLine * line
const Int_t n
Definition legend1.C:16
MemoryLayout
Memory layout type (copy from RTensor.hxx)
Definition CudaTensor.h:47
create variable transformations
Definition file.py:1
auto * m
Definition textangle.C:8
auto * l
Definition textangle.C:4