19#ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDATENSOR
20#define TMVA_DNN_ARCHITECTURES_CUDA_CUDATENSOR
36#define CUDNNCHECK(ans) {cudnnError((ans), __FILE__, __LINE__); }
45namespace Experimental {
63inline void cudnnError(cudnnStatus_t status,
const char *
file,
int line,
bool abort=
true)
65 if (status != CUDNN_STATUS_SUCCESS) {
66 fprintf(stderr,
"CUDNN Error: %s %s %d\n", cudnnGetErrorString(status),
file,
line);
82template<
typename AFloat>
96 cudnnTensorDescriptor_t fCudnnDesc;
99 static std::vector<cudnnHandle_t> fCudnnHandle;
101 static cudnnDataType_t fDataType;
138 const std::vector<size_t> & shape,
140 int deviceIndx = 0,
int streamIndx = 0);
142 const std::vector<size_t> & shape,
144 int deviceIndx = 0,
int streamIndx = 0);
147 int deviceIndx = 0,
int streamIndx = 0);
149 TCudaTensor(
size_t bsize,
size_t csize,
size_t hwsize,
MemoryLayout memlayout = MemoryLayout::ColumnMajor,
int deviceIndx = 0,
int streamIndx = 0) :
151 deviceIndx, streamIndx)
154 TCudaTensor(
size_t bsize,
size_t csize,
size_t hsize,
size_t wsize,
MemoryLayout memlayout = MemoryLayout::ColumnMajor,
int deviceIndx = 0,
int streamIndx = 0) :
156 TCudaTensor( {bsize, csize, hsize, wsize}, memlayout, deviceIndx, streamIndx)
158 if (memlayout == MemoryLayout::ColumnMajor)
174 TCudaTensor( buffer, {
n,
m}, MemoryLayout::ColumnMajor ,0,0) {}
208 const cudnnHandle_t & GetCudnnHandle()
const {
return fCudnnHandle[
fStreamIndx];}
209 const cudnnTensorDescriptor_t & GetTensorDescriptor()
const {
return fTensorDescriptor->fCudnnDesc;}
210 static cudnnDataType_t
GetDataType() {
return fDataType; }
225 std::unique_ptr<AFloat[]> hostBufferThis(
new AFloat[
fSize]);
226 std::unique_ptr<AFloat[]> hostBufferOther(
new AFloat[
fSize]);
228 cudaMemcpyDeviceToHost);
230 cudaMemcpyDeviceToHost);
232 for (
size_t i = 0; i <
fSize; i++) {
233 if (hostBufferThis[i] != hostBufferOther[i])
return false;
238 bool isEqual (
const AFloat * hostBufferOther,
size_t otherSize) {
239 if (
fSize != otherSize)
return false;
242 std::unique_ptr<AFloat[]> hostBufferThis(
new AFloat[
fSize]);
244 cudaMemcpyDeviceToHost);
246 for (
size_t i = 0; i <
fSize; i++) {
247 if (hostBufferThis[i] != hostBufferOther[i])
return false;
253 void Print(
const char *
name =
"Tensor",
bool truncate =
false)
const;
280 if (
fNDim == 2)
return 1;
306 if (
GetLayout() == MemoryLayout::ColumnMajor &&
312 bool caseNM11 =
true;
313 for (
size_t i = 2; i <
fNDim; ++i) caseNM11 &=
fShape[i] == 1;
315 return (
GetLayout() == MemoryLayout::ColumnMajor ) ?
319 bool case11NM =
true;
320 for (
size_t i = 0; i <
fNDim-2; ++i) case11NM &=
fShape[i] == 1;
322 return (
GetLayout() == MemoryLayout::ColumnMajor ) ?
334 return At(i).GetMatrix();
340 bool rowmajorLayout);
370 size_t buffsize = (
GetLayout() == MemoryLayout::RowMajor) ?
373 size_t offset = i * buffsize;
387 size_t offset = (
GetLayout() == MemoryLayout::RowMajor) ?
388 i * ncols + j : j * nrows + i;
400 size_t offset = (
GetLayout() == MemoryLayout::RowMajor) ?
415 size_t offset = (
GetLayout() == MemoryLayout::RowMajor) ?
void SetConstVal(const AFloat constVal)
Sets the entire buffer to a constant value.
TCudaTensor< AFloat > At(size_t i) const
const AFloat * GetDataPointerAt(size_t i) const
void SetTensorDescriptor()
const Shape_t & GetShape() const
TCudaTensor(const std::vector< size_t > &shape, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
AFloat * GetDataPointer()
std::vector< size_t > Shape_t
TCudaTensor(const TMatrixT< AFloat > &m, size_t dim=2)
const AFloat * GetData() const
size_t GetDimAt(size_t i) const
static std::vector< int > fInstances
For each GPU device keep the CUDA streams in which tensors are used.
Shape_t fStrides
Strides between tensor dimensions (always assume dense, non overlapping tensor)
int fDevice
Device associated with current tensor instance.
bool isEqual(TCudaTensor< AFloat > &other)
TCudaTensor & operator=(TCudaTensor &&)=default
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
TCudaMatrix< AFloat > operator[](size_t i) const
TCudaTensor(size_t bsize, size_t csize, size_t hsize, size_t wsize, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
size_t fNDim
Dimension of the tensor (first dimension is the batch size, second is the no. channels)
TCudaTensor & operator=(const TCudaTensor &)=default
cudaStream_t GetComputeStream() const
void InitializeCuda()
Initializes all shared devices resource and makes sure that a sufficient number of curand states are ...
MemoryLayout GetLayout() const
TCudaTensor(const TCudaMatrix< AFloat > &m, size_t dim=2)
TCudaDeviceBuffer< AFloat > & GetDeviceBuffer()
TCudaTensor(TCudaTensor &&)=default
TCudaTensor(size_t bsize, size_t csize, size_t hwsize, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
TCudaTensor(const AFloat *data, const std::vector< size_t > &shape, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
TCudaTensor(const TCudaTensor &)=default
void InitializeCurandStates()
Shape_t fShape
The shape vector (size of dimensions) needs to be ordered as no.
bool isEqual(const AFloat *hostBufferOther, size_t otherSize)
AFloat * GetDataPointerAt(size_t i)
TCudaTensor(TCudaDeviceBuffer< AFloat > buffer, size_t n, size_t m)
void PrintShape(const char *name="Tensor") const
TCudaTensor< AFloat > Reshape(const Shape_t &newShape) const
size_t fSize
No. of elements.
TCudaMatrix< AFloat > GetMatrix() const
TCudaTensor(size_t n, size_t m, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
TCudaTensor(TCudaDeviceBuffer< AFloat > buffer, const std::vector< size_t > &shape, MemoryLayout memlayout=MemoryLayout::ColumnMajor, int deviceIndx=0, int streamIndx=0)
const AFloat * GetDataPointer() const
const Shape_t & GetStrides() const
void ReshapeInPlace(const Shape_t &newShape)
TCudaDeviceBuffer< AFloat > fElementBuffer
size_t GetFirstStride() const
const TCudaDeviceBuffer< AFloat > & GetDeviceBuffer() const
MemoryLayout fMemoryLayout
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j, size_t k, size_t l) const
void SetComputeStream(cudaStream_t stream)
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j, size_t k) const
size_t GetFirstSize() const
void SetConstVal(const AFloat constVal)
int fStreamIndx
Cuda stream associated with current instance.
void Print(const char *name="Tensor", bool truncate=false) const
static std::vector< std::size_t > ComputeStridesFromShape(const std::vector< std::size_t > &shape, bool rowmajorLayout)
std::shared_ptr< TensorDescriptor > fTensorDescriptor
BinData::ErrorType GetDataType(const TGraph *gr, DataOptions &fitOpt)
MemoryLayout
Memory layout type (copy from RTensor.hxx)
create variable transformations