30template<
typename AFloat>
31std::vector<cudnnHandle_t> TCudaTensor<AFloat>::fCudnnHandle(1);
32template<
typename AFloat>
33cudnnDataType_t TCudaTensor<AFloat>::fDataType = CUDNN_DATA_FLOAT;
36template<
typename AFloat>
42template<
typename AFloat>
46 const auto size = shape.size();
47 std::vector<std::size_t> strides(
size);
49 for (std::size_t i = 0; i <
size; i++) {
51 strides[
size - 1 - i] = 1;
53 strides[
size - 1 - i] = strides[
size - 1 - i + 1] * shape[
size - 1 - i + 1];
57 for (std::size_t i = 0; i <
size; i++) {
61 strides[i] = strides[i - 1] * shape[i - 1];
70template<
typename AFloat>
72 :
fShape(), fStrides(), fNDim(0),
fSize(0), fElementBuffer(), fStreamIndx(0), fTensorDescriptor(nullptr)
79template<
typename AFloat>
82 int device,
int streamIndx)
83 :
fShape(shape), fStrides(shape.
size()), fNDim(shape.
size()), fDevice(device), fStreamIndx(streamIndx),
84 fTensorDescriptor(nullptr), fMemoryLayout(layout)
98template<
typename AFloat>
101 int device,
int streamIndx)
114 cudaMemcpyHostToDevice);
121template<
typename AFloat>
123 const std::vector<size_t> & shape,
125 int device,
int streamIndx)
126 : fNDim(shape.
size()), fElementBuffer(buffer),
fShape(shape), fStrides( shape.
size()), fDevice(device),
127 fStreamIndx(streamIndx), fTensorDescriptor(nullptr), fMemoryLayout(layout)
156template <
typename AFloat>
165 fStrides.insert(fStrides.end(),dim-2,
fSize);
168 SetTensorDescriptor();
174template<
typename AFloat>
178 if (GetLayout() == MemoryLayout::ColumnMajor &&
179 (fNDim == 2 || (fNDim == 3 && GetFirstSize() == 1)) ) {
190 if (GetLayout() == MemoryLayout::RowMajor) {
195 cudaMemcpyDeviceToHost);
203 cudaMemcpyDeviceToHost);
204 return hostMatrix.
T();
209template <
typename AFloat>
212 if (fTensorDescriptor && fTensorDescriptor.use_count() == 1 ) {
217 CUDNNCHECK(cudnnDestroyTensorDescriptor(fTensorDescriptor->fCudnnDesc));
219 fInstances[fStreamIndx]--;
222 if (fInstances[fStreamIndx] <= 0) {
223 std::cout <<
"All Cuda tensors are -released - destroy cudnn handle " << fInstances[fStreamIndx] << std::endl;
224 CUDNNCHECK(cudnnDestroy(fCudnnHandle[fStreamIndx]));
232template <
typename AFloat>
236 if (!fTensorDescriptor &&
fSize > 0 && fNDim >= 2) {
246 if (fInstances.size() - 1 < fStreamIndx) {
248 fInstances.resize(2 * fStreamIndx + 1, 0);
249 fCudnnHandle.resize(2 * fStreamIndx + 1,
nullptr);
251 if (fInstances[fStreamIndx] == 0) {
252 std::cout <<
"TCudaTensor::create cudnn handle ! " << std::endl;
253 CUDNNCHECK(cudnnCreate(&fCudnnHandle[fStreamIndx]));
270 if (std::is_same<AFloat, double>::value) {
271 fDataType = CUDNN_DATA_DOUBLE;
272 }
else if (std::is_same<AFloat, float>::value) {
273 fDataType = CUDNN_DATA_FLOAT;
277 fTensorDescriptor = std::make_shared<TensorDescriptor>();
279 CUDNNCHECK(cudnnCreateTensorDescriptor(&(fTensorDescriptor->fCudnnDesc)));
282 fInstances[fStreamIndx]++;
285 SetTensorDescriptor();
288template<
typename AFloat>
290 if (!fTensorDescriptor)
return;
291 if (
fSize == 0)
return;
296 if (fNDim == 4 || fNDim > 1 && fMemoryLayout == MemoryLayout::ColumnMajor || fNDim == 2) {
300 if (fNDim < 4 && fNDim > 1) {
302 if (fMemoryLayout == MemoryLayout::RowMajor)
303 shape.insert(shape.end(), 4 - fNDim, 1);
305 shape.insert(shape.begin(), 4 - fNDim, 1);
308 if (fMemoryLayout == MemoryLayout::RowMajor) {
309 auto status = cudnnSetTensor4dDescriptor(fTensorDescriptor->fCudnnDesc,
316 assert(status == CUDNN_STATUS_SUCCESS);
319 CUDNNCHECK(cudnnSetTensor4dDescriptor(fTensorDescriptor->fCudnnDesc,
330 }
else if (fNDim >2 || fNDim > 4) {
336 std::vector<int> strides(fStrides.begin(), fStrides.end());
337 auto status = cudnnSetTensorNdDescriptor(fTensorDescriptor->fCudnnDesc, fDataType, (
int)fNDim, shape.data(),
339 assert(status == CUDNN_STATUS_SUCCESS);
345 CUDNNCHECK(cudnnGetTensorSizeInBytes(fTensorDescriptor->fCudnnDesc, &tensorSize));
346 assert(
fSize == tensorSize/
sizeof(AFloat));
361template <
typename AFloat>
365template <
typename AFloat>
369template<
typename AFloat>
376template<
typename AFloat>
384template<
typename AFloat>
390 AFloat hostBuffer[
fSize];
392 cudaMemcpy(hostBuffer, fElementBuffer,
fSize *
sizeof(AFloat),
393 cudaMemcpyDeviceToHost);
395 for (
size_t i = 0; i <
fSize; i++) std::cout << hostBuffer[i] <<
" ";
399 if (
n > 10 && truncate)
n = 10;
400 std::cout <<
"Data : { ";
401 for (
size_t i = 0; i <
n; ++i ) {
402 AFloat * elementPointer = fElementBuffer + i;
403 std::cout << AFloat( TCudaDeviceReference<AFloat>(elementPointer) );
404 if (i <
n-1) std::cout <<
" , ";
406 if (
n <
fSize) std::cout <<
"............ } ";
407 std::cout <<
" } " << std::endl;
409template<
typename AFloat>
412 std::string memlayout = (GetLayout() == MemoryLayout::RowMajor) ?
"RowMajor" :
"ColMajor";
413 std::cout <<
name <<
" shape : { ";
414 for (
size_t i = 0; i < fNDim-1; ++i )
415 std::cout <<
fShape[i] <<
" , ";
416 std::cout <<
fShape.back() <<
" } " <<
" Layout : " << memlayout << std::endl;
421template<
typename AFloat>
424 std::vector<size_t> shape(fNDims, fNDims + fDim)
428 AFloat * buffer =
new AFloat[
fSize];
429 cudaMemcpy(buffer, fElementBuffer,
fSize *
sizeof(AFloat),
430 cudaMemcpyDeviceToHost);
433 for (
int j = 0; j <
fSize; j++) {
434 hostTensor.GetData()[j] =
static_cast<AFloat
>(buffer[j]);
444template class TCudaTensor<float>;
445template class TCudaTensor<double>;
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
void SetTensorDescriptor()
Shape_t fStrides
Strides between tensor dimensions (always assume dense, non overlapping tensor)
void InitializeCuda()
Initializes all shared devices resource and makes sure that a sufficient number of curand states are ...
static std::vector< int > fInstances
For each GPU device keep the CUDA streams in which tensors are used.
void InitializeCurandStates()
Shape_t fShape
The shape vector (size of dimensions) needs to be ordered as no.
void PrintShape(const char *name="Tensor") const
size_t fSize
No. of elements.
static std::vector< std::size_t > ComputeStridesFromShape(const std::vector< std::size_t > &shape, bool rowmajorLayout)
This information is needed for the multi-dimensional indexing.
TCudaDeviceBuffer< AFloat > fElementBuffer
void Print(const char *name="Tensor", bool truncate=false) const
RTensor is a container with contiguous memory and shape information.
TMatrixT< Element > & T()
const Element * GetMatrixArray() const override
MemoryLayout
Memory layout type (copy from RTensor.hxx)
create variable transformations