124template<
typename AFloat>
125AFloat TCudnn<AFloat>::Sum(
const TCudaTensor<AFloat> & A,
const AFloat alpha,
const AFloat beta)
127 cudnnHandle_t cudnnHandle = A.GetCudnnHandle();
128 cudnnDataType_t cudnnDataType;
129 if (std::is_same<AFloat, double>::value) { cudnnDataType = CUDNN_DATA_DOUBLE;}
130 else if (std::is_same<AFloat, float>::value) { cudnnDataType = CUDNN_DATA_FLOAT;}
133 TCudaHostBuffer<AFloat> hostBuffer (1);
134 const std::vector<size_t> shapeVec {1,1,1,1};
136 TCudaTensor<AFloat>
C (hostBuffer, shapeVec);
139 cudnnReduceTensorDescriptor_t reduceTensorDescr;
140 CUDNNCHECK(cudnnCreateReduceTensorDescriptor(&reduceTensorDescr));
141 CUDNNCHECK(cudnnSetReduceTensorDescriptor(reduceTensorDescr,
142 CUDNN_REDUCE_TENSOR_ADD,
145 CUDNN_REDUCE_TENSOR_FLATTENED_INDICES,
147 CUDNN_32BIT_INDICES));
150 size_t indiceSizeInBytes;
151 void* indices =
nullptr;
152 CUDNNCHECK(cudnnGetReductionIndicesSize(cudnnHandle,
154 A.GetTensorDescriptor(),
155 C.GetTensorDescriptor(),
156 &indiceSizeInBytes));
157 cudaMalloc(&indices, indiceSizeInBytes);
160 size_t workspaceSizeInBytes;
161 void* workspace =
nullptr;
162 CUDNNCHECK(cudnnGetReductionWorkspaceSize(cudnnHandle,
164 A.GetTensorDescriptor(),
165 C.GetTensorDescriptor(),
166 &workspaceSizeInBytes));
167 cudaMalloc(&workspace, workspaceSizeInBytes);
171 CUDNNCHECK(cudnnReduceTensor(cudnnHandle,
176 workspaceSizeInBytes,
178 A.GetTensorDescriptor(),
181 C.GetTensorDescriptor(),
182 C.GetDataPointer()));
185 TCudaDeviceBuffer<AFloat>& resultDeviceBuffer =
C.GetDeviceBuffer();
186 resultDeviceBuffer.CopyTo(hostBuffer);
190 CUDNNCHECK(cudnnDestroyReduceTensorDescriptor(reduceTensorDescr));
241template<
typename AFloat>
242void TCudnn<AFloat>::ScaleAdd(TCudaTensor<AFloat> & B,
243 const TCudaTensor<AFloat> & A,
248 assert(B.GetShape().size() == A.GetShape().size());
249 for (
size_t i = 0; i < B.GetShape().
size(); ++i) {
250 if (B.GetShape()[i] != A.GetShape()[i] ) {
251 if ( A.GetShape()[i]!=1) {
259 CUDNNCHECK(cudnnAddTensor(A.GetCudnnHandle(),
261 A.GetTensorDescriptor(),
264 B.GetTensorDescriptor(),
265 B.GetDataPointer()));
270template<
typename AFloat>
271void TCudnn<AFloat>::ConstAdd(TCudaTensor<AFloat> &A,
const AFloat beta)
274 TCudaTensor<AFloat>
C (A);
281template<
typename AFloat>
282void TCudnn<AFloat>::ConstMult(TCudaTensor<AFloat> &A,
const AFloat beta)
284 CUDNNCHECK(cudnnScaleTensor(A.GetCudnnHandle(),
285 A.GetTensorDescriptor(),
306template<
typename AFloat>
307void TCudnn<AFloat>::SqrtElementWise(TCudaTensor<AFloat> &A,
const AFloat alpha,
const AFloat beta,
const AFloat gamma)
309 cudnnDataType_t cudnnDataType;
310 if (std::is_same<AFloat, double>::value) { cudnnDataType = CUDNN_DATA_DOUBLE;}
311 else if (std::is_same<AFloat, float>::value) { cudnnDataType = CUDNN_DATA_FLOAT;}
314 cudnnOpTensorDescriptor_t opTensorDescr;
315 CUDNNCHECK(cudnnCreateOpTensorDescriptor(&opTensorDescr));
317 CUDNNCHECK(cudnnSetOpTensorDescriptor(opTensorDescr,
318 CUDNN_OP_TENSOR_SQRT,
320 CUDNN_PROPAGATE_NAN));
323 CUDNNCHECK(cudnnOpTensor(A.GetCudnnHandle(),
326 A.GetTensorDescriptor(),
329 A.GetTensorDescriptor(),
332 A.GetTensorDescriptor(),
333 A.GetDataPointer()));
335 CUDNNCHECK(cudnnDestroyOpTensorDescriptor(opTensorDescr));
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
void PrintTensor(RTensor< T > &t)
create variable transformations
constexpr Double_t C()
Velocity of light in .