19 #ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX 20 #define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX 23 #include "cuda_runtime.h" 24 #include "cublas_v2.h" 25 #include "curand_kernel.h" 30 #define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); } 38 inline void cudaError(cudaError_t code,
const char *
file,
int line,
bool abort=
true);
53 template<
typename AFloat>
97 template<
typename AFloat>
134 inline cudaStream_t GetComputeStream()
const;
135 inline void SetComputeStream(cudaStream_t stream);
139 inline static void ResetDeviceReturn(AFloat value = 0.0);
142 inline static AFloat GetDeviceReturn();
149 inline void Synchronize(
const TCudaMatrix &)
const;
170 AFloat * p = GetDataPointer();
171 for (
size_t i = 0; i < GetNoElements(); ++i)
182 void InitializeCuda();
183 void InitializeCurandStates();
190 inline void cudaError(cudaError_t code,
const char *file,
int line,
bool abort)
192 if (code != cudaSuccess)
194 fprintf(stderr,
"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
195 if (abort) exit(code);
200 template<
typename AFloat>
208 template<
typename AFloat>
213 cudaMemcpyDeviceToHost);
218 template<
typename AFloat>
222 cudaMemcpyDeviceToDevice);
226 template<
typename AFloat>
229 AFloat buffer = value;
231 cudaMemcpyHostToDevice);
235 template<
typename AFloat>
240 cudaMemcpyDeviceToHost);
243 cudaMemcpyHostToDevice);
247 template<
typename AFloat>
252 cudaMemcpyDeviceToHost);
255 cudaMemcpyHostToDevice);
259 template<
typename AFloat>
262 return fElementBuffer.GetComputeStream();
266 template<
typename AFloat>
269 return fElementBuffer.SetComputeStream(stream);
273 template<
typename AFloat>
277 cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
279 cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0);
280 cudaEventDestroy(event);
284 template<
typename AFloat>
287 AFloat buffer = value;
288 cudaMemcpy(fDeviceReturn, & buffer,
sizeof(AFloat), cudaMemcpyHostToDevice);
292 template<
typename AFloat>
296 cudaMemcpy(& buffer, fDeviceReturn,
sizeof(AFloat), cudaMemcpyDeviceToHost);
301 template<
typename AFloat>
304 AFloat * elementPointer = fElementBuffer;
305 elementPointer += j * fNRows + i;
static curandState_t * GetCurandStatesPointer()
static cublasHandle_t fCublasHandle
void operator-=(AFloat value)
void SetComputeStream(cudaStream_t stream)
TRObject operator()(const T1 &t1) const
static curandState_t * fCurandStates
size_t GetNoElements() const
void cudaError(cudaError_t code, const char *file, int line, bool abort=true)
Function to check cuda return code.
static AFloat * fOnes
Vector used for summations of columns.
AFloat * GetDataPointer()
TCudaDeviceBuffer< AFloat > fElementBuffer
static size_t fNCurandStates
static size_t fInstances
Current number of matrix instances.
void Synchronize(const TCudaMatrix &) const
Blocking synchronization with the associated compute stream, if it's not the default stream...
void operator=(const TCudaDeviceReference &other)
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Access to elements of device matrices provided through TCudaDeviceReference class.
static AFloat * fDeviceReturn
Buffer for kernel return values.
TCudaDeviceReference(AFloat *devicePointer)
static AFloat GetDeviceReturn()
Transfer the value in the device return buffer to the host.
const cublasHandle_t & GetCublasHandle() const
void Print(Option_t *name="") const
Print the matrix as a table of elements.
Abstract ClassifierFactory template that handles arbitrary types.
static AFloat * GetDeviceReturnPointer()
Return device pointer to the device return buffer.
static AFloat * GetOnes()
static void ResetDeviceReturn(AFloat value=0.0)
Set the return buffer on the device to the specified value.
const AFloat * GetDataPointer() const
cudaStream_t GetComputeStream() const
static size_t fNOnes
Current length of the one vector.
void operator+=(AFloat value)