19#ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
20#define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
24#include "RConfigure.h"
27#include "cuda_runtime.h"
29#include "curand_kernel.h"
34#define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); }
57template<
typename AFloat>
101template<
typename AFloat>
203 if (abort)
exit(code);
208template<
typename AFloat>
216template<
typename AFloat>
220 cudaMemcpy(& buffer, fDevicePointer,
sizeof(AFloat),
226template<
typename AFloat>
234template<
typename AFloat>
237 AFloat buffer =
value;
238 cudaMemcpy(fDevicePointer, & buffer,
sizeof(AFloat),
243template<
typename AFloat>
247 cudaMemcpy(& buffer, fDevicePointer,
sizeof(AFloat),
250 cudaMemcpy(fDevicePointer, & buffer,
sizeof(AFloat),
255template<
typename AFloat>
259 cudaMemcpy(& buffer, fDevicePointer,
sizeof(AFloat),
262 cudaMemcpy(fDevicePointer, & buffer,
sizeof(AFloat),
267template<
typename AFloat>
270 return fElementBuffer.GetComputeStream();
274template<
typename AFloat>
277 return fElementBuffer.SetComputeStream(stream);
281template<
typename AFloat>
292template<
typename AFloat>
295 AFloat buffer =
value;
300template<
typename AFloat>
309template<
typename AFloat>
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
void operator-=(AFloat value)
TCudaDeviceReference(AFloat *devicePointer)
void operator=(const TCudaDeviceReference &other)
void operator+=(AFloat value)
TCudaDeviceBuffer< AFloat > fElementBuffer
TCudaMatrix & operator=(const TCudaMatrix &)=default
static AFloat GetDeviceReturn()
Transfer the value in the device return buffer to the host.
void SetComputeStream(cudaStream_t stream)
cudaStream_t GetComputeStream() const
size_t GetNoElements() const
void InitializeCuda()
Initializes all shared devices resource and makes sure that a sufficient number of curand states are ...
static Bool_t gInitializeCurand
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Access to elements of device matrices provided through TCudaDeviceReference class.
static AFloat * GetDeviceReturnPointer()
Return device pointer to the device return buffer.
static curandState_t * fCurandStates
const cublasHandle_t & GetCublasHandle() const
static void ResetDeviceReturn(AFloat value=0.0)
Set the return buffer on the device to the specified value.
const AFloat * GetDataPointer() const
static size_t fNCurandStates
TCudaMatrix(const TCudaMatrix &)=default
TCudaDeviceBuffer< AFloat > GetDeviceBuffer() const
static AFloat * fDeviceReturn
Buffer for kernel return values.
void Synchronize(const TCudaMatrix &) const
Blocking synchronization with the associated compute stream, if it's not the default stream.
static AFloat * GetOnes()
static AFloat * fOnes
Vector used for summations of columns.
static cublasHandle_t fCublasHandle
static size_t fInstances
Current number of matrix instances.
TCudaMatrix & operator=(TCudaMatrix &&)=default
void InitializeCurandStates()
AFloat * GetDataPointer()
static size_t fNOnes
Current length of the one vector.
TCudaMatrix(TCudaMatrix &&)=default
static curandState_t * GetCurandStatesPointer()
void cudaError(cudaError_t code, const char *file, int line, bool abort=true)
Function to check cuda return code.
create variable transformations