template<typename AFloat>
class TMVA::DNN::TCudaMatrix< AFloat >

TCudaMatrix Class.

The TCudaMatrix class represents matrices on a CUDA device. The elements of the matrix are stored in a TCudaDeviceBuffer object which takes care of the allocation and freeing of the device memory. TCudaMatrices are lightweight object, that means on assignment and copy creation only a shallow copy is performed and no new element buffer allocated. To perform a deep copy use the static Copy method of the TCuda architecture class.

The TCudaDeviceBuffer has an associated cuda stream, on which the data is transferred to the device. This stream can be accessed through the GetComputeStream member function and used to synchronize computations.

The TCudaMatrix class also holds static references to CUDA resources. Those are the cublas handle, a buffer of curand states for the generation of random numbers as well as a vector containing ones, which is used for summing column matrices using matrix-vector multiplication. The class also has a static buffer for returning results from the device.

Definition at line 108 of file CudaMatrix.h.

Public Member Functions
	TCudaMatrix ()

	TCudaMatrix (const TCudaMatrix &)=default

	TCudaMatrix (const TMatrixT< AFloat > &)

	TCudaMatrix (size_t i, size_t j)

	TCudaMatrix (TCudaDeviceBuffer< AFloat > buffer, size_t m, size_t n)

	TCudaMatrix (TCudaMatrix &&)=default

	~TCudaMatrix ()=default

cudaStream_t	GetComputeStream () const

const cublasHandle_t &	GetCublasHandle () const

AFloat *	GetDataPointer ()

const AFloat *	GetDataPointer () const

TCudaDeviceBuffer< AFloat >	GetDeviceBuffer () const

size_t	GetNcols () const

size_t	GetNoElements () const

size_t	GetNrows () const

	operator TMatrixT () const
	Convert cuda matrix to Root TMatrix.

TCudaDeviceReference< AFloat >	operator() (size_t i, size_t j) const
	Access to elements of device matrices provided through TCudaDeviceReference class.

TCudaMatrix &	operator= (const TCudaMatrix &)=default

TCudaMatrix &	operator= (TCudaMatrix &&)=default

void	Print () const

void	SetComputeStream (cudaStream_t stream)

void	Synchronize (const TCudaMatrix &) const
	Blocking synchronization with the associated compute stream, if it's not the default stream.

void	Zero ()

Static Public Member Functions
static curandState_t *	GetCurandStatesPointer ()

static AFloat	GetDeviceReturn ()
	Transfer the value in the device return buffer to the host.

static AFloat *	GetDeviceReturnPointer ()
	Return device pointer to the device return buffer.

static size_t	GetNDim ()

static AFloat *	GetOnes ()

static void	ResetDeviceReturn (AFloat value=0.0)
	Set the return buffer on the device to the specified value.

Static Public Attributes
static Bool_t	gInitializeCurand

Private Member Functions
void	InitializeCuda ()
	Initializes all shared devices resource and makes sure that a sufficient number of curand states are allocated on the device and initialized as well as that the one-vector for the summation over columns has the right size.

void	InitializeCurandStates ()

Private Attributes
TCudaDeviceBuffer< AFloat >	fElementBuffer

size_t	fNCols

size_t	fNRows

Static Private Attributes
static cublasHandle_t	fCublasHandle

static curandState_t *	fCurandStates

static AFloat *	fDeviceReturn
	Buffer for kernel return values.

static size_t	fInstances
	Current number of matrix instances.

static size_t	fNCurandStates

static size_t	fNOnes
	Current length of the one vector.

static AFloat *	fOnes
	Vector used for summations of columns.

#include <TMVA/DNN/Architectures/Cuda/CudaMatrix.h>

Constructor & Destructor Documentation

◆ TCudaMatrix() [1/6]

template<typename AFloat >

TMVA::DNN::TCudaMatrix< AFloat >::TCudaMatrix ( )

◆ TCudaMatrix() [2/6]

template<typename AFloat >

TMVA::DNN::TCudaMatrix< AFloat >::TCudaMatrix	(	size_t	i,
		size_t	j
	)

◆ TCudaMatrix() [3/6]

template<typename AFloat >

TMVA::DNN::TCudaMatrix< AFloat >::TCudaMatrix ( const TMatrixT< AFloat > & )

◆ TCudaMatrix() [4/6]

template<typename AFloat >

TMVA::DNN::TCudaMatrix< AFloat >::TCudaMatrix	(	TCudaDeviceBuffer< AFloat >	buffer,
		size_t	m,
		size_t	n
	)

◆ TCudaMatrix() [5/6]

template<typename AFloat >

TMVA::DNN::TCudaMatrix< AFloat >::TCudaMatrix ( const TCudaMatrix< AFloat > & )

default

◆ TCudaMatrix() [6/6]

template<typename AFloat >

TMVA::DNN::TCudaMatrix< AFloat >::TCudaMatrix ( TCudaMatrix< AFloat > && )

default

◆ ~TCudaMatrix()

template<typename AFloat >

TMVA::DNN::TCudaMatrix< AFloat >::~TCudaMatrix ( )

default

Member Function Documentation

◆ GetComputeStream()

template<typename AFloat >

cudaStream_t TMVA::DNN::TCudaMatrix< AFloat >::GetComputeStream

inline

Definition at line 274 of file CudaMatrix.h.

◆ GetCublasHandle()

template<typename AFloat >

const cublasHandle_t & TMVA::DNN::TCudaMatrix< AFloat >::GetCublasHandle ( ) const

inline

Definition at line 171 of file CudaMatrix.h.

◆ GetCurandStatesPointer()

template<typename AFloat >

static curandState_t * TMVA::DNN::TCudaMatrix< AFloat >::GetCurandStatesPointer ( )

inlinestatic

Definition at line 158 of file CudaMatrix.h.

◆ GetDataPointer() [1/2]

template<typename AFloat >

AFloat * TMVA::DNN::TCudaMatrix< AFloat >::GetDataPointer ( )

inline

Definition at line 170 of file CudaMatrix.h.

◆ GetDataPointer() [2/2]

template<typename AFloat >

const AFloat * TMVA::DNN::TCudaMatrix< AFloat >::GetDataPointer ( ) const

inline

Definition at line 169 of file CudaMatrix.h.

◆ GetDeviceBuffer()

template<typename AFloat >

TCudaDeviceBuffer< AFloat > TMVA::DNN::TCudaMatrix< AFloat >::GetDeviceBuffer ( ) const

inline

Definition at line 173 of file CudaMatrix.h.

◆ GetDeviceReturn()

template<typename AFloat >

AFloat TMVA::DNN::TCudaMatrix< AFloat >::GetDeviceReturn

inlinestatic

Transfer the value in the device return buffer to the host.

This transfer is synchronous

Definition at line 307 of file CudaMatrix.h.

◆ GetDeviceReturnPointer()

template<typename AFloat >

static AFloat * TMVA::DNN::TCudaMatrix< AFloat >::GetDeviceReturnPointer ( )

inlinestatic

Return device pointer to the device return buffer.

Definition at line 157 of file CudaMatrix.h.

◆ GetNcols()

template<typename AFloat >

size_t TMVA::DNN::TCudaMatrix< AFloat >::GetNcols ( ) const

inline

Definition at line 166 of file CudaMatrix.h.

◆ GetNDim()

template<typename AFloat >

static size_t TMVA::DNN::TCudaMatrix< AFloat >::GetNDim ( )

inlinestatic

Definition at line 164 of file CudaMatrix.h.

◆ GetNoElements()

template<typename AFloat >

size_t TMVA::DNN::TCudaMatrix< AFloat >::GetNoElements ( ) const

inline

Definition at line 167 of file CudaMatrix.h.

◆ GetNrows()

template<typename AFloat >

size_t TMVA::DNN::TCudaMatrix< AFloat >::GetNrows ( ) const

inline

Definition at line 165 of file CudaMatrix.h.

◆ GetOnes()

template<typename AFloat >

static AFloat * TMVA::DNN::TCudaMatrix< AFloat >::GetOnes ( )

inlinestatic

Definition at line 131 of file CudaMatrix.h.

◆ InitializeCuda()

template<typename AFloat >

void TMVA::DNN::TCudaMatrix< AFloat >::InitializeCuda ( )

private

Initializes all shared devices resource and makes sure that a sufficient number of curand states are allocated on the device and initialized as well as that the one-vector for the summation over columns has the right size.

◆ InitializeCurandStates()

template<typename AFloat >

void TMVA::DNN::TCudaMatrix< AFloat >::InitializeCurandStates ( )

private

◆ operator TMatrixT()

template<typename AFloat >

TMVA::DNN::TCudaMatrix< AFloat >::operator TMatrixT ( ) const

Convert cuda matrix to Root TMatrix.

Performs synchronous data transfer.

◆ operator()()

template<typename AFloat >

TCudaDeviceReference< AFloat > TMVA::DNN::TCudaMatrix< AFloat >::operator()	(	size_t	i,
		size_t	j
	)		const

Access to elements of device matrices provided through TCudaDeviceReference class.

Note that access is synchronous end enforces device synchronization on all streams. Only used for testing.

Definition at line 316 of file CudaMatrix.h.

◆ operator=() [1/2]

template<typename AFloat >

TCudaMatrix & TMVA::DNN::TCudaMatrix< AFloat >::operator= ( const TCudaMatrix< AFloat > & )

default

◆ operator=() [2/2]

template<typename AFloat >

TCudaMatrix & TMVA::DNN::TCudaMatrix< AFloat >::operator= ( TCudaMatrix< AFloat > && )

default

◆ Print()

template<typename AFloat >

void TMVA::DNN::TCudaMatrix< AFloat >::Print ( ) const

inline

Definition at line 180 of file CudaMatrix.h.

◆ ResetDeviceReturn()

template<typename AFloat >

void TMVA::DNN::TCudaMatrix< AFloat >::ResetDeviceReturn ( AFloat value = 0.0 )

inlinestatic

Set the return buffer on the device to the specified value.

This is required for example for reductions in order to initialize the accumulator.

Definition at line 299 of file CudaMatrix.h.

◆ SetComputeStream()

template<typename AFloat >

void TMVA::DNN::TCudaMatrix< AFloat >::SetComputeStream ( cudaStream_t stream )

inline

Definition at line 281 of file CudaMatrix.h.

◆ Synchronize()

template<typename AFloat >

void TMVA::DNN::TCudaMatrix< AFloat >::Synchronize ( const TCudaMatrix< AFloat > & A ) const

inline

Blocking synchronization with the associated compute stream, if it's not the default stream.

Definition at line 288 of file CudaMatrix.h.

◆ Zero()

template<typename AFloat >

void TMVA::DNN::TCudaMatrix< AFloat >::Zero ( )

inline

Definition at line 185 of file CudaMatrix.h.

Member Data Documentation

◆ fCublasHandle

template<typename AFloat >

cublasHandle_t TMVA::DNN::TCudaMatrix< AFloat >::fCublasHandle

staticprivate

Definition at line 115 of file CudaMatrix.h.

◆ fCurandStates

template<typename AFloat >

curandState_t* TMVA::DNN::TCudaMatrix< AFloat >::fCurandStates

staticprivate

Definition at line 119 of file CudaMatrix.h.

◆ fDeviceReturn

template<typename AFloat >

AFloat* TMVA::DNN::TCudaMatrix< AFloat >::fDeviceReturn

staticprivate

Buffer for kernel return values.

Definition at line 116 of file CudaMatrix.h.

◆ fElementBuffer

template<typename AFloat >

TCudaDeviceBuffer<AFloat> TMVA::DNN::TCudaMatrix< AFloat >::fElementBuffer

private

Definition at line 125 of file CudaMatrix.h.

◆ fInstances

template<typename AFloat >

size_t TMVA::DNN::TCudaMatrix< AFloat >::fInstances

staticprivate

Current number of matrix instances.

Definition at line 114 of file CudaMatrix.h.

◆ fNCols

template<typename AFloat >

size_t TMVA::DNN::TCudaMatrix< AFloat >::fNCols

private

Definition at line 124 of file CudaMatrix.h.

◆ fNCurandStates

template<typename AFloat >

size_t TMVA::DNN::TCudaMatrix< AFloat >::fNCurandStates

staticprivate

Definition at line 120 of file CudaMatrix.h.

◆ fNOnes

template<typename AFloat >

size_t TMVA::DNN::TCudaMatrix< AFloat >::fNOnes

staticprivate

Current length of the one vector.

Definition at line 118 of file CudaMatrix.h.

◆ fNRows

template<typename AFloat >

size_t TMVA::DNN::TCudaMatrix< AFloat >::fNRows

private

Definition at line 123 of file CudaMatrix.h.

◆ fOnes

template<typename AFloat >

AFloat* TMVA::DNN::TCudaMatrix< AFloat >::fOnes

staticprivate

Vector used for summations of columns.

Definition at line 117 of file CudaMatrix.h.

◆ gInitializeCurand

template<typename AFloat >

Bool_t TMVA::DNN::TCudaMatrix< AFloat >::gInitializeCurand

static

Definition at line 129 of file CudaMatrix.h.

tmva/tmva/inc/TMVA/DNN/Architectures/Cuda/CudaMatrix.h

Public Member Functions

Static Public Member Functions

Static Public Attributes

Private Member Functions

Private Attributes

Static Private Attributes

Constructor & Destructor Documentation

◆ TCudaMatrix() [1/6]

◆ TCudaMatrix() [2/6]

◆ TCudaMatrix() [3/6]

◆ TCudaMatrix() [4/6]

◆ TCudaMatrix() [5/6]

◆ TCudaMatrix() [6/6]

◆ ~TCudaMatrix()

Member Function Documentation

◆ GetComputeStream()

◆ GetCublasHandle()

◆ GetCurandStatesPointer()

◆ GetDataPointer() [1/2]

◆ GetDataPointer() [2/2]

◆ GetDeviceBuffer()

◆ GetDeviceReturn()

◆ GetDeviceReturnPointer()

◆ GetNcols()

◆ GetNDim()

◆ GetNoElements()

◆ GetNrows()

◆ GetOnes()

◆ InitializeCuda()

◆ InitializeCurandStates()

◆ operator TMatrixT()

◆ operator()()

◆ operator=() [1/2]

◆ operator=() [2/2]

◆ Print()

◆ ResetDeviceReturn()

◆ SetComputeStream()

◆ Synchronize()

◆ Zero()

Member Data Documentation

◆ fCublasHandle

◆ fCurandStates

◆ fDeviceReturn

◆ fElementBuffer

◆ fInstances

◆ fNCols

◆ fNCurandStates

◆ fNOnes

◆ fNRows

◆ fOnes

◆ gInitializeCurand