doc/v616/CudaMatrix_8h_source.html

// @(#)root/tmva/tmva/dnn:$Id$

// Author: Simon Pfreundschuh 13/07/16


/*************************************************************************

 * Copyright (C) 2016, Simon Pfreundschuh                                *

 * All rights reserved.                                                  *

 *                                                                       *

 * For the licensing terms see $ROOTSYS/LICENSE.                         *

 * For the list of contributors see $ROOTSYS/README/CREDITS.             *

 *************************************************************************/


///////////////////////////////////////////////////////////////////////

// Contains the TCudaMatrix class for the representation of matrices //

// on CUDA devices as well as the TCudaDeviceReference class which   //

// is a helper class to emulate lvalue references to floating point  //

// values on the device.                                             //

///////////////////////////////////////////////////////////////////////


#ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX

#define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX


#include "cuda.h"

#include "cuda_runtime.h"

#include "cublas_v2.h"

#include "curand_kernel.h"


#include "TMatrixT.h"

#include "CudaBuffers.h"


#define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); }


namespace TMVA {

namespace DNN {


/** Function to check cuda return code. Taken from

 * http://stackoverflow.com/questions/14038589/

 */

inline void cudaError(cudaError_t code, const char *file, int line, bool abort=true);


//____________________________________________________________________________

//

// Cuda Device Reference

//____________________________________________________________________________


/** TCudaDeviceReference

 *

 * Helper class emulating lvalue references for AFloat values that are

 * physically on the device. Allows for example to assign to matrix elements.

 * Note that device access through CudaDeviceReferences enforces synchronization

 * with all streams and thus qualifies as performance killer. Only used for

 * testing.

 */

template<typename AFloat>

class TCudaDeviceReference

{

private:


    AFloat * fDevicePointer;


public:


    TCudaDeviceReference(AFloat * devicePointer);


    operator AFloat();


    void operator=(const TCudaDeviceReference &other);

    void operator=(AFloat value);

    void operator+=(AFloat value);

    void operator-=(AFloat value);

};


//____________________________________________________________________________

//

// Cuda Matrix

//____________________________________________________________________________


/** TCudaMatrix Class

 *

 * The TCudaMatrix class represents matrices on a CUDA device. The elements

 * of the matrix are stored in a TCudaDeviceBuffer object which takes care of

 * the allocation and freeing of the device memory. TCudaMatrices are lightweight

 * object, that means on assignment and copy creation only a shallow copy is

 * performed and no new element buffer allocated. To perform a deep copy use

 * the static Copy method of the TCuda architecture class.

 *

 * The TCudaDeviceBuffer has an associated cuda stream, on which the data is

 * transferred to the device. This stream can be accessed through the

 * GetComputeStream member function and used to synchronize computations.

 *

 * The TCudaMatrix class also holds static references to CUDA resources.

 * Those are the cublas handle, a buffer of curand states for the generation

 * of random numbers as well as a vector containing ones, which is used for

 * summing column matrices using matrix-vector multiplication. The class also

 * has a static buffer for returning results from the device.

 *

 */

template<typename AFloat>

class TCudaMatrix

{

public:


private:


   static size_t          fInstances;    ///< Current number of matrix instances.

   static cublasHandle_t  fCublasHandle;

   static AFloat        * fDeviceReturn; ///< Buffer for kernel return values.

   static AFloat        * fOnes;         ///< Vector used for summations of columns.

   static size_t          fNOnes;        ///< Current length of the one vector.

   static curandState_t * fCurandStates;

   static size_t          fNCurandStates;


   size_t                    fNRows;

   size_t                    fNCols;

   TCudaDeviceBuffer<AFloat> fElementBuffer;


public:


   static AFloat * GetOnes() {return fOnes;}


   TCudaMatrix();

   TCudaMatrix(size_t i, size_t j);

   TCudaMatrix(const TMatrixT<AFloat> &);

   TCudaMatrix(TCudaDeviceBuffer<AFloat> buffer, size_t m, size_t n);


   TCudaMatrix(const TCudaMatrix  &) = default;

   TCudaMatrix(      TCudaMatrix &&) = default;

   TCudaMatrix & operator=(const TCudaMatrix  &) = default;

   TCudaMatrix & operator=(      TCudaMatrix &&) = default;

   ~TCudaMatrix() = default;


   /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */

   operator TMatrixT<AFloat>() const;


   inline cudaStream_t GetComputeStream() const;

   inline void         SetComputeStream(cudaStream_t stream);

   /** Set the return buffer on the device to the specified value. This is

    * required for example for reductions in order to initialize the

    * accumulator. */

   inline static void ResetDeviceReturn(AFloat value = 0.0);

   /** Transfer the value in the device return buffer to the host. This

    *  tranfer is synchronous */

   inline static AFloat GetDeviceReturn();

   /** Return device pointer to the device return buffer */

   inline static AFloat *        GetDeviceReturnPointer() {return fDeviceReturn;}

   inline static curandState_t * GetCurandStatesPointer() {return fCurandStates;}


   /** Blocking synchronization with the associated compute stream, if it's

    * not the default stream. */

   inline void Synchronize(const TCudaMatrix &) const;


   size_t GetNrows() const {return fNRows;}

   size_t GetNcols() const {return fNCols;}

   size_t GetNoElements() const {return fNRows * fNCols;}


   const AFloat * GetDataPointer() const {return fElementBuffer;}

   AFloat *       GetDataPointer()       {return fElementBuffer;}

   const cublasHandle_t & GetCublasHandle() const    {return fCublasHandle;}


   /** Access to elements of device matrices provided through TCudaDeviceReference

    *  class. Note that access is synchronous end enforces device synchronization

    *  on all streams. Only used for testing. */

   TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const;


   void Print() const {

      TMatrixT<AFloat> mat(*this);

      mat.Print();

   }


   void Zero() {

      cudaMemset(GetDataPointer(), 0, sizeof(AFloat) * GetNoElements());

   }


private:


   /** Initializes all shared devices resource and makes sure that a sufficient

    *  number of curand states are allocated on the device and initialized as

    *  well as that the one-vector for the summation over columns has the right

    *  size. */

   void InitializeCuda();

   void InitializeCurandStates();


};


//

// Inline Functions.

//______________________________________________________________________________

inline void cudaError(cudaError_t code, const char *file, int line, bool abort)

{

   if (code != cudaSuccess)

   {

      fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);

      if (abort) exit(code);

   }

}


//______________________________________________________________________________

template<typename AFloat>

TCudaDeviceReference<AFloat>::TCudaDeviceReference(AFloat * devicePointer)

    : fDevicePointer(devicePointer)

{

   // Nothing to do here.

}


//______________________________________________________________________________

template<typename AFloat>

TCudaDeviceReference<AFloat>::operator AFloat()

{

    AFloat buffer;

    cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),

               cudaMemcpyDeviceToHost);

    return buffer;

}


//______________________________________________________________________________

template<typename AFloat>

void TCudaDeviceReference<AFloat>::operator=(const TCudaDeviceReference &other)

{

   cudaMemcpy(fDevicePointer, other.fDevicePointer, sizeof(AFloat),

              cudaMemcpyDeviceToDevice);

}


//______________________________________________________________________________

template<typename AFloat>

void TCudaDeviceReference<AFloat>::operator=(AFloat value)

{

   AFloat buffer = value;

   cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),

              cudaMemcpyHostToDevice);

}


//______________________________________________________________________________

template<typename AFloat>

void TCudaDeviceReference<AFloat>::operator+=(AFloat value)

{

   AFloat buffer;

   cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),

              cudaMemcpyDeviceToHost);

   buffer += value;

   cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),

              cudaMemcpyHostToDevice);

}


//______________________________________________________________________________

template<typename AFloat>

void TCudaDeviceReference<AFloat>::operator-=(AFloat value)

{

   AFloat buffer;

   cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),

              cudaMemcpyDeviceToHost);

   buffer -= value;

   cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),

              cudaMemcpyHostToDevice);

}


//______________________________________________________________________________

template<typename AFloat>

inline cudaStream_t TCudaMatrix<AFloat>::GetComputeStream() const

{

   return fElementBuffer.GetComputeStream();

}


//______________________________________________________________________________

template<typename AFloat>

inline void TCudaMatrix<AFloat>::SetComputeStream(cudaStream_t stream)

{

   return fElementBuffer.SetComputeStream(stream);

}


//______________________________________________________________________________

template<typename AFloat>

inline void TCudaMatrix<AFloat>::Synchronize(const TCudaMatrix &A) const

{

   cudaEvent_t event;

   cudaEventCreateWithFlags(&event, cudaEventDisableTiming);

   cudaEventRecord(event, A.GetComputeStream());

   cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0);

   cudaEventDestroy(event);

}


//______________________________________________________________________________

template<typename AFloat>

inline void TCudaMatrix<AFloat>::ResetDeviceReturn(AFloat value)

{

   AFloat buffer = value;

   cudaMemcpy(fDeviceReturn, & buffer, sizeof(AFloat), cudaMemcpyHostToDevice);

}


//______________________________________________________________________________

template<typename AFloat>

inline AFloat TCudaMatrix<AFloat>::GetDeviceReturn()

{

   AFloat buffer;

   cudaMemcpy(& buffer, fDeviceReturn, sizeof(AFloat), cudaMemcpyDeviceToHost);

   return buffer;

}


//______________________________________________________________________________

template<typename AFloat>

TCudaDeviceReference<AFloat> TCudaMatrix<AFloat>::operator()(size_t i, size_t j) const

{

    AFloat * elementPointer = fElementBuffer;

    elementPointer += j * fNRows + i;

    return TCudaDeviceReference<AFloat>(elementPointer);

}


} // namespace DNN

} // namespace TMVA


#endif

CudaBuffers.h

TMatrixT.h

TMVA::DNN::TCudaDeviceBuffer
TCudaDeviceBuffer.
Definition: CudaBuffers.h:98

TMVA::DNN::TCudaDeviceReference
TCudaDeviceReference.
Definition: CudaMatrix.h:55

TMVA::DNN::TCudaDeviceReference::operator-=
void operator-=(AFloat value)
Definition: CudaMatrix.h:246

TMVA::DNN::TCudaDeviceReference::fDevicePointer
AFloat * fDevicePointer
Definition: CudaMatrix.h:58

TMVA::DNN::TCudaDeviceReference::TCudaDeviceReference
TCudaDeviceReference(AFloat *devicePointer)
Definition: CudaMatrix.h:199

TMVA::DNN::TCudaDeviceReference::operator=
void operator=(const TCudaDeviceReference &other)
Definition: CudaMatrix.h:217

TMVA::DNN::TCudaDeviceReference::operator+=
void operator+=(AFloat value)
Definition: CudaMatrix.h:234

TMVA::DNN::TCudaMatrix
TCudaMatrix Class.
Definition: CudaMatrix.h:99

TMVA::DNN::TCudaMatrix::fElementBuffer
TCudaDeviceBuffer< AFloat > fElementBuffer
Definition: CudaMatrix.h:114

TMVA::DNN::TCudaMatrix::GetNcols
size_t GetNcols() const
Definition: CudaMatrix.h:152

TMVA::DNN::TCudaMatrix::operator=
TCudaMatrix & operator=(const TCudaMatrix &)=default

TMVA::DNN::TCudaMatrix::fCurandStates
static curandState_t * fCurandStates
Definition: CudaMatrix.h:109

TMVA::DNN::TCudaMatrix::Print
void Print() const
Definition: CudaMatrix.h:164

TMVA::DNN::TCudaMatrix::TCudaMatrix
TCudaMatrix(const TMatrixT< AFloat > &)

TMVA::DNN::TCudaMatrix::GetDeviceReturn
static AFloat GetDeviceReturn()
Transfer the value in the device return buffer to the host.
Definition: CudaMatrix.h:291

TMVA::DNN::TCudaMatrix::TCudaMatrix
TCudaMatrix()

TMVA::DNN::TCudaMatrix::SetComputeStream
void SetComputeStream(cudaStream_t stream)
Definition: CudaMatrix.h:265

TMVA::DNN::TCudaMatrix::fDeviceReturn
static AFloat * fDeviceReturn
Buffer for kernel return values.
Definition: CudaMatrix.h:106

TMVA::DNN::TCudaMatrix::GetComputeStream
cudaStream_t GetComputeStream() const
Definition: CudaMatrix.h:258

TMVA::DNN::TCudaMatrix::GetNoElements
size_t GetNoElements() const
Definition: CudaMatrix.h:153

TMVA::DNN::TCudaMatrix::InitializeCuda
void InitializeCuda()
Initializes all shared devices resource and makes sure that a sufficient number of curand states are ...

TMVA::DNN::TCudaMatrix::operator()
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Access to elements of device matrices provided through TCudaDeviceReference class.
Definition: CudaMatrix.h:300

TMVA::DNN::TCudaMatrix::GetDeviceReturnPointer
static AFloat * GetDeviceReturnPointer()
Return device pointer to the device return buffer.
Definition: CudaMatrix.h:144

TMVA::DNN::TCudaMatrix::Zero
void Zero()
Definition: CudaMatrix.h:169

TMVA::DNN::TCudaMatrix::GetCublasHandle
const cublasHandle_t & GetCublasHandle() const
Definition: CudaMatrix.h:157

TMVA::DNN::TCudaMatrix::fOnes
static AFloat * fOnes
Vector used for summations of columns.
Definition: CudaMatrix.h:107

TMVA::DNN::TCudaMatrix::ResetDeviceReturn
static void ResetDeviceReturn(AFloat value=0.0)
Set the return buffer on the device to the specified value.
Definition: CudaMatrix.h:283

TMVA::DNN::TCudaMatrix::fNRows
size_t fNRows
Definition: CudaMatrix.h:112

TMVA::DNN::TCudaMatrix::GetDataPointer
const AFloat * GetDataPointer() const
Definition: CudaMatrix.h:155

TMVA::DNN::TCudaMatrix::TCudaMatrix
TCudaMatrix(TCudaDeviceBuffer< AFloat > buffer, size_t m, size_t n)

TMVA::DNN::TCudaMatrix::fNCurandStates
static size_t fNCurandStates
Definition: CudaMatrix.h:110

TMVA::DNN::TCudaMatrix::TCudaMatrix
TCudaMatrix(const TCudaMatrix &)=default

TMVA::DNN::TCudaMatrix::Synchronize
void Synchronize(const TCudaMatrix &) const
Blocking synchronization with the associated compute stream, if it's not the default stream.
Definition: CudaMatrix.h:272

TMVA::DNN::TCudaMatrix::fNCols
size_t fNCols
Definition: CudaMatrix.h:113

TMVA::DNN::TCudaMatrix::GetOnes
static AFloat * GetOnes()
Definition: CudaMatrix.h:118

TMVA::DNN::TCudaMatrix::~TCudaMatrix
~TCudaMatrix()=default

TMVA::DNN::TCudaMatrix::fCublasHandle
static cublasHandle_t fCublasHandle
Definition: CudaMatrix.h:105

TMVA::DNN::TCudaMatrix::fInstances
static size_t fInstances
Current number of matrix instances.
Definition: CudaMatrix.h:104

TMVA::DNN::TCudaMatrix::TCudaMatrix
TCudaMatrix(size_t i, size_t j)

TMVA::DNN::TCudaMatrix::GetNrows
size_t GetNrows() const
Definition: CudaMatrix.h:151

TMVA::DNN::TCudaMatrix::operator=
TCudaMatrix & operator=(TCudaMatrix &&)=default

TMVA::DNN::TCudaMatrix::InitializeCurandStates
void InitializeCurandStates()

TMVA::DNN::TCudaMatrix::GetDataPointer
AFloat * GetDataPointer()
Definition: CudaMatrix.h:156

TMVA::DNN::TCudaMatrix::fNOnes
static size_t fNOnes
Current length of the one vector.
Definition: CudaMatrix.h:108

TMVA::DNN::TCudaMatrix::TCudaMatrix
TCudaMatrix(TCudaMatrix &&)=default

TMVA::DNN::TCudaMatrix::GetCurandStatesPointer
static curandState_t * GetCurandStatesPointer()
Definition: CudaMatrix.h:145

TMatrixTBase::Print
void Print(Option_t *name="") const
Print the matrix as a table of elements.
Definition: TMatrixTBase.cxx:832

TMatrixT
TMatrixT.
Definition: TMatrixT.h:39

line
TLine * line
Definition: entrylistblock_figure1.C:235

n
const Int_t n
Definition: legend1.C:16

ROOT::Math::Cephes::A
static double A[]
Definition: SpecFuncCephes.cxx:170

TMVA::DNN::cudaError
void cudaError(cudaError_t code, const char *file, int line, bool abort=true)
Function to check cuda return code.
Definition: CudaMatrix.h:188

TMVA
Abstract ClassifierFactory template that handles arbitrary types.
Definition: GeneticMinimizer.h:21

file
Definition: file.py:1

m
auto * m
Definition: textangle.C:8