doc/v614/CudaMatrix_8h_source.html

 // @(#)root/tmva/tmva/dnn:$Id$
 // Author: Simon Pfreundschuh 13/07/16

 /*************************************************************************
  * Copyright (C) 2016, Simon Pfreundschuh                                *
  * All rights reserved.                                                  *
  *                                                                       *
  * For the licensing terms see $ROOTSYS/LICENSE.                         *
  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
  *************************************************************************/

 ///////////////////////////////////////////////////////////////////////
 // Contains the TCudaMatrix class for the representation of matrices //
 // on CUDA devices as well as the TCudaDeviceReference class which   //
 // is a helper class to emulate lvalue references to floating point  //
 // values on the device.                                             //
 ///////////////////////////////////////////////////////////////////////

 #ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
 #define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX

 #include "cuda.h"
 #include "cuda_runtime.h"
 #include "cublas_v2.h"
 #include "curand_kernel.h"

 #include "TMatrixT.h"
 #include "CudaBuffers.h"

 #define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); }

 namespace TMVA {
 namespace DNN {

 /** Function to check cuda return code. Taken from
  * http://stackoverflow.com/questions/14038589/
  */
 inline void cudaError(cudaError_t code, const char *file, int line, bool abort=true);

 //____________________________________________________________________________
 //
 // Cuda Device Reference
 //____________________________________________________________________________

 /** TCudaDeviceReference
  *
  * Helper class emulating lvalue references for AFloat values that are
  * physically on the device. Allows for example to assign to matrix elements.
  * Note that device access through CudaDeviceReferences enforces synchronization
  * with all streams and thus qualifies as performance killer. Only used for
  * testing.
  */
 template<typename AFloat>
 class TCudaDeviceReference
 {
 private:

     AFloat * fDevicePointer;

 public:

     TCudaDeviceReference(AFloat * devicePointer);

     operator AFloat();

     void operator=(const TCudaDeviceReference &other);
     void operator=(AFloat value);
     void operator+=(AFloat value);
     void operator-=(AFloat value);
 };

 //____________________________________________________________________________
 //
 // Cuda Matrix
 //____________________________________________________________________________

 /** TCudaMatrix Class
  *
  * The TCudaMatrix class represents matrices on a CUDA device. The elements
  * of the matrix are stored in a TCudaDeviceBuffer object which takes care of
  * the allocation and freeing of the device memory. TCudaMatrices are lightweight
  * object, that means on assignment and copy creation only a shallow copy is
  * performed and no new element buffer allocated. To perform a deep copy use
  * the static Copy method of the TCuda architecture class.
  *
  * The TCudaDeviceBuffer has an associated cuda stream, on which the data is
  * transferred to the device. This stream can be accessed through the
  * GetComputeStream member function and used to synchronize computations.
  *
  * The TCudaMatrix class also holds static references to CUDA resources.
  * Those are the cublas handle, a buffer of curand states for the generation
  * of random numbers as well as a vector containing ones, which is used for
  * summing column matrices using matrix-vector multiplication. The class also
  * has a static buffer for returning results from the device.
  *
  */
 template<typename AFloat>
 class TCudaMatrix
 {
 public:

 private:

    static size_t          fInstances;    ///< Current number of matrix instances.
    static cublasHandle_t  fCublasHandle;
    static AFloat        * fDeviceReturn; ///< Buffer for kernel return values.
    static AFloat        * fOnes;         ///< Vector used for summations of columns.
    static size_t          fNOnes;        ///< Current length of the one vector.
    static curandState_t * fCurandStates;
    static size_t          fNCurandStates;

    size_t                    fNRows;
    size_t                    fNCols;
    TCudaDeviceBuffer<AFloat> fElementBuffer;

 public:

    static AFloat * GetOnes() {return fOnes;}

    TCudaMatrix();
    TCudaMatrix(size_t i, size_t j);
    TCudaMatrix(const TMatrixT<Double_t> &);
    TCudaMatrix(TCudaDeviceBuffer<AFloat> buffer, size_t m, size_t n);

    TCudaMatrix(const TCudaMatrix  &) = default;
    TCudaMatrix(      TCudaMatrix &&) = default;
    TCudaMatrix & operator=(const TCudaMatrix  &) = default;
    TCudaMatrix & operator=(      TCudaMatrix &&) = default;
    ~TCudaMatrix() = default;

    /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */
    operator TMatrixT<Double_t>() const;

    inline cudaStream_t GetComputeStream() const;
    inline void         SetComputeStream(cudaStream_t stream);
    /** Set the return buffer on the device to the specified value. This is
     * required for example for reductions in order to initialize the
     * accumulator. */
    inline static void ResetDeviceReturn(AFloat value = 0.0);
    /** Transfer the value in the device return buffer to the host. This
     *  tranfer is synchronous */
    inline static AFloat GetDeviceReturn();
    /** Return device pointer to the device return buffer */
    inline static AFloat *        GetDeviceReturnPointer() {return fDeviceReturn;}
    inline static curandState_t * GetCurandStatesPointer() {return fCurandStates;}

    /** Blocking synchronization with the associated compute stream, if it's
     * not the default stream. */
    inline void Synchronize(const TCudaMatrix &) const;

    size_t GetNrows() const {return fNRows;}
    size_t GetNcols() const {return fNCols;}
    size_t GetNoElements() const {return fNRows * fNCols;}
    const AFloat * GetDataPointer() const {return fElementBuffer;}
    AFloat *       GetDataPointer()       {return fElementBuffer;}
    const cublasHandle_t & GetCublasHandle() const    {return fCublasHandle;}

    /** Access to elements of device matrices provided through TCudaDeviceReference
     *  class. Note that access is synchronous end enforces device synchronization
     *  on all streams. Only used for testing. */
    TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const;

    void Print() const {
       TMatrixT<Double_t> mat(*this);
       mat.Print();
    }

    void Zero() {
       // to be checked
       AFloat * p = GetDataPointer();
       for (size_t i = 0; i < GetNoElements(); ++i)
          p[i] = 0;
    }


 private:

    /** Initializes all shared devices resource and makes sure that a sufficient
     *  number of curand states are allocated on the device and initialized as
     *  well as that the one-vector for the summation over columns has the right
     *  size. */
    void InitializeCuda();
    void InitializeCurandStates();

 };

 //
 // Inline Functions.
 //______________________________________________________________________________
 inline void cudaError(cudaError_t code, const char *file, int line, bool abort)
 {
    if (code != cudaSuccess)
    {
       fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
       if (abort) exit(code);
    }
 }

 //______________________________________________________________________________
 template<typename AFloat>
 TCudaDeviceReference<AFloat>::TCudaDeviceReference(AFloat * devicePointer)
     : fDevicePointer(devicePointer)
 {
    // Nothing to do here.
 }

 //______________________________________________________________________________
 template<typename AFloat>
 TCudaDeviceReference<AFloat>::operator AFloat()
 {
     AFloat buffer;
     cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
                cudaMemcpyDeviceToHost);
     return buffer;
 }

 //______________________________________________________________________________
 template<typename AFloat>
 void TCudaDeviceReference<AFloat>::operator=(const TCudaDeviceReference &other)
 {
    cudaMemcpy(fDevicePointer, other.fDevicePointer, sizeof(AFloat),
               cudaMemcpyDeviceToDevice);
 }

 //______________________________________________________________________________
 template<typename AFloat>
 void TCudaDeviceReference<AFloat>::operator=(AFloat value)
 {
    AFloat buffer = value;
    cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
               cudaMemcpyHostToDevice);
 }

 //______________________________________________________________________________
 template<typename AFloat>
 void TCudaDeviceReference<AFloat>::operator+=(AFloat value)
 {
    AFloat buffer;
    cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
               cudaMemcpyDeviceToHost);
    buffer += value;
    cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
               cudaMemcpyHostToDevice);
 }

 //______________________________________________________________________________
 template<typename AFloat>
 void TCudaDeviceReference<AFloat>::operator-=(AFloat value)
 {
    AFloat buffer;
    cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
               cudaMemcpyDeviceToHost);
    buffer -= value;
    cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
               cudaMemcpyHostToDevice);
 }

 //______________________________________________________________________________
 template<typename AFloat>
 inline cudaStream_t TCudaMatrix<AFloat>::GetComputeStream() const
 {
    return fElementBuffer.GetComputeStream();
 }

 //______________________________________________________________________________
 template<typename AFloat>
 inline void TCudaMatrix<AFloat>::SetComputeStream(cudaStream_t stream)
 {
    return fElementBuffer.SetComputeStream(stream);
 }

 //______________________________________________________________________________
 template<typename AFloat>
 inline void TCudaMatrix<AFloat>::Synchronize(const TCudaMatrix &A) const
 {
    cudaEvent_t event;
    cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
    cudaEventRecord(event, A.GetComputeStream());
    cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0);
    cudaEventDestroy(event);
 }

 //______________________________________________________________________________
 template<typename AFloat>
 inline void TCudaMatrix<AFloat>::ResetDeviceReturn(AFloat value)
 {
    AFloat buffer = value;
    cudaMemcpy(fDeviceReturn, & buffer, sizeof(AFloat), cudaMemcpyHostToDevice);
 }

 //______________________________________________________________________________
 template<typename AFloat>
 inline AFloat TCudaMatrix<AFloat>::GetDeviceReturn()
 {
    AFloat buffer;
    cudaMemcpy(& buffer, fDeviceReturn, sizeof(AFloat), cudaMemcpyDeviceToHost);
    return buffer;
 }

 //______________________________________________________________________________
 template<typename AFloat>
 TCudaDeviceReference<AFloat> TCudaMatrix<AFloat>::operator()(size_t i, size_t j) const
 {
     AFloat * elementPointer = fElementBuffer;
     elementPointer += j * fNRows + i;
     return TCudaDeviceReference<AFloat>(elementPointer);
 }

 } // namespace DNN
 } // namespace TMVA

 #endif
TMVA::DNN::TCudaMatrix::GetCurandStatesPointer
static curandState_t * GetCurandStatesPointer()
Definition: CudaMatrix.h:145

TMVA::DNN::TCudaMatrix::fCublasHandle
static cublasHandle_t fCublasHandle
Definition: CudaMatrix.h:105

TMVA::DNN::TCudaDeviceReference
TCudaDeviceReference.
Definition: CudaMatrix.h:54

m
auto * m
Definition: textangle.C:8

TMVA::DNN::TCudaDeviceReference::operator-=
void operator-=(AFloat value)
Definition: CudaMatrix.h:248

line
TLine * line
Definition: entrylistblock_figure1.C:235

TMVA::DNN::TCudaMatrix::fNRows
size_t fNRows
Definition: CudaMatrix.h:112

CudaBuffers.h

TMVA::DNN::TCudaDeviceBuffer
TCudaDeviceBuffer.
Definition: CudaBuffers.h:28

TMVA::DNN::TCudaMatrix::SetComputeStream
void SetComputeStream(cudaStream_t stream)
Definition: CudaMatrix.h:267

TMatrixT.h

TMVA::DNN::TCudaMatrix::fNCols
size_t fNCols
Definition: CudaMatrix.h:113

ROOT::Math::Cephes::A
static double A[]
Definition: SpecFuncCephes.cxx:170

operator()
TRObject operator()(const T1 &t1) const
Definition: TRFunctionImport__oprtr.h:14

TMVA::DNN::TCudaDeviceReference::fDevicePointer
AFloat * fDevicePointer
Definition: CudaMatrix.h:58

TMatrixT< Double_t >

TMVA::DNN::TCudaMatrix::fCurandStates
static curandState_t * fCurandStates
Definition: CudaMatrix.h:109

TMVA::DNN::TCudaMatrix::GetNoElements
size_t GetNoElements() const
Definition: CudaMatrix.h:153

TMVA::DNN::cudaError
void cudaError(cudaError_t code, const char *file, int line, bool abort=true)
Function to check cuda return code.
Definition: CudaMatrix.h:190

TMVA::DNN::TCudaMatrix::GetNcols
size_t GetNcols() const
Definition: CudaMatrix.h:152

TMVA::DNN::TCudaMatrix::fOnes
static AFloat * fOnes
Vector used for summations of columns.
Definition: CudaMatrix.h:107

TMVA::DNN::TCudaMatrix::GetDataPointer
AFloat * GetDataPointer()
Definition: CudaMatrix.h:155

TMVA::DNN::TCudaMatrix::GetNrows
size_t GetNrows() const
Definition: CudaMatrix.h:151

TMVA::DNN::TCudaMatrix::Print
void Print() const
Definition: CudaMatrix.h:163

TMVA::DNN::TCudaMatrix::fElementBuffer
TCudaDeviceBuffer< AFloat > fElementBuffer
Definition: CudaMatrix.h:114

TMVA::DNN::TCudaMatrix::fNCurandStates
static size_t fNCurandStates
Definition: CudaMatrix.h:110

TMVA::DNN::TCudaMatrix::fInstances
static size_t fInstances
Current number of matrix instances.
Definition: CudaMatrix.h:104

TMVA::DNN::TCudaMatrix::Synchronize
void Synchronize(const TCudaMatrix &) const
Blocking synchronization with the associated compute stream, if it&#39;s not the default stream...
Definition: CudaMatrix.h:274

TMVA::DNN::TCudaDeviceReference::operator=
void operator=(const TCudaDeviceReference &other)
Definition: CudaMatrix.h:219

TMVA::DNN::TCudaMatrix::operator()
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Access to elements of device matrices provided through TCudaDeviceReference class.
Definition: CudaMatrix.h:302

TMVA::DNN::TCudaMatrix::fDeviceReturn
static AFloat * fDeviceReturn
Buffer for kernel return values.
Definition: CudaMatrix.h:106

TMVA::DNN::TCudaDeviceReference::TCudaDeviceReference
TCudaDeviceReference(AFloat *devicePointer)
Definition: CudaMatrix.h:201

TMVA::DNN::TCudaMatrix::GetDeviceReturn
static AFloat GetDeviceReturn()
Transfer the value in the device return buffer to the host.
Definition: CudaMatrix.h:293

TMVA::DNN::TCudaMatrix::GetCublasHandle
const cublasHandle_t & GetCublasHandle() const
Definition: CudaMatrix.h:156

TMatrixTBase::Print
void Print(Option_t *name="") const
Print the matrix as a table of elements.
Definition: TMatrixTBase.cxx:832

TMVA
Abstract ClassifierFactory template that handles arbitrary types.
Definition: GeneticMinimizer.h:21

TMVA::DNN::TCudaMatrix::GetDeviceReturnPointer
static AFloat * GetDeviceReturnPointer()
Return device pointer to the device return buffer.
Definition: CudaMatrix.h:144

file
Definition: file.py:1

TMVA::DNN::TCudaMatrix::GetOnes
static AFloat * GetOnes()
Definition: CudaMatrix.h:118

TMVA::DNN::TCudaMatrix::Zero
void Zero()
Definition: CudaMatrix.h:168

TMVA::DNN::TCudaMatrix::ResetDeviceReturn
static void ResetDeviceReturn(AFloat value=0.0)
Set the return buffer on the device to the specified value.
Definition: CudaMatrix.h:285

TMVA::DNN::TCudaMatrix::GetDataPointer
const AFloat * GetDataPointer() const
Definition: CudaMatrix.h:154

n
const Int_t n
Definition: legend1.C:16

TMVA::DNN::TCudaMatrix
TCudaMatrix Class.
Definition: CudaMatrix.h:98

TMVA::DNN::TCudaMatrix::GetComputeStream
cudaStream_t GetComputeStream() const
Definition: CudaMatrix.h:260

TMVA::DNN::TCudaMatrix::fNOnes
static size_t fNOnes
Current length of the one vector.
Definition: CudaMatrix.h:108

TMVA::DNN::TCudaDeviceReference::operator+=
void operator+=(AFloat value)
Definition: CudaMatrix.h:236