Logo ROOT   6.14/05
Reference Guide
CudaMatrix.h
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 13/07/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 ///////////////////////////////////////////////////////////////////////
13 // Contains the TCudaMatrix class for the representation of matrices //
14 // on CUDA devices as well as the TCudaDeviceReference class which //
15 // is a helper class to emulate lvalue references to floating point //
16 // values on the device. //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
20 #define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
21 
22 #include "cuda.h"
23 #include "cuda_runtime.h"
24 #include "cublas_v2.h"
25 #include "curand_kernel.h"
26 
27 #include "TMatrixT.h"
28 #include "CudaBuffers.h"
29 
30 #define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); }
31 
32 namespace TMVA {
33 namespace DNN {
34 
35 /** Function to check cuda return code. Taken from
36  * http://stackoverflow.com/questions/14038589/
37  */
38 inline void cudaError(cudaError_t code, const char *file, int line, bool abort=true);
39 
40 //____________________________________________________________________________
41 //
42 // Cuda Device Reference
43 //____________________________________________________________________________
44 
45 /** TCudaDeviceReference
46  *
47  * Helper class emulating lvalue references for AFloat values that are
48  * physically on the device. Allows for example to assign to matrix elements.
49  * Note that device access through CudaDeviceReferences enforces synchronization
50  * with all streams and thus qualifies as performance killer. Only used for
51  * testing.
52  */
53 template<typename AFloat>
55 {
56 private:
57 
58  AFloat * fDevicePointer;
59 
60 public:
61 
62  TCudaDeviceReference(AFloat * devicePointer);
63 
64  operator AFloat();
65 
66  void operator=(const TCudaDeviceReference &other);
67  void operator=(AFloat value);
68  void operator+=(AFloat value);
69  void operator-=(AFloat value);
70 };
71 
72 //____________________________________________________________________________
73 //
74 // Cuda Matrix
75 //____________________________________________________________________________
76 
77 /** TCudaMatrix Class
78  *
79  * The TCudaMatrix class represents matrices on a CUDA device. The elements
80  * of the matrix are stored in a TCudaDeviceBuffer object which takes care of
81  * the allocation and freeing of the device memory. TCudaMatrices are lightweight
82  * object, that means on assignment and copy creation only a shallow copy is
83  * performed and no new element buffer allocated. To perform a deep copy use
84  * the static Copy method of the TCuda architecture class.
85  *
86  * The TCudaDeviceBuffer has an associated cuda stream, on which the data is
87  * transferred to the device. This stream can be accessed through the
88  * GetComputeStream member function and used to synchronize computations.
89  *
90  * The TCudaMatrix class also holds static references to CUDA resources.
91  * Those are the cublas handle, a buffer of curand states for the generation
92  * of random numbers as well as a vector containing ones, which is used for
93  * summing column matrices using matrix-vector multiplication. The class also
94  * has a static buffer for returning results from the device.
95  *
96  */
97 template<typename AFloat>
99 {
100 public:
101 
102 private:
103 
104  static size_t fInstances; ///< Current number of matrix instances.
105  static cublasHandle_t fCublasHandle;
106  static AFloat * fDeviceReturn; ///< Buffer for kernel return values.
107  static AFloat * fOnes; ///< Vector used for summations of columns.
108  static size_t fNOnes; ///< Current length of the one vector.
109  static curandState_t * fCurandStates;
110  static size_t fNCurandStates;
111 
112  size_t fNRows;
113  size_t fNCols;
115 
116 public:
117 
118  static AFloat * GetOnes() {return fOnes;}
119 
120  TCudaMatrix();
121  TCudaMatrix(size_t i, size_t j);
123  TCudaMatrix(TCudaDeviceBuffer<AFloat> buffer, size_t m, size_t n);
124 
125  TCudaMatrix(const TCudaMatrix &) = default;
126  TCudaMatrix( TCudaMatrix &&) = default;
127  TCudaMatrix & operator=(const TCudaMatrix &) = default;
128  TCudaMatrix & operator=( TCudaMatrix &&) = default;
129  ~TCudaMatrix() = default;
130 
131  /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */
132  operator TMatrixT<Double_t>() const;
133 
134  inline cudaStream_t GetComputeStream() const;
135  inline void SetComputeStream(cudaStream_t stream);
136  /** Set the return buffer on the device to the specified value. This is
137  * required for example for reductions in order to initialize the
138  * accumulator. */
139  inline static void ResetDeviceReturn(AFloat value = 0.0);
140  /** Transfer the value in the device return buffer to the host. This
141  * tranfer is synchronous */
142  inline static AFloat GetDeviceReturn();
143  /** Return device pointer to the device return buffer */
144  inline static AFloat * GetDeviceReturnPointer() {return fDeviceReturn;}
145  inline static curandState_t * GetCurandStatesPointer() {return fCurandStates;}
146 
147  /** Blocking synchronization with the associated compute stream, if it's
148  * not the default stream. */
149  inline void Synchronize(const TCudaMatrix &) const;
150 
151  size_t GetNrows() const {return fNRows;}
152  size_t GetNcols() const {return fNCols;}
153  size_t GetNoElements() const {return fNRows * fNCols;}
154  const AFloat * GetDataPointer() const {return fElementBuffer;}
155  AFloat * GetDataPointer() {return fElementBuffer;}
156  const cublasHandle_t & GetCublasHandle() const {return fCublasHandle;}
157 
158  /** Access to elements of device matrices provided through TCudaDeviceReference
159  * class. Note that access is synchronous end enforces device synchronization
160  * on all streams. Only used for testing. */
161  TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const;
162 
163  void Print() const {
164  TMatrixT<Double_t> mat(*this);
165  mat.Print();
166  }
167 
168  void Zero() {
169  // to be checked
170  AFloat * p = GetDataPointer();
171  for (size_t i = 0; i < GetNoElements(); ++i)
172  p[i] = 0;
173  }
174 
175 
176 private:
177 
178  /** Initializes all shared devices resource and makes sure that a sufficient
179  * number of curand states are allocated on the device and initialized as
180  * well as that the one-vector for the summation over columns has the right
181  * size. */
182  void InitializeCuda();
183  void InitializeCurandStates();
184 
185 };
186 
187 //
188 // Inline Functions.
189 //______________________________________________________________________________
190 inline void cudaError(cudaError_t code, const char *file, int line, bool abort)
191 {
192  if (code != cudaSuccess)
193  {
194  fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
195  if (abort) exit(code);
196  }
197 }
198 
199 //______________________________________________________________________________
200 template<typename AFloat>
202  : fDevicePointer(devicePointer)
203 {
204  // Nothing to do here.
205 }
206 
207 //______________________________________________________________________________
208 template<typename AFloat>
210 {
211  AFloat buffer;
212  cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
213  cudaMemcpyDeviceToHost);
214  return buffer;
215 }
216 
217 //______________________________________________________________________________
218 template<typename AFloat>
220 {
221  cudaMemcpy(fDevicePointer, other.fDevicePointer, sizeof(AFloat),
222  cudaMemcpyDeviceToDevice);
223 }
224 
225 //______________________________________________________________________________
226 template<typename AFloat>
228 {
229  AFloat buffer = value;
230  cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
231  cudaMemcpyHostToDevice);
232 }
233 
234 //______________________________________________________________________________
235 template<typename AFloat>
237 {
238  AFloat buffer;
239  cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
240  cudaMemcpyDeviceToHost);
241  buffer += value;
242  cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
243  cudaMemcpyHostToDevice);
244 }
245 
246 //______________________________________________________________________________
247 template<typename AFloat>
249 {
250  AFloat buffer;
251  cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
252  cudaMemcpyDeviceToHost);
253  buffer -= value;
254  cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
255  cudaMemcpyHostToDevice);
256 }
257 
258 //______________________________________________________________________________
259 template<typename AFloat>
260 inline cudaStream_t TCudaMatrix<AFloat>::GetComputeStream() const
261 {
262  return fElementBuffer.GetComputeStream();
263 }
264 
265 //______________________________________________________________________________
266 template<typename AFloat>
267 inline void TCudaMatrix<AFloat>::SetComputeStream(cudaStream_t stream)
268 {
269  return fElementBuffer.SetComputeStream(stream);
270 }
271 
272 //______________________________________________________________________________
273 template<typename AFloat>
275 {
276  cudaEvent_t event;
277  cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
278  cudaEventRecord(event, A.GetComputeStream());
279  cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0);
280  cudaEventDestroy(event);
281 }
282 
283 //______________________________________________________________________________
284 template<typename AFloat>
285 inline void TCudaMatrix<AFloat>::ResetDeviceReturn(AFloat value)
286 {
287  AFloat buffer = value;
288  cudaMemcpy(fDeviceReturn, & buffer, sizeof(AFloat), cudaMemcpyHostToDevice);
289 }
290 
291 //______________________________________________________________________________
292 template<typename AFloat>
294 {
295  AFloat buffer;
296  cudaMemcpy(& buffer, fDeviceReturn, sizeof(AFloat), cudaMemcpyDeviceToHost);
297  return buffer;
298 }
299 
300 //______________________________________________________________________________
301 template<typename AFloat>
303 {
304  AFloat * elementPointer = fElementBuffer;
305  elementPointer += j * fNRows + i;
306  return TCudaDeviceReference<AFloat>(elementPointer);
307 }
308 
309 } // namespace DNN
310 } // namespace TMVA
311 
312 #endif
static curandState_t * GetCurandStatesPointer()
Definition: CudaMatrix.h:145
static cublasHandle_t fCublasHandle
Definition: CudaMatrix.h:105
TCudaDeviceReference.
Definition: CudaMatrix.h:54
auto * m
Definition: textangle.C:8
void operator-=(AFloat value)
Definition: CudaMatrix.h:248
TLine * line
TCudaDeviceBuffer.
Definition: CudaBuffers.h:28
void SetComputeStream(cudaStream_t stream)
Definition: CudaMatrix.h:267
static double A[]
TRObject operator()(const T1 &t1) const
static curandState_t * fCurandStates
Definition: CudaMatrix.h:109
size_t GetNoElements() const
Definition: CudaMatrix.h:153
void cudaError(cudaError_t code, const char *file, int line, bool abort=true)
Function to check cuda return code.
Definition: CudaMatrix.h:190
size_t GetNcols() const
Definition: CudaMatrix.h:152
static AFloat * fOnes
Vector used for summations of columns.
Definition: CudaMatrix.h:107
AFloat * GetDataPointer()
Definition: CudaMatrix.h:155
size_t GetNrows() const
Definition: CudaMatrix.h:151
void Print() const
Definition: CudaMatrix.h:163
TCudaDeviceBuffer< AFloat > fElementBuffer
Definition: CudaMatrix.h:114
static size_t fNCurandStates
Definition: CudaMatrix.h:110
static size_t fInstances
Current number of matrix instances.
Definition: CudaMatrix.h:104
void Synchronize(const TCudaMatrix &) const
Blocking synchronization with the associated compute stream, if it&#39;s not the default stream...
Definition: CudaMatrix.h:274
void operator=(const TCudaDeviceReference &other)
Definition: CudaMatrix.h:219
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Access to elements of device matrices provided through TCudaDeviceReference class.
Definition: CudaMatrix.h:302
static AFloat * fDeviceReturn
Buffer for kernel return values.
Definition: CudaMatrix.h:106
TCudaDeviceReference(AFloat *devicePointer)
Definition: CudaMatrix.h:201
static AFloat GetDeviceReturn()
Transfer the value in the device return buffer to the host.
Definition: CudaMatrix.h:293
const cublasHandle_t & GetCublasHandle() const
Definition: CudaMatrix.h:156
void Print(Option_t *name="") const
Print the matrix as a table of elements.
Abstract ClassifierFactory template that handles arbitrary types.
static AFloat * GetDeviceReturnPointer()
Return device pointer to the device return buffer.
Definition: CudaMatrix.h:144
Definition: file.py:1
static AFloat * GetOnes()
Definition: CudaMatrix.h:118
static void ResetDeviceReturn(AFloat value=0.0)
Set the return buffer on the device to the specified value.
Definition: CudaMatrix.h:285
const AFloat * GetDataPointer() const
Definition: CudaMatrix.h:154
const Int_t n
Definition: legend1.C:16
TCudaMatrix Class.
Definition: CudaMatrix.h:98
cudaStream_t GetComputeStream() const
Definition: CudaMatrix.h:260
static size_t fNOnes
Current length of the one vector.
Definition: CudaMatrix.h:108
void operator+=(AFloat value)
Definition: CudaMatrix.h:236