Logo ROOT   6.16/01
Reference Guide
CudaMatrix.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 13/07/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12///////////////////////////////////////////////////////////////////////
13// Contains the TCudaMatrix class for the representation of matrices //
14// on CUDA devices as well as the TCudaDeviceReference class which //
15// is a helper class to emulate lvalue references to floating point //
16// values on the device. //
17///////////////////////////////////////////////////////////////////////
18
19#ifndef TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
20#define TMVA_DNN_ARCHITECTURES_CUDA_CUDAMATRIX
21
22#include "cuda.h"
23#include "cuda_runtime.h"
24#include "cublas_v2.h"
25#include "curand_kernel.h"
26
27#include "TMatrixT.h"
28#include "CudaBuffers.h"
29
30#define CUDACHECK(ans) {cudaError((ans), __FILE__, __LINE__); }
31
32namespace TMVA {
33namespace DNN {
34
35/** Function to check cuda return code. Taken from
36 * http://stackoverflow.com/questions/14038589/
37 */
38inline void cudaError(cudaError_t code, const char *file, int line, bool abort=true);
39
40//____________________________________________________________________________
41//
42// Cuda Device Reference
43//____________________________________________________________________________
44
45/** TCudaDeviceReference
46 *
47 * Helper class emulating lvalue references for AFloat values that are
48 * physically on the device. Allows for example to assign to matrix elements.
49 * Note that device access through CudaDeviceReferences enforces synchronization
50 * with all streams and thus qualifies as performance killer. Only used for
51 * testing.
52 */
53template<typename AFloat>
55{
56private:
57
59
60public:
61
62 TCudaDeviceReference(AFloat * devicePointer);
63
64 operator AFloat();
65
66 void operator=(const TCudaDeviceReference &other);
67 void operator=(AFloat value);
68 void operator+=(AFloat value);
69 void operator-=(AFloat value);
70};
71
72//____________________________________________________________________________
73//
74// Cuda Matrix
75//____________________________________________________________________________
76
77/** TCudaMatrix Class
78 *
79 * The TCudaMatrix class represents matrices on a CUDA device. The elements
80 * of the matrix are stored in a TCudaDeviceBuffer object which takes care of
81 * the allocation and freeing of the device memory. TCudaMatrices are lightweight
82 * object, that means on assignment and copy creation only a shallow copy is
83 * performed and no new element buffer allocated. To perform a deep copy use
84 * the static Copy method of the TCuda architecture class.
85 *
86 * The TCudaDeviceBuffer has an associated cuda stream, on which the data is
87 * transferred to the device. This stream can be accessed through the
88 * GetComputeStream member function and used to synchronize computations.
89 *
90 * The TCudaMatrix class also holds static references to CUDA resources.
91 * Those are the cublas handle, a buffer of curand states for the generation
92 * of random numbers as well as a vector containing ones, which is used for
93 * summing column matrices using matrix-vector multiplication. The class also
94 * has a static buffer for returning results from the device.
95 *
96 */
97template<typename AFloat>
99{
100public:
101
102private:
103
104 static size_t fInstances; ///< Current number of matrix instances.
105 static cublasHandle_t fCublasHandle;
106 static AFloat * fDeviceReturn; ///< Buffer for kernel return values.
107 static AFloat * fOnes; ///< Vector used for summations of columns.
108 static size_t fNOnes; ///< Current length of the one vector.
109 static curandState_t * fCurandStates;
110 static size_t fNCurandStates;
111
112 size_t fNRows;
113 size_t fNCols;
115
116public:
117
118 static AFloat * GetOnes() {return fOnes;}
119
121 TCudaMatrix(size_t i, size_t j);
123 TCudaMatrix(TCudaDeviceBuffer<AFloat> buffer, size_t m, size_t n);
124
125 TCudaMatrix(const TCudaMatrix &) = default;
126 TCudaMatrix( TCudaMatrix &&) = default;
127 TCudaMatrix & operator=(const TCudaMatrix &) = default;
129 ~TCudaMatrix() = default;
130
131 /** Convert cuda matrix to Root TMatrix. Performs synchronous data transfer. */
132 operator TMatrixT<AFloat>() const;
133
134 inline cudaStream_t GetComputeStream() const;
135 inline void SetComputeStream(cudaStream_t stream);
136 /** Set the return buffer on the device to the specified value. This is
137 * required for example for reductions in order to initialize the
138 * accumulator. */
139 inline static void ResetDeviceReturn(AFloat value = 0.0);
140 /** Transfer the value in the device return buffer to the host. This
141 * tranfer is synchronous */
142 inline static AFloat GetDeviceReturn();
143 /** Return device pointer to the device return buffer */
144 inline static AFloat * GetDeviceReturnPointer() {return fDeviceReturn;}
145 inline static curandState_t * GetCurandStatesPointer() {return fCurandStates;}
146
147 /** Blocking synchronization with the associated compute stream, if it's
148 * not the default stream. */
149 inline void Synchronize(const TCudaMatrix &) const;
150
151 size_t GetNrows() const {return fNRows;}
152 size_t GetNcols() const {return fNCols;}
153 size_t GetNoElements() const {return fNRows * fNCols;}
154
155 const AFloat * GetDataPointer() const {return fElementBuffer;}
156 AFloat * GetDataPointer() {return fElementBuffer;}
157 const cublasHandle_t & GetCublasHandle() const {return fCublasHandle;}
158
159 /** Access to elements of device matrices provided through TCudaDeviceReference
160 * class. Note that access is synchronous end enforces device synchronization
161 * on all streams. Only used for testing. */
162 TCudaDeviceReference<AFloat> operator()(size_t i, size_t j) const;
163
164 void Print() const {
165 TMatrixT<AFloat> mat(*this);
166 mat.Print();
167 }
168
169 void Zero() {
170 cudaMemset(GetDataPointer(), 0, sizeof(AFloat) * GetNoElements());
171 }
172
173
174private:
175
176 /** Initializes all shared devices resource and makes sure that a sufficient
177 * number of curand states are allocated on the device and initialized as
178 * well as that the one-vector for the summation over columns has the right
179 * size. */
182
183};
184
185//
186// Inline Functions.
187//______________________________________________________________________________
188inline void cudaError(cudaError_t code, const char *file, int line, bool abort)
189{
190 if (code != cudaSuccess)
191 {
192 fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
193 if (abort) exit(code);
194 }
195}
196
197//______________________________________________________________________________
198template<typename AFloat>
200 : fDevicePointer(devicePointer)
201{
202 // Nothing to do here.
203}
204
205//______________________________________________________________________________
206template<typename AFloat>
208{
209 AFloat buffer;
210 cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
211 cudaMemcpyDeviceToHost);
212 return buffer;
213}
214
215//______________________________________________________________________________
216template<typename AFloat>
218{
219 cudaMemcpy(fDevicePointer, other.fDevicePointer, sizeof(AFloat),
220 cudaMemcpyDeviceToDevice);
221}
222
223//______________________________________________________________________________
224template<typename AFloat>
226{
227 AFloat buffer = value;
228 cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
229 cudaMemcpyHostToDevice);
230}
231
232//______________________________________________________________________________
233template<typename AFloat>
235{
236 AFloat buffer;
237 cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
238 cudaMemcpyDeviceToHost);
239 buffer += value;
240 cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
241 cudaMemcpyHostToDevice);
242}
243
244//______________________________________________________________________________
245template<typename AFloat>
247{
248 AFloat buffer;
249 cudaMemcpy(& buffer, fDevicePointer, sizeof(AFloat),
250 cudaMemcpyDeviceToHost);
251 buffer -= value;
252 cudaMemcpy(fDevicePointer, & buffer, sizeof(AFloat),
253 cudaMemcpyHostToDevice);
254}
255
256//______________________________________________________________________________
257template<typename AFloat>
258inline cudaStream_t TCudaMatrix<AFloat>::GetComputeStream() const
259{
260 return fElementBuffer.GetComputeStream();
261}
262
263//______________________________________________________________________________
264template<typename AFloat>
265inline void TCudaMatrix<AFloat>::SetComputeStream(cudaStream_t stream)
266{
267 return fElementBuffer.SetComputeStream(stream);
268}
269
270//______________________________________________________________________________
271template<typename AFloat>
273{
274 cudaEvent_t event;
275 cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
276 cudaEventRecord(event, A.GetComputeStream());
277 cudaStreamWaitEvent(fElementBuffer.GetComputeStream(), event, 0);
278 cudaEventDestroy(event);
279}
280
281//______________________________________________________________________________
282template<typename AFloat>
284{
285 AFloat buffer = value;
286 cudaMemcpy(fDeviceReturn, & buffer, sizeof(AFloat), cudaMemcpyHostToDevice);
287}
288
289//______________________________________________________________________________
290template<typename AFloat>
292{
293 AFloat buffer;
294 cudaMemcpy(& buffer, fDeviceReturn, sizeof(AFloat), cudaMemcpyDeviceToHost);
295 return buffer;
296}
297
298//______________________________________________________________________________
299template<typename AFloat>
301{
302 AFloat * elementPointer = fElementBuffer;
303 elementPointer += j * fNRows + i;
304 return TCudaDeviceReference<AFloat>(elementPointer);
305}
306
307} // namespace DNN
308} // namespace TMVA
309
310#endif
TCudaDeviceBuffer.
Definition: CudaBuffers.h:98
TCudaDeviceReference.
Definition: CudaMatrix.h:55
void operator-=(AFloat value)
Definition: CudaMatrix.h:246
TCudaDeviceReference(AFloat *devicePointer)
Definition: CudaMatrix.h:199
void operator=(const TCudaDeviceReference &other)
Definition: CudaMatrix.h:217
void operator+=(AFloat value)
Definition: CudaMatrix.h:234
TCudaMatrix Class.
Definition: CudaMatrix.h:99
TCudaDeviceBuffer< AFloat > fElementBuffer
Definition: CudaMatrix.h:114
size_t GetNcols() const
Definition: CudaMatrix.h:152
TCudaMatrix & operator=(const TCudaMatrix &)=default
static curandState_t * fCurandStates
Definition: CudaMatrix.h:109
void Print() const
Definition: CudaMatrix.h:164
TCudaMatrix(const TMatrixT< AFloat > &)
static AFloat GetDeviceReturn()
Transfer the value in the device return buffer to the host.
Definition: CudaMatrix.h:291
void SetComputeStream(cudaStream_t stream)
Definition: CudaMatrix.h:265
static AFloat * fDeviceReturn
Buffer for kernel return values.
Definition: CudaMatrix.h:106
cudaStream_t GetComputeStream() const
Definition: CudaMatrix.h:258
size_t GetNoElements() const
Definition: CudaMatrix.h:153
void InitializeCuda()
Initializes all shared devices resource and makes sure that a sufficient number of curand states are ...
TCudaDeviceReference< AFloat > operator()(size_t i, size_t j) const
Access to elements of device matrices provided through TCudaDeviceReference class.
Definition: CudaMatrix.h:300
static AFloat * GetDeviceReturnPointer()
Return device pointer to the device return buffer.
Definition: CudaMatrix.h:144
const cublasHandle_t & GetCublasHandle() const
Definition: CudaMatrix.h:157
static AFloat * fOnes
Vector used for summations of columns.
Definition: CudaMatrix.h:107
static void ResetDeviceReturn(AFloat value=0.0)
Set the return buffer on the device to the specified value.
Definition: CudaMatrix.h:283
const AFloat * GetDataPointer() const
Definition: CudaMatrix.h:155
TCudaMatrix(TCudaDeviceBuffer< AFloat > buffer, size_t m, size_t n)
static size_t fNCurandStates
Definition: CudaMatrix.h:110
TCudaMatrix(const TCudaMatrix &)=default
void Synchronize(const TCudaMatrix &) const
Blocking synchronization with the associated compute stream, if it's not the default stream.
Definition: CudaMatrix.h:272
static AFloat * GetOnes()
Definition: CudaMatrix.h:118
static cublasHandle_t fCublasHandle
Definition: CudaMatrix.h:105
static size_t fInstances
Current number of matrix instances.
Definition: CudaMatrix.h:104
TCudaMatrix(size_t i, size_t j)
size_t GetNrows() const
Definition: CudaMatrix.h:151
TCudaMatrix & operator=(TCudaMatrix &&)=default
AFloat * GetDataPointer()
Definition: CudaMatrix.h:156
static size_t fNOnes
Current length of the one vector.
Definition: CudaMatrix.h:108
TCudaMatrix(TCudaMatrix &&)=default
static curandState_t * GetCurandStatesPointer()
Definition: CudaMatrix.h:145
void Print(Option_t *name="") const
Print the matrix as a table of elements.
TMatrixT.
Definition: TMatrixT.h:39
TLine * line
const Int_t n
Definition: legend1.C:16
static double A[]
void cudaError(cudaError_t code, const char *file, int line, bool abort=true)
Function to check cuda return code.
Definition: CudaMatrix.h:188
Abstract ClassifierFactory template that handles arbitrary types.
Definition: file.py:1
auto * m
Definition: textangle.C:8