Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
Arithmetic.cu
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 13/07/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12///////////////////////////////////////////////////////////////////
13// Contains additional arithmetic functions required by the CUDA //
14// neural network implementation. //
15///////////////////////////////////////////////////////////////////
16
19#include "Kernels.cuh"
20
21namespace TMVA
22{
23namespace DNN
24{
25
26//____________________________________________________________________________
27template<>
29 const TCudaMatrix<float> &A,
30 const TCudaMatrix<float> &B)
31{
32 int m, n, k;
33 m = A.GetNrows();
34 k = A.GetNcols();
35 n = B.GetNcols();
36 float alpha = 1.0, beta = 0.0;
37
38 cudaStream_t s = A.GetComputeStream();
39 cublasSetStream(A.GetCublasHandle(), s);
40
41 // Compute C = beta * C + alpha * (A * B)
42 cublasSgemm(A.GetCublasHandle(),
43 CUBLAS_OP_N, CUBLAS_OP_N,
44 m, n, k, & alpha,
45 A.GetDataPointer(), m, // *A, lda
46 B.GetDataPointer(), k, // *B, ldb
47 & beta, // beta
48 C.GetDataPointer(), m); // *C, ldc
49
50 C.SetComputeStream(s);
51}
52
53//____________________________________________________________________________
54template<>
56 const TCudaMatrix<double> &A,
57 const TCudaMatrix<double> &B)
58{
59 int m, n, k;
60 m = A.GetNrows();
61 k = A.GetNcols();
62 n = B.GetNcols();
63 double alpha = 1.0, beta = 0.0;
64
65 cudaStream_t s = A.GetComputeStream();
66 cublasSetStream(A.GetCublasHandle(), s);
67
68 // Compute C = beta * C + alpha * (A * B)
69 cublasDgemm(A.GetCublasHandle(),
70 CUBLAS_OP_N, CUBLAS_OP_N,
71 m, n, k, & alpha,
72 A.GetDataPointer(), m, // *A, lda
73 B.GetDataPointer(), k, // *B, ldb
74 & beta, // beta
75 C.GetDataPointer(), m); // *C, ldc
76
77 C.SetComputeStream(s);
78}
79
80//____________________________________________________________________________
81template<>
83 const TCudaMatrix<float> & A,
84 const TCudaMatrix<float> & B,
85 float alpha, float beta)
86{
87 int m, n, k;
88 k = A.GetNrows();
89 m = A.GetNcols();
90 n = B.GetNcols();
91 //float alpha = 1.0, beta = 0.0;
92
93 cudaStream_t s = A.GetComputeStream();
94 cublasSetStream(A.GetCublasHandle(), s);
95
96 // Compute C = beta * C + alpha * (A^T * B)
97 cublasSgemm(A.GetCublasHandle(),
98 CUBLAS_OP_T, CUBLAS_OP_N,
99 m, n, k, & alpha,
100 A.GetDataPointer(), k, // *A, lda
101 B.GetDataPointer(), k, // *B, ldb
102 & beta, // beta
103 C.GetDataPointer(), m); // *C, ldc
104
105 C.SetComputeStream(s);
106}
107//____________________________________________________________________________
108template<>
110 const TCudaMatrix<double> & A,
111 const TCudaMatrix<double> & B,
112 double alpha, double beta)
113{
114 int m, n, k;
115 k = A.GetNrows();
116 m = A.GetNcols();
117 n = B.GetNcols();
118 //double alpha = 1.0, beta = 0.0;
119
120 cudaStream_t s = A.GetComputeStream();
121 cublasSetStream(A.GetCublasHandle(), s);
122
123 // Compute C = beta * C + alpha * (A^T * B)
124 cublasDgemm(A.GetCublasHandle(),
125 CUBLAS_OP_T, CUBLAS_OP_N,
126 m, n, k, & alpha,
127 A.GetDataPointer(), k, // *A, lda
128 B.GetDataPointer(), k, // *B, ldb
129 & beta, // beta
130 C.GetDataPointer(), m); // *C, ldc
131
132 C.SetComputeStream(s);
133}
134
135//____________________________________________________________________________
136template<typename AFloat>
138 const TCudaMatrix<AFloat> &A)
139{
140 dim3 blockDims = TDevice::BlockDims2D();
141 dim3 gridDims = TDevice::GridDims2D(B);
142 cudaStream_t s = A.GetComputeStream();
143 ::TMVA::DNN::Cuda::Hadamard<<<gridDims, blockDims, 0, s>>>(B.GetDataPointer(),
144 A.GetDataPointer(),
145 A.GetNrows(),
146 A.GetNcols());
147 B.SetComputeStream(s);
148}
149//____________________________________________________________________________
150template<typename AFloat>
152 const TCudaTensor<AFloat> &A)
153{
154 dim3 blockDims = TDevice::BlockDims2D();
155 int ncols = A.GetFirstSize(); // ncols (size X)
156 int nrows = A.GetFirstStride(); // nrows (y size)
157 if (ncols == 1) {
158 ncols = A.GetWSize();
159 nrows = A.GetHSize();
160 }
161
162 dim3 gridDims = TDevice::GridDims2D(nrows, ncols);
163 cudaStream_t s = A.GetComputeStream();
164 ::TMVA::DNN::Cuda::Hadamard<<<gridDims, blockDims, 0, s>>>(B.GetDataPointer(),
165 A.GetDataPointer(),
166 nrows,ncols);
167 B.SetComputeStream(s);
168}
169
170//____________________________________________________________________________
171template<typename AFloat>
173{
174 dim3 blockDims = TDevice::BlockDims2D();
175 dim3 gridDims = TDevice::GridDims2D(A);
176 cudaStream_t s = A.GetComputeStream();
177
179 ::TMVA::DNN::Cuda::ReduceMatrix<<<gridDims, blockDims, 0, s>>>(
181 A.GetDataPointer(),
182 A.GetNrows(),
183 A.GetNcols());
185}
186
187//____________________________________________________________________________
188template<>
190 const TCudaMatrix<float> & A,
191 float alpha, float beta)
192{
193 int m, n;
194 m = A.GetNrows();
195 n = A.GetNcols();
196 //float alpha = 1.0, beta = 0.0;
197
198 cudaStream_t s = A.GetComputeStream();
199 cublasSetStream(A.GetCublasHandle(), s);
200
201 // Compute C = beta * C + alpha * (A * B)
202 cublasSgemv(A.GetCublasHandle(), CUBLAS_OP_T,
203 m, n, & alpha,
204 A.GetDataPointer(), m, // *A, lda
205 TCudaMatrix<float>::GetOnes(), 1, // *x, incx
206 & beta, B.GetDataPointer(), 1); // beta, *y, incy
207
208 B.SetComputeStream(s);
209}
210
211//____________________________________________________________________________
212template<>
214 const TCudaMatrix<double> & A,
215 double alpha, double beta)
216{
217 int m, n;
218 m = A.GetNrows();
219 n = A.GetNcols();
220 //double alpha = 1.0, beta = 0.0;
221
222 cudaStream_t s = A.GetComputeStream();
223 cublasSetStream(A.GetCublasHandle(), s);
224
225 // Compute C = beta * C + alpha * (A * B)
226 cublasDgemv(A.GetCublasHandle(), CUBLAS_OP_T,
227 m, n, & alpha,
228 A.GetDataPointer(), m, // *A, lda
229 TCudaMatrix<double>::GetOnes(), 1, // *x, incx
230 & beta, B.GetDataPointer(), 1); // beta, *y, incy
231
232 B.SetComputeStream(s);
233}
234
235template<>
237 const TCudaMatrix<float> & A)
238{
239 int m, n;
240 m = A.GetNrows();
241 n = A.GetNcols();
242 float alpha = 1.0, beta = 0.0;
243
244 cudaStream_t s = A.GetComputeStream();
245 cublasSetStream(A.GetCublasHandle(), s);
246
247 // Compute C = beta * C + alpha * (A * B)
248 cublasSgemv(A.GetCublasHandle(), CUBLAS_OP_N,
249 m, n, & alpha,
250 A.GetDataPointer(), m, // *A, lda
251 TCudaMatrix<float>::GetOnes(), 1, // *x, incx
252 & beta, B.GetDataPointer(), 1); // beta, *y, incy
253
254 B.SetComputeStream(s);
255}
256
257//____________________________________________________________________________
258template<>
260 const TCudaMatrix<double> & A)
261{
262 int m, n;
263 m = A.GetNrows();
264 n = A.GetNcols();
265 double alpha = 1.0, beta = 0.0;
266
267 cudaStream_t s = A.GetComputeStream();
268 cublasSetStream(A.GetCublasHandle(), s);
269
270 // Compute C = beta * C + alpha * (A * B)
271 cublasDgemv(A.GetCublasHandle(), CUBLAS_OP_N,
272 m, n, & alpha,
273 A.GetDataPointer(), m, // *A, lda
274 TCudaMatrix<double>::GetOnes(), 1, // *x, incx
275 & beta, B.GetDataPointer(), 1); // beta, *y, incy
276
277 B.SetComputeStream(s);
278}
279
280
281
282////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
283/// \brief Checks two matrices for element-wise equality.
284/// \tparam AFloat An architecture-specific floating point number type.
285/// \param A The first matrix.
286/// \param B The second matrix.
287/// \param epsilon Equality tolerance, needed to address floating point arithmetic.
288/// \return Whether the two matrices can be considered equal element-wise
289////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
290template<typename AFloat>
292{
293 if (A.GetNrows() != B.GetNrows() || A.GetNcols() != B.GetNcols()) {
294 Fatal("AlmostEquals", "The passed matrices have unequal shapes.");
295 }
296
297 dim3 blockDims = TDevice::BlockDims2D();
298 dim3 gridDims = TDevice::GridDims2D(A);
299 cudaStream_t s = A.GetComputeStream();
300
301 bool * dResult = 0;
302 cudaMalloc((void**) &dResult, sizeof(bool));
303 cudaMemset(dResult, 1, sizeof(bool));
304
305 ::TMVA::DNN::Cuda::AlmostEquals<<<gridDims, blockDims, 0, s>>>(dResult, A.GetDataPointer(), B.GetDataPointer(),
306 epsilon, A.GetNrows(), A.GetNcols());
307
308 bool result;
309 cudaMemcpy(&result, dResult, sizeof(bool), cudaMemcpyDeviceToHost);
310 cudaFree(dResult);
311
312 return result;
313}
314
315//____________________________________________________________________________
316template<>
318 const TCudaMatrix<float> & A,
319 float alpha)
320{
321 cudaStream_t s = 0;
322 cublasSetStream(A.GetCublasHandle(), s);
323 cublasSaxpy(A.GetCublasHandle(), A.GetNoElements(), &alpha,
324 A.GetDataPointer(), 1,
325 B.GetDataPointer(), 1);
326}
327
328//____________________________________________________________________________
329template<>
331 const TCudaMatrix<double> & A,
332 double alpha)
333{
334 cudaStream_t s = 0;
335 cublasSetStream(A.GetCublasHandle(), s);
336 cublasDaxpy(A.GetCublasHandle(), A.GetNoElements(), &alpha,
337 A.GetDataPointer(), 1,
338 B.GetDataPointer(), 1);
339}
340
341//____________________________________________________________________________
342template<typename AFloat>
344 const TCudaTensor<AFloat> & A,
345 AFloat alpha)
346{
347 // should re-implemented at tensor level
348 for (size_t i = 0; i < A.GetFirstSize(); ++i) {
349 TCudaMatrix<AFloat> B_m = B.At(i).GetMatrix();
350 TCudaMatrix<AFloat> A_m = A.At(i).GetMatrix();
351 ScaleAdd(B_m, A_m, alpha);
352 }
353}
354
355//____________________________________________________________________________
356template<typename AFloat>
358{
359 dim3 blockDims = TDevice::BlockDims2D();
360 dim3 gridDims = TDevice::GridDims2D(A);
361 cudaStream_t s = A.GetComputeStream();
362 ::TMVA::DNN::Cuda::ConstAdd<<<gridDims, blockDims, 0, s>>>(
363 A.GetDataPointer(),
364 beta,
365 (int) A.GetNrows(),
366 (int) A.GetNcols());
367}
368
369//____________________________________________________________________________
370template<typename AFloat>
372{
373 dim3 blockDims = TDevice::BlockDims2D();
374 dim3 gridDims = TDevice::GridDims2D(A);
375 cudaStream_t s = A.GetComputeStream();
376 ::TMVA::DNN::Cuda::ConstMult<<<gridDims, blockDims, 0, s>>>(
377 A.GetDataPointer(),
378 beta,
379 (int) A.GetNrows(),
380 (int) A.GetNcols());
381}
382
383//____________________________________________________________________________
384template<typename AFloat>
386{
387 dim3 blockDims = TDevice::BlockDims2D();
388 dim3 gridDims = TDevice::GridDims2D(A);
389 cudaStream_t s = A.GetComputeStream();
390 ::TMVA::DNN::Cuda::ReciprocalElementWise<<<gridDims, blockDims, 0, s>>>(
391 A.GetDataPointer(),
392 (int) A.GetNrows(),
393 (int) A.GetNcols());
394}
395
396//____________________________________________________________________________
397template<typename AFloat>
399{
400 dim3 blockDims = TDevice::BlockDims2D();
401 dim3 gridDims = TDevice::GridDims2D(A);
402 cudaStream_t s = A.GetComputeStream();
403 ::TMVA::DNN::Cuda::SquareElementWise<<<gridDims, blockDims, 0, s>>>(
404 A.GetDataPointer(),
405 (int) A.GetNrows(),
406 (int) A.GetNcols());
407}
408
409//____________________________________________________________________________
410template<typename AFloat>
412{
413 dim3 blockDims = TDevice::BlockDims2D();
414 dim3 gridDims = TDevice::GridDims2D(A);
415 cudaStream_t s = A.GetComputeStream();
416 ::TMVA::DNN::Cuda::SqrtElementWise<<<gridDims, blockDims, 0, s>>>(
417 A.GetDataPointer(),
418 (int) A.GetNrows(),
419 (int) A.GetNcols());
420}
421
422/// Adam updates
423//____________________________________________________________________________
424template<typename AFloat>
426{
427 dim3 blockDims = TDevice::BlockDims2D();
428 dim3 gridDims = TDevice::GridDims2D(A);
429 cudaStream_t s = A.GetComputeStream();
430 ::TMVA::DNN::Cuda::AdamUpdate<<<gridDims, blockDims, 0, s>>>(
431 A.GetDataPointer(),
432 M.GetDataPointer(),
433 V.GetDataPointer(),
434 (int) A.GetNrows(),
435 (int) A.GetNcols(),
436 alpha, eps);
437}
438
439//____________________________________________________________________________
440template<typename AFloat>
442{
443 dim3 blockDims = TDevice::BlockDims2D();
444 dim3 gridDims = TDevice::GridDims2D(A);
445 cudaStream_t s = A.GetComputeStream();
446 ::TMVA::DNN::Cuda::AdamUpdateFirstMom<<<gridDims, blockDims, 0, s>>>(
447 A.GetDataPointer(),
448 B.GetDataPointer(),
449 (int) A.GetNrows(),
450 (int) A.GetNcols(), beta);
451}
452
453//____________________________________________________________________________
454template<typename AFloat>
456{
457 dim3 blockDims = TDevice::BlockDims2D();
458 dim3 gridDims = TDevice::GridDims2D(A);
459 cudaStream_t s = A.GetComputeStream();
460 ::TMVA::DNN::Cuda::AdamUpdateSecondMom<<<gridDims, blockDims, 0, s>>>(
461 A.GetDataPointer(),
462 B.GetDataPointer(),
463 (int) A.GetNrows(),
464 (int) A.GetNcols(), beta);
465}
466
467
468
469
470
471
472
473
474} // DNN
475} // TMVA
void Fatal(const char *location, const char *msgfmt,...)
Use this function in case of a fatal error. It will abort the program.
Definition TError.cxx:244
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
TCudaMatrix Class.
Definition CudaMatrix.h:103
size_t GetNcols() const
Definition CudaMatrix.h:160
static AFloat GetDeviceReturn()
Transfer the value in the device return buffer to the host.
Definition CudaMatrix.h:301
void SetComputeStream(cudaStream_t stream)
Definition CudaMatrix.h:275
cudaStream_t GetComputeStream() const
Definition CudaMatrix.h:268
size_t GetNoElements() const
Definition CudaMatrix.h:161
static AFloat * GetDeviceReturnPointer()
Return device pointer to the device return buffer.
Definition CudaMatrix.h:151
const cublasHandle_t & GetCublasHandle() const
Definition CudaMatrix.h:165
static void ResetDeviceReturn(AFloat value=0.0)
Set the return buffer on the device to the specified value.
Definition CudaMatrix.h:293
const AFloat * GetDataPointer() const
Definition CudaMatrix.h:163
size_t GetNrows() const
Definition CudaMatrix.h:159
TCudaTensor Class.
Definition CudaTensor.h:84
TCudaTensor< AFloat > At(size_t i) const
Definition CudaTensor.h:364
size_t GetWSize() const
Definition CudaTensor.h:289
cudaStream_t GetComputeStream() const
Definition CudaTensor.h:213
const AFloat * GetDataPointer() const
Definition CudaTensor.h:194
size_t GetHSize() const
Definition CudaTensor.h:283
size_t GetFirstStride() const
Definition CudaTensor.h:276
void SetComputeStream(cudaStream_t stream)
Definition CudaTensor.h:216
size_t GetFirstSize() const
Definition CudaTensor.h:274
static bool AlmostEquals(const Matrix_t &A, const Matrix_t &B, double epsilon=0.1)
Check two matrices for equality, taking floating point arithmetic errors into account.
static void SqrtElementWise(Matrix_t &A)
Square root each element of the matrix A and write the result into A.
static void SumRows(Matrix_t &B, const Matrix_t &A)
extra functions defined only for CPU architecture !!!
static void Multiply(Matrix_t &C, const Matrix_t &A, const Matrix_t &B)
Standard multiplication of two matrices A and B with the result being written into C.
static void AdamUpdate(Matrix_t &A, const Matrix_t &M, const Matrix_t &V, Scalar_t alpha, Scalar_t eps)
Adam updates.
static void SumColumns(Matrix_t &B, const Matrix_t &A, Scalar_t alpha=1.0, Scalar_t beta=0.)
Sum columns of (m x n) matrix A and write the results into the first m elements in A.
static void AdamUpdateSecondMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta)
static void Hadamard(Tensor_t &A, const Tensor_t &B)
In-place Hadamard (element-wise) product of matrices A and B with the result being written into A.
static void ReciprocalElementWise(Matrix_t &A)
Reciprocal each element of the matrix A and write the result into A.
static void SquareElementWise(Matrix_t &A)
Square each element of the matrix A and write the result into A.
static Scalar_t Sum(const Matrix_t &A)
Compute the sum of all elements in A.
static void ConstMult(Matrix_t &A, Scalar_t beta)
Multiply the constant beta to all the elements of matrix A and write the result into A.
static void AdamUpdateFirstMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta)
static void ConstAdd(Matrix_t &A, Scalar_t beta)
Add the constant beta to all the elements of matrix A and write the result into A.
static void TransposeMultiply(Matrix_t &output, const Matrix_t &input, const Matrix_t &Weights, Scalar_t alpha=1.0, Scalar_t beta=0.)
Matrix multiplication of two matrices A and B^T (transposed) with the result being written into C.
static void ScaleAdd(Matrix_t &A, const Matrix_t &B, Scalar_t beta=1.0)
Adds a the elements in matrix B scaled by c to the elements in the matrix A.
static dim3 BlockDims2D()
Definition Device.h:55
static dim3 GridDims2D(int nrows, int ncols)
Definition Device.h:74
const Int_t n
Definition legend1.C:16
create variable transformations
TMarker m
Definition textangle.C:8