Logo ROOT   6.14/05
Reference Guide
CudaBuffers.cxx
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 07/08/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 ////////////////////////////////////////////////////////////////////////
13 // Implementation of device and host buffers for CUDA architectures. //
14 ////////////////////////////////////////////////////////////////////////
15 
16 #include "TMVA/DataSetInfo.h"
17 #include "TMVA/DNN/DataLoader.h"
18 
22 
23 #include "cuda_runtime.h"
24 #include <iostream>
25 
26 namespace TMVA {
27 namespace DNN {
28 
29 //
30 // TCudaHostBuffer
31 //______________________________________________________________________________
32 template <typename AFloat>
34 {
35  cudaFreeHost(*devicePointer);
36  delete[] devicePointer;
37 }
38 
39 //______________________________________________________________________________
40 template <typename AFloat>
41 TCudaHostBuffer<AFloat>::TCudaHostBuffer(size_t size) : fOffset(0), fSize(size), fComputeStream(0), fDestructor()
42 {
43  AFloat **pointer = new AFloat *[1];
44  cudaMallocHost(pointer, size * sizeof(AFloat));
45  fHostPointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
46 }
47 
48 //______________________________________________________________________________
49 template <typename AFloat>
51 {
52  return *fHostPointer + fOffset;
53 }
54 
55 //______________________________________________________________________________
56 template <typename AFloat>
58 {
59  TCudaHostBuffer buffer = *this;
60  buffer.fOffset = offset;
61  buffer.fSize = size;
62  return buffer;
63 }
64 
65 //
66 // TCudaDevicePointer
67 //______________________________________________________________________________
68 template <typename AFloat>
70 {
71  cudaFree(*devicePointer);
72  delete[] devicePointer;
73 }
74 
75 //______________________________________________________________________________
76 template <typename AFloat>
78 {
79  AFloat **pointer = new AFloat *[1];
80  cudaMalloc(pointer, size * sizeof(AFloat));
81  fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
82  cudaStreamCreate(&fComputeStream);
83 }
84 
85 //______________________________________________________________________________
86 template <typename AFloat>
87 TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(size_t size, cudaStream_t stream)
88  : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
89 {
90  AFloat **pointer = new AFloat *[1];
91  cudaMalloc(pointer, size * sizeof(AFloat));
92  fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
93 }
94 
95 //______________________________________________________________________________
96 template <typename AFloat>
97 TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(AFloat *devicePointer, size_t size, cudaStream_t stream)
98  : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
99 {
100  AFloat **pointer = new AFloat *[1];
101  *pointer = devicePointer;
102  fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
103 }
104 
105 //______________________________________________________________________________
106 template <typename AFloat>
108 {
109  TCudaDeviceBuffer buffer = *this;
110  buffer.fOffset = offset;
111  buffer.fSize = size;
112  return buffer;
113 }
114 
115 //______________________________________________________________________________
116 template <typename AFloat>
118 {
119  return *fDevicePointer + fOffset;
120 }
121 
122 //______________________________________________________________________________
123 template <typename AFloat>
125 {
126  cudaStreamSynchronize(fComputeStream);
127  cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat), cudaMemcpyHostToDevice, fComputeStream);
128 }
129 
130 //______________________________________________________________________________
131 template <typename AFloat>
133 {
134  cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat), cudaMemcpyDeviceToHost, fComputeStream);
136 }
137 
138 //______________________________________________________________________________
139 template <>
141  size_t batchSize)
142 {
143  const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
144  size_t n = inputMatrix.GetNcols();
145 
146  for (size_t i = 0; i < batchSize; i++) {
147  size_t sampleIndex = *sampleIterator;
148  for (size_t j = 0; j < n; j++) {
149  size_t bufferIndex = j * batchSize + i;
150  buffer[bufferIndex] = static_cast<float>(inputMatrix(sampleIndex, j));
151  }
152  sampleIterator++;
153  }
154 }
155 
156 //______________________________________________________________________________
157 template <>
159  IndexIterator_t sampleIterator, size_t batchSize)
160 {
161  const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
162  size_t n = outputMatrix.GetNcols();
163 
164  for (size_t i = 0; i < batchSize; i++) {
165  size_t sampleIndex = *sampleIterator;
166  for (size_t j = 0; j < n; j++) {
167  size_t bufferIndex = j * batchSize + i;
168  buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j));
169  }
170  sampleIterator++;
171  }
172 }
173 
174 //______________________________________________________________________________
175 template <>
177  IndexIterator_t sampleIterator, size_t batchSize)
178 {
179  const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
180  for (size_t i = 0; i < batchSize; i++) {
181  buffer[i] = static_cast<float>(weightMatrix(*sampleIterator, 0));
182  sampleIterator++;
183  }
184 }
185 
186 //______________________________________________________________________________
187 template <>
189  size_t batchSize)
190 {
191  Event *event = std::get<0>(fData)[0];
192  size_t n = event->GetNVariables();
193  for (size_t i = 0; i < batchSize; i++) {
194  size_t sampleIndex = * sampleIterator++;
195  event = std::get<0>(fData)[sampleIndex];
196  for (size_t j = 0; j < n; j++) {
197  size_t bufferIndex = j * batchSize + i;
198  buffer[bufferIndex] = static_cast<float>(event->GetValue(j));
199  }
200  }
201 }
202 
203 //______________________________________________________________________________
204 template <>
206  size_t batchSize)
207 {
208  const DataSetInfo &info = std::get<1>(fData);
209  size_t n = buffer.GetSize() / batchSize;
210 
211  // Copy target(s).
212 
213  for (size_t i = 0; i < batchSize; i++) {
214  size_t sampleIndex = *sampleIterator++;
215  Event *event = std::get<0>(fData)[sampleIndex];
216  for (size_t j = 0; j < n; j++) {
217  // Copy output matrices.
218  size_t bufferIndex = j * batchSize + i;
219  // Classification
220  if (event->GetNTargets() == 0) {
221  if (n == 1) {
222  // Binary.
223  buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
224  } else {
225  // Multiclass.
226  buffer[bufferIndex] = 0.0;
227  if (j == event->GetClass()) {
228  buffer[bufferIndex] = 1.0;
229  }
230  }
231  } else {
232  buffer[bufferIndex] = static_cast<float>(event->GetTarget(j));
233  }
234  }
235  }
236 }
237 
238 //______________________________________________________________________________
239 template <>
241  size_t batchSize)
242 {
243  for (size_t i = 0; i < batchSize; i++) {
244  size_t sampleIndex = *sampleIterator++;
245  Event *event = std::get<0>(fData)[sampleIndex];
246  buffer[i] = static_cast<float>(event->GetWeight());
247  }
248 }
249 
250 //______________________________________________________________________________
251 template <>
253  IndexIterator_t sampleIterator, size_t batchSize)
254 {
255  const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
256  size_t n = inputMatrix.GetNcols();
257 
258  for (size_t i = 0; i < batchSize; i++) {
259  size_t sampleIndex = *sampleIterator;
260  for (size_t j = 0; j < n; j++) {
261  size_t bufferIndex = j * batchSize + i;
262  buffer[bufferIndex] = inputMatrix(sampleIndex, j);
263  }
264  sampleIterator++;
265  }
266 }
267 
268 //______________________________________________________________________________
269 template <>
271  IndexIterator_t sampleIterator, size_t batchSize)
272 {
273  const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
274  size_t n = outputMatrix.GetNcols();
275 
276  for (size_t i = 0; i < batchSize; i++) {
277  size_t sampleIndex = *sampleIterator;
278  for (size_t j = 0; j < n; j++) {
279  size_t bufferIndex = j * batchSize + i;
280  buffer[bufferIndex] = outputMatrix(sampleIndex, j);
281  }
282  sampleIterator++;
283  }
284 }
285 
286 //______________________________________________________________________________
287 template <>
289  IndexIterator_t sampleIterator, size_t batchSize)
290 {
291  const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
292  for (size_t i = 0; i < batchSize; i++) {
293  buffer[i] = static_cast<double>(weightMatrix(*sampleIterator, 0));
294  sampleIterator++;
295  }
296 }
297 
298 //______________________________________________________________________________
299 template <>
301  size_t batchSize)
302 {
303  Event *event = std::get<0>(fData)[0];
304  size_t n = event->GetNVariables();
305  for (size_t i = 0; i < batchSize; i++) {
306  size_t sampleIndex = * sampleIterator++;
307  event = std::get<0>(fData)[sampleIndex];
308  for (size_t j = 0; j < n; j++) {
309  size_t bufferIndex = j * batchSize + i;
310  buffer[bufferIndex] = event->GetValue(j);
311  }
312  }
313 }
314 
315 //______________________________________________________________________________
316 template <>
318  IndexIterator_t sampleIterator, size_t batchSize)
319 {
320  const DataSetInfo &info = std::get<1>(fData);
321  size_t n = buffer.GetSize() / batchSize;
322 
323  // Copy target(s).
324 
325  for (size_t i = 0; i < batchSize; i++) {
326  size_t sampleIndex = *sampleIterator++;
327  Event *event = std::get<0>(fData)[sampleIndex];
328  for (size_t j = 0; j < n; j++) {
329  // Copy output matrices.
330  size_t bufferIndex = j * batchSize + i;
331  // Classification
332  if (event->GetNTargets() == 0) {
333  // Binary.
334  if (n == 1) {
335  buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
336  } else {
337  // Multiclass.
338  buffer[bufferIndex] = 0.0;
339  if (j == event->GetClass()) {
340  buffer[bufferIndex] = 1.0;
341  }
342  }
343  } else {
344  buffer[bufferIndex] = event->GetTarget(j);
345  }
346  }
347  }
348 }
349 
350 //______________________________________________________________________________
351 template <>
353  IndexIterator_t sampleIterator, size_t batchSize)
354 {
355  for (size_t i = 0; i < batchSize; i++) {
356  size_t sampleIndex = *sampleIterator++;
357  Event *event = std::get<0>(fData)[sampleIndex];
358  buffer[i] = static_cast<double>(event->GetWeight());
359  }
360 }
361 
362 //______________________________________________________________________________
363 template <>
365  IndexIterator_t sampleIterator)
366 {
367  const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
368 
369  for (size_t i = 0; i < fBatchSize; i++) {
370  size_t sampleIndex = *sampleIterator;
371  for (size_t j = 0; j < fBatchHeight; j++) {
372  for (size_t k = 0; k < fBatchWidth; k++) {
373  size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
374  buffer[bufferIndex] = static_cast<float>(inputTensor[sampleIndex](j, k));
375  }
376  }
377  sampleIterator++;
378  }
379 }
380 
381 //______________________________________________________________________________
382 template <>
384  IndexIterator_t sampleIterator)
385 {
386  const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
387  size_t n = outputMatrix.GetNcols();
388 
389  for (size_t i = 0; i < fBatchSize; i++) {
390  size_t sampleIndex = *sampleIterator;
391  for (size_t j = 0; j < n; j++) {
392  size_t bufferIndex = j * fBatchSize + i;
393  buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j));
394  }
395  sampleIterator++;
396  }
397 }
398 
399 //______________________________________________________________________________
400 template <>
402  IndexIterator_t sampleIterator)
403 {
404  const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
405  for (size_t i = 0; i < fBatchSize; i++) {
406  buffer[i] = static_cast<float>(weightMatrix(*sampleIterator, 0));
407  sampleIterator++;
408  }
409 }
410 
411 //______________________________________________________________________________
412 template <>
414  IndexIterator_t sampleIterator)
415 {
416  // one event, one example in the batch
417 
418  if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
419  for (size_t i = 0; i < fBatchHeight; i++) {
420  size_t sampleIndex = *sampleIterator;
421  Event * event = std::get<0>(fData)[sampleIndex];
422  for (size_t j = 0; j < fBatchWidth; j++) {
423  size_t bufferIndex = j * fBatchHeight + i;
424  buffer[bufferIndex] = event->GetValue(j);
425  }
426  sampleIterator++;
427  }
428  } else if (fBatchDepth == fBatchSize) {
429  // batchDepth is batch size
430  for (size_t i = 0; i < fBatchDepth; i++) {
431  size_t sampleIndex = *sampleIterator;
432  Event * event = std::get<0>(fData)[sampleIndex];
433  for (size_t j = 0; j < fBatchHeight; j++) {
434  for (size_t k = 0; k < fBatchWidth; k++) {
435  // because of the column-major ordering
436  size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
437  buffer[bufferIndex] = event->GetValue(j * fBatchWidth + k);
438  }
439  }
440  sampleIterator++;
441  }
442  }
443  else {
444  Error("TTensorDataLoader","Inconsistency between batch depth and batch size");
445  R__ASSERT(0);
446  }
447 }
448 //______________________________________________________________________________
449 template <>
451  IndexIterator_t sampleIterator)
452 {
453  const DataSetInfo &info = std::get<1>(fData);
454  size_t n = buffer.GetSize() / fBatchSize;
455 
456  // Copy target(s).
457 
458  for (size_t i = 0; i < fBatchSize; i++) {
459  size_t sampleIndex = *sampleIterator++;
460  Event *event = std::get<0>(fData)[sampleIndex];
461  for (size_t j = 0; j < n; j++) {
462  // Copy output matrices.
463  size_t bufferIndex = j * fBatchSize + i;
464  // Classification
465  if (event->GetNTargets() == 0) {
466  if (n == 1) {
467  // Binary.
468  buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
469  } else {
470  // Multiclass.
471  buffer[bufferIndex] = 0.0;
472  if (j == event->GetClass()) {
473  buffer[bufferIndex] = 1.0;
474  }
475  }
476  } else {
477  buffer[bufferIndex] = static_cast<Real_t>(event->GetTarget(j));
478  }
479  }
480  }
481 }
482 
483 //______________________________________________________________________________
484 template <>
486  IndexIterator_t sampleIterator)
487 {
488  for (size_t i = 0; i < fBatchSize; i++) {
489  size_t sampleIndex = *sampleIterator++;
490  Event *event = std::get<0>(fData)[sampleIndex];
491  buffer[i] = event->GetWeight();
492  }
493 }
494 
495 //______________________________________________________________________________
496 template <>
498  IndexIterator_t sampleIterator)
499 {
500  const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
501 
502  for (size_t i = 0; i < fBatchSize; i++) {
503  size_t sampleIndex = *sampleIterator;
504  for (size_t j = 0; j < fBatchHeight; j++) {
505  for (size_t k = 0; k < fBatchWidth; k++) {
506  size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
507  buffer[bufferIndex] = inputTensor[sampleIndex](j, k);
508  }
509  }
510  sampleIterator++;
511  }
512 }
513 
514 //______________________________________________________________________________
515 template <>
517  IndexIterator_t sampleIterator)
518 {
519  const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
520  size_t n = outputMatrix.GetNcols();
521 
522  for (size_t i = 0; i < fBatchSize; i++) {
523  size_t sampleIndex = *sampleIterator;
524  for (size_t j = 0; j < n; j++) {
525  size_t bufferIndex = j * fBatchSize + i;
526  buffer[bufferIndex] = outputMatrix(sampleIndex, j);
527  }
528  sampleIterator++;
529  }
530 }
531 
532 //______________________________________________________________________________
533 template <>
535  IndexIterator_t sampleIterator)
536 {
537  const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
538  for (size_t i = 0; i < fBatchSize; i++) {
539  buffer[i] = static_cast<double>(weightMatrix(*sampleIterator, 0));
540  sampleIterator++;
541  }
542 }
543 
544 //______________________________________________________________________________
545 template <>
547  IndexIterator_t sampleIterator)
548 {
549  // one event, one example in the batch
550 
551  if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
552  for (size_t i = 0; i < fBatchHeight; i++) {
553  size_t sampleIndex = *sampleIterator;
554  Event * event = std::get<0>(fData)[sampleIndex];
555  for (size_t j = 0; j < fBatchWidth; j++) {
556  size_t bufferIndex = j * fBatchHeight + i;
557  buffer[bufferIndex] = event->GetValue(j);
558  }
559  sampleIterator++;
560  }
561  } else if (fBatchDepth == fBatchSize) {
562  // batchDepth is batch size
563  for (size_t i = 0; i < fBatchDepth; i++) {
564  size_t sampleIndex = *sampleIterator;
565  Event * event = std::get<0>(fData)[sampleIndex];
566  for (size_t j = 0; j < fBatchHeight; j++) {
567  for (size_t k = 0; k < fBatchWidth; k++) {
568  // because of the column-major ordering
569  size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
570  buffer[bufferIndex] = event->GetValue(j * fBatchWidth + k);
571  }
572  }
573  sampleIterator++;
574  }
575  }
576  else {
577  Error("TTensorDataLoader","Inconsistency between batch depth and batch size");
578  R__ASSERT(0);
579  }
580 }
581 
582 //______________________________________________________________________________
583 template <>
585  IndexIterator_t sampleIterator)
586 {
587  const DataSetInfo &info = std::get<1>(fData);
588  size_t n = buffer.GetSize() / fBatchSize;
589 
590  // Copy target(s).
591 
592  for (size_t i = 0; i < fBatchSize; i++) {
593  size_t sampleIndex = *sampleIterator++;
594  Event *event = std::get<0>(fData)[sampleIndex];
595  for (size_t j = 0; j < n; j++) {
596  // Copy output matrices.
597  size_t bufferIndex = j * fBatchSize + i;
598  // Classification
599  if (event->GetNTargets() == 0) {
600  if (n == 1) {
601  // Binary.
602  buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
603  } else {
604  // Multiclass.
605  buffer[bufferIndex] = 0.0;
606  if (j == event->GetClass()) {
607  buffer[bufferIndex] = 1.0;
608  }
609  }
610  } else {
611  buffer[bufferIndex] = static_cast<Real_t>(event->GetTarget(j));
612  }
613  }
614  }
615 }
616 
617 //______________________________________________________________________________
618 template <>
620  IndexIterator_t sampleIterator)
621 {
622  for (size_t i = 0; i < fBatchSize; i++) {
623  size_t sampleIndex = *sampleIterator++;
624  Event *event = std::get<0>(fData)[sampleIndex];
625  buffer[i] = event->GetWeight();
626  }
627 }
628 
629 // Explicit Instantiations.
630 
631 template class TCudaDeviceBuffer<float>;
632 template class TCudaDeviceBuffer<double>;
633 
634 template class TCudaHostBuffer<float>;
635 template class TCudaHostBuffer<double>;
636 
641 
642 } // TMVA
643 } // DNN
std::shared_ptr< AFloat * > fHostPointer
Pointer to the buffer data.
Definition: CudaBuffers.h:49
Int_t GetNcols() const
Definition: TMatrixTBase.h:125
TCudaDeviceBuffer.
Definition: CudaBuffers.h:28
#define R__ASSERT(e)
Definition: TError.h:96
size_t GetSize() const
Definition: CudaBuffers.h:82
size_t fOffset
Offset for sub-buffers.
Definition: CudaBuffers.h:101
void CopyTo(const TCudaHostBuffer< AFloat > &) const
struct TMVA::DNN::TCudaHostBuffer::TDestructor fDestructor
cudaStream_t fComputeStream
cudaStream for data transfer
Definition: CudaBuffers.h:48
TCudaHostBuffer.
Definition: CudaBuffers.h:42
Class that contains all the data information.
Definition: DataSetInfo.h:60
struct TMVA::DNN::TCudaDeviceBuffer::TDestructor fDestructor
void CopyFrom(const TCudaHostBuffer< AFloat > &) const
typename std::vector< size_t >::iterator IndexIterator_t
Definition: DataLoader.h:42
void operator()(AFloat **devicePointer)
Definition: CudaBuffers.cxx:69
void Error(const char *location, const char *msgfmt,...)
void operator()(AFloat **devicePointer)
Definition: CudaBuffers.cxx:33
TDataLoader.
Definition: DataLoader.h:79
std::shared_ptr< AFloat * > fDevicePointer
Pointer to the buffer data.
Definition: CudaBuffers.h:104
float Real_t
Definition: RtypesCore.h:64
Abstract ClassifierFactory template that handles arbitrary types.
TCudaDeviceBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
Bool_t IsSignal(const Event *ev) const
TCudaHostBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
Definition: CudaBuffers.cxx:57
const Int_t n
Definition: legend1.C:16
size_t fOffset
Offset for sub-buffers.
Definition: CudaBuffers.h:46
cudaStream_t fComputeStream
cudaStream for data transfer
Definition: CudaBuffers.h:103