Logo ROOT   6.18/05
Reference Guide
CudaBuffers.cxx
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 07/08/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12////////////////////////////////////////////////////////////////////////
13// Implementation of device and host buffers for CUDA architectures. //
14////////////////////////////////////////////////////////////////////////
15
16#include "TMVA/DataSetInfo.h"
17#include "TMVA/DNN/DataLoader.h"
18
22
23#include "cuda_runtime.h"
24#include <iostream>
25
26namespace TMVA {
27namespace DNN {
28
29//
30// TCudaHostBuffer
31//______________________________________________________________________________
32template <typename AFloat>
34{
35 cudaFreeHost(*devicePointer);
36 delete[] devicePointer;
37}
38
39//______________________________________________________________________________
40template <typename AFloat>
42{
43 AFloat **pointer = new AFloat *[1];
44 cudaMallocHost(pointer, size * sizeof(AFloat));
45 fHostPointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
46}
47
48//______________________________________________________________________________
49template <typename AFloat>
51{
52 return *fHostPointer + fOffset;
53}
54
55//______________________________________________________________________________
56template <typename AFloat>
58{
59 TCudaHostBuffer buffer = *this;
60 buffer.fOffset = offset;
61 buffer.fSize = size;
62 return buffer;
63}
64
65//
66// TCudaDevicePointer
67//______________________________________________________________________________
68template <typename AFloat>
70{
71 cudaFree(*devicePointer);
72 delete[] devicePointer;
73}
74
75//______________________________________________________________________________
76template <typename AFloat>
78{
79 AFloat **pointer = new AFloat *[1];
80 cudaMalloc(pointer, size * sizeof(AFloat));
81 fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
82 cudaStreamCreate(&fComputeStream);
83}
84
85//______________________________________________________________________________
86template <typename AFloat>
87TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(size_t size, cudaStream_t stream)
88 : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
89{
90 AFloat **pointer = new AFloat *[1];
91 cudaMalloc(pointer, size * sizeof(AFloat));
92 fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
93}
94
95//______________________________________________________________________________
96template <typename AFloat>
97TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(AFloat *devicePointer, size_t size, cudaStream_t stream)
98 : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
99{
100 AFloat **pointer = new AFloat *[1];
101 *pointer = devicePointer;
102 fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
103}
104
105//______________________________________________________________________________
106template <typename AFloat>
108{
109 TCudaDeviceBuffer buffer = *this;
110 buffer.fOffset = offset;
111 buffer.fSize = size;
112 return buffer;
113}
114
115//______________________________________________________________________________
116template <typename AFloat>
118{
119 return *fDevicePointer + fOffset;
120}
121
122//______________________________________________________________________________
123template <typename AFloat>
125{
126 cudaStreamSynchronize(fComputeStream);
127 cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat), cudaMemcpyHostToDevice, fComputeStream);
128}
129
130//______________________________________________________________________________
131template <typename AFloat>
133{
134 cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat), cudaMemcpyDeviceToHost, fComputeStream);
135 buffer.fComputeStream = fComputeStream;
136}
137
138//______________________________________________________________________________
139template <>
141 size_t batchSize)
142{
143 const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
144 size_t n = inputMatrix.GetNcols();
145
146 for (size_t i = 0; i < batchSize; i++) {
147 size_t sampleIndex = *sampleIterator;
148 for (size_t j = 0; j < n; j++) {
149 size_t bufferIndex = j * batchSize + i;
150 buffer[bufferIndex] = static_cast<float>(inputMatrix(sampleIndex, j));
151 }
152 sampleIterator++;
153 }
154}
155
156//______________________________________________________________________________
157template <>
159 IndexIterator_t sampleIterator, size_t batchSize)
160{
161 const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
162 size_t n = outputMatrix.GetNcols();
163
164 for (size_t i = 0; i < batchSize; i++) {
165 size_t sampleIndex = *sampleIterator;
166 for (size_t j = 0; j < n; j++) {
167 size_t bufferIndex = j * batchSize + i;
168 buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j));
169 }
170 sampleIterator++;
171 }
172}
173
174//______________________________________________________________________________
175template <>
177 IndexIterator_t sampleIterator, size_t batchSize)
178{
179 const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
180 for (size_t i = 0; i < batchSize; i++) {
181 buffer[i] = static_cast<float>(weightMatrix(*sampleIterator, 0));
182 sampleIterator++;
183 }
184}
185
186//______________________________________________________________________________
187template <>
189 size_t batchSize)
190{
191 Event *event = std::get<0>(fData)[0];
192 size_t n = event->GetNVariables();
193 for (size_t i = 0; i < batchSize; i++) {
194 size_t sampleIndex = * sampleIterator++;
195 event = std::get<0>(fData)[sampleIndex];
196 for (size_t j = 0; j < n; j++) {
197 size_t bufferIndex = j * batchSize + i;
198 buffer[bufferIndex] = static_cast<float>(event->GetValue(j));
199 }
200 }
201}
202
203//______________________________________________________________________________
204template <>
206 size_t batchSize)
207{
208 const DataSetInfo &info = std::get<1>(fData);
209 size_t n = buffer.GetSize() / batchSize;
210
211 // Copy target(s).
212
213 for (size_t i = 0; i < batchSize; i++) {
214 size_t sampleIndex = *sampleIterator++;
215 Event *event = std::get<0>(fData)[sampleIndex];
216 for (size_t j = 0; j < n; j++) {
217 // Copy output matrices.
218 size_t bufferIndex = j * batchSize + i;
219 // Classification
220 if (event->GetNTargets() == 0) {
221 if (n == 1) {
222 // Binary.
223 buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
224 } else {
225 // Multiclass.
226 buffer[bufferIndex] = 0.0;
227 if (j == event->GetClass()) {
228 buffer[bufferIndex] = 1.0;
229 }
230 }
231 } else {
232 buffer[bufferIndex] = static_cast<float>(event->GetTarget(j));
233 }
234 }
235 }
236}
237
238//______________________________________________________________________________
239template <>
241 size_t batchSize)
242{
243 for (size_t i = 0; i < batchSize; i++) {
244 size_t sampleIndex = *sampleIterator++;
245 Event *event = std::get<0>(fData)[sampleIndex];
246 buffer[i] = static_cast<float>(event->GetWeight());
247 }
248}
249
250//______________________________________________________________________________
251template <>
253 IndexIterator_t sampleIterator, size_t batchSize)
254{
255 const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
256 size_t n = inputMatrix.GetNcols();
257
258 for (size_t i = 0; i < batchSize; i++) {
259 size_t sampleIndex = *sampleIterator;
260 for (size_t j = 0; j < n; j++) {
261 size_t bufferIndex = j * batchSize + i;
262 buffer[bufferIndex] = inputMatrix(sampleIndex, j);
263 }
264 sampleIterator++;
265 }
266}
267
268//______________________________________________________________________________
269template <>
271 IndexIterator_t sampleIterator, size_t batchSize)
272{
273 const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
274 size_t n = outputMatrix.GetNcols();
275
276 for (size_t i = 0; i < batchSize; i++) {
277 size_t sampleIndex = *sampleIterator;
278 for (size_t j = 0; j < n; j++) {
279 size_t bufferIndex = j * batchSize + i;
280 buffer[bufferIndex] = outputMatrix(sampleIndex, j);
281 }
282 sampleIterator++;
283 }
284}
285
286//______________________________________________________________________________
287template <>
289 IndexIterator_t sampleIterator, size_t batchSize)
290{
291 const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
292 for (size_t i = 0; i < batchSize; i++) {
293 buffer[i] = static_cast<double>(weightMatrix(*sampleIterator, 0));
294 sampleIterator++;
295 }
296}
297
298//______________________________________________________________________________
299template <>
301 size_t batchSize)
302{
303 Event *event = std::get<0>(fData)[0];
304 size_t n = event->GetNVariables();
305 for (size_t i = 0; i < batchSize; i++) {
306 size_t sampleIndex = * sampleIterator++;
307 event = std::get<0>(fData)[sampleIndex];
308 for (size_t j = 0; j < n; j++) {
309 size_t bufferIndex = j * batchSize + i;
310 buffer[bufferIndex] = event->GetValue(j);
311 }
312 }
313}
314
315//______________________________________________________________________________
316template <>
318 IndexIterator_t sampleIterator, size_t batchSize)
319{
320 const DataSetInfo &info = std::get<1>(fData);
321 size_t n = buffer.GetSize() / batchSize;
322
323 // Copy target(s).
324
325 for (size_t i = 0; i < batchSize; i++) {
326 size_t sampleIndex = *sampleIterator++;
327 Event *event = std::get<0>(fData)[sampleIndex];
328 for (size_t j = 0; j < n; j++) {
329 // Copy output matrices.
330 size_t bufferIndex = j * batchSize + i;
331 // Classification
332 if (event->GetNTargets() == 0) {
333 // Binary.
334 if (n == 1) {
335 buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
336 } else {
337 // Multiclass.
338 buffer[bufferIndex] = 0.0;
339 if (j == event->GetClass()) {
340 buffer[bufferIndex] = 1.0;
341 }
342 }
343 } else {
344 buffer[bufferIndex] = event->GetTarget(j);
345 }
346 }
347 }
348}
349
350//______________________________________________________________________________
351template <>
353 IndexIterator_t sampleIterator, size_t batchSize)
354{
355 for (size_t i = 0; i < batchSize; i++) {
356 size_t sampleIndex = *sampleIterator++;
357 Event *event = std::get<0>(fData)[sampleIndex];
358 buffer[i] = static_cast<double>(event->GetWeight());
359 }
360}
361
362//______________________________________________________________________________
363template <>
365 IndexIterator_t sampleIterator)
366{
367 const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
368
369 for (size_t i = 0; i < fBatchSize; i++) {
370 size_t sampleIndex = *sampleIterator;
371 for (size_t j = 0; j < fBatchHeight; j++) {
372 for (size_t k = 0; k < fBatchWidth; k++) {
373 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
374 buffer[bufferIndex] = static_cast<float>(inputTensor[sampleIndex](j, k));
375 }
376 }
377 sampleIterator++;
378 }
379}
380
381//______________________________________________________________________________
382template <>
384 IndexIterator_t sampleIterator)
385{
386 const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
387 size_t n = outputMatrix.GetNcols();
388
389 for (size_t i = 0; i < fBatchSize; i++) {
390 size_t sampleIndex = *sampleIterator;
391 for (size_t j = 0; j < n; j++) {
392 size_t bufferIndex = j * fBatchSize + i;
393 buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j));
394 }
395 sampleIterator++;
396 }
397}
398
399//______________________________________________________________________________
400template <>
402 IndexIterator_t sampleIterator)
403{
404 const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
405 for (size_t i = 0; i < fBatchSize; i++) {
406 buffer[i] = static_cast<float>(weightMatrix(*sampleIterator, 0));
407 sampleIterator++;
408 }
409}
410
411//______________________________________________________________________________
412template <>
414 IndexIterator_t sampleIterator)
415{
416 // one event, one example in the batch
417
418 if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
419 for (size_t i = 0; i < fBatchHeight; i++) {
420 size_t sampleIndex = *sampleIterator;
421 Event * event = std::get<0>(fData)[sampleIndex];
422 for (size_t j = 0; j < fBatchWidth; j++) {
423 size_t bufferIndex = j * fBatchHeight + i;
424 buffer[bufferIndex] = event->GetValue(j);
425 }
426 sampleIterator++;
427 }
428 } else if (fBatchDepth == fBatchSize) {
429 // batchDepth is batch size
430 for (size_t i = 0; i < fBatchDepth; i++) {
431 size_t sampleIndex = *sampleIterator;
432 Event * event = std::get<0>(fData)[sampleIndex];
433 for (size_t j = 0; j < fBatchHeight; j++) {
434 for (size_t k = 0; k < fBatchWidth; k++) {
435 // because of the column-major ordering
436 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
437 buffer[bufferIndex] = event->GetValue(j * fBatchWidth + k);
438 }
439 }
440 sampleIterator++;
441 }
442 }
443 else {
444 Error("TTensorDataLoader","Inconsistency between batch depth and batch size");
445 R__ASSERT(0);
446 }
447}
448//______________________________________________________________________________
449template <>
451 IndexIterator_t sampleIterator)
452{
453 const DataSetInfo &info = std::get<1>(fData);
454 size_t n = buffer.GetSize() / fBatchSize;
455
456 // Copy target(s).
457
458 for (size_t i = 0; i < fBatchSize; i++) {
459 size_t sampleIndex = *sampleIterator++;
460 Event *event = std::get<0>(fData)[sampleIndex];
461 for (size_t j = 0; j < n; j++) {
462 // Copy output matrices.
463 size_t bufferIndex = j * fBatchSize + i;
464 // Classification
465 if (event->GetNTargets() == 0) {
466 if (n == 1) {
467 // Binary.
468 buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
469 } else {
470 // Multiclass.
471 buffer[bufferIndex] = 0.0;
472 if (j == event->GetClass()) {
473 buffer[bufferIndex] = 1.0;
474 }
475 }
476 } else {
477 buffer[bufferIndex] = static_cast<Real_t>(event->GetTarget(j));
478 }
479 }
480 }
481}
482
483//______________________________________________________________________________
484template <>
486 IndexIterator_t sampleIterator)
487{
488 for (size_t i = 0; i < fBatchSize; i++) {
489 size_t sampleIndex = *sampleIterator++;
490 Event *event = std::get<0>(fData)[sampleIndex];
491 buffer[i] = event->GetWeight();
492 }
493}
494
495//______________________________________________________________________________
496template <>
498 IndexIterator_t sampleIterator)
499{
500 const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
501
502 for (size_t i = 0; i < fBatchSize; i++) {
503 size_t sampleIndex = *sampleIterator;
504 for (size_t j = 0; j < fBatchHeight; j++) {
505 for (size_t k = 0; k < fBatchWidth; k++) {
506 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
507 buffer[bufferIndex] = inputTensor[sampleIndex](j, k);
508 }
509 }
510 sampleIterator++;
511 }
512}
513
514//______________________________________________________________________________
515template <>
517 IndexIterator_t sampleIterator)
518{
519 const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
520 size_t n = outputMatrix.GetNcols();
521
522 for (size_t i = 0; i < fBatchSize; i++) {
523 size_t sampleIndex = *sampleIterator;
524 for (size_t j = 0; j < n; j++) {
525 size_t bufferIndex = j * fBatchSize + i;
526 buffer[bufferIndex] = outputMatrix(sampleIndex, j);
527 }
528 sampleIterator++;
529 }
530}
531
532//______________________________________________________________________________
533template <>
535 IndexIterator_t sampleIterator)
536{
537 const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
538 for (size_t i = 0; i < fBatchSize; i++) {
539 buffer[i] = static_cast<double>(weightMatrix(*sampleIterator, 0));
540 sampleIterator++;
541 }
542}
543
544//______________________________________________________________________________
545template <>
547 IndexIterator_t sampleIterator)
548{
549 // one event, one example in the batch
550
551 if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
552 for (size_t i = 0; i < fBatchHeight; i++) {
553 size_t sampleIndex = *sampleIterator;
554 Event * event = std::get<0>(fData)[sampleIndex];
555 for (size_t j = 0; j < fBatchWidth; j++) {
556 size_t bufferIndex = j * fBatchHeight + i;
557 buffer[bufferIndex] = event->GetValue(j);
558 }
559 sampleIterator++;
560 }
561 } else if (fBatchDepth == fBatchSize) {
562 // batchDepth is batch size
563 for (size_t i = 0; i < fBatchDepth; i++) {
564 size_t sampleIndex = *sampleIterator;
565 Event * event = std::get<0>(fData)[sampleIndex];
566 for (size_t j = 0; j < fBatchHeight; j++) {
567 for (size_t k = 0; k < fBatchWidth; k++) {
568 // because of the column-major ordering
569 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
570 buffer[bufferIndex] = event->GetValue(j * fBatchWidth + k);
571 }
572 }
573 sampleIterator++;
574 }
575 }
576 else {
577 Error("TTensorDataLoader","Inconsistency between batch depth and batch size");
578 R__ASSERT(0);
579 }
580}
581
582//______________________________________________________________________________
583template <>
585 IndexIterator_t sampleIterator)
586{
587 const DataSetInfo &info = std::get<1>(fData);
588 size_t n = buffer.GetSize() / fBatchSize;
589
590 // Copy target(s).
591
592 for (size_t i = 0; i < fBatchSize; i++) {
593 size_t sampleIndex = *sampleIterator++;
594 Event *event = std::get<0>(fData)[sampleIndex];
595 for (size_t j = 0; j < n; j++) {
596 // Copy output matrices.
597 size_t bufferIndex = j * fBatchSize + i;
598 // Classification
599 if (event->GetNTargets() == 0) {
600 if (n == 1) {
601 // Binary.
602 buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
603 } else {
604 // Multiclass.
605 buffer[bufferIndex] = 0.0;
606 if (j == event->GetClass()) {
607 buffer[bufferIndex] = 1.0;
608 }
609 }
610 } else {
611 buffer[bufferIndex] = static_cast<Real_t>(event->GetTarget(j));
612 }
613 }
614 }
615}
616
617//______________________________________________________________________________
618template <>
620 IndexIterator_t sampleIterator)
621{
622 for (size_t i = 0; i < fBatchSize; i++) {
623 size_t sampleIndex = *sampleIterator++;
624 Event *event = std::get<0>(fData)[sampleIndex];
625 buffer[i] = event->GetWeight();
626 }
627}
628
629// Explicit Instantiations.
630
631template class TCudaDeviceBuffer<float>;
632template class TCudaDeviceBuffer<double>;
633
634template class TCudaHostBuffer<float>;
635template class TCudaHostBuffer<double>;
636
641
642} // TMVA
643} // DNN
float Real_t
Definition: RtypesCore.h:64
#define R__ASSERT(e)
Definition: TError.h:96
void Error(const char *location, const char *msgfmt,...)
TCudaDeviceBuffer.
Definition: CudaBuffers.h:98
size_t fOffset
Offset for sub-buffers.
Definition: CudaBuffers.h:101
void CopyFrom(const TCudaHostBuffer< AFloat > &) const
void CopyTo(const TCudaHostBuffer< AFloat > &) const
struct TMVA::DNN::TCudaDeviceBuffer::TDestructor fDestructor
TCudaDeviceBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
cudaStream_t fComputeStream
cudaStream for data transfer
Definition: CudaBuffers.h:103
std::shared_ptr< AFloat * > fDevicePointer
Pointer to the buffer data.
Definition: CudaBuffers.h:104
TCudaHostBuffer.
Definition: CudaBuffers.h:43
struct TMVA::DNN::TCudaHostBuffer::TDestructor fDestructor
size_t GetSize() const
Definition: CudaBuffers.h:82
size_t fOffset
Offset for sub-buffers.
Definition: CudaBuffers.h:46
TCudaHostBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
Definition: CudaBuffers.cxx:57
cudaStream_t fComputeStream
cudaStream for data transfer
Definition: CudaBuffers.h:48
std::shared_ptr< AFloat * > fHostPointer
Pointer to the buffer data.
Definition: CudaBuffers.h:49
Class that contains all the data information.
Definition: DataSetInfo.h:60
Bool_t IsSignal(const Event *ev) const
Int_t GetNcols() const
Definition: TMatrixTBase.h:127
const Int_t n
Definition: legend1.C:16
typename std::vector< size_t >::iterator IndexIterator_t
Definition: DataLoader.h:42
create variable transformations
void operator()(AFloat **devicePointer)
Definition: CudaBuffers.cxx:69
void operator()(AFloat **devicePointer)
Definition: CudaBuffers.cxx:33