Logo ROOT  
Reference Guide
CudaBuffers.cxx
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 07/08/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12////////////////////////////////////////////////////////////////////////
13// Implementation of device and host buffers for CUDA architectures. //
14////////////////////////////////////////////////////////////////////////
15
16#include "TMVA/DataSetInfo.h"
17#include "TMVA/DNN/DataLoader.h"
18
21#ifdef R__HAS_CUDNN
23#endif
25
26#include "cuda_runtime.h"
27#include <algorithm>
28
29namespace TMVA {
30namespace DNN {
31
32//
33// TCudaHostBuffer
34//______________________________________________________________________________
35template <typename AFloat>
37{
38 cudaFreeHost(*devicePointer);
39 delete[] devicePointer;
40}
41
42//______________________________________________________________________________
43template <typename AFloat>
45{
46 AFloat **pointer = new AFloat *[1];
47 cudaMallocHost(pointer, size * sizeof(AFloat));
48 fHostPointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
49}
50
51//______________________________________________________________________________
52template <typename AFloat>
54{
55 return *fHostPointer + fOffset;
56}
57
58//______________________________________________________________________________
59template <typename AFloat>
61{
62 TCudaHostBuffer buffer = *this;
63 buffer.fOffset = offset;
64 buffer.fSize = size;
65 return buffer;
66}
67
68//______________________________________________________________________________
69template <typename AFloat>
70void TCudaHostBuffer<AFloat>::SetConstVal(const AFloat constVal)
71{
72 std::fill(*fHostPointer, *fHostPointer+fSize, constVal);
73}
74
75//
76// TCudaDevicePointer
77//______________________________________________________________________________
78template <typename AFloat>
80{
81 cudaFree(*devicePointer);
82 delete[] devicePointer;
83}
84
85//______________________________________________________________________________
86template <typename AFloat>
88{
89 AFloat **pointer = new AFloat *[1];
90 cudaMalloc(pointer, size * sizeof(AFloat));
91 fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
92 cudaStreamCreate(&fComputeStream);
93}
94
95//______________________________________________________________________________
96template <typename AFloat>
97TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(size_t size, cudaStream_t stream)
98 : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
99{
100 AFloat **pointer = new AFloat *[1];
101 cudaMalloc(pointer, size * sizeof(AFloat));
102 fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
103}
104
105//______________________________________________________________________________
106template <typename AFloat>
107TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(AFloat *devicePointer, size_t size, cudaStream_t stream)
108 : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
109{
110 AFloat **pointer = new AFloat *[1];
111 *pointer = devicePointer;
112 fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
113}
114
115//______________________________________________________________________________
116template <typename AFloat>
118{
119 TCudaDeviceBuffer buffer = *this;
120 buffer.fOffset = offset;
121 buffer.fSize = size;
122 return buffer;
123}
124
125//______________________________________________________________________________
126template <typename AFloat>
128{
129 return *fDevicePointer + fOffset;
130}
131
132//______________________________________________________________________________
133template <typename AFloat>
135{
136 cudaStreamSynchronize(fComputeStream);
137 cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat), cudaMemcpyHostToDevice, fComputeStream);
138}
139
140//______________________________________________________________________________
141template <typename AFloat>
143{
144 cudaMemcpyAsync(buffer, *this, fSize * sizeof(AFloat), cudaMemcpyDeviceToHost, fComputeStream);
145 buffer.fComputeStream = fComputeStream;
146}
147
148//______________________________________________________________________________
149template <>
151 size_t batchSize)
152{
153 const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
154 size_t n = inputMatrix.GetNcols();
155
156 for (size_t i = 0; i < batchSize; i++) {
157 size_t sampleIndex = *sampleIterator;
158 for (size_t j = 0; j < n; j++) {
159 size_t bufferIndex = j * batchSize + i;
160 buffer[bufferIndex] = static_cast<float>(inputMatrix(sampleIndex, j));
161 }
162 sampleIterator++;
163 }
164}
165
166//______________________________________________________________________________
167template <>
169 IndexIterator_t sampleIterator, size_t batchSize)
170{
171 const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
172 size_t n = outputMatrix.GetNcols();
173
174 for (size_t i = 0; i < batchSize; i++) {
175 size_t sampleIndex = *sampleIterator;
176 for (size_t j = 0; j < n; j++) {
177 size_t bufferIndex = j * batchSize + i;
178 buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j));
179 }
180 sampleIterator++;
181 }
182}
183
184//______________________________________________________________________________
185template <>
187 IndexIterator_t sampleIterator, size_t batchSize)
188{
189 const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
190 for (size_t i = 0; i < batchSize; i++) {
191 buffer[i] = static_cast<float>(weightMatrix(*sampleIterator, 0));
192 sampleIterator++;
193 }
194}
195
196//______________________________________________________________________________
197template <>
199 size_t batchSize)
200{
201 Event *event = std::get<0>(fData)[0];
202 size_t n = event->GetNVariables();
203 for (size_t i = 0; i < batchSize; i++) {
204 size_t sampleIndex = * sampleIterator++;
205 event = std::get<0>(fData)[sampleIndex];
206 for (size_t j = 0; j < n; j++) {
207 size_t bufferIndex = j * batchSize + i;
208 buffer[bufferIndex] = static_cast<float>(event->GetValue(j));
209 }
210 }
211}
212
213//______________________________________________________________________________
214template <>
216 size_t batchSize)
217{
218 const DataSetInfo &info = std::get<1>(fData);
219 size_t n = buffer.GetSize() / batchSize;
220
221 // Copy target(s).
222
223 for (size_t i = 0; i < batchSize; i++) {
224 size_t sampleIndex = *sampleIterator++;
225 Event *event = std::get<0>(fData)[sampleIndex];
226 for (size_t j = 0; j < n; j++) {
227 // Copy output matrices.
228 size_t bufferIndex = j * batchSize + i;
229 // Classification
230 if (event->GetNTargets() == 0) {
231 if (n == 1) {
232 // Binary.
233 buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
234 } else {
235 // Multiclass.
236 buffer[bufferIndex] = 0.0;
237 if (j == event->GetClass()) {
238 buffer[bufferIndex] = 1.0;
239 }
240 }
241 } else {
242 buffer[bufferIndex] = static_cast<float>(event->GetTarget(j));
243 }
244 }
245 }
246}
247
248//______________________________________________________________________________
249template <>
251 size_t batchSize)
252{
253 for (size_t i = 0; i < batchSize; i++) {
254 size_t sampleIndex = *sampleIterator++;
255 Event *event = std::get<0>(fData)[sampleIndex];
256 buffer[i] = static_cast<float>(event->GetWeight());
257 }
258}
259
260//______________________________________________________________________________
261template <>
263 IndexIterator_t sampleIterator, size_t batchSize)
264{
265 const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
266 size_t n = inputMatrix.GetNcols();
267
268 for (size_t i = 0; i < batchSize; i++) {
269 size_t sampleIndex = *sampleIterator;
270 for (size_t j = 0; j < n; j++) {
271 size_t bufferIndex = j * batchSize + i;
272 buffer[bufferIndex] = inputMatrix(sampleIndex, j);
273 }
274 sampleIterator++;
275 }
276}
277
278//______________________________________________________________________________
279template <>
281 IndexIterator_t sampleIterator, size_t batchSize)
282{
283 const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
284 size_t n = outputMatrix.GetNcols();
285
286 for (size_t i = 0; i < batchSize; i++) {
287 size_t sampleIndex = *sampleIterator;
288 for (size_t j = 0; j < n; j++) {
289 size_t bufferIndex = j * batchSize + i;
290 buffer[bufferIndex] = outputMatrix(sampleIndex, j);
291 }
292 sampleIterator++;
293 }
294}
295
296//______________________________________________________________________________
297template <>
299 IndexIterator_t sampleIterator, size_t batchSize)
300{
301 const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
302 for (size_t i = 0; i < batchSize; i++) {
303 buffer[i] = static_cast<double>(weightMatrix(*sampleIterator, 0));
304 sampleIterator++;
305 }
306}
307
308//______________________________________________________________________________
309template <>
311 size_t batchSize)
312{
313 Event *event = std::get<0>(fData)[0];
314 size_t n = event->GetNVariables();
315 for (size_t i = 0; i < batchSize; i++) {
316 size_t sampleIndex = * sampleIterator++;
317 event = std::get<0>(fData)[sampleIndex];
318 for (size_t j = 0; j < n; j++) {
319 size_t bufferIndex = j * batchSize + i;
320 buffer[bufferIndex] = event->GetValue(j);
321 }
322 }
323}
324
325//______________________________________________________________________________
326template <>
328 IndexIterator_t sampleIterator, size_t batchSize)
329{
330 const DataSetInfo &info = std::get<1>(fData);
331 size_t n = buffer.GetSize() / batchSize;
332
333 // Copy target(s).
334
335 for (size_t i = 0; i < batchSize; i++) {
336 size_t sampleIndex = *sampleIterator++;
337 Event *event = std::get<0>(fData)[sampleIndex];
338 for (size_t j = 0; j < n; j++) {
339 // Copy output matrices.
340 size_t bufferIndex = j * batchSize + i;
341 // Classification
342 if (event->GetNTargets() == 0) {
343 // Binary.
344 if (n == 1) {
345 buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
346 } else {
347 // Multiclass.
348 buffer[bufferIndex] = 0.0;
349 if (j == event->GetClass()) {
350 buffer[bufferIndex] = 1.0;
351 }
352 }
353 } else {
354 buffer[bufferIndex] = event->GetTarget(j);
355 }
356 }
357 }
358}
359
360//______________________________________________________________________________
361template <>
363 IndexIterator_t sampleIterator, size_t batchSize)
364{
365 for (size_t i = 0; i < batchSize; i++) {
366 size_t sampleIndex = *sampleIterator++;
367 Event *event = std::get<0>(fData)[sampleIndex];
368 buffer[i] = static_cast<double>(event->GetWeight());
369 }
370}
371
372//______________________________________________________________________________
373template <>
375 IndexIterator_t sampleIterator)
376{
377 const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
378
379 if (fBatchDepth == 1) {
380 for (size_t i = 0; i < fBatchHeight; i++) {
381 size_t sampleIndex = *sampleIterator;
382 for (size_t j = 0; j < fBatchWidth; j++) {
383 size_t bufferIndex = j * fBatchHeight + i;
384 buffer[bufferIndex] = static_cast<float>(inputTensor[0](sampleIndex, j));
385 }
386 sampleIterator++;
387 }
388 } else {
389 for (size_t i = 0; i < fBatchDepth; i++) {
390 size_t sampleIndex = *sampleIterator;
391 for (size_t j = 0; j < fBatchHeight; j++) {
392 for (size_t k = 0; k < fBatchWidth; k++) {
393 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
394 buffer[bufferIndex] = static_cast<float>(inputTensor[sampleIndex](j, k));
395 }
396 }
397 sampleIterator++;
398 }
399 }
400}
401
402//______________________________________________________________________________
403template <>
405 IndexIterator_t sampleIterator)
406{
407 const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
408 size_t n = outputMatrix.GetNcols();
409
410 for (size_t i = 0; i < fBatchSize; i++) {
411 size_t sampleIndex = *sampleIterator;
412 for (size_t j = 0; j < n; j++) {
413 size_t bufferIndex = j * fBatchSize + i;
414 buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j));
415 }
416 sampleIterator++;
417 }
418}
419
420//______________________________________________________________________________
421template <>
423 IndexIterator_t sampleIterator)
424{
425 const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
426 for (size_t i = 0; i < fBatchSize; i++) {
427 buffer[i] = static_cast<float>(weightMatrix(*sampleIterator, 0));
428 sampleIterator++;
429 }
430}
431
432//______________________________________________________________________________
433template <>
435 IndexIterator_t sampleIterator)
436{
437 // one event, one example in the batch
438
439 if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
440 for (size_t i = 0; i < fBatchHeight; i++) {
441 size_t sampleIndex = *sampleIterator;
442 Event * event = std::get<0>(fData)[sampleIndex];
443 for (size_t j = 0; j < fBatchWidth; j++) {
444 size_t bufferIndex = j * fBatchHeight + i;
445 buffer[bufferIndex] = event->GetValue(j);
446 }
447 sampleIterator++;
448 }
449 } else if (fBatchDepth == fBatchSize) {
450 // batchDepth is batch size
451 for (size_t i = 0; i < fBatchDepth; i++) {
452 size_t sampleIndex = *sampleIterator;
453 Event * event = std::get<0>(fData)[sampleIndex];
454 for (size_t j = 0; j < fBatchHeight; j++) {
455 for (size_t k = 0; k < fBatchWidth; k++) {
456 // because of the column-major ordering
457 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
458 buffer[bufferIndex] = event->GetValue(j * fBatchWidth + k);
459 }
460 }
461 sampleIterator++;
462 }
463 }
464 else {
465 std::cout << fBatchDepth << fBatchSize << fBatchHeight << std::endl;
466 Error("TTensorDataLoader","Inconsistency between batch depth and batch size");
467 R__ASSERT(0);
468 }
469}
470//______________________________________________________________________________
471template <>
473 IndexIterator_t sampleIterator)
474{
475 const DataSetInfo &info = std::get<1>(fData);
476 size_t n = buffer.GetSize() / fBatchSize;
477
478 // Copy target(s).
479
480 for (size_t i = 0; i < fBatchSize; i++) {
481 size_t sampleIndex = *sampleIterator++;
482 Event *event = std::get<0>(fData)[sampleIndex];
483 for (size_t j = 0; j < n; j++) {
484 // Copy output matrices.
485 size_t bufferIndex = j * fBatchSize + i;
486 // Classification
487 if (event->GetNTargets() == 0) {
488 if (n == 1) {
489 // Binary.
490 buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
491 } else {
492 // Multiclass.
493 buffer[bufferIndex] = 0.0;
494 if (j == event->GetClass()) {
495 buffer[bufferIndex] = 1.0;
496 }
497 }
498 } else {
499 buffer[bufferIndex] = static_cast<Float_t>(event->GetTarget(j));
500 }
501 }
502 }
503}
504
505//______________________________________________________________________________
506template <>
508 IndexIterator_t sampleIterator)
509{
510 for (size_t i = 0; i < fBatchSize; i++) {
511 size_t sampleIndex = *sampleIterator++;
512 Event *event = std::get<0>(fData)[sampleIndex];
513 buffer[i] = event->GetWeight();
514 }
515}
516
517//______________________________________________________________________________
518template <>
520 IndexIterator_t sampleIterator)
521{
522 const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
523
524 if (fBatchDepth == 1) {
525 for (size_t i = 0; i < fBatchHeight; i++) {
526 size_t sampleIndex = *sampleIterator;
527 for (size_t j = 0; j < fBatchWidth; j++) {
528 size_t bufferIndex = j * fBatchHeight + i;
529 buffer[bufferIndex] = static_cast<float>(inputTensor[0](sampleIndex, j));
530 }
531 sampleIterator++;
532 }
533 } else {
534 for (size_t i = 0; i < fBatchDepth; i++) {
535 size_t sampleIndex = *sampleIterator;
536 for (size_t j = 0; j < fBatchHeight; j++) {
537 for (size_t k = 0; k < fBatchWidth; k++) {
538 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
539 buffer[bufferIndex] = static_cast<float>(inputTensor[sampleIndex](j, k));
540 }
541 }
542 sampleIterator++;
543 }
544 }
545}
546
547//______________________________________________________________________________
548template <>
550 IndexIterator_t sampleIterator)
551{
552 const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
553 size_t n = outputMatrix.GetNcols();
554
555 for (size_t i = 0; i < fBatchSize; i++) {
556 size_t sampleIndex = *sampleIterator;
557 for (size_t j = 0; j < n; j++) {
558 size_t bufferIndex = j * fBatchSize + i;
559 buffer[bufferIndex] = outputMatrix(sampleIndex, j);
560 }
561 sampleIterator++;
562 }
563}
564
565//______________________________________________________________________________
566template <>
568 IndexIterator_t sampleIterator)
569{
570 const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
571
572 for (size_t i = 0; i < fBatchSize; i++) {
573 buffer[i] = weightMatrix(*sampleIterator, 0);
574 sampleIterator++;
575 }
576}
577
578//______________________________________________________________________________
579template <>
581 IndexIterator_t sampleIterator)
582{
583 // one event, one example in the batch
584
585 if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
586 for (size_t i = 0; i < fBatchHeight; i++) {
587 size_t sampleIndex = *sampleIterator;
588 Event * event = std::get<0>(fData)[sampleIndex];
589 for (size_t j = 0; j < fBatchWidth; j++) {
590 size_t bufferIndex = j * fBatchHeight + i;
591 buffer[bufferIndex] = event->GetValue(j);
592 }
593 sampleIterator++;
594 }
595 } else if (fBatchDepth == fBatchSize) {
596 // batchDepth is batch size
597 for (size_t i = 0; i < fBatchDepth; i++) {
598 size_t sampleIndex = *sampleIterator;
599 Event * event = std::get<0>(fData)[sampleIndex];
600 for (size_t j = 0; j < fBatchHeight; j++) {
601 for (size_t k = 0; k < fBatchWidth; k++) {
602 // because of the column-major ordering
603 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
604 buffer[bufferIndex] = event->GetValue(j * fBatchWidth + k);
605 }
606 }
607 sampleIterator++;
608 }
609 }
610 else {
611 std::cout << fBatchDepth << fBatchSize << fBatchHeight << std::endl;
612 Error("TTensorDataLoader","Inconsistency between batch depth and batch size");
613 R__ASSERT(0);
614 }
615}
616
617//______________________________________________________________________________
618template <>
620 IndexIterator_t sampleIterator)
621{
622 const DataSetInfo &info = std::get<1>(fData);
623 size_t n = buffer.GetSize() / fBatchSize;
624
625 // Copy target(s).
626
627 for (size_t i = 0; i < fBatchSize; i++) {
628 size_t sampleIndex = *sampleIterator++;
629 Event *event = std::get<0>(fData)[sampleIndex];
630 for (size_t j = 0; j < n; j++) {
631 // Copy output matrices.
632 size_t bufferIndex = j * fBatchSize + i;
633 // Classification
634 if (event->GetNTargets() == 0) {
635 if (n == 1) {
636 // Binary.
637 buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
638 } else {
639 // Multiclass.
640 buffer[bufferIndex] = 0.0;
641 if (j == event->GetClass()) {
642 buffer[bufferIndex] = 1.0;
643 }
644 }
645 } else {
646 buffer[bufferIndex] = static_cast<Double_t>(event->GetTarget(j));
647 }
648 }
649 }
650}
651
652//______________________________________________________________________________
653template <>
655 IndexIterator_t sampleIterator)
656{
657 for (size_t i = 0; i < fBatchSize; i++) {
658 size_t sampleIndex = *sampleIterator++;
659 Event *event = std::get<0>(fData)[sampleIndex];
660 buffer[i] = event->GetWeight();
661 }
662}
663
664#if 0
665//______________________________________________________________________________
666template <>
668{
669 // After copying the data to the device, wrap the device buffer in the respective
670 // architectures matrix type
671 DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
672
673 std::vector<Matrix_t> inputTensor(std::get<0>(DeviceBuffers), fBatchSize, )
674 size_t jump = fBatchHeight * fBatchWidth;
675 for (size_t i = 0; i < fBatchSize; i++) {
676 DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
677 inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
678 }
679 Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
680 Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
681
682 fBatchIndex++;
683 return TTensorBatch<TCuda<float>>(inputTensor, outputMatrix, weightMatrix);
684}
685
686//______________________________________________________________________________
687template <>
688TTensorBatch<TCuda<double> > TTensorDataLoader<TensorInput, TCuda<double> >::GetTensorBatch()
689{
690 // After copying the data to the device, wrap the device buffer in the respective
691 // architectures matrix type
692 DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
693
694 std::vector<Matrix_t> inputTensor;
695 size_t jump = fBatchHeight * fBatchWidth;
696 for (size_t i = 0; i < fBatchSize; i++) {
697 DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
698 inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
699 }
700 Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
701 Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
702
703 fBatchIndex++;
704 return TTensorBatch<TCuda<double>>(inputTensor, outputMatrix, weightMatrix);
705}
706
707//______________________________________________________________________________
708template <>
709TTensorBatch<TCuda<float> > TTensorDataLoader<TMVAInput_t, TCuda<float> >::GetTensorBatch()
710{
711 // After copying the data to the device, wrap the device buffer in the respective
712 // architectures matrix type
713 DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
714
715 std::vector<Matrix_t> inputTensor;
716 size_t jump = fBatchHeight * fBatchWidth;
717 for (size_t i = 0; i < fBatchSize; i++) {
718 DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
719 inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
720 }
721 Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
722 Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
723
724 fBatchIndex++;
725 return TTensorBatch<TCuda<float>>(inputTensor, outputMatrix, weightMatrix);
726}
727
728//______________________________________________________________________________
729template <>
730TTensorBatch<TCuda<double> > TTensorDataLoader<TMVAInput_t, TCuda<double> >::GetTensorBatch()
731{
732 // After copying the data to the device, wrap the device buffer in the respective
733 // architectures matrix type
734 DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
735
736 std::vector<Matrix_t> inputTensor;
737 size_t jump = fBatchHeight * fBatchWidth;
738 for (size_t i = 0; i < fBatchSize; i++) {
739 DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
740 inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
741 }
742 Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
743 Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
744
745 fBatchIndex++;
746 return TTensorBatch<TCuda<double>>(inputTensor, outputMatrix, weightMatrix);
747}
748#endif
749
750// see file Cudnn/TensorDataLoader.cxx for Cudnn definitions
751
752//______________________________________________________________________________
753// Explicit Instantiations.
754
755template class TCudaDeviceBuffer<float>;
756template class TCudaDeviceBuffer<double>;
757
758template class TCudaHostBuffer<float>;
759template class TCudaHostBuffer<double>;
760
761template class TDataLoader<MatrixInput_t, TCuda<float>>;
762template class TDataLoader<TMVAInput_t, TCuda<float>>;
763template class TDataLoader<MatrixInput_t, TCuda<double>>;
764template class TDataLoader<TMVAInput_t, TCuda<double>>;
765
766template class TTensorDataLoader<TensorInput, TCuda<float> >;
767template class TTensorDataLoader<TMVAInput_t, TCuda<float> >;
768template class TTensorDataLoader<TensorInput, TCuda<double >>;
769template class TTensorDataLoader<TMVAInput_t, TCuda<double> >;
770
771
772} // TMVA
773} // DNN
double Double_t
Definition: RtypesCore.h:55
float Float_t
Definition: RtypesCore.h:53
#define R__ASSERT(e)
Definition: TError.h:96
void Error(const char *location, const char *msgfmt,...)
TCudaDeviceBuffer.
Definition: CudaBuffers.h:100
size_t fOffset
Offset for sub-buffers.
Definition: CudaBuffers.h:103
void CopyFrom(const TCudaHostBuffer< AFloat > &) const
void CopyTo(const TCudaHostBuffer< AFloat > &) const
struct TMVA::DNN::TCudaDeviceBuffer::TDestructor fDestructor
TCudaDeviceBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
cudaStream_t fComputeStream
cudaStream for data transfer
Definition: CudaBuffers.h:105
std::shared_ptr< AFloat * > fDevicePointer
Pointer to the buffer data.
Definition: CudaBuffers.h:106
TCudaHostBuffer.
Definition: CudaBuffers.h:43
struct TMVA::DNN::TCudaHostBuffer::TDestructor fDestructor
size_t GetSize() const
Definition: CudaBuffers.h:84
size_t fOffset
Offset for sub-buffers.
Definition: CudaBuffers.h:46
TCudaHostBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
Definition: CudaBuffers.cxx:60
cudaStream_t fComputeStream
cudaStream for data transfer
Definition: CudaBuffers.h:48
void SetConstVal(const AFloat constVal)
Sets the entire buffer to a constant value.
Definition: CudaBuffers.cxx:70
std::shared_ptr< AFloat * > fHostPointer
Pointer to the buffer data.
Definition: CudaBuffers.h:49
Class that contains all the data information.
Definition: DataSetInfo.h:60
Bool_t IsSignal(const Event *ev) const
Int_t GetNcols() const
Definition: TMatrixTBase.h:127
const Int_t n
Definition: legend1.C:16
for(Int_t i=0;i< n;i++)
Definition: legend1.C:18
typename std::vector< size_t >::iterator IndexIterator_t
Definition: DataLoader.h:42
create variable transformations
fill
Definition: fit1_py.py:6
void operator()(AFloat **devicePointer)
Definition: CudaBuffers.cxx:79
void operator()(AFloat **devicePointer)
Definition: CudaBuffers.cxx:36