26#include "cuda_runtime.h"
35template <
typename AFloat>
38 cudaFreeHost(*devicePointer);
39 delete[] devicePointer;
43template <
typename AFloat>
46 AFloat **pointer =
new AFloat *[1];
47 cudaMallocHost(pointer, size *
sizeof(AFloat));
52template <
typename AFloat>
55 return (fHostPointer) ? *fHostPointer + fOffset :
nullptr;
59template <
typename AFloat>
69template <
typename AFloat>
72 std::fill(*fHostPointer, *fHostPointer+
fSize, constVal);
78template <
typename AFloat>
81 cudaFree(*devicePointer);
82 delete[] devicePointer;
86template <
typename AFloat>
89 AFloat **pointer =
new AFloat *[1];
90 cudaMalloc(pointer, size *
sizeof(AFloat));
96template <
typename AFloat>
98 : fOffset(0),
fSize(size), fComputeStream(stream), fDestructor()
100 AFloat **pointer =
new AFloat *[1];
101 cudaMalloc(pointer, size *
sizeof(AFloat));
106template <
typename AFloat>
108 : fOffset(0),
fSize(size), fComputeStream(stream), fDestructor()
110 AFloat **pointer =
new AFloat *[1];
111 *pointer = devicePointer;
116template <
typename AFloat>
126template <
typename AFloat>
129 return (fDevicePointer) ? *fDevicePointer + fOffset :
nullptr;
133template <
typename AFloat>
136 cudaStreamSynchronize(fComputeStream);
137 cudaMemcpyAsync(*
this, buffer,
fSize *
sizeof(AFloat), cudaMemcpyHostToDevice, fComputeStream);
141template <
typename AFloat>
144 cudaMemcpyAsync(buffer, *
this,
fSize *
sizeof(AFloat), cudaMemcpyDeviceToHost, fComputeStream);
156 for (
size_t i = 0; i < batchSize; i++) {
157 size_t sampleIndex = *sampleIterator;
158 for (
size_t j = 0; j <
n; j++) {
159 size_t bufferIndex = j * batchSize + i;
160 buffer[bufferIndex] =
static_cast<float>(inputMatrix(sampleIndex, j));
174 for (
size_t i = 0; i < batchSize; i++) {
175 size_t sampleIndex = *sampleIterator;
176 for (
size_t j = 0; j <
n; j++) {
177 size_t bufferIndex = j * batchSize + i;
178 buffer[bufferIndex] =
static_cast<float>(outputMatrix(sampleIndex, j));
190 for (
size_t i = 0; i < batchSize; i++) {
191 buffer[i] =
static_cast<float>(weightMatrix(*sampleIterator, 0));
201 Event *
event = std::get<0>(fData)[0];
203 for (
size_t i = 0; i < batchSize; i++) {
204 size_t sampleIndex = * sampleIterator++;
205 event = std::get<0>(fData)[sampleIndex];
206 for (
size_t j = 0; j <
n; j++) {
207 size_t bufferIndex = j * batchSize + i;
208 buffer[bufferIndex] =
static_cast<float>(
event->GetValue(j));
219 size_t n = buffer.
GetSize() / batchSize;
223 for (
size_t i = 0; i < batchSize; i++) {
224 size_t sampleIndex = *sampleIterator++;
225 Event *
event = std::get<0>(fData)[sampleIndex];
226 for (
size_t j = 0; j <
n; j++) {
228 size_t bufferIndex = j * batchSize + i;
230 if (event->GetNTargets() == 0) {
233 buffer[bufferIndex] = (info.
IsSignal(event)) ? 1.0 : 0.0;
236 buffer[bufferIndex] = 0.0;
237 if (j == event->GetClass()) {
238 buffer[bufferIndex] = 1.0;
242 buffer[bufferIndex] =
static_cast<float>(
event->GetTarget(j));
253 for (
size_t i = 0; i < batchSize; i++) {
254 size_t sampleIndex = *sampleIterator++;
255 Event *
event = std::get<0>(fData)[sampleIndex];
256 buffer[i] =
static_cast<float>(
event->GetWeight());
268 for (
size_t i = 0; i < batchSize; i++) {
269 size_t sampleIndex = *sampleIterator;
270 for (
size_t j = 0; j <
n; j++) {
271 size_t bufferIndex = j * batchSize + i;
272 buffer[bufferIndex] = inputMatrix(sampleIndex, j);
286 for (
size_t i = 0; i < batchSize; i++) {
287 size_t sampleIndex = *sampleIterator;
288 for (
size_t j = 0; j <
n; j++) {
289 size_t bufferIndex = j * batchSize + i;
290 buffer[bufferIndex] = outputMatrix(sampleIndex, j);
302 for (
size_t i = 0; i < batchSize; i++) {
303 buffer[i] =
static_cast<double>(weightMatrix(*sampleIterator, 0));
313 Event *
event = std::get<0>(fData)[0];
315 for (
size_t i = 0; i < batchSize; i++) {
316 size_t sampleIndex = * sampleIterator++;
317 event = std::get<0>(fData)[sampleIndex];
318 for (
size_t j = 0; j <
n; j++) {
319 size_t bufferIndex = j * batchSize + i;
320 buffer[bufferIndex] =
event->GetValue(j);
331 size_t n = buffer.
GetSize() / batchSize;
335 for (
size_t i = 0; i < batchSize; i++) {
336 size_t sampleIndex = *sampleIterator++;
337 Event *
event = std::get<0>(fData)[sampleIndex];
338 for (
size_t j = 0; j <
n; j++) {
340 size_t bufferIndex = j * batchSize + i;
342 if (event->GetNTargets() == 0) {
345 buffer[bufferIndex] = (info.
IsSignal(event)) ? 1.0 : 0.0;
348 buffer[bufferIndex] = 0.0;
349 if (j == event->GetClass()) {
350 buffer[bufferIndex] = 1.0;
354 buffer[bufferIndex] =
event->GetTarget(j);
365 for (
size_t i = 0; i < batchSize; i++) {
366 size_t sampleIndex = *sampleIterator++;
367 Event *
event = std::get<0>(fData)[sampleIndex];
368 buffer[i] =
static_cast<double>(
event->GetWeight());
377 const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
379 if (fBatchDepth == 1) {
380 for (
size_t i = 0; i < fBatchHeight; i++) {
381 size_t sampleIndex = *sampleIterator;
382 for (
size_t j = 0; j < fBatchWidth; j++) {
383 size_t bufferIndex = j * fBatchHeight + i;
384 buffer[bufferIndex] =
static_cast<float>(inputTensor[0](sampleIndex, j));
389 for (
size_t i = 0; i < fBatchDepth; i++) {
390 size_t sampleIndex = *sampleIterator;
391 for (
size_t j = 0; j < fBatchHeight; j++) {
392 for (
size_t k = 0; k < fBatchWidth; k++) {
393 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
394 buffer[bufferIndex] =
static_cast<float>(inputTensor[sampleIndex](j, k));
410 for (
size_t i = 0; i < fBatchSize; i++) {
411 size_t sampleIndex = *sampleIterator;
412 for (
size_t j = 0; j <
n; j++) {
413 size_t bufferIndex = j * fBatchSize + i;
414 buffer[bufferIndex] =
static_cast<float>(outputMatrix(sampleIndex, j));
426 for (
size_t i = 0; i < fBatchSize; i++) {
427 buffer[i] =
static_cast<float>(weightMatrix(*sampleIterator, 0));
439 if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
440 for (
size_t i = 0; i < fBatchHeight; i++) {
441 size_t sampleIndex = *sampleIterator;
442 Event *
event = std::get<0>(fData)[sampleIndex];
443 for (
size_t j = 0; j < fBatchWidth; j++) {
444 size_t bufferIndex = j * fBatchHeight + i;
445 buffer[bufferIndex] =
event->GetValue(j);
449 }
else if (fBatchDepth == fBatchSize) {
451 for (
size_t i = 0; i < fBatchDepth; i++) {
452 size_t sampleIndex = *sampleIterator;
453 Event *
event = std::get<0>(fData)[sampleIndex];
454 for (
size_t j = 0; j < fBatchHeight; j++) {
455 for (
size_t k = 0; k < fBatchWidth; k++) {
457 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
458 buffer[bufferIndex] =
event->GetValue(j * fBatchWidth + k);
465 std::cout << fBatchDepth << fBatchSize << fBatchHeight << std::endl;
466 Error(
"TTensorDataLoader",
"Inconsistency between batch depth and batch size");
476 size_t n = buffer.
GetSize() / fBatchSize;
480 for (
size_t i = 0; i < fBatchSize; i++) {
481 size_t sampleIndex = *sampleIterator++;
482 Event *
event = std::get<0>(fData)[sampleIndex];
483 for (
size_t j = 0; j <
n; j++) {
485 size_t bufferIndex = j * fBatchSize + i;
487 if (event->GetNTargets() == 0) {
490 buffer[bufferIndex] = (info.
IsSignal(event)) ? 1.0 : 0.0;
493 buffer[bufferIndex] = 0.0;
494 if (j == event->GetClass()) {
495 buffer[bufferIndex] = 1.0;
499 buffer[bufferIndex] =
static_cast<Float_t>(
event->GetTarget(j));
510 for (
size_t i = 0; i < fBatchSize; i++) {
511 size_t sampleIndex = *sampleIterator++;
512 Event *
event = std::get<0>(fData)[sampleIndex];
513 buffer[i] =
event->GetWeight();
522 const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);
524 if (fBatchDepth == 1) {
525 for (
size_t i = 0; i < fBatchHeight; i++) {
526 size_t sampleIndex = *sampleIterator;
527 for (
size_t j = 0; j < fBatchWidth; j++) {
528 size_t bufferIndex = j * fBatchHeight + i;
529 buffer[bufferIndex] =
static_cast<float>(inputTensor[0](sampleIndex, j));
534 for (
size_t i = 0; i < fBatchDepth; i++) {
535 size_t sampleIndex = *sampleIterator;
536 for (
size_t j = 0; j < fBatchHeight; j++) {
537 for (
size_t k = 0; k < fBatchWidth; k++) {
538 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
539 buffer[bufferIndex] =
static_cast<float>(inputTensor[sampleIndex](j, k));
555 for (
size_t i = 0; i < fBatchSize; i++) {
556 size_t sampleIndex = *sampleIterator;
557 for (
size_t j = 0; j <
n; j++) {
558 size_t bufferIndex = j * fBatchSize + i;
559 buffer[bufferIndex] = outputMatrix(sampleIndex, j);
572 for (
size_t i = 0; i < fBatchSize; i++) {
573 buffer[i] = weightMatrix(*sampleIterator, 0);
585 if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {
586 for (
size_t i = 0; i < fBatchHeight; i++) {
587 size_t sampleIndex = *sampleIterator;
588 Event *
event = std::get<0>(fData)[sampleIndex];
589 for (
size_t j = 0; j < fBatchWidth; j++) {
590 size_t bufferIndex = j * fBatchHeight + i;
591 buffer[bufferIndex] =
event->GetValue(j);
595 }
else if (fBatchDepth == fBatchSize) {
597 for (
size_t i = 0; i < fBatchDepth; i++) {
598 size_t sampleIndex = *sampleIterator;
599 Event *
event = std::get<0>(fData)[sampleIndex];
600 for (
size_t j = 0; j < fBatchHeight; j++) {
601 for (
size_t k = 0; k < fBatchWidth; k++) {
603 size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;
604 buffer[bufferIndex] =
event->GetValue(j * fBatchWidth + k);
611 std::cout << fBatchDepth << fBatchSize << fBatchHeight << std::endl;
612 Error(
"TTensorDataLoader",
"Inconsistency between batch depth and batch size");
623 size_t n = buffer.
GetSize() / fBatchSize;
627 for (
size_t i = 0; i < fBatchSize; i++) {
628 size_t sampleIndex = *sampleIterator++;
629 Event *
event = std::get<0>(fData)[sampleIndex];
630 for (
size_t j = 0; j <
n; j++) {
632 size_t bufferIndex = j * fBatchSize + i;
634 if (event->GetNTargets() == 0) {
637 buffer[bufferIndex] = (info.
IsSignal(event)) ? 1.0 : 0.0;
640 buffer[bufferIndex] = 0.0;
641 if (j == event->GetClass()) {
642 buffer[bufferIndex] = 1.0;
646 buffer[bufferIndex] =
static_cast<Double_t>(
event->GetTarget(j));
657 for (
size_t i = 0; i < fBatchSize; i++) {
658 size_t sampleIndex = *sampleIterator++;
659 Event *
event = std::get<0>(fData)[sampleIndex];
660 buffer[i] =
event->GetWeight();
671 DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
673 std::vector<Matrix_t> inputTensor(std::get<0>(DeviceBuffers), fBatchSize, )
674 size_t jump = fBatchHeight * fBatchWidth;
675 for (
size_t i = 0; i < fBatchSize; i++) {
676 DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
677 inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
679 Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
680 Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
688TTensorBatch<TCuda<double> > TTensorDataLoader<TensorInput, TCuda<double> >::GetTensorBatch()
692 DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
694 std::vector<Matrix_t> inputTensor;
695 size_t jump = fBatchHeight * fBatchWidth;
696 for (
size_t i = 0; i < fBatchSize; i++) {
697 DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
698 inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
700 Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
701 Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
704 return TTensorBatch<TCuda<double>>(inputTensor, outputMatrix, weightMatrix);
709TTensorBatch<TCuda<float> > TTensorDataLoader<TMVAInput_t, TCuda<float> >::GetTensorBatch()
713 DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
715 std::vector<Matrix_t> inputTensor;
716 size_t jump = fBatchHeight * fBatchWidth;
717 for (
size_t i = 0; i < fBatchSize; i++) {
718 DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
719 inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
721 Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
722 Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
725 return TTensorBatch<TCuda<float>>(inputTensor, outputMatrix, weightMatrix);
730TTensorBatch<TCuda<double> > TTensorDataLoader<TMVAInput_t, TCuda<double> >::GetTensorBatch()
734 DeviceBufferTuple DeviceBuffers = CopyTensorBatches();
736 std::vector<Matrix_t> inputTensor;
737 size_t jump = fBatchHeight * fBatchWidth;
738 for (
size_t i = 0; i < fBatchSize; i++) {
739 DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);
740 inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);
742 Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);
743 Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);
746 return TTensorBatch<TCuda<double>>(inputTensor, outputMatrix, weightMatrix);
756template class TCudaDeviceBuffer<float>;
757template class TCudaDeviceBuffer<double>;
759template class TCudaHostBuffer<float>;
760template class TCudaHostBuffer<double>;
762template class TDataLoader<MatrixInput_t, TCuda<float>>;
763template class TDataLoader<TMVAInput_t, TCuda<float>>;
764template class TDataLoader<MatrixInput_t, TCuda<double>>;
765template class TDataLoader<TMVAInput_t, TCuda<double>>;
767template class TTensorDataLoader<TensorInput, TCuda<float> >;
768template class TTensorDataLoader<TMVAInput_t, TCuda<float> >;
769template class TTensorDataLoader<TensorInput, TCuda<double >>;
770template class TTensorDataLoader<TMVAInput_t, TCuda<double> >;
void Error(const char *location, const char *msgfmt,...)
Use this function in case an error occurred.
size_t fOffset
Offset for sub-buffers.
void CopyFrom(const TCudaHostBuffer< AFloat > &) const
void CopyTo(const TCudaHostBuffer< AFloat > &) const
struct TMVA::DNN::TCudaDeviceBuffer::TDestructor fDestructor
TCudaDeviceBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
cudaStream_t fComputeStream
cudaStream for data transfer
std::shared_ptr< AFloat * > fDevicePointer
Pointer to the buffer data.
TCudaDeviceBuffer()=default
struct TMVA::DNN::TCudaHostBuffer::TDestructor fDestructor
size_t fOffset
Offset for sub-buffers.
TCudaHostBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
TCudaHostBuffer()=default
cudaStream_t fComputeStream
cudaStream for data transfer
void SetConstVal(const AFloat constVal)
Sets the entire buffer to a constant value.
std::shared_ptr< AFloat * > fHostPointer
Pointer to the buffer data.
Class that contains all the data information.
Bool_t IsSignal(const Event *ev) const
UInt_t GetNVariables() const
accessor to the number of variables
typename std::vector< size_t >::iterator IndexIterator_t
create variable transformations
void operator()(AFloat **devicePointer)
void operator()(AFloat **devicePointer)