doc/v632/CudaBuffers_8cxx_source.html

// @(#)root/tmva/tmva/dnn:$Id$

// Author: Simon Pfreundschuh 07/08/16


/*************************************************************************

 * Copyright (C) 2016, Simon Pfreundschuh                                *

 * All rights reserved.                                                  *

 *                                                                       *

 * For the licensing terms see $ROOTSYS/LICENSE.                         *

 * For the list of contributors see $ROOTSYS/README/CREDITS.             *

 *************************************************************************/


////////////////////////////////////////////////////////////////////////

// Implementation of device and host buffers for CUDA architectures.  //

////////////////////////////////////////////////////////////////////////


#include "TMVA/DataSetInfo.h"

#include "TMVA/DNN/DataLoader.h"


#include "TMVA/DNN/TensorDataLoader.h"

#include "TMVA/DNN/Architectures/Cuda.h"

#ifdef R__HAS_CUDNN

#include "TMVA/DNN/Architectures/TCudnn.h"

#endif

#include "TMVA/DNN/Architectures/Cuda/CudaBuffers.h"


#include "cuda_runtime.h"

#include <algorithm>


namespace TMVA {

namespace DNN {


//

// TCudaHostBuffer

//______________________________________________________________________________

template <typename AFloat>


void TCudaHostBuffer<AFloat>::TDestructor::operator()(AFloat **devicePointer)

{

   cudaFreeHost(*devicePointer);

   delete[] devicePointer;

}


//______________________________________________________________________________

template <typename AFloat>


TCudaHostBuffer<AFloat>::TCudaHostBuffer(size_t size) : fOffset(0), fSize(size), fComputeStream(0), fDestructor()

{

   AFloat **pointer = new AFloat *[1];

   cudaMallocHost(pointer, size * sizeof(AFloat));

   fHostPointer = std::shared_ptr<AFloat *>(pointer, fDestructor);

}


//______________________________________________________________________________

template <typename AFloat>


TCudaHostBuffer<AFloat>::operator AFloat *() const

{

   return (fHostPointer) ? *fHostPointer + fOffset : nullptr;

}


//______________________________________________________________________________

template <typename AFloat>


TCudaHostBuffer<AFloat> TCudaHostBuffer<AFloat>::GetSubBuffer(size_t offset, size_t size)

{

   TCudaHostBuffer buffer = *this;

   buffer.fOffset = offset;

   buffer.fSize = size;

   return buffer;

}


//______________________________________________________________________________

template <typename AFloat>


void TCudaHostBuffer<AFloat>::SetConstVal(const AFloat constVal)

{

   std::fill(*fHostPointer, *fHostPointer+fSize, constVal);

}


//

// TCudaDevicePointer

//______________________________________________________________________________

template <typename AFloat>


void TCudaDeviceBuffer<AFloat>::TDestructor::operator()(AFloat **devicePointer)

{

   cudaFree(*devicePointer);

   delete[] devicePointer;

}


//______________________________________________________________________________

template <typename AFloat>


TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(size_t size) : fOffset(0), fSize(size), fDestructor()

{

   AFloat **pointer = new AFloat *[1];

   cudaMalloc(pointer, size * sizeof(AFloat));

   fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);

   cudaStreamCreate(&fComputeStream);

}


//______________________________________________________________________________

template <typename AFloat>


TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(size_t size, cudaStream_t stream)

   : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()

{

   AFloat **pointer = new AFloat *[1];

   cudaMalloc(pointer, size * sizeof(AFloat));

   fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);

}


//______________________________________________________________________________

template <typename AFloat>


TCudaDeviceBuffer<AFloat>::TCudaDeviceBuffer(AFloat *devicePointer, size_t size, cudaStream_t stream)

   : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()

{

   AFloat **pointer = new AFloat *[1];

   *pointer = devicePointer;

   fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);

}


//______________________________________________________________________________

template <typename AFloat>


TCudaDeviceBuffer<AFloat> TCudaDeviceBuffer<AFloat>::GetSubBuffer(size_t offset, size_t size)

{

   TCudaDeviceBuffer buffer = *this;

   buffer.fOffset = offset;

   buffer.fSize = size;

   return buffer;

}


//______________________________________________________________________________

template <typename AFloat>


TCudaDeviceBuffer<AFloat>::operator AFloat *() const

{

   return (fDevicePointer) ? *fDevicePointer + fOffset : nullptr;

}


//______________________________________________________________________________

template <typename AFloat>


void TCudaDeviceBuffer<AFloat>::CopyFrom(const TCudaHostBuffer<AFloat> &buffer) const

{

   cudaStreamSynchronize(fComputeStream);

   cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat), cudaMemcpyHostToDevice, fComputeStream);

}


//______________________________________________________________________________

template <typename AFloat>


void TCudaDeviceBuffer<AFloat>::CopyTo(const TCudaHostBuffer<AFloat> &buffer) const

{

   cudaMemcpyAsync(buffer, *this, fSize * sizeof(AFloat), cudaMemcpyDeviceToHost, fComputeStream);

   buffer.fComputeStream = fComputeStream;

}


//______________________________________________________________________________

template <>


void TDataLoader<MatrixInput_t, TCuda<float>>::CopyInput(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,

                                                         size_t batchSize)

{

   const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);

   size_t n = inputMatrix.GetNcols();


   for (size_t i = 0; i < batchSize; i++) {

      size_t sampleIndex = *sampleIterator;

      for (size_t j = 0; j < n; j++) {

         size_t bufferIndex = j * batchSize + i;

         buffer[bufferIndex] = static_cast<float>(inputMatrix(sampleIndex, j));

      }

      sampleIterator++;

   }

}


//______________________________________________________________________________

template <>


void TDataLoader<MatrixInput_t, TCuda<float>>::CopyOutput(TCudaHostBuffer<float> &buffer,

                                                          IndexIterator_t sampleIterator, size_t batchSize)

{

   const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);

   size_t n = outputMatrix.GetNcols();


   for (size_t i = 0; i < batchSize; i++) {

      size_t sampleIndex = *sampleIterator;

      for (size_t j = 0; j < n; j++) {

         size_t bufferIndex = j * batchSize + i;

         buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j));

      }

      sampleIterator++;

   }

}


//______________________________________________________________________________

template <>


void TDataLoader<MatrixInput_t, TCuda<float>>::CopyWeights(TCudaHostBuffer<float> &buffer,

                                                           IndexIterator_t sampleIterator, size_t batchSize)

{

   const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);

   for (size_t i = 0; i < batchSize; i++) {

      buffer[i] = static_cast<float>(weightMatrix(*sampleIterator, 0));

      sampleIterator++;

   }

}


//______________________________________________________________________________

template <>


void TDataLoader<TMVAInput_t, TCuda<float>>::CopyInput(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,

                                                       size_t batchSize)

{

   Event *event = std::get<0>(fData)[0];

   size_t n  = event->GetNVariables();

   for (size_t i = 0; i < batchSize; i++) {

      size_t sampleIndex = * sampleIterator++;

      event = std::get<0>(fData)[sampleIndex];

      for (size_t j = 0; j < n; j++) {

         size_t bufferIndex = j * batchSize + i;

         buffer[bufferIndex] = static_cast<float>(event->GetValue(j));

      }

   }

}


//______________________________________________________________________________

template <>


void TDataLoader<TMVAInput_t, TCuda<float>>::CopyOutput(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,

                                                        size_t batchSize)

{

  const DataSetInfo &info = std::get<1>(fData);

  size_t n = buffer.GetSize() / batchSize;


  // Copy target(s).


  for (size_t i = 0; i < batchSize; i++) {

    size_t sampleIndex = *sampleIterator++;

    Event *event = std::get<0>(fData)[sampleIndex];

    for (size_t j = 0; j < n; j++) {

      // Copy output matrices.

      size_t bufferIndex = j * batchSize + i;

      // Classification

      if (event->GetNTargets() == 0) {

        if (n == 1) {

          // Binary.

          buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;

        } else {

          // Multiclass.

          buffer[bufferIndex] = 0.0;

          if (j == event->GetClass()) {

            buffer[bufferIndex] = 1.0;

          }

        }

      } else {

        buffer[bufferIndex] = static_cast<float>(event->GetTarget(j));

      }

    }

   }

}


//______________________________________________________________________________

template <>


void TDataLoader<TMVAInput_t, TCuda<float>>::CopyWeights(TCudaHostBuffer<float> &buffer, IndexIterator_t sampleIterator,

                                                         size_t batchSize)

{

   for (size_t i = 0; i < batchSize; i++) {

      size_t sampleIndex = *sampleIterator++;

      Event *event = std::get<0>(fData)[sampleIndex];

      buffer[i] = static_cast<float>(event->GetWeight());

   }

}


//______________________________________________________________________________

template <>


void TDataLoader<MatrixInput_t, TCuda<double>>::CopyInput(TCudaHostBuffer<double> &buffer,

                                                          IndexIterator_t sampleIterator, size_t batchSize)

{

   const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);

   size_t n = inputMatrix.GetNcols();


   for (size_t i = 0; i < batchSize; i++) {

      size_t sampleIndex = *sampleIterator;

      for (size_t j = 0; j < n; j++) {

         size_t bufferIndex = j * batchSize + i;

         buffer[bufferIndex] = inputMatrix(sampleIndex, j);

      }

      sampleIterator++;

   }

}


//______________________________________________________________________________

template <>


void TDataLoader<MatrixInput_t, TCuda<double>>::CopyOutput(TCudaHostBuffer<double> &buffer,

                                                           IndexIterator_t sampleIterator, size_t batchSize)

{

   const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);

   size_t n = outputMatrix.GetNcols();


   for (size_t i = 0; i < batchSize; i++) {

      size_t sampleIndex = *sampleIterator;

      for (size_t j = 0; j < n; j++) {

         size_t bufferIndex = j * batchSize + i;

         buffer[bufferIndex] = outputMatrix(sampleIndex, j);

      }

      sampleIterator++;

   }

}


//______________________________________________________________________________

template <>


void TDataLoader<MatrixInput_t, TCuda<double>>::CopyWeights(TCudaHostBuffer<double> &buffer,

                                                            IndexIterator_t sampleIterator, size_t batchSize)

{

   const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);

   for (size_t i = 0; i < batchSize; i++) {

      buffer[i] = static_cast<double>(weightMatrix(*sampleIterator, 0));

      sampleIterator++;

   }

}


//______________________________________________________________________________

template <>


void TDataLoader<TMVAInput_t, TCuda<double>>::CopyInput(TCudaHostBuffer<double> &buffer, IndexIterator_t sampleIterator,

                                                        size_t batchSize)

{

   Event *event = std::get<0>(fData)[0];

   size_t n  = event->GetNVariables();

   for (size_t i = 0; i < batchSize; i++) {

      size_t sampleIndex = * sampleIterator++;

      event = std::get<0>(fData)[sampleIndex];

      for (size_t j = 0; j < n; j++) {

         size_t bufferIndex = j * batchSize + i;

         buffer[bufferIndex] = event->GetValue(j);

      }

   }

}


//______________________________________________________________________________

template <>


void TDataLoader<TMVAInput_t, TCuda<double>>::CopyOutput(TCudaHostBuffer<double> &buffer,

                                                         IndexIterator_t sampleIterator, size_t batchSize)

{

  const DataSetInfo &info = std::get<1>(fData);

  size_t n = buffer.GetSize() / batchSize;


  // Copy target(s).


  for (size_t i = 0; i < batchSize; i++) {

    size_t sampleIndex = *sampleIterator++;

    Event *event = std::get<0>(fData)[sampleIndex];

    for (size_t j = 0; j < n; j++) {

      // Copy output matrices.

      size_t bufferIndex = j * batchSize + i;

      // Classification

      if (event->GetNTargets() == 0) {

        // Binary.

        if (n == 1) {

          buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;

        } else {

          // Multiclass.

          buffer[bufferIndex] = 0.0;

          if (j == event->GetClass()) {

            buffer[bufferIndex] = 1.0;

          }

        }

      } else {

        buffer[bufferIndex] = event->GetTarget(j);

      }

    }

   }

}


//______________________________________________________________________________

template <>


void TDataLoader<TMVAInput_t, TCuda<double>>::CopyWeights(TCudaHostBuffer<double> &buffer,

                                                          IndexIterator_t sampleIterator, size_t batchSize)

{

   for (size_t i = 0; i < batchSize; i++) {

      size_t sampleIndex = *sampleIterator++;

      Event *event = std::get<0>(fData)[sampleIndex];

      buffer[i] = static_cast<double>(event->GetWeight());

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TensorInput, TCuda<float>>::CopyTensorInput(TCudaHostBuffer<float> &buffer,

                                                                   IndexIterator_t sampleIterator)

{

   const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);


   if (fBatchDepth == 1) {

      for (size_t i = 0; i < fBatchHeight; i++) {

         size_t sampleIndex = *sampleIterator;

         for (size_t j = 0; j < fBatchWidth; j++) {

            size_t bufferIndex = j * fBatchHeight + i;

            buffer[bufferIndex] = static_cast<float>(inputTensor[0](sampleIndex, j));

         }

         sampleIterator++;

      }

   } else {

      for (size_t i = 0; i < fBatchDepth; i++) {

         size_t sampleIndex = *sampleIterator;

         for (size_t j = 0; j < fBatchHeight; j++) {

            for (size_t k = 0; k < fBatchWidth; k++) {

               size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;

               buffer[bufferIndex] = static_cast<float>(inputTensor[sampleIndex](j, k));

            }

         }

         sampleIterator++;

      }

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TensorInput, TCuda<float>>::CopyTensorOutput(TCudaHostBuffer<float> &buffer,

                                                                    IndexIterator_t sampleIterator)

{

   const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);

   size_t n = outputMatrix.GetNcols();


   for (size_t i = 0; i < fBatchSize; i++) {

      size_t sampleIndex = *sampleIterator;

      for (size_t j = 0; j < n; j++) {

         size_t bufferIndex = j * fBatchSize + i;

         buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j));

      }

      sampleIterator++;

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TensorInput, TCuda<float>>::CopyTensorWeights(TCudaHostBuffer<float> &buffer,

                                                                     IndexIterator_t sampleIterator)

{

   const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);

   for (size_t i = 0; i < fBatchSize; i++) {

      buffer[i] = static_cast<float>(weightMatrix(*sampleIterator, 0));

      sampleIterator++;

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TMVAInput_t, TCuda<float>>::CopyTensorInput(TCudaHostBuffer<float> &buffer,

                                                                   IndexIterator_t sampleIterator)

{

   // one event, one  example in the batch


   if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {

      for (size_t i = 0; i < fBatchHeight; i++) {

         size_t sampleIndex = *sampleIterator;

         Event * event = std::get<0>(fData)[sampleIndex];

         for (size_t j = 0; j < fBatchWidth; j++) {

            size_t bufferIndex = j * fBatchHeight + i;

            buffer[bufferIndex] = event->GetValue(j);

         }

         sampleIterator++;

      }

   } else if (fBatchDepth == fBatchSize) {

      // batchDepth is batch size

      for (size_t i = 0; i < fBatchDepth; i++) {

         size_t sampleIndex = *sampleIterator;

         Event * event = std::get<0>(fData)[sampleIndex];

         for (size_t j = 0; j < fBatchHeight; j++) {

            for (size_t k = 0; k < fBatchWidth; k++) {

               // because of the column-major ordering

               size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;

               buffer[bufferIndex] = event->GetValue(j * fBatchWidth + k);

            }

         }

         sampleIterator++;

      }

   }

   else {

      std::cout  << fBatchDepth << fBatchSize << fBatchHeight << std::endl;

      Error("TTensorDataLoader","Inconsistency between batch depth and batch size");

      R__ASSERT(0);

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TMVAInput_t, TCuda<float>>::CopyTensorOutput(TCudaHostBuffer<float> &buffer,

                                                                    IndexIterator_t sampleIterator)

{

   const DataSetInfo &info = std::get<1>(fData);

   size_t n = buffer.GetSize() / fBatchSize;


   // Copy target(s).


   for (size_t i = 0; i < fBatchSize; i++) {

      size_t sampleIndex = *sampleIterator++;

      Event *event = std::get<0>(fData)[sampleIndex];

      for (size_t j = 0; j < n; j++) {

         // Copy output matrices.

         size_t bufferIndex = j * fBatchSize + i;

         // Classification

         if (event->GetNTargets() == 0) {

            if (n == 1) {

               // Binary.

               buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;

            } else {

               // Multiclass.

               buffer[bufferIndex] = 0.0;

               if (j == event->GetClass()) {

                  buffer[bufferIndex] = 1.0;

               }

            }

         } else {

            buffer[bufferIndex] = static_cast<Float_t>(event->GetTarget(j));

         }

      }

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TMVAInput_t, TCuda<float>>::CopyTensorWeights(TCudaHostBuffer<float> &buffer,

                                                                     IndexIterator_t sampleIterator)

{

   for (size_t i = 0; i < fBatchSize; i++) {

      size_t sampleIndex = *sampleIterator++;

      Event *event = std::get<0>(fData)[sampleIndex];

      buffer[i] = event->GetWeight();

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TensorInput, TCuda<Double_t>>::CopyTensorInput(TCudaHostBuffer<double> &buffer,

                                                                    IndexIterator_t sampleIterator)

{

   const std::vector<TMatrixT<Double_t>> &inputTensor = std::get<0>(fData);


   if (fBatchDepth == 1) {

      for (size_t i = 0; i < fBatchHeight; i++) {

         size_t sampleIndex = *sampleIterator;

         for (size_t j = 0; j < fBatchWidth; j++) {

            size_t bufferIndex = j * fBatchHeight + i;

            buffer[bufferIndex] = static_cast<float>(inputTensor[0](sampleIndex, j));

         }

         sampleIterator++;

      }

   } else {

      for (size_t i = 0; i < fBatchDepth; i++) {

         size_t sampleIndex = *sampleIterator;

         for (size_t j = 0; j < fBatchHeight; j++) {

            for (size_t k = 0; k < fBatchWidth; k++) {

               size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;

               buffer[bufferIndex] = static_cast<float>(inputTensor[sampleIndex](j, k));

            }

         }

         sampleIterator++;

      }

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TensorInput, TCuda<Double_t>>::CopyTensorOutput(TCudaHostBuffer<double> &buffer,

                                                                     IndexIterator_t sampleIterator)

{

   const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);

   size_t n = outputMatrix.GetNcols();


   for (size_t i = 0; i < fBatchSize; i++) {

      size_t sampleIndex = *sampleIterator;

      for (size_t j = 0; j < n; j++) {

         size_t bufferIndex = j * fBatchSize + i;

         buffer[bufferIndex] = outputMatrix(sampleIndex, j);

      }

      sampleIterator++;

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TensorInput, TCuda<Double_t>>::CopyTensorWeights(TCudaHostBuffer<double> &buffer,

                                                                      IndexIterator_t sampleIterator)

{

   const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);


   for (size_t i = 0; i < fBatchSize; i++) {

      buffer[i] = weightMatrix(*sampleIterator, 0);

      sampleIterator++;

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TMVAInput_t, TCuda<Double_t>>::CopyTensorInput(TCudaHostBuffer<double> &buffer,

                                                                    IndexIterator_t sampleIterator)

{

   // one event, one  example in the batch


   if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {

      for (size_t i = 0; i < fBatchHeight; i++) {

         size_t sampleIndex = *sampleIterator;

         Event * event = std::get<0>(fData)[sampleIndex];

         for (size_t j = 0; j < fBatchWidth; j++) {

            size_t bufferIndex = j * fBatchHeight + i;

            buffer[bufferIndex] = event->GetValue(j);

         }

         sampleIterator++;

      }

   } else if (fBatchDepth == fBatchSize) {

      // batchDepth is batch size

      for (size_t i = 0; i < fBatchDepth; i++) {

         size_t sampleIndex = *sampleIterator;

         Event * event = std::get<0>(fData)[sampleIndex];

         for (size_t j = 0; j < fBatchHeight; j++) {

            for (size_t k = 0; k < fBatchWidth; k++) {

               // because of the column-major ordering

               size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;

               buffer[bufferIndex] = event->GetValue(j * fBatchWidth + k);

            }

         }

         sampleIterator++;

      }

   }

   else {

      std::cout  << fBatchDepth << fBatchSize << fBatchHeight << std::endl;

      Error("TTensorDataLoader","Inconsistency between batch depth and batch size");

      R__ASSERT(0);

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TMVAInput_t, TCuda<Double_t>>::CopyTensorOutput(TCudaHostBuffer<double> &buffer,

                                                                     IndexIterator_t sampleIterator)

{

   const DataSetInfo &info = std::get<1>(fData);

   size_t n = buffer.GetSize() / fBatchSize;


   // Copy target(s).


   for (size_t i = 0; i < fBatchSize; i++) {

      size_t sampleIndex = *sampleIterator++;

      Event *event = std::get<0>(fData)[sampleIndex];

      for (size_t j = 0; j < n; j++) {

         // Copy output matrices.

         size_t bufferIndex = j * fBatchSize + i;

         // Classification

         if (event->GetNTargets() == 0) {

            if (n == 1) {

               // Binary.

               buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;

            } else {

               // Multiclass.

               buffer[bufferIndex] = 0.0;

               if (j == event->GetClass()) {

                  buffer[bufferIndex] = 1.0;

               }

            }

         } else {

            buffer[bufferIndex] = static_cast<Double_t>(event->GetTarget(j));

         }

      }

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TMVAInput_t, TCuda<Double_t>>::CopyTensorWeights(TCudaHostBuffer<double> &buffer,

                                                                      IndexIterator_t sampleIterator)

{

   for (size_t i = 0; i < fBatchSize; i++) {

      size_t sampleIndex = *sampleIterator++;

      Event *event = std::get<0>(fData)[sampleIndex];

      buffer[i] = event->GetWeight();

   }

}


#if 0

//______________________________________________________________________________

template <>

TTensorBatch<TCuda<float> > TTensorDataLoader<TensorInput, TCuda<float> >::GetTensorBatch()

{

   // After copying the data to the device, wrap the device buffer in the respective

   // architectures matrix type

   DeviceBufferTuple DeviceBuffers = CopyTensorBatches();


   std::vector<Matrix_t> inputTensor(std::get<0>(DeviceBuffers), fBatchSize, )

   size_t jump = fBatchHeight * fBatchWidth;

   for (size_t i = 0; i < fBatchSize; i++) {

      DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);

      inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);

   }

   Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);

   Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);


   fBatchIndex++;

   return TTensorBatch<TCuda<float>>(inputTensor, outputMatrix, weightMatrix);

}


//______________________________________________________________________________

template <>

TTensorBatch<TCuda<double> > TTensorDataLoader<TensorInput, TCuda<double> >::GetTensorBatch()

{

   // After copying the data to the device, wrap the device buffer in the respective

   // architectures matrix type

   DeviceBufferTuple DeviceBuffers = CopyTensorBatches();


   std::vector<Matrix_t> inputTensor;

   size_t jump = fBatchHeight * fBatchWidth;

   for (size_t i = 0; i < fBatchSize; i++) {

      DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);

      inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);

   }

   Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);

   Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);


   fBatchIndex++;

   return TTensorBatch<TCuda<double>>(inputTensor, outputMatrix, weightMatrix);

}


//______________________________________________________________________________

template <>

TTensorBatch<TCuda<float> > TTensorDataLoader<TMVAInput_t, TCuda<float> >::GetTensorBatch()

{

   // After copying the data to the device, wrap the device buffer in the respective

   // architectures matrix type

   DeviceBufferTuple DeviceBuffers = CopyTensorBatches();


   std::vector<Matrix_t> inputTensor;

   size_t jump = fBatchHeight * fBatchWidth;

   for (size_t i = 0; i < fBatchSize; i++) {

      DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);

      inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);

   }

   Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);

   Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);


   fBatchIndex++;

   return TTensorBatch<TCuda<float>>(inputTensor, outputMatrix, weightMatrix);

}


//______________________________________________________________________________

template <>

TTensorBatch<TCuda<double> > TTensorDataLoader<TMVAInput_t, TCuda<double> >::GetTensorBatch()

{

   // After copying the data to the device, wrap the device buffer in the respective

   // architectures matrix type

   DeviceBufferTuple DeviceBuffers = CopyTensorBatches();


   std::vector<Matrix_t> inputTensor;

   size_t jump = fBatchHeight * fBatchWidth;

   for (size_t i = 0; i < fBatchSize; i++) {

      DeviceBuffer_t subInputDeviceBuffer = std::get<0>(DeviceBuffers).GetSubBuffer(i * jump, jump);

      inputTensor.emplace_back(subInputDeviceBuffer, fBatchHeight, fBatchWidth);

   }

   Matrix_t outputMatrix(std::get<1>(DeviceBuffers), fBatchSize, fNOutputFeatures);

   Matrix_t weightMatrix(std::get<2>(DeviceBuffers), fBatchSize, fNOutputFeatures);


   fBatchIndex++;

   return TTensorBatch<TCuda<double>>(inputTensor, outputMatrix, weightMatrix);

}

#endif


// see file Cudnn/TensorDataLoader.cxx for Cudnn definitions


//______________________________________________________________________________

// Explicit Instantiations.


template class TCudaDeviceBuffer<float>;

template class TCudaDeviceBuffer<double>;


template class TCudaHostBuffer<float>;

template class TCudaHostBuffer<double>;


template class TDataLoader<MatrixInput_t, TCuda<float>>;

template class TDataLoader<TMVAInput_t, TCuda<float>>;

template class TDataLoader<MatrixInput_t, TCuda<double>>;

template class TDataLoader<TMVAInput_t, TCuda<double>>;


template class TTensorDataLoader<TensorInput, TCuda<float> >;

template class TTensorDataLoader<TMVAInput_t, TCuda<float> >;

template class TTensorDataLoader<TensorInput, TCuda<double >>;

template class TTensorDataLoader<TMVAInput_t, TCuda<double> >;


} // TMVA

} // DNN

CudaBuffers.h

Cuda.h

DataLoader.h

DataSetInfo.h

fSize
dim_t fSize
Definition DeclareExecutors.h:184

size
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix

Float_t
float Float_t
Definition RtypesCore.h:57

TRangeDynCast
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Definition TCollection.h:358

TCudnn.h

R__ASSERT
#define R__ASSERT(e)
Definition TError.h:118

Error
void Error(const char *location, const char *msgfmt,...)
Use this function in case an error occurred.
Definition TError.cxx:185

offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Definition TGWin32VirtualXProxy.cxx:245

TensorDataLoader.h

ROOT::Detail::TRangeCast
Definition TCollection.h:311

ROOT::Internal::TypedIter
Definition RRangeCast.hxx:78

TMVA::DNN::TCudaDeviceBuffer
TCudaDeviceBuffer.
Definition CudaBuffers.h:100

TMVA::DNN::TCudaDeviceBuffer::fOffset
size_t fOffset
Offset for sub-buffers.
Definition CudaBuffers.h:103

TMVA::DNN::TCudaDeviceBuffer::fSize
size_t fSize
Definition CudaBuffers.h:104

TMVA::DNN::TCudaDeviceBuffer::CopyFrom
void CopyFrom(const TCudaHostBuffer< AFloat > &) const
Definition CudaBuffers.cxx:134

TMVA::DNN::TCudaDeviceBuffer::CopyTo
void CopyTo(const TCudaHostBuffer< AFloat > &) const
Definition CudaBuffers.cxx:142

TMVA::DNN::TCudaDeviceBuffer::fDestructor
struct TMVA::DNN::TCudaDeviceBuffer::TDestructor fDestructor

TMVA::DNN::TCudaDeviceBuffer::GetSubBuffer
TCudaDeviceBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
Definition CudaBuffers.cxx:117

TMVA::DNN::TCudaDeviceBuffer::fComputeStream
cudaStream_t fComputeStream
cudaStream for data transfer
Definition CudaBuffers.h:105

TMVA::DNN::TCudaDeviceBuffer::fDevicePointer
std::shared_ptr< AFloat * > fDevicePointer
Pointer to the buffer data.
Definition CudaBuffers.h:106

TMVA::DNN::TCudaDeviceBuffer::TCudaDeviceBuffer
TCudaDeviceBuffer()=default

TMVA::DNN::TCudaHostBuffer
TCudaHostBuffer.
Definition CudaBuffers.h:43

TMVA::DNN::TCudaHostBuffer::fDestructor
struct TMVA::DNN::TCudaHostBuffer::TDestructor fDestructor

TMVA::DNN::TCudaHostBuffer::fOffset
size_t fOffset
Offset for sub-buffers.
Definition CudaBuffers.h:46

TMVA::DNN::TCudaHostBuffer::GetSubBuffer
TCudaHostBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
Definition CudaBuffers.cxx:60

TMVA::DNN::TCudaHostBuffer::TCudaHostBuffer
TCudaHostBuffer()=default

TMVA::DNN::TCudaHostBuffer::fComputeStream
cudaStream_t fComputeStream
cudaStream for data transfer
Definition CudaBuffers.h:48

TMVA::DNN::TCudaHostBuffer::SetConstVal
void SetConstVal(const AFloat constVal)
Sets the entire buffer to a constant value.
Definition CudaBuffers.cxx:70

TMVA::DNN::TCudaHostBuffer::fHostPointer
std::shared_ptr< AFloat * > fHostPointer
Pointer to the buffer data.
Definition CudaBuffers.h:49

TMVA::DNN::TCudaHostBuffer::fSize
size_t fSize
Definition CudaBuffers.h:47

TMVA::DataSetInfo
Class that contains all the data information.
Definition DataSetInfo.h:62

TMVA::Event
Definition Event.h:51

double

n
const Int_t n
Definition legend1.C:16

for
for(Int_t i=0;i< n;i++)
Definition legend1.C:18

TMVA
create variable transformations
Definition GeneticMinimizer.h:22

TMVA::DNN::TCudaDeviceBuffer::TDestructor::operator()
void operator()(AFloat **devicePointer)
Definition CudaBuffers.cxx:79

TMVA::DNN::TCudaHostBuffer::TDestructor::operator()
void operator()(AFloat **devicePointer)
Definition CudaBuffers.cxx:36