doc/hackathon/Cudnn_2TensorDataLoader_8cxx_source.html

// @(#)root/tmva/tmva/dnn:$Id$

// Author: Lorenzo Moneta,


////////////////////////////////////////////////////////////////////////

// Implementation of TensorDataLoader functions for CUDA with CuDNN architecture.  //

////////////////////////////////////////////////////////////////////////


#include "TMVA/DataSetInfo.h"


#include "TMVA/DNN/TensorDataLoader.h"

#include "TMVA/DNN/Architectures/Cuda/CudaBuffers.h"


#include "TMVA/DNN/Architectures/TCudnn.h"


#include "cuda_runtime.h"

#include <algorithm>


namespace TMVA {

namespace DNN {


//______________________________________________________________________________

//

// cuDNN

//______________________________________________________________________________

template <>


void TTensorDataLoader<TensorInput, TCudnn<float> >::CopyTensorInput(TCudaHostBuffer<float> &buffer,

                                                                     IndexIterator_t sampleIterator)

{

   const std::vector<TMatrixT<Double_t> > &inputTensor = std::get<0>(fData);


   if (fBatchDepth == 1) {

      for (size_t i = 0; i < fBatchHeight; i++) {

         size_t sampleIndex = *sampleIterator;

         for (size_t j = 0; j < fBatchWidth; j++) {

            size_t bufferIndex = j * fBatchHeight + i;

            buffer[bufferIndex] = static_cast<float>(inputTensor[0](sampleIndex, j));

         }

         sampleIterator++;

      }

   } else {

      for (size_t i = 0; i < fBatchDepth; i++) {

         size_t sampleIndex = *sampleIterator;

         for (size_t j = 0; j < fBatchHeight; j++) {

            for (size_t k = 0; k < fBatchWidth; k++) {

               size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;

               buffer[bufferIndex] = static_cast<float>(inputTensor[sampleIndex](j, k));

            }

         }

         sampleIterator++;

      }

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TensorInput, TCudnn<float> >::CopyTensorOutput(TCudaHostBuffer<float> &buffer,

                                                                      IndexIterator_t sampleIterator)

{

   const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);

   size_t n = outputMatrix.GetNcols();


   for (size_t i = 0; i < fBatchSize; i++) {

      size_t sampleIndex = *sampleIterator;

      for (size_t j = 0; j < n; j++) {

         size_t bufferIndex = j * fBatchSize + i;

         buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j));

      }

      sampleIterator++;

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TensorInput, TCudnn<float> >::CopyTensorWeights(TCudaHostBuffer<float> &buffer,

                                                                       IndexIterator_t sampleIterator)

{

   const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);


   for (size_t i = 0; i < fBatchSize; i++) {

      buffer[i] = static_cast<float>(weightMatrix(*sampleIterator, 0));

      sampleIterator++;

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TMVAInput_t, TCudnn<float> >::CopyTensorInput(TCudaHostBuffer<float> &buffer,

                                                                     IndexIterator_t sampleIterator)

{

   // Image has channel depth 1 -> they are ordered as row-vectors in a matrix (batchHeight = batchSize)

   // one event, one  example in the batch

   if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {

      for (size_t i = 0; i < fBatchHeight; i++) {

         size_t sampleIndex = *sampleIterator;

         Event * event = std::get<0>(fData)[sampleIndex];

         for (size_t j = 0; j < fBatchWidth; j++) {

            size_t bufferIndex = j * fBatchHeight + i;

            buffer[bufferIndex] = event->GetValue(j);

         }

         sampleIterator++;

      }

   // A batch is made up by a single image with its channels

   } else if (fBatchDepth == fBatchSize) {

      for (size_t i = 0; i < fBatchSize; i++) {

         size_t sampleIndex = *sampleIterator;

         Event * event = std::get<0>(fData)[sampleIndex];

         for (size_t j = 0; j < fBatchHeight; j++) {

            for (size_t k = 0; k < fBatchWidth; k++) {

               // Cudnn order is NCHW

               size_t bufferIndex = i * fBatchHeight * fBatchWidth + j * fBatchWidth + k;

               buffer[bufferIndex] = event->GetValue(j * fBatchWidth + k);

            }

         }

         sampleIterator++;

      }

   }

   else {

      std::cout  << fBatchDepth << fBatchSize << fBatchHeight << std::endl;

      Error("TTensorDataLoader","Inconsistency between batch depth and batch size");

      R__ASSERT(0);

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TMVAInput_t, TCudnn<float> >::CopyTensorOutput(TCudaHostBuffer<float> &buffer,

                                                                      IndexIterator_t sampleIterator)

{

   const DataSetInfo &info = std::get<1>(fData);

   size_t n = buffer.GetSize() / fBatchSize;


   // Copy target(s).

   for (size_t i = 0; i < fBatchSize; i++) {

      size_t sampleIndex = *sampleIterator++;

      Event *event = std::get<0>(fData)[sampleIndex];

      for (size_t j = 0; j < n; j++) {

         // Copy output matrices.

         size_t bufferIndex = j * fBatchSize + i;

         // Classification

         if (event->GetNTargets() == 0) {

            if (n == 1) {

               // Binary.

               buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;

            } else {

               // Multiclass.

               buffer[bufferIndex] = 0.0;

               if (j == event->GetClass()) {

                  buffer[bufferIndex] = 1.0;

               }

            }

         } else {

            buffer[bufferIndex] = static_cast<Float_t>(event->GetTarget(j));

         }

      }

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TMVAInput_t, TCudnn<float> >::CopyTensorWeights(TCudaHostBuffer<float> &buffer,

                                                                       IndexIterator_t sampleIterator)

{

   for (size_t i = 0; i < fBatchSize; i++) {

      size_t sampleIndex = *sampleIterator++;

      Event *event = std::get<0>(fData)[sampleIndex];

      buffer[i] = event->GetWeight();

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TensorInput, TCudnn<double> >::CopyTensorInput(TCudaHostBuffer<double> &buffer,

                                                                      IndexIterator_t sampleIterator)

{

   const std::vector<TMatrixT<Double_t> > &inputTensor = std::get<0>(fData);


   if (fBatchDepth == 1) {

      for (size_t i = 0; i < fBatchHeight; i++) {

         size_t sampleIndex = *sampleIterator;

         for (size_t j = 0; j < fBatchWidth; j++) {

            size_t bufferIndex = j * fBatchHeight + i;

            buffer[bufferIndex] = static_cast<double>(inputTensor[0](sampleIndex, j));

         }

         sampleIterator++;

      }

   } else {

      for (size_t i = 0; i < fBatchDepth; i++) {

         size_t sampleIndex = *sampleIterator;

         for (size_t j = 0; j < fBatchHeight; j++) {

            for (size_t k = 0; k < fBatchWidth; k++) {

               size_t bufferIndex = i * fBatchHeight * fBatchWidth + k * fBatchHeight + j;

               buffer[bufferIndex] = static_cast<double>(inputTensor[sampleIndex](j, k));

            }

         }

         sampleIterator++;

      }

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TensorInput, TCudnn<double> >::CopyTensorOutput(TCudaHostBuffer<double> &buffer,

                                                                       IndexIterator_t sampleIterator)

{

   const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);

   size_t n = outputMatrix.GetNcols();


   for (size_t i = 0; i < fBatchSize; i++) {

      size_t sampleIndex = *sampleIterator;

      for (size_t j = 0; j < n; j++) {

         size_t bufferIndex = j * fBatchSize + i;

         buffer[bufferIndex] = outputMatrix(sampleIndex, j);

      }

      sampleIterator++;

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TensorInput, TCudnn<double> >::CopyTensorWeights(TCudaHostBuffer<double> &buffer,

                                                                        IndexIterator_t sampleIterator)

{

   const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);

   for (size_t i = 0; i < fBatchSize; i++) {

      buffer[i] = weightMatrix(*sampleIterator, 0);

      sampleIterator++;

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TMVAInput_t, TCudnn<double> >::CopyTensorInput(TCudaHostBuffer<double> &buffer,

                                                                      IndexIterator_t sampleIterator)

{

   // one event, one  example in the batch

   if (fBatchDepth == 1 && fBatchHeight == fBatchSize) {

      for (size_t i = 0; i < fBatchHeight; i++) {

         size_t sampleIndex = *sampleIterator;

         Event * event = std::get<0>(fData)[sampleIndex];

         for (size_t j = 0; j < fBatchWidth; j++) {

            size_t bufferIndex = j * fBatchHeight + i;

            buffer[bufferIndex] = event->GetValue(j);

         }

         sampleIterator++;

      }

   } else if (fBatchDepth == fBatchSize) {

      // batchDepth is batch size

      for (size_t i = 0; i < fBatchDepth; i++) {

         size_t sampleIndex = *sampleIterator;

         Event * event = std::get<0>(fData)[sampleIndex];

         for (size_t j = 0; j < fBatchHeight; j++) {

            for (size_t k = 0; k < fBatchWidth; k++) {

               // because of the column-major ordering

               size_t bufferIndex = i * fBatchHeight * fBatchWidth + j * fBatchWidth + k;

               buffer[bufferIndex] = event->GetValue(j * fBatchWidth + k);

            }

         }

         sampleIterator++;

      }

   }

   else {

      Error("TTensorDataLoader","Inconsistency between batch depth and batch size");

      R__ASSERT(0);

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TMVAInput_t, TCudnn<double> >::CopyTensorOutput(TCudaHostBuffer<double> &buffer,

                                                                       IndexIterator_t sampleIterator)

{

   const DataSetInfo &info = std::get<1>(fData);

   size_t n = buffer.GetSize() / fBatchSize;


   // Copy target(s).


   for (size_t i = 0; i < fBatchSize; i++) {

      size_t sampleIndex = *sampleIterator++;

      Event *event = std::get<0>(fData)[sampleIndex];

      for (size_t j = 0; j < n; j++) {

         // Copy output matrices.

         size_t bufferIndex = j * fBatchSize + i;

         // Classification

         if (event->GetNTargets() == 0) {

            if (n == 1) {

               // Binary.

               buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;

            } else {

               // Multiclass.

               buffer[bufferIndex] = 0.0;

               if (j == event->GetClass()) {

                  buffer[bufferIndex] = 1.0;

               }

            }

         } else {

            buffer[bufferIndex] = static_cast<Double_t>(event->GetTarget(j));

         }

      }

   }

}


//______________________________________________________________________________

template <>


void TTensorDataLoader<TMVAInput_t, TCudnn<double> >::CopyTensorWeights(TCudaHostBuffer<double> &buffer,

                                                                        IndexIterator_t sampleIterator)

{

   for (size_t i = 0; i < fBatchSize; i++) {

      size_t sampleIndex = *sampleIterator++;

      Event *event = std::get<0>(fData)[sampleIndex];

      buffer[i] = event->GetWeight();

   }

}


#if 0

//______________________________________________________________________________

template <>

TTensorBatch<TCudnn<float> > TTensorDataLoader<TensorInput, TCudnn<float> >::GetTensorBatch()

{

   // Get buffer tuple on device that contains the data

   DeviceBufferTuple DeviceBuffers = CopyTensorBatches();


   std::vector<size_t> outputShape  {fBatchSize, 1, fNOutputFeatures, 1};

   std::vector<size_t> wheightShape {fBatchSize, 1, 1, 1};

   std::vector<TCudaTensor<float> > inputTensor(1, TCudaTensor<float>(std::get<0>(DeviceBuffers),

                                                this->GetTensorDim(),  fInputShape));

   TCudaTensor<float> outputMatrix(std::get<1>(DeviceBuffers), this->GetTensorDim(), outputShape);

   TCudaTensor<float> weightMatrix(std::get<2>(DeviceBuffers), this->GetTensorDim(), wheightShape);


   fBatchIndex++;

   return TTensorBatch<TCudnn<float> >(inputTensor, outputMatrix, weightMatrix);

}


//______________________________________________________________________________

template <>

TTensorBatch<TCudnn<double> > TTensorDataLoader<TensorInput, TCudnn<double> >::GetTensorBatch()

{

   // Get buffer tuple on device that contains the data

   DeviceBufferTuple DeviceBuffers = CopyTensorBatches();


   std::vector<size_t> outputShape  {fBatchSize, 1, fNOutputFeatures, 1};

   std::vector<size_t> wheightShape {fBatchSize, 1, 1, 1};

   std::vector<TCudaTensor<double> > inputTensor(1, TCudaTensor<double>(std::get<0>(DeviceBuffers),

                                                 this->GetTensorDim(),  fInputShape));

   TCudaTensor<double> outputMatrix(std::get<1>(DeviceBuffers), this->GetTensorDim(), outputShape);

   TCudaTensor<double> weightMatrix(std::get<2>(DeviceBuffers), this->GetTensorDim(), wheightShape);


   fBatchIndex++;

   return TTensorBatch<TCudnn<double> >(inputTensor, outputMatrix, weightMatrix);

}


//______________________________________________________________________________

template <>

TTensorBatch<TCudnn<float> > TTensorDataLoader<TMVAInput_t, TCudnn<float> >::GetTensorBatch()

{

   // Get buffer tuple on device that contains the data

   DeviceBufferTuple DeviceBuffers = CopyTensorBatches();


   std::vector<size_t> outputShape  {fBatchSize, 1, fNOutputFeatures, 1};

   std::vector<size_t> wheightShape {fBatchSize, 1, 1, 1};

   std::vector<TCudaTensor<float> > inputTensor(1, TCudaTensor<float>(std::get<0>(DeviceBuffers),

                                                this->GetTensorDim(),  fInputShape));

   TCudaTensor<float> outputMatrix(std::get<1>(DeviceBuffers), this->GetTensorDim(), outputShape);

   TCudaTensor<float> weightMatrix(std::get<2>(DeviceBuffers), this->GetTensorDim(), wheightShape);


   fBatchIndex++;

   return TTensorBatch<TCudnn<float> >(inputTensor, outputMatrix, weightMatrix);

}


//______________________________________________________________________________

template <>

TTensorBatch<TCudnn<double> > TTensorDataLoader<TMVAInput_t, TCudnn<double> >::GetTensorBatch()

{

   // Get buffer tuple on device that contains the data

   DeviceBufferTuple DeviceBuffers = CopyTensorBatches();


   std::vector<size_t> outputShape  {fBatchSize, 1, fNOutputFeatures, 1};

   std::vector<size_t> wheightShape {fBatchSize, 1, 1, 1};

   std::vector<TCudaTensor<double> > inputTensor(1, TCudaTensor<double>(std::get<0>(DeviceBuffers),

                                                 this->GetTensorDim(),  fInputShape));

   TCudaTensor<double> outputMatrix(std::get<1>(DeviceBuffers), fNOutputFeatures + 2, outputShape);

   TCudaTensor<double> weightMatrix(std::get<2>(DeviceBuffers), 3, wheightShape);


   fBatchIndex++;

   return TTensorBatch<TCudnn<double> >(inputTensor, outputMatrix, weightMatrix);

}

#endif


//______________________________________________________________________________

// Explicit Instantiations.


template class TTensorDataLoader<TensorInput, TCudnn<float> >;

template class TTensorDataLoader<TMVAInput_t, TCudnn<float> >;

template class TTensorDataLoader<TensorInput, TCudnn<double> >;

template class TTensorDataLoader<TMVAInput_t, TCudnn<double> >;


} // TMVA

} // DNN

CudaBuffers.h

DataSetInfo.h

Float_t
float Float_t
Float 4 bytes (float).
Definition RtypesCore.h:71

Double_t
double Double_t
Double 8 bytes.
Definition RtypesCore.h:73

TCudnn.h

Error
Error("WriteTObject","The current directory (%s) is not associated with a file. The object (%s) has not been written.", GetName(), objname)

R__ASSERT
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125

TensorDataLoader.h

TMVA::DNN::TCudaHostBuffer
TCudaHostBuffer.
Definition CudaBuffers.h:43

TMVA::DNN::TCudaHostBuffer::GetSize
size_t GetSize() const
Definition CudaBuffers.h:85

TMVA::DNN::TCudaTensor
TCudaTensor Class.
Definition CudaTensor.h:84

TMVA::DNN::TTensorBatch
TTensorBatch.
Definition TensorDataLoader.h:59

TMVA::DNN::TTensorDataLoader< AData, TReference< AReal > >::weightMatrix
TMatrixT< AReal > weightMatrix
The matrix used to keep the batch weights.
Definition TensorDataLoader.h:65

TMVA::DNN::TTensorDataLoader< AData, TReference< AReal > >::TTensorDataLoader
TTensorDataLoader(const AData &data, size_t nSamples, size_t batchDepth, size_t batchHeight, size_t batchWidth, size_t nOutputFeatures, std::vector< size_t > inputShape, size_t nStreams=1)
Constructor.
Definition TensorDataLoader.h:109

TMVA::DNN::TTensorDataLoader< AData, TReference< AReal > >::fBatchHeight
size_t fBatchHeight
The number od rows in each matrix.
Definition TensorDataLoader.h:56

TMVA::DNN::TTensorDataLoader< AData, TReference< AReal > >::fData
const AData & fData
The data that should be loaded in the batches.
Definition TensorDataLoader.h:51

TMVA::DNN::TTensorDataLoader< AData, TReference< AReal > >::inputTensor
std::vector< TMatrixT< AReal > > inputTensor
The 3D tensor used to keep the input data.
Definition TensorDataLoader.h:63

TMVA::DNN::TTensorDataLoader< AData, TReference< AReal > >::CopyTensorInput
void CopyTensorInput(std::vector< TMatrixT< AReal > > &tensor, IndexIterator_t sampleIterator)
Copy input tensor into the given host buffer.

TMVA::DNN::TTensorDataLoader< AData, TReference< AReal > >::outputMatrix
TMatrixT< AReal > outputMatrix
The matrix used to keep the output.
Definition TensorDataLoader.h:64

TMVA::DNN::TTensorDataLoader< AData, TReference< AReal > >::fBatchDepth
size_t fBatchDepth
The number of matrices in the tensor.
Definition TensorDataLoader.h:55

TMVA::DNN::TTensorDataLoader< AData, TReference< AReal > >::fBatchWidth
size_t fBatchWidth
The number of columns in each matrix.
Definition TensorDataLoader.h:57

TMVA::DNN::TTensorDataLoader< AData, TReference< AReal > >::CopyTensorWeights
void CopyTensorWeights(TMatrixT< AReal > &matrix, IndexIterator_t sampleIterator)
Copy weight matrix into the given host buffer.

TMVA::DNN::TTensorDataLoader< AData, TReference< AReal > >::CopyTensorOutput
void CopyTensorOutput(TMatrixT< AReal > &matrix, IndexIterator_t sampleIterator)
Copy output matrix into the given host buffer.

TMVA::DNN::TTensorDataLoader
TTensorDataLoader.
Definition TensorDataLoader.h:133

TMVA::DNN::TTensorDataLoader< AData, TReference< AReal > >::fBatchSize
size_t fBatchSize
Definition TensorDataLoader.h:144

TMVA::DataSetInfo
Class that contains all the data information.
Definition DataSetInfo.h:62

TMVA::DataSetInfo::IsSignal
Bool_t IsSignal(const Event *ev) const
Definition DataSetInfo.cxx:167

TMVA::Event
Definition Event.h:51

TMatrixT
TMatrixT.
Definition TMatrixT.h:40

n
const Int_t n
Definition legend1.C:16

TMVA::DNN
Definition Adadelta.h:36

TMVA::DNN::IndexIterator_t
typename std::vector< size_t >::iterator IndexIterator_t
Definition DataLoader.h:42

TMVA
create variable transformations
Definition GeneticMinimizer.h:22