doc/v610/DNN_2DataLoader_8h_source.html

 // @(#)root/tmva/tmva/dnn:$Id$
 // Author: Simon Pfreundschuh 08/08/16

 /*************************************************************************
  * Copyright (C) 2016, Simon Pfreundschuh                                *
  * All rights reserved.                                                  *
  *                                                                       *
  * For the licensing terms see $ROOTSYS/LICENSE.                         *
  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
  *************************************************************************/

 /////////////////////////////////////////////////////////////////////
 // Generic data loader for neural network input data. Provides a   //
 // high level abstraction for the transfer of training data to the //
 // device.                                                         //
 /////////////////////////////////////////////////////////////////////

 #ifndef TMVA_DNN_DATALOADER
 #define TMVA_DNN_DATALOADER

 #include "TMatrix.h"
 #include <vector>
 #include <iostream>
 #include <algorithm>

 #include "TMVA/Event.h"

 namespace TMVA {
 namespace DNN  {

 //
 // Input Data Types
 //______________________________________________________________________________
 using MatrixInput_t    = std::pair<const TMatrixT<Double_t> &,
                                    const TMatrixT<Double_t> &>;
 using TMVAInput_t      = std::vector<Event*>;

 using IndexIterator_t = typename std::vector<size_t>::iterator;

 /** TBatch
  *
  * Class representing training batches consisting of a matrix of input data
  * and a matrix of output data. The input and output data can be accessed using
  * the GetInput() and GetOutput() member functions.
  *
  * \tparam AArchitecture The underlying architecture.
  */
 //______________________________________________________________________________
 template <typename AArchitecture>
 class TBatch
 {
 private:

    using Matrix_t       = typename AArchitecture::Matrix_t;

    Matrix_t fInputMatrix;
    Matrix_t fOutputMatrix;

 public:

    TBatch(Matrix_t &, Matrix_t &);
    TBatch(const TBatch  &) = default;
    TBatch(      TBatch &&) = default;
    TBatch & operator=(const TBatch  &) = default;
    TBatch & operator=(      TBatch &&) = default;

    /** Return the matrix representing the input data. */
    Matrix_t & GetInput()  {return fInputMatrix;}
    /** Return the matrix representing the output data. */
    Matrix_t & GetOutput() {return fOutputMatrix;}
 };

 template<typename Data_t, typename AArchitecture> class TDataLoader;

 /** TBatchIterator
  *
  * Simple iterator class for the iterations over the training batches in
  * a given data set represented by a TDataLoader object.
  *
  * \tparam AData         The input data type.
  * \tparam AArchitecture The underlying architecture type.
  */
 template<typename Data_t, typename AArchitecture>
 class TBatchIterator
 {
 private:

    TDataLoader<Data_t, AArchitecture> & fDataLoader;
    size_t fBatchIndex;

 public:

 TBatchIterator(TDataLoader<Data_t, AArchitecture> & dataLoader, size_t index = 0)
 : fDataLoader(dataLoader), fBatchIndex(index)
 {
    // Nothing to do here.
 }

    TBatch<AArchitecture> operator*() {return fDataLoader.GetBatch();}
    TBatchIterator operator++() {fBatchIndex++; return *this;}
    bool operator!=(const TBatchIterator & other) {
       return fBatchIndex != other.fBatchIndex;
    }
 };

 /** TDataLoader
  *
  * Service class managing the streaming of the training data from the input data
  * type to the accelerator device or the CPU. A TDataLoader object manages a number
  * of host and device buffer pairs that are used in a round-robin manner for the
  * transfer of batches to the device.
  *
  * Each TDataLoader object has an associated batch size and a number of total
  * samples in the dataset. One epoch is the number of buffers required to transfer
  * the complete training set. Using the begin() and end() member functions allows
  * the user to iterate over the batches in one epoch.
  *
  * \tparam AData The input data type.
  * \tparam AArchitecture The achitecture class of the underlying architecture.
  */
 template<typename Data_t, typename AArchitecture>
 class TDataLoader
 {
 private:

    using HostBuffer_t    = typename AArchitecture::HostBuffer_t;
    using DeviceBuffer_t  = typename AArchitecture::DeviceBuffer_t;
    using Matrix_t        = typename AArchitecture::Matrix_t;
    using BatchIterator_t = TBatchIterator<Data_t, AArchitecture>;

    const Data_t  & fData;

    size_t fNSamples;
    size_t fBatchSize;
    size_t fNInputFeatures;
    size_t fNOutputFeatures;
    size_t fBatchIndex;

    size_t fNStreams;                            ///< Number of buffer pairs.
    std::vector<DeviceBuffer_t> fDeviceBuffers;
    std::vector<HostBuffer_t>   fHostBuffers;

    std::vector<size_t> fSampleIndices; ///< Ordering of the samples in the epoch.

 public:

    TDataLoader(const Data_t & data, size_t nSamples, size_t batchSize,
                size_t nInputFeatures, size_t nOutputFeatures, size_t nStreams = 1);
    TDataLoader(const TDataLoader  &) = default;
    TDataLoader(      TDataLoader &&) = default;
    TDataLoader & operator=(const TDataLoader  &) = default;
    TDataLoader & operator=(      TDataLoader &&) = default;

    /** Copy input matrix into the given host buffer. Function to be specialized by
     *  the architecture-specific backend. */
    void  CopyInput(HostBuffer_t &buffer, IndexIterator_t begin, size_t batchSize);
    /** Copy output matrix into the given host buffer. Function to be specialized
     * by the architecture-spcific backend. */
    void CopyOutput(HostBuffer_t &buffer, IndexIterator_t begin, size_t batchSize);

    BatchIterator_t begin() {return TBatchIterator<Data_t, AArchitecture>(*this);}
    BatchIterator_t end()
    {
       return TBatchIterator<Data_t, AArchitecture>(*this, fNSamples / fBatchSize);
    }

    /** Shuffle the order of the samples in the batch. The shuffling is indirect,
     *  i.e. only the indices are shuffled. No input data is moved by this
     * routine. */
    void Shuffle();

    /** Return the next batch from the training set. The TDataLoader object
     *  keeps an internal counter that cycles over the batches in the training
     *  set. */
    TBatch<AArchitecture> GetBatch();

 };

 //
 // TBatch Class.
 //______________________________________________________________________________
 template<typename AArchitecture>
 TBatch<AArchitecture>::TBatch(Matrix_t & inputMatrix, Matrix_t & outputMatrix)
     : fInputMatrix(inputMatrix), fOutputMatrix(outputMatrix)
 {
     // Nothing to do here.
 }

 //
 // TDataLoader Class.
 //______________________________________________________________________________
 template<typename Data_t, typename AArchitecture>
 TDataLoader<Data_t, AArchitecture>::TDataLoader(
     const Data_t & data, size_t nSamples, size_t batchSize,
     size_t nInputFeatures, size_t nOutputFeatures, size_t nStreams)
     : fData(data), fNSamples(nSamples), fBatchSize(batchSize),
       fNInputFeatures(nInputFeatures), fNOutputFeatures(nOutputFeatures),
       fBatchIndex(0), fNStreams(nStreams), fDeviceBuffers(), fHostBuffers(),
       fSampleIndices()
 {
    size_t inputMatrixSize  = fBatchSize * fNInputFeatures;
    size_t outputMatrixSize = fBatchSize * fNOutputFeatures;

    for (size_t i = 0; i < fNStreams; i++)
    {
       fHostBuffers.push_back(HostBuffer_t(inputMatrixSize + outputMatrixSize));
       fDeviceBuffers.push_back(DeviceBuffer_t(inputMatrixSize + outputMatrixSize));
    }

    fSampleIndices.reserve(fNSamples);
    for (size_t i = 0; i < fNSamples; i++) {
       fSampleIndices.push_back(i);
    }
 }

 //______________________________________________________________________________
 template<typename Data_t, typename AArchitecture>
 TBatch<AArchitecture> TDataLoader<Data_t, AArchitecture>::GetBatch()
 {
    fBatchIndex %= (fNSamples / fBatchSize); // Cycle through samples.


    size_t inputMatrixSize  = fBatchSize * fNInputFeatures;
    size_t outputMatrixSize = fBatchSize * fNOutputFeatures;

    size_t streamIndex = fBatchIndex % fNStreams;
    HostBuffer_t   & hostBuffer   = fHostBuffers[streamIndex];
    DeviceBuffer_t & deviceBuffer = fDeviceBuffers[streamIndex];

    HostBuffer_t inputHostBuffer  = hostBuffer.GetSubBuffer(0, inputMatrixSize);
    HostBuffer_t outputHostBuffer = hostBuffer.GetSubBuffer(inputMatrixSize,
                                                            outputMatrixSize);

    DeviceBuffer_t inputDeviceBuffer  = deviceBuffer.GetSubBuffer(0, inputMatrixSize);
    DeviceBuffer_t outputDeviceBuffer = deviceBuffer.GetSubBuffer(inputMatrixSize,
                                                                  outputMatrixSize);
    size_t sampleIndex = fBatchIndex * fBatchSize;
    IndexIterator_t sampleIndexIterator = fSampleIndices.begin() + sampleIndex;

    CopyInput(inputHostBuffer,   sampleIndexIterator, fBatchSize);
    CopyOutput(outputHostBuffer, sampleIndexIterator, fBatchSize);

    deviceBuffer.CopyFrom(hostBuffer);
    Matrix_t  inputMatrix(inputDeviceBuffer,  fBatchSize, fNInputFeatures);
    Matrix_t outputMatrix(outputDeviceBuffer, fBatchSize, fNOutputFeatures);

    fBatchIndex++;
    return TBatch<AArchitecture>(inputMatrix, outputMatrix);
 }

 //______________________________________________________________________________
 template<typename Data_t, typename AArchitecture>
 void TDataLoader<Data_t, AArchitecture>::Shuffle()
 {
    std::random_shuffle(fSampleIndices.begin(), fSampleIndices.end());
 }

 } // namespace DNN
 } // namespace TMVA

 #endif
TMVA::DNN::TBatchIterator::operator*
TBatch< AArchitecture > operator*()
Definition: DataLoader.h:99

TMVA::DNN::TBatch::GetInput
Matrix_t & GetInput()
Return the matrix representing the input data.
Definition: DataLoader.h:68

TMVA::DNN::TBatch::fOutputMatrix
Matrix_t fOutputMatrix
Definition: DataLoader.h:57

TMVA::DNN::TDataLoader::CopyOutput
void CopyOutput(HostBuffer_t &buffer, IndexIterator_t begin, size_t batchSize)
Copy output matrix into the given host buffer.

TMVA::DNN::TDataLoader::GetBatch
TBatch< AArchitecture > GetBatch()
Return the next batch from the training set.
Definition: DataLoader.h:218

TMVA::DNN::TDataLoader::fData
const Data_t & fData
Definition: DataLoader.h:131

TMVA::DNN::IndexIterator_t
typename std::vector< size_t >::iterator IndexIterator_t
Definition: DataLoader.h:38

TMVA::DNN::TDataLoader::CopyInput
void CopyInput(HostBuffer_t &buffer, IndexIterator_t begin, size_t batchSize)
Copy input matrix into the given host buffer.

TMVA::DNN::TDataLoader::fNStreams
size_t fNStreams
Number of buffer pairs.
Definition: DataLoader.h:139

TMVA::DNN::TBatchIterator::fBatchIndex
size_t fBatchIndex
Definition: DataLoader.h:89

TMVA::DNN::TBatchIterator::operator!=
bool operator!=(const TBatchIterator &other)
Definition: DataLoader.h:101

TMVA::DNN::TDataLoader::fBatchSize
size_t fBatchSize
Definition: DataLoader.h:134

TMVA::DNN::TBatchIterator::fDataLoader
TDataLoader< Data_t, AArchitecture > & fDataLoader
Definition: DataLoader.h:88

TMVA::DNN::TDataLoader::Matrix_t
typename AArchitecture::Matrix_t Matrix_t
Definition: DataLoader.h:128

ApplicationClassificationKeras.data
data
Definition: ApplicationClassificationKeras.py:17

TMVA::DNN::TBatchIterator::operator++
TBatchIterator operator++()
Definition: DataLoader.h:100

TMVA::DNN::TDataLoader::begin
BatchIterator_t begin()
Definition: DataLoader.h:161

TMatrixT< Double_t >

TMVA::DNN::TDataLoader::fNInputFeatures
size_t fNInputFeatures
Definition: DataLoader.h:135

TMVA::DNN::TBatchIterator::TBatchIterator
TBatchIterator(TDataLoader< Data_t, AArchitecture > &dataLoader, size_t index=0)
Definition: DataLoader.h:93

TMVA::DNN::TDataLoader::DeviceBuffer_t
typename AArchitecture::DeviceBuffer_t DeviceBuffer_t
Definition: DataLoader.h:127

TMVA::DNN::TBatch::fInputMatrix
Matrix_t fInputMatrix
Definition: DataLoader.h:56

TMVA::DNN::TBatch
TBatch.
Definition: DataLoader.h:50

TMVA::DNN::TBatch::TBatch
TBatch(Matrix_t &, Matrix_t &)
Definition: DataLoader.h:183

TMVA::DNN::TMVAInput_t
std::vector< Event * > TMVAInput_t
Definition: DataLoader.h:36

TMVA::DNN::TBatch::Matrix_t
typename AArchitecture::Matrix_t Matrix_t
Definition: DataLoader.h:54

TMVA::DNN::TDataLoader::HostBuffer_t
typename AArchitecture::HostBuffer_t HostBuffer_t
Definition: DataLoader.h:126

TMVA::DNN::TDataLoader::Shuffle
void Shuffle()
Shuffle the order of the samples in the batch.
Definition: DataLoader.h:253

TMVA::DNN::TDataLoader
TDataLoader.
Definition: DataLoader.h:73

TMVA::DNN::TDataLoader::fNSamples
size_t fNSamples
Definition: DataLoader.h:133

TMVA::DNN::TBatch::operator=
TBatch & operator=(const TBatch &)=default

TMVA::DNN::TDataLoader::TDataLoader
TDataLoader(const Data_t &data, size_t nSamples, size_t batchSize, size_t nInputFeatures, size_t nOutputFeatures, size_t nStreams=1)
Definition: DataLoader.h:193

Event.h

TMVA::DNN::TDataLoader::fSampleIndices
std::vector< size_t > fSampleIndices
Ordering of the samples in the epoch.
Definition: DataLoader.h:143

TMVA
Abstract ClassifierFactory template that handles arbitrary types.
Definition: GeneticMinimizer.h:21

TMVA::DNN::TDataLoader::fBatchIndex
size_t fBatchIndex
Definition: DataLoader.h:137

TMatrix.h

TMVA::DNN::TDataLoader::fDeviceBuffers
std::vector< DeviceBuffer_t > fDeviceBuffers
Definition: DataLoader.h:140

TMVA::DNN::TDataLoader::fHostBuffers
std::vector< HostBuffer_t > fHostBuffers
Definition: DataLoader.h:141

TMVA::DNN::TDataLoader::fNOutputFeatures
size_t fNOutputFeatures
Definition: DataLoader.h:136

TMVA::DNN::TBatch::GetOutput
Matrix_t & GetOutput()
Return the matrix representing the output data.
Definition: DataLoader.h:70

TMVA::DNN::TDataLoader::end
BatchIterator_t end()
Definition: DataLoader.h:162

TMVA::DNN::TBatchIterator
TBatchIterator.
Definition: DataLoader.h:84

TMVA::DNN::MatrixInput_t
std::pair< const TMatrixT< Double_t > &, const TMatrixT< Double_t > & > MatrixInput_t
Definition: DataLoader.h:35