Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
ROOT::Experimental::Internal::ML::RClusterLoader< Args > Class Template Reference

template<typename... Args>
class ROOT::Experimental::Internal::ML::RClusterLoader< Args >

Loads TTree/RNTuple clusters from one or more RDataFrames into RFlat2DMatrix buffers for ML training and validation.

Overview

At construction the loader scans the cluster boundaries of every provided RDataFrame and stores them as a flat list of RClusterRange objects. SplitDataset() then partitions those ranges into training and validation sets according to validationSplit.

The split strategy depends on whether shuffling is enabled or not

  • Unshuffled: one cut is made so that the first (1 - validationSplit) fraction of entries goes to training. At most one cluster is split at the boundary.
  • Shuffled: each cluster is split proportionally (according to validationSplit) so both sets draw entries from every part of the dataset. ShuffleTrainingClusters() and ShuffleValidationClusters() re-order the cluster lists at the start of each epoch. A second shuffling step, at the entries level, happens inside LoadTrainingClusterInto() and LoadValidationClusterInto() when loading the data into the tensors.

Filtered RDataFrames

When any RDataFrame carries a filter, the true entry count is not known until the computation graph is executed. In this case SplitDataset() is a no-op and the split is discovered lazily inside LoadTrainingClusterInto() during the first epoch. After the first epoch FinaliseSplitDiscovery() marks the split as stable and all subsequent epochs use the same pre-computed ranges.

Definition at line 149 of file RClusterLoader.hxx.

Public Member Functions

 RClusterLoader (std::vector< ROOT::RDF::RNode > &rdfs, const std::vector< std::string > &cols, const std::vector< std::size_t > &vecSizes, float vecPadding, float validationSplit, bool shuffle, std::size_t setSeed)
 
void FinaliseSplitDiscovery ()
 Mark the train/val split as finalised after the first epoch.
 
std::size_t GetNmTotalClusters () const
 
std::size_t GetNumChunkCols () const
 
std::size_t GetNumTrainingClusters () const
 
std::size_t GetNumTrainingEntries () const
 
std::size_t GetNumValidationClusters () const
 
std::size_t GetNumValidationEntries () const
 
const std::vector< RClusterRange > & GetTrainingClusters () const
 
const std::vector< RClusterRange > & GetValidationClusters () const
 
bool IsSplitDiscovered () const
 
void LoadClusterInto (RFlat2DMatrix &dest, std::size_t rdfIdx, std::uint64_t startRow, std::uint64_t endRow, std::size_t rowOffset=0)
 
std::size_t LoadTrainingClusterInto (RFlat2DMatrix &dest, std::size_t rdfIdx, std::uint64_t startRow, std::uint64_t endRow, std::size_t rowOffset=0)
 Load one training cluster and return the number of rows written.
 
void LoadValidationClusterInto (RFlat2DMatrix &dest, std::size_t rdfIdx, std::uint64_t startRow, std::uint64_t endRow, std::size_t rowOffset=0)
 Load one validation cluster into dest starting at rowOffset.
 
void ShuffleTrainingClusters (std::size_t epochIdx)
 Re-order training clusters for the upcoming epoch.
 
void ShuffleValidationClusters (std::size_t epochIdx)
 Re-order validation clusters for the upcoming epoch.
 
void SplitDataset ()
 Distribute the clusters into training and validation datasets No-op for filtered RDataFrames, the split is discovered lazily during the first epoch.
 

Private Attributes

std::size_t fAccumulatedFilteredForTrain {0}
 
std::vector< RClusterRangefAllClusters
 
std::vector< std::string > fCols
 
bool fIsFiltered {false}
 
std::size_t fNumChunkCols
 
std::size_t fNumCols
 
std::size_t fNumTrainingEntries {0}
 
std::size_t fNumValidationEntries {0}
 
std::vector< ROOT::RDF::RNode > & fRdfs
 
std::vector< std::size_t > fRdfSizes
 
std::size_t fSetSeed
 
bool fShuffle
 
bool fSplitDiscovered {false}
 
std::size_t fSumVecSizes
 
std::size_t fTotalEntries {0}
 
std::vector< RClusterRangefTrainingClusters
 
std::vector< RClusterRangefValidationClusters
 
float fValidationSplit
 
float fVecPadding
 
std::vector< std::size_t > fVecSizes
 

#include <ROOT/ML/RClusterLoader.hxx>

Constructor & Destructor Documentation

◆ RClusterLoader()

template<typename... Args>
ROOT::Experimental::Internal::ML::RClusterLoader< Args >::RClusterLoader ( std::vector< ROOT::RDF::RNode > & rdfs,
const std::vector< std::string > & cols,
const std::vector< std::size_t > & vecSizes,
float vecPadding,
float validationSplit,
bool shuffle,
std::size_t setSeed )
inline

Definition at line 177 of file RClusterLoader.hxx.

Member Function Documentation

◆ FinaliseSplitDiscovery()

template<typename... Args>
void ROOT::Experimental::Internal::ML::RClusterLoader< Args >::FinaliseSplitDiscovery ( )
inline

Mark the train/val split as finalised after the first epoch.

Definition at line 422 of file RClusterLoader.hxx.

◆ GetNmTotalClusters()

template<typename... Args>
std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetNmTotalClusters ( ) const
inline

Definition at line 447 of file RClusterLoader.hxx.

◆ GetNumChunkCols()

template<typename... Args>
std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetNumChunkCols ( ) const
inline

Definition at line 434 of file RClusterLoader.hxx.

◆ GetNumTrainingClusters()

template<typename... Args>
std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetNumTrainingClusters ( ) const
inline

Definition at line 442 of file RClusterLoader.hxx.

◆ GetNumTrainingEntries()

template<typename... Args>
std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetNumTrainingEntries ( ) const
inline

Definition at line 432 of file RClusterLoader.hxx.

◆ GetNumValidationClusters()

template<typename... Args>
std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetNumValidationClusters ( ) const
inline

Definition at line 446 of file RClusterLoader.hxx.

◆ GetNumValidationEntries()

template<typename... Args>
std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetNumValidationEntries ( ) const
inline

Definition at line 433 of file RClusterLoader.hxx.

◆ GetTrainingClusters()

template<typename... Args>
const std::vector< RClusterRange > & ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetTrainingClusters ( ) const
inline

Definition at line 436 of file RClusterLoader.hxx.

◆ GetValidationClusters()

template<typename... Args>
const std::vector< RClusterRange > & ROOT::Experimental::Internal::ML::RClusterLoader< Args >::GetValidationClusters ( ) const
inline

Definition at line 440 of file RClusterLoader.hxx.

◆ IsSplitDiscovered()

template<typename... Args>
bool ROOT::Experimental::Internal::ML::RClusterLoader< Args >::IsSplitDiscovered ( ) const
inline

Definition at line 428 of file RClusterLoader.hxx.

◆ LoadClusterInto()

template<typename... Args>
void ROOT::Experimental::Internal::ML::RClusterLoader< Args >::LoadClusterInto ( RFlat2DMatrix & dest,
std::size_t rdfIdx,
std::uint64_t startRow,
std::uint64_t endRow,
std::size_t rowOffset = 0 )
inline

Definition at line 316 of file RClusterLoader.hxx.

◆ LoadTrainingClusterInto()

template<typename... Args>
std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::LoadTrainingClusterInto ( RFlat2DMatrix & dest,
std::size_t rdfIdx,
std::uint64_t startRow,
std::uint64_t endRow,
std::size_t rowOffset = 0 )
inline

Load one training cluster and return the number of rows written.

Unfiltered: delegates directly to LoadClusterInto() Filtered, epoch 1 (!fSplitDiscovered):

  • On the first call, Count() is called across all RDFs to obtain the total filtered entry count, fNumTrainingEntries and fNumValidationEntries are set as targets.
  • A single Foreach on the full raw cluster range loads data and captures rdfentry_ simultaneously. The real train/val boundary is computed from the accumulated filtered count vs the target, then the train sub-range is pushed to fTrainingClusters and the val sub-range to fValidationClusters.
  • Only the train rows are written into dest. -All subsequent epochs: delegates directly to LoadClusterInto()

Definition at line 340 of file RClusterLoader.hxx.

◆ LoadValidationClusterInto()

template<typename... Args>
void ROOT::Experimental::Internal::ML::RClusterLoader< Args >::LoadValidationClusterInto ( RFlat2DMatrix & dest,
std::size_t rdfIdx,
std::uint64_t startRow,
std::uint64_t endRow,
std::size_t rowOffset = 0 )
inline

Load one validation cluster into dest starting at rowOffset.

Definition at line 414 of file RClusterLoader.hxx.

◆ ShuffleTrainingClusters()

template<typename... Args>
void ROOT::Experimental::Internal::ML::RClusterLoader< Args >::ShuffleTrainingClusters ( std::size_t epochIdx)
inline

Re-order training clusters for the upcoming epoch.

Definition at line 295 of file RClusterLoader.hxx.

◆ ShuffleValidationClusters()

template<typename... Args>
void ROOT::Experimental::Internal::ML::RClusterLoader< Args >::ShuffleValidationClusters ( std::size_t epochIdx)
inline

Re-order validation clusters for the upcoming epoch.

Definition at line 307 of file RClusterLoader.hxx.

◆ SplitDataset()

template<typename... Args>
void ROOT::Experimental::Internal::ML::RClusterLoader< Args >::SplitDataset ( )
inline

Distribute the clusters into training and validation datasets No-op for filtered RDataFrames, the split is discovered lazily during the first epoch.

Definition at line 217 of file RClusterLoader.hxx.

Member Data Documentation

◆ fAccumulatedFilteredForTrain

template<typename... Args>
std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fAccumulatedFilteredForTrain {0}
private

Definition at line 174 of file RClusterLoader.hxx.

◆ fAllClusters

template<typename... Args>
std::vector<RClusterRange> ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fAllClusters
private

Definition at line 164 of file RClusterLoader.hxx.

◆ fCols

template<typename... Args>
std::vector<std::string> ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fCols
private

Definition at line 153 of file RClusterLoader.hxx.

◆ fIsFiltered

template<typename... Args>
bool ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fIsFiltered {false}
private

Definition at line 172 of file RClusterLoader.hxx.

◆ fNumChunkCols

template<typename... Args>
std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fNumChunkCols
private

Definition at line 162 of file RClusterLoader.hxx.

◆ fNumCols

template<typename... Args>
std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fNumCols
private

Definition at line 160 of file RClusterLoader.hxx.

◆ fNumTrainingEntries

template<typename... Args>
std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fNumTrainingEntries {0}
private

Definition at line 169 of file RClusterLoader.hxx.

◆ fNumValidationEntries

template<typename... Args>
std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fNumValidationEntries {0}
private

Definition at line 170 of file RClusterLoader.hxx.

◆ fRdfs

template<typename... Args>
std::vector<ROOT::RDF::RNode>& ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fRdfs
private

Definition at line 151 of file RClusterLoader.hxx.

◆ fRdfSizes

template<typename... Args>
std::vector<std::size_t> ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fRdfSizes
private

Definition at line 152 of file RClusterLoader.hxx.

◆ fSetSeed

template<typename... Args>
std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fSetSeed
private

Definition at line 158 of file RClusterLoader.hxx.

◆ fShuffle

template<typename... Args>
bool ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fShuffle
private

Definition at line 157 of file RClusterLoader.hxx.

◆ fSplitDiscovered

template<typename... Args>
bool ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fSplitDiscovered {false}
private

Definition at line 173 of file RClusterLoader.hxx.

◆ fSumVecSizes

template<typename... Args>
std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fSumVecSizes
private

Definition at line 161 of file RClusterLoader.hxx.

◆ fTotalEntries

template<typename... Args>
std::size_t ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fTotalEntries {0}
private

Definition at line 168 of file RClusterLoader.hxx.

◆ fTrainingClusters

template<typename... Args>
std::vector<RClusterRange> ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fTrainingClusters
private

Definition at line 165 of file RClusterLoader.hxx.

◆ fValidationClusters

template<typename... Args>
std::vector<RClusterRange> ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fValidationClusters
private

Definition at line 166 of file RClusterLoader.hxx.

◆ fValidationSplit

template<typename... Args>
float ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fValidationSplit
private

Definition at line 156 of file RClusterLoader.hxx.

◆ fVecPadding

template<typename... Args>
float ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fVecPadding
private

Definition at line 155 of file RClusterLoader.hxx.

◆ fVecSizes

template<typename... Args>
std::vector<std::size_t> ROOT::Experimental::Internal::ML::RClusterLoader< Args >::fVecSizes
private

Definition at line 154 of file RClusterLoader.hxx.

  • tree/ml/inc/ROOT/ML/RClusterLoader.hxx