15#ifndef ROOT_INTERNAL_ML_RCLUSTERLOADER
16#define ROOT_INTERNAL_ML_RCLUSTERLOADER
47 static_cast<std::size_t
>(
end -
start)};
148template <
typename... Args>
151 std::vector<ROOT::RDF::RNode> &
fRdfs;
194 if (!
rdf.GetFilterNames().empty()) {
204 for (std::size_t rdfIdx = 0; rdfIdx <
fRdfs.size(); ++rdfIdx) {
207 auto numEntries =
r.second -
r.first;
220 throw std::runtime_error(
"RClusterLoader::SplitDataset: no clusters found.");
232 const std::size_t
sz =
c.GetNumEntries();
285 throw std::runtime_error(
"RClusterLoader::SplitDataset: no entries for training after split. "
286 "Reduce validation_split.");
289 throw std::runtime_error(
"RClusterLoader::SplitDataset: no entries for validation after split. "
290 "Increase validation_split.");
346 std::vector<ROOT::RDF::RResultPtr<ULong64_t>>
counts;
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t dest
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t r
Functor invoked by RDataFrame::Foreach to fill one row of an RFlat2DMatrix.
std::size_t fNumChunkCols
void AssignToTensor(const T &vec, int i, int numColumns)
Copy the content of a column into the current tensor when the column consists of vectors.
RFlat2DMatrix & fChunkTensor
RClusterLoaderFunctor(RFlat2DMatrix &chunkTensor, std::size_t numColumns, const std::vector< std::size_t > &maxVecSizes, float vecPadding, int i, std::size_t rowOffset=0)
void AssignToTensor(const T &val, int i, int numColumns)
Copy the content of a column into the current tensor when the column consists of scalar values.
void operator()(const ColTypes &...cols)
std::vector< std::size_t > fMaxVecSizes
Loads TTree/RNTuple clusters from one or more RDataFrames into RFlat2DMatrix buffers for ML training ...
std::size_t GetNumValidationClusters() const
std::size_t fNumValidationEntries
void ShuffleTrainingClusters(std::size_t epochIdx)
Re-order training clusters for the upcoming epoch.
void FinaliseSplitDiscovery()
Mark the train/val split as finalised after the first epoch.
std::size_t fAccumulatedFilteredForTrain
std::size_t fNumTrainingEntries
void LoadClusterInto(RFlat2DMatrix &dest, std::size_t rdfIdx, std::uint64_t startRow, std::uint64_t endRow, std::size_t rowOffset=0)
std::vector< std::size_t > fRdfSizes
void ShuffleValidationClusters(std::size_t epochIdx)
Re-order validation clusters for the upcoming epoch.
std::size_t fNumChunkCols
std::size_t LoadTrainingClusterInto(RFlat2DMatrix &dest, std::size_t rdfIdx, std::uint64_t startRow, std::uint64_t endRow, std::size_t rowOffset=0)
Load one training cluster and return the number of rows written.
std::size_t GetNumTrainingClusters() const
std::vector< RClusterRange > fAllClusters
void SplitDataset()
Distribute the clusters into training and validation datasets No-op for filtered RDataFrames,...
void LoadValidationClusterInto(RFlat2DMatrix &dest, std::size_t rdfIdx, std::uint64_t startRow, std::uint64_t endRow, std::size_t rowOffset=0)
Load one validation cluster into dest starting at rowOffset.
const std::vector< RClusterRange > & GetTrainingClusters() const
std::size_t fTotalEntries
std::size_t GetNmTotalClusters() const
RClusterLoader(std::vector< ROOT::RDF::RNode > &rdfs, const std::vector< std::string > &cols, const std::vector< std::size_t > &vecSizes, float vecPadding, float validationSplit, bool shuffle, std::size_t setSeed)
const std::vector< RClusterRange > & GetValidationClusters() const
bool IsSplitDiscovered() const
std::size_t GetNumValidationEntries() const
std::size_t GetNumTrainingEntries() const
std::vector< std::size_t > fVecSizes
std::vector< RClusterRange > fValidationClusters
std::vector< ROOT::RDF::RNode > & fRdfs
std::vector< std::string > fCols
std::size_t GetNumChunkCols() const
std::vector< RClusterRange > fTrainingClusters
The public interface to the RDataFrame federation of classes.
const_iterator begin() const
const_iterator end() const
std::vector< std::pair< std::uint64_t, std::uint64_t > > GetDatasetGlobalClusterBoundaries(const RNode &node)
Retrieve the cluster boundaries for each cluster in the dataset, across files, with a global offset.
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
unsigned int RunGraphs(std::vector< RResultHandle > handles)
Run the event loops of multiple RDataFrames concurrently.
Describes a contiguous range of entries within a single RDataFrame, corresponding to one TTree/RNTupl...
std::size_t GetNumEntries() const
void SetNumEntries(std::size_t num)
Wrapper around ROOT::RVec<float> representing a 2D matrix.