15#ifndef ROOT_INTERNAL_ML_RCLUSTERLOADER
16#define ROOT_INTERNAL_ML_RCLUSTERLOADER
47 static_cast<std::size_t
>(
end -
start)};
59template <
typename... ColTypes>
74 template <typename T, std::enable_if_t<ROOT::Internal::RDF::IsDataContainer<T>::value,
int> = 0>
78 std::size_t vec_size =
vec.size();
81 if (vec_size < max_vec_size)
83 std::copy(
vec.begin(),
vec.end(), dst);
84 std::fill(dst + vec_size, dst + max_vec_size,
fVecPadding);
87 std::copy(
vec.begin(),
vec.begin() + max_vec_size, dst);
94 template <typename T, std::enable_if_t<!ROOT::Internal::RDF::IsDataContainer<T>::value,
int> = 0>
103 const std::vector<std::size_t> &maxVecSizes,
float vecPadding,
int i,
104 std::size_t rowOffset = 0)
110 fOffset(rowOffset * numColumns)
148template <
typename... Args>
151 std::vector<ROOT::RDF::RNode> &
fRdfs;
177 RClusterLoader(std::vector<ROOT::RDF::RNode> &rdfs,
const std::vector<std::string> &cols,
178 const std::vector<std::size_t> &vecSizes,
float vecPadding,
float validationSplit,
bool shuffle,
192 for (
auto &rdf :
fRdfs) {
194 if (!rdf.GetFilterNames().empty()) {
204 for (std::size_t rdfIdx = 0; rdfIdx <
fRdfs.size(); ++rdfIdx) {
207 auto numEntries =
r.second -
r.first;
220 throw std::runtime_error(
"RClusterLoader::SplitDataset: no clusters found.");
232 const std::size_t sz =
c.GetNumEntries();
233 const std::size_t trainSz =
static_cast<std::size_t
>((1.0f -
fValidationSplit) * sz);
234 const std::size_t valSz = sz - trainSz;
237 fTrainingClusters.push_back({
c.rdfIdx,
c.start,
c.start +
static_cast<std::uint64_t
>(trainSz)});
252 std::size_t accumulated = 0;
253 std::size_t splitIdx = 0;
255 const std::size_t sz =
fAllClusters[splitIdx].GetNumEntries();
256 if (accumulated + sz > targetTraining) {
266 if (splitIdx <
fAllClusters.size() && accumulated < targetTraining) {
269 const std::uint64_t splitPoint = boundary.
start +
static_cast<std::uint64_t
>(targetTraining - accumulated);
285 throw std::runtime_error(
"RClusterLoader::SplitDataset: no entries for training after split. "
286 "Reduce validation_split.");
289 throw std::runtime_error(
"RClusterLoader::SplitDataset: no entries for validation after split. "
290 "Increase validation_split.");
317 std::size_t rowOffset = 0)
341 std::uint64_t endRow, std::size_t rowOffset = 0)
346 std::vector<ROOT::RDF::RResultPtr<ULong64_t>> counts;
347 counts.reserve(
fRdfs.size());
348 for (
auto &rdf :
fRdfs) {
349 counts.push_back(rdf.Count());
353 std::size_t totalFiltered = 0;
354 for (
auto &
c : counts) {
355 totalFiltered +=
c.GetValue();
364 std::vector<ULong64_t> rdfEntries;
365 rdfEntries.reserve(endRow - startRow);
370 std::vector<std::string> colsWithEntry;
371 colsWithEntry.reserve(
fCols.size() + 1);
372 colsWithEntry.push_back(
"rdfentry_");
373 colsWithEntry.insert(colsWithEntry.end(),
fCols.begin(),
fCols.end());
376 [&](
ULong64_t entry,
const Args &...cols) {
377 rdfEntries.push_back(entry);
384 const std::size_t totalFiltered = rdfEntries.size();
385 if (totalFiltered == 0) {
388 std::sort(rdfEntries.begin(), rdfEntries.end());
391 const std::size_t trainCount =
392 std::min(
static_cast<std::size_t
>(totalFiltered * (1.0f -
fValidationSplit)), trainRemaining);
393 const std::size_t valCount = totalFiltered - trainCount;
397 const std::uint64_t boundary = (valCount > 0) ? rdfEntries[trainCount] : endRow;
409 return endRow - startRow;
415 std::size_t rowOffset = 0)
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Functor invoked by RDataFrame::Foreach to fill one row of an RFlat2DMatrix.
std::size_t fNumChunkCols
void AssignToTensor(const T &vec, int i, int numColumns)
Copy the content of a column into the current tensor when the column consists of vectors.
RFlat2DMatrix & fChunkTensor
RClusterLoaderFunctor(RFlat2DMatrix &chunkTensor, std::size_t numColumns, const std::vector< std::size_t > &maxVecSizes, float vecPadding, int i, std::size_t rowOffset=0)
void AssignToTensor(const T &val, int i, int numColumns)
Copy the content of a column into the current tensor when the column consists of scalar values.
void operator()(const ColTypes &...cols)
std::vector< std::size_t > fMaxVecSizes
std::size_t GetNumValidationClusters() const
std::size_t fNumValidationEntries
void ShuffleTrainingClusters(std::size_t epochIdx)
Re-order training clusters for the upcoming epoch.
void FinaliseSplitDiscovery()
Mark the train/val split as finalised after the first epoch.
std::size_t fAccumulatedFilteredForTrain
std::size_t fNumTrainingEntries
void LoadClusterInto(RFlat2DMatrix &dest, std::size_t rdfIdx, std::uint64_t startRow, std::uint64_t endRow, std::size_t rowOffset=0)
std::vector< std::size_t > fRdfSizes
void ShuffleValidationClusters(std::size_t epochIdx)
Re-order validation clusters for the upcoming epoch.
std::size_t fNumChunkCols
std::size_t LoadTrainingClusterInto(RFlat2DMatrix &dest, std::size_t rdfIdx, std::uint64_t startRow, std::uint64_t endRow, std::size_t rowOffset=0)
Load one training cluster and return the number of rows written.
std::size_t GetNumTrainingClusters() const
std::vector< RClusterRange > fAllClusters
void SplitDataset()
Distribute the clusters into training and validation datasets No-op for filtered RDataFrames,...
void LoadValidationClusterInto(RFlat2DMatrix &dest, std::size_t rdfIdx, std::uint64_t startRow, std::uint64_t endRow, std::size_t rowOffset=0)
Load one validation cluster into dest starting at rowOffset.
const std::vector< RClusterRange > & GetTrainingClusters() const
std::size_t fTotalEntries
std::size_t GetNmTotalClusters() const
RClusterLoader(std::vector< ROOT::RDF::RNode > &rdfs, const std::vector< std::string > &cols, const std::vector< std::size_t > &vecSizes, float vecPadding, float validationSplit, bool shuffle, std::size_t setSeed)
const std::vector< RClusterRange > & GetValidationClusters() const
bool IsSplitDiscovered() const
std::size_t GetNumValidationEntries() const
std::size_t GetNumTrainingEntries() const
std::vector< std::size_t > fVecSizes
std::vector< RClusterRange > fValidationClusters
std::vector< ROOT::RDF::RNode > & fRdfs
std::vector< std::string > fCols
std::size_t GetNumChunkCols() const
std::vector< RClusterRange > fTrainingClusters
void Foreach(F f, const ColumnNames_t &columns={})
Execute a user-defined function on each entry (instant action).
std::vector< std::pair< std::uint64_t, std::uint64_t > > GetDatasetGlobalClusterBoundaries(const RNode &node)
Retrieve the cluster boundaries for each cluster in the dataset, across files, with a global offset.
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
unsigned int RunGraphs(std::vector< RResultHandle > handles)
Run the event loops of multiple RDataFrames concurrently.
RInterface<::ROOT::Detail::RDF::RNodeBase > RNode
Describes a contiguous range of entries within a single RDataFrame, corresponding to one TTree/RNTupl...
std::size_t GetNumEntries() const
void SetNumEntries(std::size_t num)
Wrapper around ROOT::RVec<float> representing a 2D matrix.