Logo ROOT  
Reference Guide
Loading...
Searching...
No Matches
RDatasetLoader.hxx
Go to the documentation of this file.
1// Author: Martin Føll, University of Oslo (UiO) & CERN 01/2026
2
3/*************************************************************************
4 * Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_INTERNAL_ML_RDATASETLOADER
12#define ROOT_INTERNAL_ML_RDATASETLOADER
13
14#include <algorithm>
15#include <memory>
16#include <numeric>
17#include <string>
18#include <type_traits>
19#include <vector>
20
23#include "ROOT/RDataFrame.hxx"
24#include "ROOT/RDF/Utils.hxx"
25
27
28/**
29\class ROOT::Experimental::Internal::ML::RDatasetLoaderFunctor
30
31\brief Loading chunks made in RDatasetLoader into tensors from data from RDataFrame.
32*/
33
34template <typename... ColTypes>
36 std::size_t fOffset{};
37 std::size_t fVecSizeIdx{};
38 float fVecPadding{};
39 std::vector<std::size_t> fMaxVecSizes{};
41
42 std::size_t fNumDatasetCols;
43
44 int fI;
46
47 //////////////////////////////////////////////////////////////////////////
48 /// \brief Copy the content of a column into RTensor when the column consits of vectors
49 template <typename T, std::enable_if_t<ROOT::Internal::RDF::IsDataContainer<T>::value, int> = 0>
50 void AssignToTensor(const T &vec, int i, int numColumns)
51 {
52 std::size_t max_vec_size = fMaxVecSizes[fVecSizeIdx++];
53 std::size_t vec_size = vec.size();
54 if (vec_size < max_vec_size) // Padding vector column to max_vec_size with fVecPadding
55 {
56 std::copy(vec.begin(), vec.end(), &fDatasetTensor.GetData()[fOffset + numColumns * i]);
57 std::fill(&fDatasetTensor.GetData()[fOffset + numColumns * i + vec_size],
58 &fDatasetTensor.GetData()[fOffset + numColumns * i + max_vec_size], fVecPadding);
59 } else // Copy only max_vec_size length from vector column
60 {
61 std::copy(vec.begin(), vec.begin() + max_vec_size, &fDatasetTensor.GetData()[fOffset + numColumns * i]);
62 }
63 fOffset += max_vec_size;
64 }
65
66 //////////////////////////////////////////////////////////////////////////
67 /// \brief Copy the content of a column into RTensor when the column consits of single values
68 template <typename T, std::enable_if_t<!ROOT::Internal::RDF::IsDataContainer<T>::value, int> = 0>
69 void AssignToTensor(const T &val, int i, int numColumns)
70 {
71 fDatasetTensor.GetData()[fOffset + numColumns * i] = val;
72 fOffset++;
73 }
74
75public:
76 RDatasetLoaderFunctor(RFlat2DMatrix &datasetTensor, std::size_t numColumns,
77 const std::vector<std::size_t> &maxVecSizes, float vecPadding, int i)
78 : fDatasetTensor(datasetTensor),
79 fMaxVecSizes(maxVecSizes),
80 fVecPadding(vecPadding),
81 fI(i),
82 fNumColumns(numColumns)
83 {
84 }
85
86 void operator()(const ColTypes &...cols)
87 {
88 fVecSizeIdx = 0;
89 (AssignToTensor(cols, fI, fNumColumns), ...);
90 }
91};
92
93/**
94\class ROOT::Experimental::Internal::ML::RDatasetLoader
95
96\brief Load the whole dataset into memory.
97
98In this class the whole dataset is loaded into memory. The dataset is further shuffled and spit into training and
99validation sets with the user-defined validation split fraction.
100*/
101
102template <typename... Args>
104private:
105 std::size_t fNumEntries;
107
108 std::vector<std::size_t> fVecSizes;
109 std::size_t fSumVecSizes;
110 std::size_t fVecPadding;
111 std::size_t fNumDatasetCols;
112
113 std::vector<RFlat2DMatrix> fTrainingDatasets;
114 std::vector<RFlat2DMatrix> fValidationDatasets;
115
118
121 std::unique_ptr<RFlat2DMatrixOperators> fTensorOperators;
122
123 std::vector<ROOT::RDF::RNode> f_rdfs;
124 std::vector<std::string> fCols;
125 std::size_t fNumCols;
126 std::size_t fSetSeed;
127
130
132
133public:
134 RDatasetLoader(const std::vector<ROOT::RDF::RNode> &rdfs, const float validationSplit,
135 const std::vector<std::string> &cols, const std::vector<std::size_t> &vecSizes = {},
136 const float vecPadding = 0.0, bool shuffle = true, const std::size_t setSeed = 0)
137 : f_rdfs(rdfs),
138 fCols(cols),
139 fVecSizes(vecSizes),
140 fVecPadding(vecPadding),
141 fValidationSplit(validationSplit),
142 fShuffle(shuffle),
143 fSetSeed(setSeed)
144 {
145 fTensorOperators = std::make_unique<RFlat2DMatrixOperators>(fShuffle, fSetSeed);
146 fNumCols = fCols.size();
147 fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0);
148
150 }
151
152 //////////////////////////////////////////////////////////////////////////
153 /// \brief Split an individual dataframe into a training and validation dataset
154 /// \param[in] rdf Dataframe that will be split into training and validation
155 /// \param[in] TrainingDataset Tensor for the training dataset
156 /// \param[in] ValidationDataset Tensor for the validation dataset
157 void SplitDataframe(ROOT::RDF::RNode &rdf, RFlat2DMatrix &TrainingDataset, RFlat2DMatrix &ValidationDataset)
158 {
160 const std::size_t NumEntries = Entries->size();
161
162 // add the last element in entries to not go out of range when filling chunks
163 Entries->push_back((*Entries)[NumEntries - 1] + 1);
164
165 // number of training and validation entries after the split
166 std::size_t NumValidationEntries = static_cast<std::size_t>(fValidationSplit * NumEntries);
167 std::size_t NumTrainingEntries = NumEntries - NumValidationEntries;
168
169 RFlat2DMatrix Dataset({NumEntries, fNumDatasetCols});
170
171 bool NotFiltered = rdf.GetFilterNames().empty();
172 if (NotFiltered) {
173 RDatasetLoaderFunctor<Args...> func(Dataset, fNumDatasetCols, fVecSizes, fVecPadding, 0);
174 rdf.Foreach(func, fCols);
175 }
176
177 else {
178 std::size_t datasetEntry = 0;
179 for (std::size_t j = 0; j < NumEntries; j++) {
180 RDatasetLoaderFunctor<Args...> func(Dataset, fNumDatasetCols, fVecSizes, fVecPadding, datasetEntry);
181 ROOT::Internal::RDF::ChangeBeginAndEndEntries(rdf, (*Entries)[j], (*Entries)[j + 1]);
182 rdf.Foreach(func, fCols);
183 datasetEntry++;
184 }
185 }
186
187 // reset dataframe
188 ROOT::Internal::RDF::ChangeBeginAndEndEntries(rdf, (*Entries)[0], (*Entries)[NumEntries]);
189
190 RFlat2DMatrix ShuffledDataset({NumEntries, fNumDatasetCols});
191 fTensorOperators->ShuffleTensor(ShuffledDataset, Dataset);
192 fTensorOperators->SliceTensor(TrainingDataset, ShuffledDataset, {{0, NumTrainingEntries}, {0, fNumDatasetCols}});
193 fTensorOperators->SliceTensor(ValidationDataset, ShuffledDataset,
194 {{NumTrainingEntries, NumEntries}, {0, fNumDatasetCols}});
195 }
196
197 //////////////////////////////////////////////////////////////////////////
198 /// \brief Split the dataframes in a training and validation dataset
200 {
201 fNumEntries = 0;
204
205 for (auto &rdf : f_rdfs) {
206 RFlat2DMatrix TrainingDataset;
207 RFlat2DMatrix ValidationDataset;
208
209 SplitDataframe(rdf, TrainingDataset, ValidationDataset);
210 fTrainingDatasets.push_back(TrainingDataset);
211 fValidationDatasets.push_back(ValidationDataset);
212
213 fNumTrainingEntries += TrainingDataset.GetRows();
214 fNumValidationEntries += ValidationDataset.GetRows();
215 fNumEntries += TrainingDataset.GetRows() + ValidationDataset.GetRows();
216 }
217 }
218
219 //////////////////////////////////////////////////////////////////////////
220 /// \brief Concatenate the datasets to a dataset
226
227 std::vector<RFlat2DMatrix> GetTrainingDatasets() { return fTrainingDatasets; }
228 std::vector<RFlat2DMatrix> GetValidationDatasets() { return fValidationDatasets; }
229
232
233 std::size_t GetNumTrainingEntries() { return fTrainingDataset.GetRows(); }
234 std::size_t GetNumValidationEntries() { return fValidationDataset.GetRows(); }
235};
236
237} // namespace ROOT::Experimental::Internal::ML
238#endif // ROOT_INTERNAL_ML_RDATASETLOADER
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Definition RtypesCore.h:84
Loading chunks made in RDatasetLoader into tensors from data from RDataFrame.
void AssignToTensor(const T &val, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of single values.
void AssignToTensor(const T &vec, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of vectors.
RDatasetLoaderFunctor(RFlat2DMatrix &datasetTensor, std::size_t numColumns, const std::vector< std::size_t > &maxVecSizes, float vecPadding, int i)
void SplitDataframe(ROOT::RDF::RNode &rdf, RFlat2DMatrix &TrainingDataset, RFlat2DMatrix &ValidationDataset)
Split an individual dataframe into a training and validation dataset.
std::unique_ptr< RFlat2DMatrixOperators > fTensorOperators
std::vector< RFlat2DMatrix > GetTrainingDatasets()
void SplitDatasets()
Split the dataframes in a training and validation dataset.
std::vector< RFlat2DMatrix > GetValidationDatasets()
ROOT::RDF::RResultPtr< std::vector< ULong64_t > > fEntries
RDatasetLoader(const std::vector< ROOT::RDF::RNode > &rdfs, const float validationSplit, const std::vector< std::string > &cols, const std::vector< std::size_t > &vecSizes={}, const float vecPadding=0.0, bool shuffle=true, const std::size_t setSeed=0)
void ConcatenateDatasets()
Concatenate the datasets to a dataset.
std::vector< std::string > GetFilterNames()
Returns the names of the filters created.
RResultPtr< COLL > Take(std::string_view column="")
Return a collection of values of a column (lazy action, returns a std::vector by default).
void Foreach(F f, const ColumnNames_t &columns={})
Execute a user-defined function on each entry (instant action).
Smart pointer for the return type of actions.
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
RInterface<::ROOT::Detail::RDF::RNodeBase > RNode
Wrapper around ROOT::RVec<float> representing a 2D matrix.