Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RDatasetLoader.hxx
Go to the documentation of this file.
1// Author: Martin Føll, University of Oslo (UiO) & CERN 01/2026
2
3/*************************************************************************
4 * Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_INTERNAL_ML_RDATASETLOADER
12#define ROOT_INTERNAL_ML_RDATASETLOADER
13
14#include <vector>
15#include <random>
16
17#include "ROOT/RDataFrame.hxx"
20#include "ROOT/RDF/Utils.hxx"
21#include "ROOT/RVec.hxx"
22
23#include "ROOT/RLogger.hxx"
24
26
27/**
28\class ROOT::Experimental::Internal::ML::RDatasetLoaderFunctor
29
30\brief Loading chunks made in RDatasetLoader into tensors from data from RDataFrame.
31*/
32
33template <typename... ColTypes>
35 std::size_t fOffset{};
36 std::size_t fVecSizeIdx{};
37 float fVecPadding{};
38 std::vector<std::size_t> fMaxVecSizes{};
40
41 std::size_t fNumDatasetCols;
42
43 int fI;
45
46 //////////////////////////////////////////////////////////////////////////
47 /// \brief Copy the content of a column into RTensor when the column consits of vectors
49 void AssignToTensor(const T &vec, int i, int numColumns)
50 {
51 std::size_t max_vec_size = fMaxVecSizes[fVecSizeIdx++];
52 std::size_t vec_size = vec.size();
53 if (vec_size < max_vec_size) // Padding vector column to max_vec_size with fVecPadding
54 {
55 std::copy(vec.begin(), vec.end(), &fDatasetTensor.GetData()[fOffset + numColumns * i]);
56 std::fill(&fDatasetTensor.GetData()[fOffset + numColumns * i + vec_size],
58 } else // Copy only max_vec_size length from vector column
59 {
60 std::copy(vec.begin(), vec.begin() + max_vec_size, &fDatasetTensor.GetData()[fOffset + numColumns * i]);
61 }
63 }
64
65 //////////////////////////////////////////////////////////////////////////
66 /// \brief Copy the content of a column into RTensor when the column consits of single values
68 void AssignToTensor(const T &val, int i, int numColumns)
69 {
71 fOffset++;
72 }
73
74public:
76 const std::vector<std::size_t> &maxVecSizes, float vecPadding, int i)
80 fI(i),
82 {
83 }
84
85 void operator()(const ColTypes &...cols)
86 {
87 fVecSizeIdx = 0;
89 }
90};
91
92/**
93\class ROOT::Experimental::Internal::ML::RDatasetLoader
94
95\brief Load the whole dataset into memory.
96
97In this class the whole dataset is loaded into memory. The dataset is further shuffled and spit into training and
98validation sets with the user-defined validation split fraction.
99*/
100
101template <typename... Args>
103private:
104 std::size_t fNumEntries;
106
107 std::vector<std::size_t> fVecSizes;
108 std::size_t fSumVecSizes;
109 std::size_t fVecPadding;
110 std::size_t fNumDatasetCols;
111
112 std::vector<RFlat2DMatrix> fTrainingDatasets;
113 std::vector<RFlat2DMatrix> fValidationDatasets;
114
117
120 std::unique_ptr<RFlat2DMatrixOperators> fTensorOperators;
121
122 std::vector<ROOT::RDF::RNode> f_rdfs;
123 std::vector<std::string> fCols;
124 std::size_t fNumCols;
125 std::size_t fSetSeed;
126
129
131
132public:
133 RDatasetLoader(const std::vector<ROOT::RDF::RNode> &rdfs, const float validationSplit,
134 const std::vector<std::string> &cols, const std::vector<std::size_t> &vecSizes = {},
135 const float vecPadding = 0.0, bool shuffle = true, const std::size_t setSeed = 0)
136 : f_rdfs(rdfs),
137 fCols(cols),
143 {
144 fTensorOperators = std::make_unique<RFlat2DMatrixOperators>(fShuffle, fSetSeed);
145 fNumCols = fCols.size();
146 fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0);
147
149 }
150
151 //////////////////////////////////////////////////////////////////////////
152 /// \brief Split an individual dataframe into a training and validation dataset
153 /// \param[in] rdf Dataframe that will be split into training and validation
154 /// \param[in] TrainingDataset Tensor for the training dataset
155 /// \param[in] ValidationDataset Tensor for the validation dataset
157 {
159 const std::size_t NumEntries = Entries->size();
160
161 // add the last element in entries to not go out of range when filling chunks
162 Entries->push_back((*Entries)[NumEntries - 1] + 1);
163
164 // number of training and validation entries after the split
165 std::size_t NumValidationEntries = static_cast<std::size_t>(fValidationSplit * NumEntries);
166 std::size_t NumTrainingEntries = NumEntries - NumValidationEntries;
167
169
170 bool NotFiltered = rdf.GetFilterNames().empty();
171 if (NotFiltered) {
173 rdf.Foreach(func, fCols);
174 }
175
176 else {
177 std::size_t datasetEntry = 0;
178 for (std::size_t j = 0; j < NumEntries; j++) {
181 rdf.Foreach(func, fCols);
182 datasetEntry++;
183 }
184 }
185
186 // reset dataframe
188
193 {{NumTrainingEntries, NumEntries}, {0, fNumDatasetCols}});
194 }
195
196 //////////////////////////////////////////////////////////////////////////
197 /// \brief Split the dataframes in a training and validation dataset
217
218 //////////////////////////////////////////////////////////////////////////
219 /// \brief Concatenate the datasets to a dataset
225
226 std::vector<RFlat2DMatrix> GetTrainingDatasets() { return fTrainingDatasets; }
227 std::vector<RFlat2DMatrix> GetValidationDatasets() { return fValidationDatasets; }
228
231
234};
235
236} // namespace ROOT::Experimental::Internal::ML
237#endif // ROOT_INTERNAL_ML_RDATASETLOADER
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Definition RtypesCore.h:84
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Loading chunks made in RDatasetLoader into tensors from data from RDataFrame.
void AssignToTensor(const T &val, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of single values.
void AssignToTensor(const T &vec, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of vectors.
RDatasetLoaderFunctor(RFlat2DMatrix &datasetTensor, std::size_t numColumns, const std::vector< std::size_t > &maxVecSizes, float vecPadding, int i)
void SplitDataframe(ROOT::RDF::RNode &rdf, RFlat2DMatrix &TrainingDataset, RFlat2DMatrix &ValidationDataset)
Split an individual dataframe into a training and validation dataset.
std::unique_ptr< RFlat2DMatrixOperators > fTensorOperators
std::vector< RFlat2DMatrix > GetTrainingDatasets()
void SplitDatasets()
Split the dataframes in a training and validation dataset.
std::vector< RFlat2DMatrix > GetValidationDatasets()
ROOT::RDF::RResultPtr< std::vector< ULong64_t > > fEntries
RDatasetLoader(const std::vector< ROOT::RDF::RNode > &rdfs, const float validationSplit, const std::vector< std::string > &cols, const std::vector< std::size_t > &vecSizes={}, const float vecPadding=0.0, bool shuffle=true, const std::size_t setSeed=0)
void ConcatenateDatasets()
Concatenate the datasets to a dataset.
The public interface to the RDataFrame federation of classes.
Smart pointer for the return type of actions.
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
Wrapper around ROOT::RVec<float> representing a 2D matrix.