Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RDatasetLoader.hxx
Go to the documentation of this file.
1// Author: Martin Føll, University of Oslo (UiO) & CERN 01/2026
2
3/*************************************************************************
4 * Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_INTERNAL_ML_RDATASETLOADER
12#define ROOT_INTERNAL_ML_RDATASETLOADER
13
14#include <algorithm>
15#include <memory>
16#include <numeric>
17#include <string>
18#include <type_traits>
19#include <vector>
20
23#include "ROOT/RDataFrame.hxx"
24#include "ROOT/RDF/Utils.hxx"
25
27
28/**
29\class ROOT::Experimental::Internal::ML::RDatasetLoaderFunctor
30
31\brief Loading chunks made in RDatasetLoader into tensors from data from RDataFrame.
32*/
33
34template <typename... ColTypes>
36 std::size_t fOffset{};
37 std::size_t fVecSizeIdx{};
38 float fVecPadding{};
39 std::vector<std::size_t> fMaxVecSizes{};
41
42 std::size_t fNumDatasetCols;
43
44 int fI;
46
47 //////////////////////////////////////////////////////////////////////////
48 /// \brief Copy the content of a column into RTensor when the column consits of vectors
50 void AssignToTensor(const T &vec, int i, int numColumns)
51 {
52 std::size_t max_vec_size = fMaxVecSizes[fVecSizeIdx++];
53 std::size_t vec_size = vec.size();
54 if (vec_size < max_vec_size) // Padding vector column to max_vec_size with fVecPadding
55 {
56 std::copy(vec.begin(), vec.end(), &fDatasetTensor.GetData()[fOffset + numColumns * i]);
57 std::fill(&fDatasetTensor.GetData()[fOffset + numColumns * i + vec_size],
59 } else // Copy only max_vec_size length from vector column
60 {
61 std::copy(vec.begin(), vec.begin() + max_vec_size, &fDatasetTensor.GetData()[fOffset + numColumns * i]);
62 }
64 }
65
66 //////////////////////////////////////////////////////////////////////////
67 /// \brief Copy the content of a column into RTensor when the column consits of single values
69 void AssignToTensor(const T &val, int i, int numColumns)
70 {
72 fOffset++;
73 }
74
75public:
77 const std::vector<std::size_t> &maxVecSizes, float vecPadding, int i)
81 fI(i),
83 {
84 }
85
86 void operator()(const ColTypes &...cols)
87 {
88 fVecSizeIdx = 0;
90 }
91};
92
93/**
94\class ROOT::Experimental::Internal::ML::RDatasetLoader
95
96\brief Load the whole dataset into memory.
97
98In this class the whole dataset is loaded into memory. The dataset is further shuffled and spit into training and
99validation sets with the user-defined validation split fraction.
100*/
101
102template <typename... Args>
104private:
105 std::size_t fNumEntries;
107
108 std::vector<std::size_t> fVecSizes;
109 std::size_t fSumVecSizes;
110 std::size_t fVecPadding;
111 std::size_t fNumDatasetCols;
112
113 std::vector<RFlat2DMatrix> fTrainingDatasets;
114 std::vector<RFlat2DMatrix> fValidationDatasets;
115
118
121 std::unique_ptr<RFlat2DMatrixOperators> fTensorOperators;
122
123 std::vector<ROOT::RDF::RNode> f_rdfs;
124 std::vector<std::string> fCols;
125 std::size_t fNumCols;
126 std::size_t fSetSeed;
127
130
132
133public:
134 RDatasetLoader(const std::vector<ROOT::RDF::RNode> &rdfs, const float validationSplit,
135 const std::vector<std::string> &cols, const std::vector<std::size_t> &vecSizes = {},
136 const float vecPadding = 0.0, bool shuffle = true, const std::size_t setSeed = 0)
137 : f_rdfs(rdfs),
138 fCols(cols),
144 {
145 fTensorOperators = std::make_unique<RFlat2DMatrixOperators>(fShuffle, fSetSeed);
146 fNumCols = fCols.size();
147 fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0);
148
150 }
151
152 //////////////////////////////////////////////////////////////////////////
153 /// \brief Split an individual dataframe into a training and validation dataset
154 /// \param[in] rdf Dataframe that will be split into training and validation
155 /// \param[in] TrainingDataset Tensor for the training dataset
156 /// \param[in] ValidationDataset Tensor for the validation dataset
158 {
160 const std::size_t NumEntries = Entries->size();
161
162 // add the last element in entries to not go out of range when filling chunks
163 Entries->push_back((*Entries)[NumEntries - 1] + 1);
164
165 // number of training and validation entries after the split
166 std::size_t NumValidationEntries = static_cast<std::size_t>(fValidationSplit * NumEntries);
167 std::size_t NumTrainingEntries = NumEntries - NumValidationEntries;
168
170
171 bool NotFiltered = rdf.GetFilterNames().empty();
172 if (NotFiltered) {
174 rdf.Foreach(func, fCols);
175 }
176
177 else {
178 std::size_t datasetEntry = 0;
179 for (std::size_t j = 0; j < NumEntries; j++) {
182 rdf.Foreach(func, fCols);
183 datasetEntry++;
184 }
185 }
186
187 // reset dataframe
189
194 {{NumTrainingEntries, NumEntries}, {0, fNumDatasetCols}});
195 }
196
197 //////////////////////////////////////////////////////////////////////////
198 /// \brief Split the dataframes in a training and validation dataset
218
219 //////////////////////////////////////////////////////////////////////////
220 /// \brief Concatenate the datasets to a dataset
226
227 std::vector<RFlat2DMatrix> GetTrainingDatasets() { return fTrainingDatasets; }
228 std::vector<RFlat2DMatrix> GetValidationDatasets() { return fValidationDatasets; }
229
232
235};
236
237} // namespace ROOT::Experimental::Internal::ML
238#endif // ROOT_INTERNAL_ML_RDATASETLOADER
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Definition RtypesCore.h:84
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Loading chunks made in RDatasetLoader into tensors from data from RDataFrame.
void AssignToTensor(const T &val, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of single values.
void AssignToTensor(const T &vec, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of vectors.
RDatasetLoaderFunctor(RFlat2DMatrix &datasetTensor, std::size_t numColumns, const std::vector< std::size_t > &maxVecSizes, float vecPadding, int i)
void SplitDataframe(ROOT::RDF::RNode &rdf, RFlat2DMatrix &TrainingDataset, RFlat2DMatrix &ValidationDataset)
Split an individual dataframe into a training and validation dataset.
std::unique_ptr< RFlat2DMatrixOperators > fTensorOperators
std::vector< RFlat2DMatrix > GetTrainingDatasets()
void SplitDatasets()
Split the dataframes in a training and validation dataset.
std::vector< RFlat2DMatrix > GetValidationDatasets()
ROOT::RDF::RResultPtr< std::vector< ULong64_t > > fEntries
RDatasetLoader(const std::vector< ROOT::RDF::RNode > &rdfs, const float validationSplit, const std::vector< std::string > &cols, const std::vector< std::size_t > &vecSizes={}, const float vecPadding=0.0, bool shuffle=true, const std::size_t setSeed=0)
void ConcatenateDatasets()
Concatenate the datasets to a dataset.
The public interface to the RDataFrame federation of classes.
Smart pointer for the return type of actions.
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
Wrapper around ROOT::RVec<float> representing a 2D matrix.