Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RDatasetLoader.hxx
Go to the documentation of this file.
1// Author: Martin Føll, University of Oslo (UiO) & CERN 01/2026
2
3/*************************************************************************
4 * Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef TMVA_RDATASETLOADER
12#define TMVA_RDATASETLOADER
13
14#include <vector>
15#include <random>
16
17#include "TMVA/RTensor.hxx"
18#include "ROOT/RDataFrame.hxx"
21#include "ROOT/RDF/Utils.hxx"
22#include "ROOT/RVec.hxx"
23
24#include "ROOT/RLogger.hxx"
25
26namespace TMVA {
27namespace Experimental {
28namespace Internal {
29
30// clang-format off
31/**
32\class ROOT::TMVA::Experimental::Internal::RDatasetLoaderFunctor
33\ingroup tmva
34\brief Loading chunks made in RDatasetLoader into tensors from data from RDataFrame.
35*/
36
37template <typename... ColTypes>
39 // clang-format on
40 std::size_t fOffset{};
41 std::size_t fVecSizeIdx{};
42 float fVecPadding{};
43 std::vector<std::size_t> fMaxVecSizes{};
45
46 std::size_t fNumDatasetCols;
47
48 int fI;
50
51 //////////////////////////////////////////////////////////////////////////
52 /// \brief Copy the content of a column into RTensor when the column consits of vectors
54 void AssignToTensor(const T &vec, int i, int numColumns)
55 {
56 std::size_t max_vec_size = fMaxVecSizes[fVecSizeIdx++];
57 std::size_t vec_size = vec.size();
58 if (vec_size < max_vec_size) // Padding vector column to max_vec_size with fVecPadding
59 {
60 std::copy(vec.begin(), vec.end(), &fDatasetTensor.GetData()[fOffset + numColumns * i]);
61 std::fill(&fDatasetTensor.GetData()[fOffset + numColumns * i + vec_size],
63 } else // Copy only max_vec_size length from vector column
64 {
65 std::copy(vec.begin(), vec.begin() + max_vec_size, &fDatasetTensor.GetData()[fOffset + numColumns * i]);
66 }
68 }
69
70 //////////////////////////////////////////////////////////////////////////
71 /// \brief Copy the content of a column into RTensor when the column consits of single values
73 void AssignToTensor(const T &val, int i, int numColumns)
74 {
76 fOffset++;
77 }
78
79public:
85
86 void operator()(const ColTypes &...cols)
87 {
88 fVecSizeIdx = 0;
90 }
91};
92
93// clang-format off
94/**
95\class ROOT::TMVA::Experimental::Internal::RDatasetLoader
96\ingroup tmva
97\brief Load the whole dataset into memory.
98
99In this class the whole dataset is loaded into memory. The dataset is further shuffled and spit into training and validation sets with the user-defined validation split fraction.
100*/
101
102template <typename... Args>
104private:
105 // clang-format on
106 std::size_t fNumEntries;
108
109 std::vector<std::size_t> fVecSizes;
110 std::size_t fSumVecSizes;
111 std::size_t fVecPadding;
112 std::size_t fNumDatasetCols;
113
114 std::vector<RFlat2DMatrix> fTrainingDatasets;
115 std::vector<RFlat2DMatrix> fValidationDatasets;
116
119 std::unique_ptr<RFlat2DMatrixOperators> fTensorOperators;
120
121 std::vector<ROOT::RDF::RNode> f_rdfs;
122 std::vector<std::string> fCols;
123 std::size_t fNumCols;
124 std::size_t fSetSeed;
125
128
130
131public:
132 RDatasetLoader(const std::vector<ROOT::RDF::RNode> &rdfs, const float validationSplit,
133 const std::vector<std::string> &cols, const std::vector<std::size_t> &vecSizes = {},
134 const float vecPadding = 0.0, bool shuffle = true, const std::size_t setSeed = 0)
135 : f_rdfs(rdfs),
136 fCols(cols),
142 {
143 fTensorOperators = std::make_unique<RFlat2DMatrixOperators>(fShuffle, fSetSeed);
144 fNumCols = fCols.size();
145 fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0);
146
148 }
149
150 //////////////////////////////////////////////////////////////////////////
151 /// \brief Split an individual dataframe into a training and validation dataset
152 /// \param[in] rdf Dataframe that will be split into training and validation
153 /// \param[in] TrainingDataset Tensor for the training dataset
154 /// \param[in] ValidationDataset Tensor for the validation dataset
156 {
157 std::size_t NumEntries = rdf.Count().GetValue();
159
160 // add the last element in entries to not go out of range when filling chunks
161 Entries->push_back((*Entries)[NumEntries - 1] + 1);
162
163 // number of training and validation entries after the split
164 std::size_t NumValidationEntries = static_cast<std::size_t>(fValidationSplit * NumEntries);
165 std::size_t NumTrainingEntries = NumEntries - NumValidationEntries;
166
168
169 bool NotFiltered = rdf.GetFilterNames().empty();
170 if (NotFiltered) {
172 rdf.Foreach(func, fCols);
173 }
174
175 else {
176 std::size_t datasetEntry = 0;
177 for (std::size_t j = 0; j < NumEntries; j++) {
180 rdf.Foreach(func, fCols);
181 datasetEntry++;
182 }
183 }
188 }
189
190 //////////////////////////////////////////////////////////////////////////
191 /// \brief Split the dataframes in a training and validation dataset
211
212 std::vector<RFlat2DMatrix> GetTrainingDatasets() {return fTrainingDatasets;}
213 std::vector<RFlat2DMatrix> GetValidationDatasets() {return fValidationDatasets;}
214
215};
216
217} // namespace Internal
218} // namespace Experimental
219} // namespace TMVA
220#endif // TMVA_RDATASETLOADER
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Definition RtypesCore.h:84
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
The public interface to the RDataFrame federation of classes.
Smart pointer for the return type of actions.
void AssignToTensor(const T &val, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of single values.
RDatasetLoaderFunctor(RFlat2DMatrix &datasetTensor, std::size_t numColumns, const std::vector< std::size_t > &maxVecSizes, float vecPadding, int i)
void AssignToTensor(const T &vec, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of vectors.
std::vector< RFlat2DMatrix > GetValidationDatasets()
std::unique_ptr< RFlat2DMatrixOperators > fTensorOperators
std::vector< RFlat2DMatrix > GetTrainingDatasets()
ROOT::RDF::RResultPtr< std::vector< ULong64_t > > fEntries
void SplitDatasets()
Split the dataframes in a training and validation dataset.
std::vector< RFlat2DMatrix > fValidationDatasets
std::vector< RFlat2DMatrix > fTrainingDatasets
void SplitDataframe(ROOT::RDF::RNode &rdf, RFlat2DMatrix &TrainingDataset, RFlat2DMatrix &ValidationDataset)
Split an individual dataframe into a training and validation dataset.
RDatasetLoader(const std::vector< ROOT::RDF::RNode > &rdfs, const float validationSplit, const std::vector< std::string > &cols, const std::vector< std::size_t > &vecSizes={}, const float vecPadding=0.0, bool shuffle=true, const std::size_t setSeed=0)
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
create variable transformations
Wrapper around ROOT::RVec<float> representing a 2D matrix.