Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RChunkLoader.hxx
Go to the documentation of this file.
1// Author: Dante Niewenhuis, VU Amsterdam 07/2023
2// Author: Kristupas Pranckietis, Vilnius University 05/2024
3// Author: Nopphakorn Subsa-Ard, King Mongkut's University of Technology Thonburi (KMUTT) (TH) 08/2024
4// Author: Vincenzo Eduardo Padulano, CERN 10/2024
5// Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025
6
7/*************************************************************************
8 * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. *
9 * All rights reserved. *
10 * *
11 * For the licensing terms see $ROOTSYS/LICENSE. *
12 * For the list of contributors see $ROOTSYS/README/CREDITS. *
13 *************************************************************************/
14
15#ifndef ROOT_INTERNAL_ML_RCHUNKLOADER
16#define ROOT_INTERNAL_ML_RCHUNKLOADER
17
18#include <vector>
19#include <random>
20
22#include "ROOT/RDataFrame.hxx"
23#include "ROOT/RDF/Utils.hxx"
26
27#include "ROOT/RLogger.hxx"
28
30/**
31\class ROOT::Experimental::Internal::ML::RChunkLoaderFunctor
32
33\brief Loading chunks made in RChunkLoader into tensors from data from RDataFrame.
34*/
35
36template <typename... ColTypes>
38 std::size_t fOffset{};
39 std::size_t fVecSizeIdx{};
40 float fVecPadding{};
41 std::vector<std::size_t> fMaxVecSizes{};
43
44 std::size_t fNumChunkCols;
45
46 int fI;
48
49 //////////////////////////////////////////////////////////////////////////
50 /// \brief Copy the content of a column into RTensor when the column consits of vectors
52 void AssignToTensor(const T &vec, int i, int numColumns)
53 {
54 std::size_t max_vec_size = fMaxVecSizes[fVecSizeIdx++];
55 std::size_t vec_size = vec.size();
56
57 float *dst = fChunkTensor.GetData() + fOffset + numColumns * i;
58 if (vec_size < max_vec_size) // Padding vector column to max_vec_size with fVecPadding
59 {
60 std::copy(vec.begin(), vec.end(), dst);
61 std::fill(dst + vec_size, dst + max_vec_size, fVecPadding);
62 } else // Copy only max_vec_size length from vector column
63 {
64 std::copy(vec.begin(), vec.begin() + max_vec_size, dst);
65 }
67 }
68
69 //////////////////////////////////////////////////////////////////////////
70 /// \brief Copy the content of a column into RTensor when the column consits of single values
72 void AssignToTensor(const T &val, int i, int numColumns)
73 {
75 fOffset++;
76 // fChunkTensor.GetData()[numColumns * i] = val;
77 }
78
79public:
85
86 void operator()(const ColTypes &...cols)
87 {
88 fVecSizeIdx = 0;
90 }
91};
92
93/**
94\class ROOT::Experimental::Internal::ML::RChunkLoader
95
96\brief Building and loading the chunks from the blocks and chunks constructed in RChunkConstructor
97
98In this class the blocks are stiches together to form chunks that are loaded into memory. The blocks used to create each
99chunk comes from different parts of the dataset. This is achieved by shuffling the blocks before distributing them into
100chunks. The purpose of this process is to reduce bias during machine learning training by ensuring that the data is well
101mixed. The dataset is also spit into training and validation sets with the user-defined validation split fraction.
102*/
103
104template <typename... Args>
106private:
107 std::size_t fNumEntries;
108 std::size_t fChunkSize;
109 std::size_t fBlockSize;
111
112 std::vector<std::size_t> fVecSizes;
113 std::size_t fSumVecSizes;
114 std::size_t fVecPadding;
115 std::size_t fNumChunkCols;
116
117 std::size_t fNumTrainEntries;
119 std::unique_ptr<RFlat2DMatrixOperators> fTensorOperators;
120
122 std::vector<std::string> fCols;
123 std::size_t fNumCols;
124 std::size_t fSetSeed;
125
128
130
131 std::unique_ptr<RChunkConstructor> fTraining;
132 std::unique_ptr<RChunkConstructor> fValidation;
133
134public:
135 RChunkLoader(ROOT::RDF::RNode &rdf, const std::size_t chunkSize, const std::size_t blockSize,
136 const float validationSplit, const std::vector<std::string> &cols,
137 const std::vector<std::size_t> &vecSizes = {}, const float vecPadding = 0.0, bool shuffle = true,
138 const std::size_t setSeed = 0)
139 : f_rdf(rdf),
140 fCols(cols),
144 fBlockSize(blockSize),
146 fNotFiltered(f_rdf.GetFilterNames().empty()),
149 {
150 fTensorOperators = std::make_unique<RFlat2DMatrixOperators>(fShuffle, fSetSeed);
151
152 fEntries = f_rdf.Take<ULong64_t>("rdfentry_");
153 fNumEntries = fEntries->size();
154
155 // add the last element in entries to not go out of range when filling chunks
156 fEntries->push_back((*fEntries)[fNumEntries - 1] + 1);
157
158 fNumCols = fCols.size();
159 fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0);
160
162
163 // number of training and validation entries after the split
164 fNumValidationEntries = static_cast<std::size_t>(fValidationSplit * fNumEntries);
166
167 fTraining = std::make_unique<RChunkConstructor>(fNumTrainEntries, fChunkSize, fBlockSize);
168 fValidation = std::make_unique<RChunkConstructor>(fNumValidationEntries, fChunkSize, fBlockSize);
169 }
170
171 //////////////////////////////////////////////////////////////////////////
172 /// \brief Distribute the blocks into training and validation datasets
174 {
175 std::random_device rd;
176 std::mt19937 g;
177
178 if (fSetSeed == 0) {
179 g.seed(rd());
180 } else {
181 g.seed(fSetSeed);
182 }
183
184 std::vector<Long_t> BlockSizes = {};
185
186 // fill the training and validation block sizes
187 for (size_t i = 0; i < fTraining->NumberOfDifferentBlocks.size(); i++) {
188 BlockSizes.insert(BlockSizes.end(), fTraining->NumberOfDifferentBlocks[i], fTraining->SizeOfBlocks[i]);
189 }
190
191 for (size_t i = 0; i < fValidation->NumberOfDifferentBlocks.size(); i++) {
192 BlockSizes.insert(BlockSizes.end(), fValidation->NumberOfDifferentBlocks[i], fValidation->SizeOfBlocks[i]);
193 }
194
195 // make an identity permutation map
196 std::vector<Long_t> indices(BlockSizes.size());
197
198 for (int i = 0; i < indices.size(); ++i) {
199 indices[i] = i;
200 }
201
202 // shuffle the identity permutation to create a new permutation
203 if (fShuffle) {
204 std::shuffle(indices.begin(), indices.end(), g);
205 }
206
207 // use the permuation to shuffle the vector of block sizes
208 std::vector<Long_t> PermutedBlockSizes(BlockSizes.size());
209 for (int i = 0; i < BlockSizes.size(); ++i) {
211 }
212
213 // create a vector for storing the boundaries of the blocks
214 std::vector<Long_t> BlockBoundaries(BlockSizes.size());
215
216 // get the boundaries of the blocks with the partial sum of the block sizes
217 // insert 0 at the beginning for the lower boundary of the first block
220
221 // distribute the neighbouring block boudaries into pairs to get the intevals for the blocks
222 std::vector<std::pair<Long_t, Long_t>> BlockIntervals;
223 for (size_t i = 0; i < BlockBoundaries.size() - 1; ++i) {
224 BlockIntervals.emplace_back(BlockBoundaries[i], BlockBoundaries[i + 1]);
225 }
226
227 // use the inverse of the permutation above to order the block intervals in the same order as
228 // the original vector of block sizes
229 std::vector<std::pair<Long_t, Long_t>> UnpermutedBlockIntervals(BlockIntervals.size());
230 for (int i = 0; i < BlockIntervals.size(); ++i) {
231 UnpermutedBlockIntervals[indices[i]] = BlockIntervals[i];
232 }
233
234 // distribute the block intervals between training and validation
235 fTraining->BlockIntervals.insert(fTraining->BlockIntervals.begin(), UnpermutedBlockIntervals.begin(),
236 UnpermutedBlockIntervals.begin() + fTraining->NumberOfBlocks);
237 fValidation->BlockIntervals.insert(fValidation->BlockIntervals.begin(),
238 UnpermutedBlockIntervals.begin() + fTraining->NumberOfBlocks,
240
241 // distribute the different block intervals types for training and validation
242 fTraining->DistributeBlockIntervals();
243 fValidation->DistributeBlockIntervals();
244 }
245
246 //////////////////////////////////////////////////////////////////////////
247 /// \brief Create training chunks consisiting of block intervals of different types
249 {
250
251 std::random_device rd;
252 std::mt19937 g;
253
254 if (fSetSeed == 0) {
255 g.seed(rd());
256 } else {
257 g.seed(fSetSeed);
258 }
259
260 // shuffle the block intervals within each type of block
261 if (fShuffle) {
262 std::shuffle(fTraining->FullBlockIntervalsInFullChunks.begin(),
263 fTraining->FullBlockIntervalsInFullChunks.end(), g);
264 std::shuffle(fTraining->LeftoverBlockIntervalsInFullChunks.begin(),
265 fTraining->LeftoverBlockIntervalsInFullChunks.end(), g);
266 std::shuffle(fTraining->FullBlockIntervalsInLeftoverChunks.begin(),
267 fTraining->FullBlockIntervalsInLeftoverChunks.end(), g);
268 std::shuffle(fTraining->LeftoverBlockIntervalsInLeftoverChunks.begin(),
269 fTraining->LeftoverBlockIntervalsInLeftoverChunks.end(), g);
270 }
271
272 // reset the chunk intervals and sizes before each epoch
273 fTraining->ChunksIntervals = {};
274 fTraining->ChunksSizes = {};
275
276 // create the chunks each consisiting of block intervals
277 fTraining->CreateChunksIntervals();
278
279 if (fShuffle) {
280 std::shuffle(fTraining->ChunksIntervals.begin(), fTraining->ChunksIntervals.end(), g);
281 }
282
283 fTraining->SizeOfChunks();
284 }
285
286 //////////////////////////////////////////////////////////////////////////
287 /// \brief Create training chunks consisiting of block intervals of different types
289 {
290 std::random_device rd;
291 std::mt19937 g;
292
293 if (fSetSeed == 0) {
294 g.seed(rd());
295 } else {
296 g.seed(fSetSeed);
297 }
298
299 if (fShuffle) {
300 std::shuffle(fValidation->FullBlockIntervalsInFullChunks.begin(),
301 fValidation->FullBlockIntervalsInFullChunks.end(), g);
302 std::shuffle(fValidation->LeftoverBlockIntervalsInFullChunks.begin(),
303 fValidation->LeftoverBlockIntervalsInFullChunks.end(), g);
304 std::shuffle(fValidation->FullBlockIntervalsInLeftoverChunks.begin(),
305 fValidation->FullBlockIntervalsInLeftoverChunks.end(), g);
306 std::shuffle(fValidation->LeftoverBlockIntervalsInLeftoverChunks.begin(),
307 fValidation->LeftoverBlockIntervalsInLeftoverChunks.end(), g);
308 }
309
310 fValidation->ChunksIntervals = {};
311 fValidation->ChunksSizes = {};
312
313 fValidation->CreateChunksIntervals();
314
315 if (fShuffle) {
316 std::shuffle(fValidation->ChunksIntervals.begin(), fValidation->ChunksIntervals.end(), g);
317 }
318
319 fValidation->SizeOfChunks();
320 }
321
322 //////////////////////////////////////////////////////////////////////////
323 /// \brief Load the nth chunk from the training dataset into a tensor
324 /// \param[in] TrainChunkTensor RTensor for the training chunk
325 /// \param[in] chunk Index of the chunk in the dataset
327 {
328
329 std::size_t chunkSize = fTraining->ChunksSizes[chunk];
330
331 if (chunk < fTraining->Chunks) {
333
334 // fill a chunk by looping over the blocks in a chunk (see RChunkConstructor)
335 std::size_t chunkEntry = 0;
336 std::vector<std::pair<Long_t, Long_t>> BlocksInChunk = fTraining->ChunksIntervals[chunk];
337
338 std::sort(
340 [](const std::pair<Long_t, Long_t> &a, const std::pair<Long_t, Long_t> &b) { return a.first < b.first; });
341
342 for (std::size_t i = 0; i < BlocksInChunk.size(); i++) {
343
344 // Use the block start and end entry to load into the chunk if the dataframe is not filtered
345 if (fNotFiltered) {
348
349 f_rdf.Foreach(func, fCols);
350 chunkEntry += BlocksInChunk[i].second - BlocksInChunk[i].first;
351 }
352
353 // use the entry column of the dataframe as a map to load the entries that passed the filters
354 else {
355 std::size_t blockSize = BlocksInChunk[i].second - BlocksInChunk[i].first;
356 for (std::size_t j = 0; j < blockSize; j++) {
359 (*fEntries)[BlocksInChunk[i].first + j + 1]);
360 f_rdf.Foreach(func, fCols);
361 chunkEntry++;
362 }
363 }
364 }
365
366 // reset dataframe
368
369 // shuffle the data in the chunk tensor
371 }
372 }
373
374 //////////////////////////////////////////////////////////////////////////
375 /// \brief Load the nth chunk from the validation dataset into a tensor
376 /// \param[in] ValidationChunkTensor RTensor for the validation chunk
377 /// \param[in] chunk Index of the chunk in the dataset
379 {
380
381 std::size_t chunkSize = fValidation->ChunksSizes[chunk];
382
383 if (chunk < fValidation->Chunks) {
385
386 std::size_t chunkEntry = 0;
387 std::vector<std::pair<Long_t, Long_t>> BlocksInChunk = fValidation->ChunksIntervals[chunk];
388
389 std::sort(
391 [](const std::pair<Long_t, Long_t> &a, const std::pair<Long_t, Long_t> &b) { return a.first < b.first; });
392
393 for (std::size_t i = 0; i < BlocksInChunk.size(); i++) {
394
395 // use the block start and end entry to load into the chunk if the dataframe is not filtered
396 if (fNotFiltered) {
399 f_rdf.Foreach(func, fCols);
400 chunkEntry += BlocksInChunk[i].second - BlocksInChunk[i].first;
401 }
402
403 // use the entry column of the dataframe as a map to load the entries that passed the filters
404 else {
405 std::size_t blockSize = BlocksInChunk[i].second - BlocksInChunk[i].first;
406 for (std::size_t j = 0; j < blockSize; j++) {
409 (*fEntries)[BlocksInChunk[i].first + j + 1]);
410
411 f_rdf.Foreach(func, fCols);
412 chunkEntry++;
413 }
414 }
415 }
416
417 // reset dataframe
419
420 // shuffle the data in the chunk tensor
422 }
423 }
424
426
427 std::vector<std::size_t> GetTrainingChunkSizes() { return fTraining->ChunksSizes; }
428 std::vector<std::size_t> GetValidationChunkSizes() { return fValidation->ChunksSizes; }
429
430 std::size_t GetNumTrainingEntries() { return fNumTrainEntries; }
432
434 {
435 const auto &rvec = Tensor.fRVec;
436 if (std::set<float>(rvec.begin(), rvec.end()).size() == rvec.size()) {
437 std::cout << "Tensor consists of only unique elements" << std::endl;
438 }
439 };
440
442 {
443 std::set<float> result;
444
445 // Call the set_intersection(), which computes the
446 // intersection of set1 and set2 and
447 // inserts the result into the 'result' set
448 std::set<float> set1(Tensor1.fRVec.begin(), Tensor1.fRVec.end());
449 std::set<float> set2(Tensor2.fRVec.begin(), Tensor2.fRVec.end());
450 std::set_intersection(set1.begin(), set1.end(), set2.begin(), set2.end(), std::inserter(result, result.begin()));
451 // std::list<int> result = intersection(allEntries1, allEntries2);
452
453 if (result.size() == 0) {
454 std::cout << "No overlap between the tensors" << std::endl;
455 } else {
456 std::cout << "Intersection between tensors: ";
457 for (auto num : result) {
458 std::cout << num << " ";
459 }
460 std::cout << std::endl;
461 }
462 };
463
464 std::size_t GetNumTrainingChunks() { return fTraining->Chunks; }
465
466 std::size_t GetNumValidationChunks() { return fValidation->Chunks; }
467};
468
469} // namespace ROOT::Experimental::Internal::ML
470#endif // ROOT_INTERNAL_ML_RCHUNKLOADER
#define b(i)
Definition RSha256.hxx:100
#define g(i)
Definition RSha256.hxx:105
#define a(i)
Definition RSha256.hxx:99
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Loading chunks made in RChunkLoader into tensors from data from RDataFrame.
void AssignToTensor(const T &vec, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of vectors.
void AssignToTensor(const T &val, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of single values.
RChunkLoaderFunctor(RFlat2DMatrix &chunkTensor, std::size_t numColumns, const std::vector< std::size_t > &maxVecSizes, float vecPadding, int i)
Building and loading the chunks from the blocks and chunks constructed in RChunkConstructor.
RChunkLoader(ROOT::RDF::RNode &rdf, const std::size_t chunkSize, const std::size_t blockSize, const float validationSplit, const std::vector< std::string > &cols, const std::vector< std::size_t > &vecSizes={}, const float vecPadding=0.0, bool shuffle=true, const std::size_t setSeed=0)
void CreateTrainingChunksIntervals()
Create training chunks consisiting of block intervals of different types.
void SplitDataset()
Distribute the blocks into training and validation datasets.
std::vector< std::size_t > GetValidationChunkSizes()
void CreateValidationChunksIntervals()
Create training chunks consisiting of block intervals of different types.
void LoadTrainingChunk(RFlat2DMatrix &TrainChunkTensor, std::size_t chunk)
Load the nth chunk from the training dataset into a tensor.
std::unique_ptr< RChunkConstructor > fTraining
std::unique_ptr< RChunkConstructor > fValidation
ROOT::RDF::RResultPtr< std::vector< ULong64_t > > fEntries
void CheckIfOverlap(RFlat2DMatrix &Tensor1, RFlat2DMatrix &Tensor2)
std::unique_ptr< RFlat2DMatrixOperators > fTensorOperators
std::vector< std::size_t > GetTrainingChunkSizes()
void LoadValidationChunk(RFlat2DMatrix &ValidationChunkTensor, std::size_t chunk)
Load the nth chunk from the validation dataset into a tensor.
The public interface to the RDataFrame federation of classes.
RResultPtr< COLL > Take(std::string_view column="")
Return a collection of values of a column (lazy action, returns a std::vector by default).
void Foreach(F f, const ColumnNames_t &columns={})
Execute a user-defined function on each entry (instant action).
Smart pointer for the return type of actions.
const_iterator begin() const
const_iterator end() const
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
Wrapper around ROOT::RVec<float> representing a 2D matrix.