Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RChunkLoader.hxx
Go to the documentation of this file.
1// Author: Dante Niewenhuis, VU Amsterdam 07/2023
2// Author: Kristupas Pranckietis, Vilnius University 05/2024
3// Author: Nopphakorn Subsa-Ard, King Mongkut's University of Technology Thonburi (KMUTT) (TH) 08/2024
4// Author: Vincenzo Eduardo Padulano, CERN 10/2024
5// Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025
6
7/*************************************************************************
8 * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. *
9 * All rights reserved. *
10 * *
11 * For the licensing terms see $ROOTSYS/LICENSE. *
12 * For the list of contributors see $ROOTSYS/README/CREDITS. *
13 *************************************************************************/
14
15#ifndef ROOT_INTERNAL_ML_RCHUNKLOADER
16#define ROOT_INTERNAL_ML_RCHUNKLOADER
17
18#include <algorithm>
19#include <iostream>
20#include <iterator>
21#include <memory>
22#include <numeric>
23#include <random>
24#include <set>
25#include <string>
26#include <type_traits>
27#include <utility>
28#include <vector>
29
33#include "ROOT/RDataFrame.hxx"
34#include "ROOT/RDF/Utils.hxx"
35
37/**
38\class ROOT::Experimental::Internal::ML::RChunkLoaderFunctor
39
40\brief Loading chunks made in RChunkLoader into tensors from data from RDataFrame.
41*/
42
43template <typename... ColTypes>
45 std::size_t fOffset{};
46 std::size_t fVecSizeIdx{};
47 float fVecPadding{};
48 std::vector<std::size_t> fMaxVecSizes{};
50
51 std::size_t fNumChunkCols;
52
53 int fI;
55
56 //////////////////////////////////////////////////////////////////////////
57 /// \brief Copy the content of a column into RTensor when the column consits of vectors
59 void AssignToTensor(const T &vec, int i, int numColumns)
60 {
61 std::size_t max_vec_size = fMaxVecSizes[fVecSizeIdx++];
62 std::size_t vec_size = vec.size();
63
64 float *dst = fChunkTensor.GetData() + fOffset + numColumns * i;
65 if (vec_size < max_vec_size) // Padding vector column to max_vec_size with fVecPadding
66 {
67 std::copy(vec.begin(), vec.end(), dst);
68 std::fill(dst + vec_size, dst + max_vec_size, fVecPadding);
69 } else // Copy only max_vec_size length from vector column
70 {
71 std::copy(vec.begin(), vec.begin() + max_vec_size, dst);
72 }
74 }
75
76 //////////////////////////////////////////////////////////////////////////
77 /// \brief Copy the content of a column into RTensor when the column consits of single values
79 void AssignToTensor(const T &val, int i, int numColumns)
80 {
82 fOffset++;
83 // fChunkTensor.GetData()[numColumns * i] = val;
84 }
85
86public:
92
93 void operator()(const ColTypes &...cols)
94 {
95 fVecSizeIdx = 0;
97 }
98};
99
100/**
101\class ROOT::Experimental::Internal::ML::RChunkLoader
102
103\brief Building and loading the chunks from the blocks and chunks constructed in RChunkConstructor
104
105In this class the blocks are stiches together to form chunks that are loaded into memory. The blocks used to create each
106chunk comes from different parts of the dataset. This is achieved by shuffling the blocks before distributing them into
107chunks. The purpose of this process is to reduce bias during machine learning training by ensuring that the data is well
108mixed. The dataset is also spit into training and validation sets with the user-defined validation split fraction.
109*/
110
111template <typename... Args>
113private:
114 std::size_t fNumEntries;
115 std::size_t fChunkSize;
116 std::size_t fBlockSize;
118
119 std::vector<std::size_t> fVecSizes;
120 std::size_t fSumVecSizes;
121 std::size_t fVecPadding;
122 std::size_t fNumChunkCols;
123
124 std::size_t fNumTrainEntries;
126 std::unique_ptr<RFlat2DMatrixOperators> fTensorOperators;
127
129 std::vector<std::string> fCols;
130 std::size_t fNumCols;
131 std::size_t fSetSeed;
132
135
137
138 std::unique_ptr<RChunkConstructor> fTraining;
139 std::unique_ptr<RChunkConstructor> fValidation;
140
141public:
142 RChunkLoader(ROOT::RDF::RNode &rdf, const std::size_t chunkSize, const std::size_t blockSize,
143 const float validationSplit, const std::vector<std::string> &cols,
144 const std::vector<std::size_t> &vecSizes = {}, const float vecPadding = 0.0, bool shuffle = true,
145 const std::size_t setSeed = 0)
146 : f_rdf(rdf),
147 fCols(cols),
151 fBlockSize(blockSize),
153 fNotFiltered(f_rdf.GetFilterNames().empty()),
156 {
157 fTensorOperators = std::make_unique<RFlat2DMatrixOperators>(fShuffle, fSetSeed);
158
159 fEntries = f_rdf.Take<ULong64_t>("rdfentry_");
160 fNumEntries = fEntries->size();
161
162 // add the last element in entries to not go out of range when filling chunks
163 fEntries->push_back((*fEntries)[fNumEntries - 1] + 1);
164
165 fNumCols = fCols.size();
166 fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0);
167
169
170 // number of training and validation entries after the split
171 fNumValidationEntries = static_cast<std::size_t>(fValidationSplit * fNumEntries);
173
174 fTraining = std::make_unique<RChunkConstructor>(fNumTrainEntries, fChunkSize, fBlockSize);
175 fValidation = std::make_unique<RChunkConstructor>(fNumValidationEntries, fChunkSize, fBlockSize);
176 }
177
178 //////////////////////////////////////////////////////////////////////////
179 /// \brief Distribute the blocks into training and validation datasets
181 {
182 std::random_device rd;
183 std::mt19937 g;
184
185 if (fSetSeed == 0) {
186 g.seed(rd());
187 } else {
188 g.seed(fSetSeed);
189 }
190
191 std::vector<Long_t> BlockSizes = {};
192
193 // fill the training and validation block sizes
194 for (size_t i = 0; i < fTraining->NumberOfDifferentBlocks.size(); i++) {
195 BlockSizes.insert(BlockSizes.end(), fTraining->NumberOfDifferentBlocks[i], fTraining->SizeOfBlocks[i]);
196 }
197
198 for (size_t i = 0; i < fValidation->NumberOfDifferentBlocks.size(); i++) {
199 BlockSizes.insert(BlockSizes.end(), fValidation->NumberOfDifferentBlocks[i], fValidation->SizeOfBlocks[i]);
200 }
201
202 // make an identity permutation map
203 std::vector<Long_t> indices(BlockSizes.size());
204
205 for (int i = 0; i < indices.size(); ++i) {
206 indices[i] = i;
207 }
208
209 // shuffle the identity permutation to create a new permutation
210 if (fShuffle) {
211 std::shuffle(indices.begin(), indices.end(), g);
212 }
213
214 // use the permuation to shuffle the vector of block sizes
215 std::vector<Long_t> PermutedBlockSizes(BlockSizes.size());
216 for (int i = 0; i < BlockSizes.size(); ++i) {
218 }
219
220 // create a vector for storing the boundaries of the blocks
221 std::vector<Long_t> BlockBoundaries(BlockSizes.size());
222
223 // get the boundaries of the blocks with the partial sum of the block sizes
224 // insert 0 at the beginning for the lower boundary of the first block
227
228 // distribute the neighbouring block boudaries into pairs to get the intevals for the blocks
229 std::vector<std::pair<Long_t, Long_t>> BlockIntervals;
230 for (size_t i = 0; i < BlockBoundaries.size() - 1; ++i) {
231 BlockIntervals.emplace_back(BlockBoundaries[i], BlockBoundaries[i + 1]);
232 }
233
234 // use the inverse of the permutation above to order the block intervals in the same order as
235 // the original vector of block sizes
236 std::vector<std::pair<Long_t, Long_t>> UnpermutedBlockIntervals(BlockIntervals.size());
237 for (int i = 0; i < BlockIntervals.size(); ++i) {
238 UnpermutedBlockIntervals[indices[i]] = BlockIntervals[i];
239 }
240
241 // distribute the block intervals between training and validation
242 fTraining->BlockIntervals.insert(fTraining->BlockIntervals.begin(), UnpermutedBlockIntervals.begin(),
243 UnpermutedBlockIntervals.begin() + fTraining->NumberOfBlocks);
244 fValidation->BlockIntervals.insert(fValidation->BlockIntervals.begin(),
245 UnpermutedBlockIntervals.begin() + fTraining->NumberOfBlocks,
247
248 // distribute the different block intervals types for training and validation
249 fTraining->DistributeBlockIntervals();
250 fValidation->DistributeBlockIntervals();
251 }
252
253 //////////////////////////////////////////////////////////////////////////
254 /// \brief Create training chunks consisiting of block intervals of different types
256 {
257
258 std::random_device rd;
259 std::mt19937 g;
260
261 if (fSetSeed == 0) {
262 g.seed(rd());
263 } else {
264 g.seed(fSetSeed);
265 }
266
267 // shuffle the block intervals within each type of block
268 if (fShuffle) {
269 std::shuffle(fTraining->FullBlockIntervalsInFullChunks.begin(),
270 fTraining->FullBlockIntervalsInFullChunks.end(), g);
271 std::shuffle(fTraining->LeftoverBlockIntervalsInFullChunks.begin(),
272 fTraining->LeftoverBlockIntervalsInFullChunks.end(), g);
273 std::shuffle(fTraining->FullBlockIntervalsInLeftoverChunks.begin(),
274 fTraining->FullBlockIntervalsInLeftoverChunks.end(), g);
275 std::shuffle(fTraining->LeftoverBlockIntervalsInLeftoverChunks.begin(),
276 fTraining->LeftoverBlockIntervalsInLeftoverChunks.end(), g);
277 }
278
279 // reset the chunk intervals and sizes before each epoch
280 fTraining->ChunksIntervals = {};
281 fTraining->ChunksSizes = {};
282
283 // create the chunks each consisiting of block intervals
284 fTraining->CreateChunksIntervals();
285
286 if (fShuffle) {
287 std::shuffle(fTraining->ChunksIntervals.begin(), fTraining->ChunksIntervals.end(), g);
288 }
289
290 fTraining->SizeOfChunks();
291 }
292
293 //////////////////////////////////////////////////////////////////////////
294 /// \brief Create training chunks consisiting of block intervals of different types
296 {
297 std::random_device rd;
298 std::mt19937 g;
299
300 if (fSetSeed == 0) {
301 g.seed(rd());
302 } else {
303 g.seed(fSetSeed);
304 }
305
306 if (fShuffle) {
307 std::shuffle(fValidation->FullBlockIntervalsInFullChunks.begin(),
308 fValidation->FullBlockIntervalsInFullChunks.end(), g);
309 std::shuffle(fValidation->LeftoverBlockIntervalsInFullChunks.begin(),
310 fValidation->LeftoverBlockIntervalsInFullChunks.end(), g);
311 std::shuffle(fValidation->FullBlockIntervalsInLeftoverChunks.begin(),
312 fValidation->FullBlockIntervalsInLeftoverChunks.end(), g);
313 std::shuffle(fValidation->LeftoverBlockIntervalsInLeftoverChunks.begin(),
314 fValidation->LeftoverBlockIntervalsInLeftoverChunks.end(), g);
315 }
316
317 fValidation->ChunksIntervals = {};
318 fValidation->ChunksSizes = {};
319
320 fValidation->CreateChunksIntervals();
321
322 if (fShuffle) {
323 std::shuffle(fValidation->ChunksIntervals.begin(), fValidation->ChunksIntervals.end(), g);
324 }
325
326 fValidation->SizeOfChunks();
327 }
328
329 //////////////////////////////////////////////////////////////////////////
330 /// \brief Load the nth chunk from the training dataset into a tensor
331 /// \param[in] TrainChunkTensor RTensor for the training chunk
332 /// \param[in] chunk Index of the chunk in the dataset
334 {
335
336 std::size_t chunkSize = fTraining->ChunksSizes[chunk];
337
338 if (chunk < fTraining->Chunks) {
340
341 // fill a chunk by looping over the blocks in a chunk (see RChunkConstructor)
342 std::size_t chunkEntry = 0;
343 std::vector<std::pair<Long_t, Long_t>> BlocksInChunk = fTraining->ChunksIntervals[chunk];
344
345 std::sort(
347 [](const std::pair<Long_t, Long_t> &a, const std::pair<Long_t, Long_t> &b) { return a.first < b.first; });
348
349 for (std::size_t i = 0; i < BlocksInChunk.size(); i++) {
350
351 // Use the block start and end entry to load into the chunk if the dataframe is not filtered
352 if (fNotFiltered) {
355
356 f_rdf.Foreach(func, fCols);
357 chunkEntry += BlocksInChunk[i].second - BlocksInChunk[i].first;
358 }
359
360 // use the entry column of the dataframe as a map to load the entries that passed the filters
361 else {
362 std::size_t blockSize = BlocksInChunk[i].second - BlocksInChunk[i].first;
363 for (std::size_t j = 0; j < blockSize; j++) {
366 (*fEntries)[BlocksInChunk[i].first + j + 1]);
367 f_rdf.Foreach(func, fCols);
368 chunkEntry++;
369 }
370 }
371 }
372
373 // reset dataframe
375
376 // shuffle the data in the chunk tensor
378 }
379 }
380
381 //////////////////////////////////////////////////////////////////////////
382 /// \brief Load the nth chunk from the validation dataset into a tensor
383 /// \param[in] ValidationChunkTensor RTensor for the validation chunk
384 /// \param[in] chunk Index of the chunk in the dataset
386 {
387
388 std::size_t chunkSize = fValidation->ChunksSizes[chunk];
389
390 if (chunk < fValidation->Chunks) {
392
393 std::size_t chunkEntry = 0;
394 std::vector<std::pair<Long_t, Long_t>> BlocksInChunk = fValidation->ChunksIntervals[chunk];
395
396 std::sort(
398 [](const std::pair<Long_t, Long_t> &a, const std::pair<Long_t, Long_t> &b) { return a.first < b.first; });
399
400 for (std::size_t i = 0; i < BlocksInChunk.size(); i++) {
401
402 // use the block start and end entry to load into the chunk if the dataframe is not filtered
403 if (fNotFiltered) {
406 f_rdf.Foreach(func, fCols);
407 chunkEntry += BlocksInChunk[i].second - BlocksInChunk[i].first;
408 }
409
410 // use the entry column of the dataframe as a map to load the entries that passed the filters
411 else {
412 std::size_t blockSize = BlocksInChunk[i].second - BlocksInChunk[i].first;
413 for (std::size_t j = 0; j < blockSize; j++) {
416 (*fEntries)[BlocksInChunk[i].first + j + 1]);
417
418 f_rdf.Foreach(func, fCols);
419 chunkEntry++;
420 }
421 }
422 }
423
424 // reset dataframe
426
427 // shuffle the data in the chunk tensor
429 }
430 }
431
433
434 std::vector<std::size_t> GetTrainingChunkSizes() { return fTraining->ChunksSizes; }
435 std::vector<std::size_t> GetValidationChunkSizes() { return fValidation->ChunksSizes; }
436
437 std::size_t GetNumTrainingEntries() { return fNumTrainEntries; }
439
441 {
442 const auto &rvec = Tensor.fRVec;
443 if (std::set<float>(rvec.begin(), rvec.end()).size() == rvec.size()) {
444 std::cout << "Tensor consists of only unique elements" << std::endl;
445 }
446 };
447
449 {
450 std::set<float> result;
451
452 // Call the set_intersection(), which computes the
453 // intersection of set1 and set2 and
454 // inserts the result into the 'result' set
455 std::set<float> set1(Tensor1.fRVec.begin(), Tensor1.fRVec.end());
456 std::set<float> set2(Tensor2.fRVec.begin(), Tensor2.fRVec.end());
457 std::set_intersection(set1.begin(), set1.end(), set2.begin(), set2.end(), std::inserter(result, result.begin()));
458 // std::list<int> result = intersection(allEntries1, allEntries2);
459
460 if (result.size() == 0) {
461 std::cout << "No overlap between the tensors" << std::endl;
462 } else {
463 std::cout << "Intersection between tensors: ";
464 for (auto num : result) {
465 std::cout << num << " ";
466 }
467 std::cout << std::endl;
468 }
469 };
470
471 std::size_t GetNumTrainingChunks() { return fTraining->Chunks; }
472
473 std::size_t GetNumValidationChunks() { return fValidation->Chunks; }
474};
475
476} // namespace ROOT::Experimental::Internal::ML
477#endif // ROOT_INTERNAL_ML_RCHUNKLOADER
#define b(i)
Definition RSha256.hxx:100
#define g(i)
Definition RSha256.hxx:105
#define a(i)
Definition RSha256.hxx:99
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Loading chunks made in RChunkLoader into tensors from data from RDataFrame.
void AssignToTensor(const T &vec, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of vectors.
void AssignToTensor(const T &val, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of single values.
RChunkLoaderFunctor(RFlat2DMatrix &chunkTensor, std::size_t numColumns, const std::vector< std::size_t > &maxVecSizes, float vecPadding, int i)
Building and loading the chunks from the blocks and chunks constructed in RChunkConstructor.
RChunkLoader(ROOT::RDF::RNode &rdf, const std::size_t chunkSize, const std::size_t blockSize, const float validationSplit, const std::vector< std::string > &cols, const std::vector< std::size_t > &vecSizes={}, const float vecPadding=0.0, bool shuffle=true, const std::size_t setSeed=0)
void CreateTrainingChunksIntervals()
Create training chunks consisiting of block intervals of different types.
void SplitDataset()
Distribute the blocks into training and validation datasets.
std::vector< std::size_t > GetValidationChunkSizes()
void CreateValidationChunksIntervals()
Create training chunks consisiting of block intervals of different types.
void LoadTrainingChunk(RFlat2DMatrix &TrainChunkTensor, std::size_t chunk)
Load the nth chunk from the training dataset into a tensor.
std::unique_ptr< RChunkConstructor > fTraining
std::unique_ptr< RChunkConstructor > fValidation
ROOT::RDF::RResultPtr< std::vector< ULong64_t > > fEntries
void CheckIfOverlap(RFlat2DMatrix &Tensor1, RFlat2DMatrix &Tensor2)
std::unique_ptr< RFlat2DMatrixOperators > fTensorOperators
std::vector< std::size_t > GetTrainingChunkSizes()
void LoadValidationChunk(RFlat2DMatrix &ValidationChunkTensor, std::size_t chunk)
Load the nth chunk from the validation dataset into a tensor.
The public interface to the RDataFrame federation of classes.
RResultPtr< COLL > Take(std::string_view column="")
Return a collection of values of a column (lazy action, returns a std::vector by default).
void Foreach(F f, const ColumnNames_t &columns={})
Execute a user-defined function on each entry (instant action).
Smart pointer for the return type of actions.
const_iterator begin() const
const_iterator end() const
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
Wrapper around ROOT::RVec<float> representing a 2D matrix.