Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RChunkLoader.hxx
Go to the documentation of this file.
1// Author: Dante Niewenhuis, VU Amsterdam 07/2023
2// Author: Kristupas Pranckietis, Vilnius University 05/2024
3// Author: Nopphakorn Subsa-Ard, King Mongkut's University of Technology Thonburi (KMUTT) (TH) 08/2024
4// Author: Vincenzo Eduardo Padulano, CERN 10/2024
5// Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025
6
7/*************************************************************************
8 * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. *
9 * All rights reserved. *
10 * *
11 * For the licensing terms see $ROOTSYS/LICENSE. *
12 * For the list of contributors see $ROOTSYS/README/CREDITS. *
13 *************************************************************************/
14
15#ifndef TMVA_RCHUNKLOADER
16#define TMVA_RCHUNKLOADER
17
18#include <vector>
19#include <random>
20
22#include "ROOT/RDataFrame.hxx"
23#include "ROOT/RDF/Utils.hxx"
26
27#include "ROOT/RLogger.hxx"
28
29namespace TMVA {
30namespace Experimental {
31namespace Internal {
32
33// clang-format off
34/**
35\class ROOT::TMVA::Experimental::Internal::RChunkLoaderFunctor
36\ingroup tmva
37\brief Loading chunks made in RChunkLoader into tensors from data from RDataFrame.
38*/
39
40template <typename... ColTypes>
42 // clang-format on
43 std::size_t fOffset{};
44 std::size_t fVecSizeIdx{};
45 float fVecPadding{};
46 std::vector<std::size_t> fMaxVecSizes{};
48
49 std::size_t fNumChunkCols;
50
51 int fI;
53
54 //////////////////////////////////////////////////////////////////////////
55 /// \brief Copy the content of a column into RTensor when the column consits of vectors
57 void AssignToTensor(const T &vec, int i, int numColumns)
58 {
59 std::size_t max_vec_size = fMaxVecSizes[fVecSizeIdx++];
60 std::size_t vec_size = vec.size();
61
62 float *dst = fChunkTensor.GetData() + fOffset + numColumns * i;
63 if (vec_size < max_vec_size) // Padding vector column to max_vec_size with fVecPadding
64 {
65 std::copy(vec.begin(), vec.end(), dst);
66 std::fill(dst + vec_size, dst + max_vec_size, fVecPadding);
67 } else // Copy only max_vec_size length from vector column
68 {
69 std::copy(vec.begin(), vec.begin() + max_vec_size, dst);
70 }
72 }
73
74 //////////////////////////////////////////////////////////////////////////
75 /// \brief Copy the content of a column into RTensor when the column consits of single values
77 void AssignToTensor(const T &val, int i, int numColumns)
78 {
80 fOffset++;
81 // fChunkTensor.GetData()[numColumns * i] = val;
82 }
83
84public:
90
91 void operator()(const ColTypes &...cols)
92 {
93 fVecSizeIdx = 0;
95 }
96};
97
98// clang-format off
99/**
100\class ROOT::TMVA::Experimental::Internal::RChunkLoader
101\ingroup tmva
102\brief Building and loading the chunks from the blocks and chunks constructed in RChunkConstructor
103
104In this class the blocks are stiches together to form chunks that are loaded into memory. The blocks used to create each chunk comes from different parts of the dataset. This is achieved by shuffling the blocks before distributing them into chunks. The purpose of this process is to reduce bias during machine learning training by ensuring that the data is well mixed. The dataset is also spit into training and validation sets with the user-defined validation split fraction.
105*/
106
107template <typename... Args>
109private:
110 // clang-format on
111 std::size_t fNumEntries;
112 std::size_t fChunkSize;
113 std::size_t fBlockSize;
115
116 std::vector<std::size_t> fVecSizes;
117 std::size_t fSumVecSizes;
118 std::size_t fVecPadding;
119 std::size_t fNumChunkCols;
120
121 std::size_t fNumTrainEntries;
123 std::unique_ptr<RFlat2DMatrixOperators> fTensorOperators;
124
126 std::vector<std::string> fCols;
127 std::size_t fNumCols;
128 std::size_t fSetSeed;
129
132
134
135 std::unique_ptr<RChunkConstructor> fTraining;
136 std::unique_ptr<RChunkConstructor> fValidation;
137
138public:
139 RChunkLoader(ROOT::RDF::RNode &rdf, const std::size_t chunkSize, const std::size_t blockSize,
140 const float validationSplit, const std::vector<std::string> &cols,
141 const std::vector<std::size_t> &vecSizes = {}, const float vecPadding = 0.0, bool shuffle = true,
142 const std::size_t setSeed = 0)
143 : f_rdf(rdf),
144 fCols(cols),
148 fBlockSize(blockSize),
150 fNotFiltered(f_rdf.GetFilterNames().empty()),
153 {
154 fTensorOperators = std::make_unique<RFlat2DMatrixOperators>(fShuffle, fSetSeed);
155
156 fNumEntries = f_rdf.Count().GetValue();
157 fEntries = f_rdf.Take<ULong64_t>("rdfentry_");
158
159 // add the last element in entries to not go out of range when filling chunks
160 fEntries->push_back((*fEntries)[fNumEntries - 1] + 1);
161
162 fNumCols = fCols.size();
163 fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0);
164
166
167 // number of training and validation entries after the split
168 fNumValidationEntries = static_cast<std::size_t>(fValidationSplit * fNumEntries);
170
171 fTraining = std::make_unique<RChunkConstructor>(fNumTrainEntries, fChunkSize, fBlockSize);
172 fValidation = std::make_unique<RChunkConstructor>(fNumValidationEntries, fChunkSize, fBlockSize);
173 }
174
175 //////////////////////////////////////////////////////////////////////////
176 /// \brief Distribute the blocks into training and validation datasets
178 {
179 std::random_device rd;
180 std::mt19937 g;
181
182 if (fSetSeed == 0) {
183 g.seed(rd());
184 } else {
185 g.seed(fSetSeed);
186 }
187
188 std::vector<Long_t> BlockSizes = {};
189
190 // fill the training and validation block sizes
191 for (size_t i = 0; i < fTraining->NumberOfDifferentBlocks.size(); i++) {
192 BlockSizes.insert(BlockSizes.end(), fTraining->NumberOfDifferentBlocks[i], fTraining->SizeOfBlocks[i]);
193 }
194
195 for (size_t i = 0; i < fValidation->NumberOfDifferentBlocks.size(); i++) {
196 BlockSizes.insert(BlockSizes.end(), fValidation->NumberOfDifferentBlocks[i], fValidation->SizeOfBlocks[i]);
197 }
198
199 // make an identity permutation map
200 std::vector<Long_t> indices(BlockSizes.size());
201
202 for (int i = 0; i < indices.size(); ++i) {
203 indices[i] = i;
204 }
205
206 // shuffle the identity permutation to create a new permutation
207 if (fShuffle) {
208 std::shuffle(indices.begin(), indices.end(), g);
209 }
210
211 // use the permuation to shuffle the vector of block sizes
212 std::vector<Long_t> PermutedBlockSizes(BlockSizes.size());
213 for (int i = 0; i < BlockSizes.size(); ++i) {
214 PermutedBlockSizes[i] = BlockSizes[indices[i]];
215 }
216
217 // create a vector for storing the boundaries of the blocks
218 std::vector<Long_t> BlockBoundaries(BlockSizes.size());
219
220 // get the boundaries of the blocks with the partial sum of the block sizes
221 // insert 0 at the beginning for the lower boundary of the first block
224
225 // distribute the neighbouring block boudaries into pairs to get the intevals for the blocks
226 std::vector<std::pair<Long_t, Long_t>> BlockIntervals;
227 for (size_t i = 0; i < BlockBoundaries.size() - 1; ++i) {
228 BlockIntervals.emplace_back(BlockBoundaries[i], BlockBoundaries[i + 1]);
229 }
230
231 // use the inverse of the permutation above to order the block intervals in the same order as
232 // the original vector of block sizes
233 std::vector<std::pair<Long_t, Long_t>> UnpermutedBlockIntervals(BlockIntervals.size());
234 for (int i = 0; i < BlockIntervals.size(); ++i) {
235 UnpermutedBlockIntervals[indices[i]] = BlockIntervals[i];
236 }
237
238 // distribute the block intervals between training and validation
239 fTraining->BlockIntervals.insert(fTraining->BlockIntervals.begin(), UnpermutedBlockIntervals.begin(),
240 UnpermutedBlockIntervals.begin() + fTraining->NumberOfBlocks);
241 fValidation->BlockIntervals.insert(fValidation->BlockIntervals.begin(),
242 UnpermutedBlockIntervals.begin() + fTraining->NumberOfBlocks,
244
245 // distribute the different block intervals types for training and validation
246 fTraining->DistributeBlockIntervals();
247 fValidation->DistributeBlockIntervals();
248 }
249
250 //////////////////////////////////////////////////////////////////////////
251 /// \brief Create training chunks consisiting of block intervals of different types
253 {
254
255 std::random_device rd;
256 std::mt19937 g;
257
258 if (fSetSeed == 0) {
259 g.seed(rd());
260 } else {
261 g.seed(fSetSeed);
262 }
263
264 // shuffle the block intervals within each type of block
265 if (fShuffle) {
266 std::shuffle(fTraining->FullBlockIntervalsInFullChunks.begin(),
267 fTraining->FullBlockIntervalsInFullChunks.end(), g);
268 std::shuffle(fTraining->LeftoverBlockIntervalsInFullChunks.begin(),
269 fTraining->LeftoverBlockIntervalsInFullChunks.end(), g);
270 std::shuffle(fTraining->FullBlockIntervalsInLeftoverChunks.begin(),
271 fTraining->FullBlockIntervalsInLeftoverChunks.end(), g);
272 std::shuffle(fTraining->LeftoverBlockIntervalsInLeftoverChunks.begin(),
273 fTraining->LeftoverBlockIntervalsInLeftoverChunks.end(), g);
274 }
275
276 // reset the chunk intervals and sizes before each epoch
277 fTraining->ChunksIntervals = {};
278 fTraining->ChunksSizes = {};
279
280 // create the chunks each consisiting of block intervals
281 fTraining->CreateChunksIntervals();
282
283 if (fShuffle) {
284 std::shuffle(fTraining->ChunksIntervals.begin(), fTraining->ChunksIntervals.end(), g);
285 }
286
287 fTraining->SizeOfChunks();
288 }
289
290 //////////////////////////////////////////////////////////////////////////
291 /// \brief Create training chunks consisiting of block intervals of different types
293 {
294 std::random_device rd;
295 std::mt19937 g;
296
297 if (fSetSeed == 0) {
298 g.seed(rd());
299 } else {
300 g.seed(fSetSeed);
301 }
302
303 if (fShuffle) {
304 std::shuffle(fValidation->FullBlockIntervalsInFullChunks.begin(),
305 fValidation->FullBlockIntervalsInFullChunks.end(), g);
306 std::shuffle(fValidation->LeftoverBlockIntervalsInFullChunks.begin(),
307 fValidation->LeftoverBlockIntervalsInFullChunks.end(), g);
308 std::shuffle(fValidation->FullBlockIntervalsInLeftoverChunks.begin(),
309 fValidation->FullBlockIntervalsInLeftoverChunks.end(), g);
310 std::shuffle(fValidation->LeftoverBlockIntervalsInLeftoverChunks.begin(),
311 fValidation->LeftoverBlockIntervalsInLeftoverChunks.end(), g);
312 }
313
314 fValidation->ChunksIntervals = {};
315 fValidation->ChunksSizes = {};
316
317 fValidation->CreateChunksIntervals();
318
319 if (fShuffle) {
320 std::shuffle(fValidation->ChunksIntervals.begin(), fValidation->ChunksIntervals.end(), g);
321 }
322
323 fValidation->SizeOfChunks();
324 }
325
326 //////////////////////////////////////////////////////////////////////////
327 /// \brief Load the nth chunk from the training dataset into a tensor
328 /// \param[in] TrainChunkTensor RTensor for the training chunk
329 /// \param[in] chunk Index of the chunk in the dataset
331 {
332
333 std::size_t chunkSize = fTraining->ChunksSizes[chunk];
334
335 if (chunk < fTraining->Chunks) {
337
338 // fill a chunk by looping over the blocks in a chunk (see RChunkConstructor)
339 std::size_t chunkEntry = 0;
340 std::vector<std::pair<Long_t, Long_t>> BlocksInChunk = fTraining->ChunksIntervals[chunk];
341
342 std::sort(BlocksInChunk.begin(), BlocksInChunk.end(),
343 [](const std::pair<Long_t, Long_t>& a, const std::pair<Long_t, Long_t>& b) {
344 return a.first < b.first;
345 });
346
347 for (std::size_t i = 0; i < BlocksInChunk.size(); i++) {
348
349 // Use the block start and end entry to load into the chunk if the dataframe is not filtered
350 if (fNotFiltered) {
353
354 f_rdf.Foreach(func, fCols);
355 chunkEntry += BlocksInChunk[i].second - BlocksInChunk[i].first;
356 }
357
358 // use the entry column of the dataframe as a map to load the entries that passed the filters
359 else {
360 std::size_t blockSize = BlocksInChunk[i].second - BlocksInChunk[i].first;
361 for (std::size_t j = 0; j < blockSize; j++) {
364 (*fEntries)[BlocksInChunk[i].first + j + 1]);
365 f_rdf.Foreach(func, fCols);
366 chunkEntry++;
367 }
368 }
369 }
370
371 // shuffle the data in the chunk tensor
373 }
374 }
375
376 //////////////////////////////////////////////////////////////////////////
377 /// \brief Load the nth chunk from the validation dataset into a tensor
378 /// \param[in] ValidationChunkTensor RTensor for the validation chunk
379 /// \param[in] chunk Index of the chunk in the dataset
381 {
382
383 std::size_t chunkSize = fValidation->ChunksSizes[chunk];
384
385 if (chunk < fValidation->Chunks) {
387
388 std::size_t chunkEntry = 0;
389 std::vector<std::pair<Long_t, Long_t>> BlocksInChunk = fValidation->ChunksIntervals[chunk];
390
391 std::sort(BlocksInChunk.begin(), BlocksInChunk.end(),
392 [](const std::pair<Long_t, Long_t>& a, const std::pair<Long_t, Long_t>& b) {
393 return a.first < b.first;
394 });
395
396 for (std::size_t i = 0; i < BlocksInChunk.size(); i++) {
397
398 // use the block start and end entry to load into the chunk if the dataframe is not filtered
399 if (fNotFiltered) {
402 f_rdf.Foreach(func, fCols);
403 chunkEntry += BlocksInChunk[i].second - BlocksInChunk[i].first;
404 }
405
406 // use the entry column of the dataframe as a map to load the entries that passed the filters
407 else {
408 std::size_t blockSize = BlocksInChunk[i].second - BlocksInChunk[i].first;
409 for (std::size_t j = 0; j < blockSize; j++) {
412 (*fEntries)[BlocksInChunk[i].first + j + 1]);
413
414 f_rdf.Foreach(func, fCols);
415 chunkEntry++;
416 }
417 }
418 }
419
420 // shuffle the data in the chunk tensor
422 }
423 }
424
429
430 std::vector<std::size_t> GetTrainingChunkSizes() { return fTraining->ChunksSizes; }
431 std::vector<std::size_t> GetValidationChunkSizes() { return fValidation->ChunksSizes; }
432
433 std::size_t GetNumTrainingEntries() { return fNumTrainEntries; }
435
437 {
438 const auto &rvec = Tensor.fRVec;
439 if(std::set<float>(rvec.begin(), rvec.end()).size() == rvec.size()) {
440 std::cout << "Tensor consists of only unique elements" << std::endl;
441 }
442 };
443
445 {
446 std::set<float> result;
447
448 // Call the set_intersection(), which computes the
449 // intersection of set1 and set2 and
450 // inserts the result into the 'result' set
451 std::set<float> set1(Tensor1.fRVec.begin(), Tensor1.fRVec.end());
452 std::set<float> set2(Tensor2.fRVec.begin(), Tensor2.fRVec.end());
453 std::set_intersection(set1.begin(), set1.end(), set2.begin(), set2.end(), std::inserter(result, result.begin()));
454 // std::list<int> result = intersection(allEntries1, allEntries2);
455
456 if (result.size() == 0) {
457 std::cout << "No overlap between the tensors" << std::endl;
458 } else {
459 std::cout << "Intersection between tensors: ";
460 for (auto num : result) {
461 std::cout << num << " ";
462 }
463 std::cout << std::endl;
464 }
465 };
466
467 std::size_t GetNumTrainingChunks() { return fTraining->Chunks; }
468
469 std::size_t GetNumValidationChunks() { return fValidation->Chunks; }
470};
471
472} // namespace Internal
473} // namespace Experimental
474} // namespace TMVA
475#endif // TMVA_RCHUNKLOADER
#define b(i)
Definition RSha256.hxx:100
#define g(i)
Definition RSha256.hxx:105
#define a(i)
Definition RSha256.hxx:99
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
The public interface to the RDataFrame federation of classes.
RResultPtr< COLL > Take(std::string_view column="")
Return a collection of values of a column (lazy action, returns a std::vector by default).
void Foreach(F f, const ColumnNames_t &columns={})
Execute a user-defined function on each entry (instant action).
RResultPtr< ULong64_t > Count()
Return the number of entries processed (lazy action).
Smart pointer for the return type of actions.
const_iterator begin() const
const_iterator end() const
void AssignToTensor(const T &vec, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of vectors.
void AssignToTensor(const T &val, int i, int numColumns)
Copy the content of a column into RTensor when the column consits of single values.
RChunkLoaderFunctor(RFlat2DMatrix &chunkTensor, std::size_t numColumns, const std::vector< std::size_t > &maxVecSizes, float vecPadding, int i)
void CheckIfUnique(RFlat2DMatrix &Tensor)
std::unique_ptr< RFlat2DMatrixOperators > fTensorOperators
std::unique_ptr< RChunkConstructor > fValidation
void LoadTrainingChunk(RFlat2DMatrix &TrainChunkTensor, std::size_t chunk)
Load the nth chunk from the training dataset into a tensor.
void CheckIfOverlap(RFlat2DMatrix &Tensor1, RFlat2DMatrix &Tensor2)
void LoadValidationChunk(RFlat2DMatrix &ValidationChunkTensor, std::size_t chunk)
Load the nth chunk from the validation dataset into a tensor.
std::vector< std::size_t > GetTrainingChunkSizes()
ROOT::RDF::RResultPtr< std::vector< ULong64_t > > fEntries
std::vector< std::size_t > GetValidationChunkSizes()
RChunkLoader(ROOT::RDF::RNode &rdf, const std::size_t chunkSize, const std::size_t blockSize, const float validationSplit, const std::vector< std::string > &cols, const std::vector< std::size_t > &vecSizes={}, const float vecPadding=0.0, bool shuffle=true, const std::size_t setSeed=0)
void SplitDataset()
Distribute the blocks into training and validation datasets.
void CreateValidationChunksIntervals()
Create training chunks consisiting of block intervals of different types.
void CreateTrainingChunksIntervals()
Create training chunks consisiting of block intervals of different types.
std::unique_ptr< RChunkConstructor > fTraining
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
create variable transformations
Wrapper around ROOT::RVec<float> representing a 2D matrix.