Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RChunkConstructor.hxx
Go to the documentation of this file.
1// Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025
2
3/*************************************************************************
4 * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_INTERNAL_ML_RCHUNKCONSTRUCTOR
12#define ROOT_INTERNAL_ML_RCHUNKCONSTRUCTOR
13
14#include <vector>
15
16#include "ROOT/RDataFrame.hxx"
17#include "ROOT/RDF/Utils.hxx"
18#include "ROOT/RVec.hxx"
19
20#include "ROOT/RLogger.hxx"
21
23/**
24\class ROOT::Experimental::Internal::ML::RChunkConstructor
25
26\brief The logic for constructing chunks from a dataset.
27
28This struct handles the logic for splitting a dataset into smaller subsets
29known as chunks, which are constructed from blocks.
30
31A chunk is the largest portion of the dataset loaded into memory at once,
32and each chunk is further divided into batches for machine learning training.
33
34The dataset is split into disjoint chunks based on a user-defined chunk size.
35There are two types of chunks:
36 - Full chunks: contain exactly the number of entries specified by the chunk size.
37 - Leftover chunk: contains any remaining entries that don't make up a full chunk.
38
39Each chunk is constructed from blocks based on a user-defined block size.
40There are two types of blocks:
41 - Full blocks: contain exactly the number of entries specified by the block size.
42 - Leftover block: contains any remaining entries that don't make up a full block.
43
44The blocks are defined by their start and end entries, which correspond to positions within the dataset’s total number
45of entries.
46*/
47
49 std::size_t fNumEntries{};
50 std::size_t fChunkSize{};
51 std::size_t fBlockSize{};
52
53 // size of full and leftover chunks
54 std::size_t SizeOfFullChunk;
56
57 // size of full and leftover blocks in a full and leftover chunk
62
63 // number of full, leftover and total chunks
64 std::size_t FullChunks;
65 std::size_t LeftoverChunks;
66 std::size_t Chunks;
67
68 // number of full, leftover and total blocks in a full chunk
71 std::size_t BlockPerFullChunk;
72
73 // number of full, leftover and total blocks in the leftover chunk
77
78 // total number of full and leftover blocks in the full chunks
81
82 // total number of full and leftover blocks in the leftover chunks
85
86 // vector of the different block sizes
87 std::vector<std::size_t> SizeOfBlocks;
88
89 // vector with the number of the different block
90 std::vector<std::size_t> NumberOfDifferentBlocks;
91
92 // total number of blocks
93 std::size_t NumberOfBlocks;
94
95 // pair of start and end entries in the different block types
96 std::vector<std::pair<Long_t, Long_t>> BlockIntervals;
97
98 std::vector<std::pair<Long_t, Long_t>> FullBlockIntervalsInFullChunks;
99 std::vector<std::pair<Long_t, Long_t>> LeftoverBlockIntervalsInFullChunks;
100
101 std::vector<std::pair<Long_t, Long_t>> FullBlockIntervalsInLeftoverChunks;
102 std::vector<std::pair<Long_t, Long_t>> LeftoverBlockIntervalsInLeftoverChunks;
103
104 std::vector<std::vector<std::pair<Long_t, Long_t>>> ChunksIntervals;
105
106 std::vector<std::size_t> ChunksSizes;
107
108 RChunkConstructor(const std::size_t numEntries, const std::size_t chunkSize, const std::size_t blockSize)
109 : fNumEntries(numEntries), fChunkSize(chunkSize), fBlockSize(blockSize)
110 {
111 // size of full and leftover chunks
114
115 // size of full and leftover blocks in a full and leftover chunk
116 SizeOfFullBlockInFullChunk = blockSize;
120
121 // number of full, leftover and total chunks
122 FullChunks = numEntries / SizeOfFullChunk;
123 LeftoverChunks = SizeOfLeftoverChunk == 0 ? 0 : 1;
125
126 // number of full, leftover and total blocks in a full chunk
130
131 // number of full, leftover and total blocks in the leftover chunk
135
136 // total number of full and leftover blocks in the full chunks
139
140 // total number of full and leftover blocks in the leftover chunks
143
144 // vector of the different block sizes
147
148 // vector with the number of the different block
151
152 // total number of blocks
153 NumberOfBlocks = std::accumulate(NumberOfDifferentBlocks.begin(), NumberOfDifferentBlocks.end(), 0);
154 };
155
156 //////////////////////////////////////////////////////////////////////////
157 /// \brief Group the blocks based on the block type (full or leftover) based on the size of the block.
159 {
160
161 std::vector<std::vector<std::pair<Long_t, Long_t>> *> TypesOfBlockIntervals = {
164
165 std::vector<std::size_t> IndexOfDifferentBlocks(NumberOfDifferentBlocks.size());
168
169 for (size_t i = 0; i < TypesOfBlockIntervals.size(); ++i) {
170 size_t start = IndexOfDifferentBlocks[i];
171 size_t end = IndexOfDifferentBlocks[i + 1];
172
173 TypesOfBlockIntervals[i]->insert(TypesOfBlockIntervals[i]->begin(), BlockIntervals.begin() + start,
174 BlockIntervals.begin() + end);
175 }
176 }
177
178 //////////////////////////////////////////////////////////////////////////
179 /// \brief Creates chunks from the dataset consisting of blocks with the begin and end entry.
181 {
182
183 ChunksIntervals.resize(Chunks);
184 for (size_t i = 0; i < FullChunks; i++) {
185
187 size_t end_FullBlock = FullBlocksPerFullChunk * (i + 1);
188
191
194 ChunksIntervals[i].insert(ChunksIntervals[i].end(),
197 }
198
199 for (size_t i = 0; i < LeftoverChunks; i++) {
200
201 size_t j = i + FullChunks;
203 size_t end_FullBlock = FullBlocksPerLeftoverChunk * (i + 1);
204
207
208 ChunksIntervals[j].insert(ChunksIntervals[j].end(),
211 ChunksIntervals[j].insert(ChunksIntervals[j].end(),
214 }
215 }
216
217 //////////////////////////////////////////////////////////////////////////
218 /// \brief Fills a vector with the size of every chunk from the dataset
220 {
221
222 for (size_t i = 0; i < Chunks; i++) {
223 std::size_t chunkSize = 0;
224 for (size_t j = 0; j < ChunksIntervals[i].size(); j++) {
225 std::size_t start = ChunksIntervals[i][j].first;
226 std::size_t end = ChunksIntervals[i][j].second;
227
228 std::size_t intervalSize = end - start;
230 }
231
232 ChunksSizes.insert(ChunksSizes.end(), chunkSize);
233 }
234 }
235};
236} // namespace ROOT::Experimental::Internal::ML
237
238#endif // ROOT_INTERNAL_ML_RCHUNKCONSTRUCTOR
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
const_iterator begin() const
The logic for constructing chunks from a dataset.
void CreateChunksIntervals()
Creates chunks from the dataset consisting of blocks with the begin and end entry.
std::vector< std::pair< Long_t, Long_t > > FullBlockIntervalsInFullChunks
std::vector< std::pair< Long_t, Long_t > > FullBlockIntervalsInLeftoverChunks
std::vector< std::vector< std::pair< Long_t, Long_t > > > ChunksIntervals
void SizeOfChunks()
Fills a vector with the size of every chunk from the dataset.
std::vector< std::pair< Long_t, Long_t > > LeftoverBlockIntervalsInFullChunks
std::vector< std::pair< Long_t, Long_t > > LeftoverBlockIntervalsInLeftoverChunks
std::vector< std::pair< Long_t, Long_t > > BlockIntervals
void DistributeBlockIntervals()
Group the blocks based on the block type (full or leftover) based on the size of the block.
RChunkConstructor(const std::size_t numEntries, const std::size_t chunkSize, const std::size_t blockSize)