Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RChunkConstructor.hxx
Go to the documentation of this file.
1// Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025
2
3/*************************************************************************
4 * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef TMVA_RCHUNKCONSTRUCTOR
12#define TMVA_RCHUNKCONSTRUCTOR
13
14#include <vector>
15
16#include "ROOT/RDataFrame.hxx"
17#include "ROOT/RDF/Utils.hxx"
18#include "ROOT/RVec.hxx"
19
20#include "ROOT/RLogger.hxx"
21
22namespace TMVA {
23namespace Experimental {
24namespace Internal {
25
26// clang-format off
27/**
28\class ROOT::TMVA::Experimental::Internal::RChunkConstructor
29\ingroup tmva
30\brief The logic for constructing chunks from a dataset.
31
32This struct handles the logic for splitting a dataset into smaller subsets
33known as chunks, which are constructed from blocks.
34
35A chunk is the largest portion of the dataset loaded into memory at once,
36and each chunk is further divided into batches for machine learning training.
37
38The dataset is split into disjoint chunks based on a user-defined chunk size.
39There are two types of chunks:
40 - Full chunks: contain exactly the number of entries specified by the chunk size.
41 - Leftover chunk: contains any remaining entries that don't make up a full chunk.
42
43Each chunk is constructed from blocks based on a user-defined block size.
44There are two types of blocks:
45 - Full blocks: contain exactly the number of entries specified by the block size.
46 - Leftover block: contains any remaining entries that don't make up a full block.
47
48The blocks are defined by their start and end entries, which correspond to positions within the dataset’s total number of entries.
49*/
50
52 // clang-format on
53 std::size_t fNumEntries{};
54 std::size_t fChunkSize{};
55 std::size_t fBlockSize{};
56
57 // size of full and leftover chunks
58 std::size_t SizeOfFullChunk;
60
61 // size of full and leftover blocks in a full and leftover chunk
66
67 // number of full, leftover and total chunks
68 std::size_t FullChunks;
69 std::size_t LeftoverChunks;
70 std::size_t Chunks;
71
72 // number of full, leftover and total blocks in a full chunk
75 std::size_t BlockPerFullChunk;
76
77 // number of full, leftover and total blocks in the leftover chunk
81
82 // total number of full and leftover blocks in the full chunks
85
86 // total number of full and leftover blocks in the leftover chunks
89
90 // vector of the different block sizes
91 std::vector<std::size_t> SizeOfBlocks;
92
93 // vector with the number of the different block
94 std::vector<std::size_t> NumberOfDifferentBlocks;
95
96 // total number of blocks
97 std::size_t NumberOfBlocks;
98
99 // pair of start and end entries in the different block types
100 std::vector<std::pair<Long_t, Long_t>> BlockIntervals;
101
102 std::vector<std::pair<Long_t, Long_t>> FullBlockIntervalsInFullChunks;
103 std::vector<std::pair<Long_t, Long_t>> LeftoverBlockIntervalsInFullChunks;
104
105 std::vector<std::pair<Long_t, Long_t>> FullBlockIntervalsInLeftoverChunks;
106 std::vector<std::pair<Long_t, Long_t>> LeftoverBlockIntervalsInLeftoverChunks;
107
108 std::vector<std::vector<std::pair<Long_t, Long_t>>> ChunksIntervals;
109
110 std::vector<std::size_t> ChunksSizes;
111
112 RChunkConstructor(const std::size_t numEntries, const std::size_t chunkSize, const std::size_t blockSize)
113 : fNumEntries(numEntries), fChunkSize(chunkSize), fBlockSize(blockSize)
114 {
115 // size of full and leftover chunks
118
119 // size of full and leftover blocks in a full and leftover chunk
120 SizeOfFullBlockInFullChunk = blockSize;
124
125 // number of full, leftover and total chunks
126 FullChunks = numEntries / SizeOfFullChunk;
127 LeftoverChunks = SizeOfLeftoverChunk == 0 ? 0 : 1;
129
130 // number of full, leftover and total blocks in a full chunk
134
135 // number of full, leftover and total blocks in the leftover chunk
139
140 // total number of full and leftover blocks in the full chunks
143
144 // total number of full and leftover blocks in the leftover chunks
147
148 // vector of the different block sizes
151
152 // vector with the number of the different block
155
156 // total number of blocks
157 NumberOfBlocks = std::accumulate(NumberOfDifferentBlocks.begin(), NumberOfDifferentBlocks.end(), 0);
158 };
159
160 //////////////////////////////////////////////////////////////////////////
161 /// \brief Group the blocks based on the block type (full or leftover) based on the size of the block.
163 {
164
165 std::vector<std::vector<std::pair<Long_t, Long_t>> *> TypesOfBlockIntervals = {
168
169 std::vector<std::size_t> IndexOfDifferentBlocks(NumberOfDifferentBlocks.size());
172
173 for (size_t i = 0; i < TypesOfBlockIntervals.size(); ++i) {
174 size_t start = IndexOfDifferentBlocks[i];
175 size_t end = IndexOfDifferentBlocks[i + 1];
176
177 TypesOfBlockIntervals[i]->insert(TypesOfBlockIntervals[i]->begin(), BlockIntervals.begin() + start,
178 BlockIntervals.begin() + end);
179 }
180 }
181
182 //////////////////////////////////////////////////////////////////////////
183 /// \brief Creates chunks from the dataset consisting of blocks with the begin and end entry.
185 {
186
187 ChunksIntervals.resize(Chunks);
188 for (size_t i = 0; i < FullChunks; i++) {
189
191 size_t end_FullBlock = FullBlocksPerFullChunk * (i + 1);
192
195
198 ChunksIntervals[i].insert(ChunksIntervals[i].end(),
201 }
202
203 for (size_t i = 0; i < LeftoverChunks; i++) {
204
205 size_t j = i + FullChunks;
207 size_t end_FullBlock = FullBlocksPerLeftoverChunk * (i + 1);
208
211
212 ChunksIntervals[j].insert(ChunksIntervals[j].end(),
215 ChunksIntervals[j].insert(ChunksIntervals[j].end(),
218 }
219 }
220
221 //////////////////////////////////////////////////////////////////////////
222 /// \brief Fills a vector with the size of every chunk from the dataset
224 {
225
226 for (size_t i = 0; i < Chunks; i++) {
227 std::size_t chunkSize = 0;
228 for (size_t j = 0; j < ChunksIntervals[i].size(); j++) {
229 std::size_t start = ChunksIntervals[i][j].first;
230 std::size_t end = ChunksIntervals[i][j].second;
231
232 std::size_t intervalSize = end - start;
234 }
235
236 ChunksSizes.insert(ChunksSizes.end(), chunkSize);
237 }
238 }
239};
240} // namespace Internal
241} // namespace Experimental
242} // namespace TMVA
243
244#endif // TMVA_RCHUNKCONSTRUCTOR
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
const_iterator begin() const
create variable transformations
void CreateChunksIntervals()
Creates chunks from the dataset consisting of blocks with the begin and end entry.
std::vector< std::vector< std::pair< Long_t, Long_t > > > ChunksIntervals
std::vector< std::pair< Long_t, Long_t > > LeftoverBlockIntervalsInLeftoverChunks
RChunkConstructor(const std::size_t numEntries, const std::size_t chunkSize, const std::size_t blockSize)
std::vector< std::pair< Long_t, Long_t > > BlockIntervals
void DistributeBlockIntervals()
Group the blocks based on the block type (full or leftover) based on the size of the block.
std::vector< std::pair< Long_t, Long_t > > FullBlockIntervalsInLeftoverChunks
std::vector< std::pair< Long_t, Long_t > > LeftoverBlockIntervalsInFullChunks
std::vector< std::pair< Long_t, Long_t > > FullBlockIntervalsInFullChunks
void SizeOfChunks()
Fills a vector with the size of every chunk from the dataset.