Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RBatchLoader.hxx
Go to the documentation of this file.
1// Author: Dante Niewenhuis, VU Amsterdam 07/2023
2// Author: Kristupas Pranckietis, Vilnius University 05/2024
3// Author: Nopphakorn Subsa-Ard, King Mongkut's University of Technology Thonburi (KMUTT) (TH) 08/2024
4// Author: Vincenzo Eduardo Padulano, CERN 10/2024
5// Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025
6
7/*************************************************************************
8 * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. *
9 * All rights reserved. *
10 * *
11 * For the licensing terms see $ROOTSYS/LICENSE. *
12 * For the list of contributors see $ROOTSYS/README/CREDITS. *
13 *************************************************************************/
14
15#ifndef TMVA_RBATCHLOADER
16#define TMVA_RBATCHLOADER
17
18#include <vector>
19#include <memory>
20#include <numeric>
21
22// Imports for threading
23#include <queue>
24#include <mutex>
25#include <condition_variable>
26
28#include "TMVA/Tools.h"
29
31
32/**
33\class ROOT::TMVA::Experimental::Internal::RBatchLoader
34\ingroup tmva
35\brief Building and loading the batches from loaded chunks in RChunkLoader
36
37In this class the chunks that are loaded into memory (see RChunkLoader) are split into batches used in the ML training
38which are loaded into a queue. This is done for both the training and validation chunks separately.
39*/
40
42private:
43 std::size_t fBatchSize;
44 std::size_t fNumColumns;
45
46 bool fIsActive = false;
47
48 std::mutex fBatchLock;
49 std::condition_variable fBatchCondition;
50
51 // queues of flattened tensors (rows * cols)
52 std::queue<std::unique_ptr<RFlat2DMatrix>> fTrainingBatchQueue;
53 std::queue<std::unique_ptr<RFlat2DMatrix>> fValidationBatchQueue;
54
55 // number of training and validation batches in the queue
58
59 // current batch that is loaded into memory
60 std::unique_ptr<RFlat2DMatrix> fCurrentBatch;
61
62 // primary and secondary leftover batches used to create batches from a chunk
63 std::unique_ptr<RFlat2DMatrix> fPrimaryLeftoverTrainingBatch;
64 std::unique_ptr<RFlat2DMatrix> fSecondaryLeftoverTrainingBatch;
65
66 std::unique_ptr<RFlat2DMatrix> fPrimaryLeftoverValidationBatch;
67 std::unique_ptr<RFlat2DMatrix> fSecondaryLeftoverValidationBatch;
68
69public:
70 RBatchLoader(std::size_t batchSize, std::size_t numColumns) : fBatchSize(batchSize), fNumColumns(numColumns)
71 {
72
73 fPrimaryLeftoverTrainingBatch = std::make_unique<RFlat2DMatrix>();
74 fSecondaryLeftoverTrainingBatch = std::make_unique<RFlat2DMatrix>();
75
76 fPrimaryLeftoverValidationBatch = std::make_unique<RFlat2DMatrix>();
77 fSecondaryLeftoverValidationBatch = std::make_unique<RFlat2DMatrix>();
78
81 }
82
83public:
84 void Activate()
85 {
86 {
87 std::lock_guard<std::mutex> lock(fBatchLock);
88 fIsActive = true;
89 }
90 fBatchCondition.notify_all();
91 }
92
93 /// \brief DeActivate the batchloader. This means that no more batches are created.
94 /// Batches can still be returned if they are already loaded
96 {
97 {
98 std::lock_guard<std::mutex> lock(fBatchLock);
99 fIsActive = false;
100 }
101 fBatchCondition.notify_all();
102 }
103
104 /// \brief Return a batch of data as a unique pointer.
105 /// After the batch has been processed, it should be destroyed.
106 /// \param[in] chunkTensor RTensor with the data from the chunk
107 /// \param[in] idxs Index of batch in the chunk
108 /// \return Training batch
109 std::unique_ptr<RFlat2DMatrix> CreateBatch(RFlat2DMatrix &chunTensor, std::size_t idxs)
110 {
111 auto batch = std::make_unique<RFlat2DMatrix>(fBatchSize, fNumColumns);
112 std::copy(chunTensor.GetData() + (idxs * fBatchSize * fNumColumns),
113 chunTensor.GetData() + ((idxs + 1) * fBatchSize * fNumColumns), batch->GetData());
114
115 return batch;
116 }
117
118 /// \brief Loading the training batch from the queue
119 /// \return Training batch
121 {
122
123 if (fTrainingBatchQueue.empty()) {
124 fCurrentBatch = std::make_unique<RFlat2DMatrix>();
125 return *fCurrentBatch;
126 }
127
128 fCurrentBatch = std::move(fTrainingBatchQueue.front());
130
131 return *fCurrentBatch;
132 }
133
134 /// \brief Loading the validation batch from the queue
135 /// \return Validation batch
137 {
138
139 if (fValidationBatchQueue.empty()) {
140 fCurrentBatch = std::make_unique<RFlat2DMatrix>();
141 return *fCurrentBatch;
142 }
143
144 fCurrentBatch = std::move(fValidationBatchQueue.front());
146
147 return *fCurrentBatch;
148 }
149
150 /// \brief Creating the training batches from a chunk and add them to the queue.
151 /// \param[in] chunkTensor RTensor with the data from the chunk
152 /// \param[in] lastbatch Check if the batch in the chunk is the last one
153 /// \param[in] leftoverBatchSize Size of the leftover batch in the training dataset
154 /// \param[in] dromRemainder Bool to drop the remainder batch or not
155 void
157 {
158 std::size_t ChunkSize = chunkTensor.GetRows();
159 std::size_t Batches = ChunkSize / fBatchSize;
160 std::size_t LeftoverBatchSize = ChunkSize % fBatchSize;
161
162 // create a vector of batches
163 std::vector<std::unique_ptr<RFlat2DMatrix>> batches;
164
165 // fill the full batches from the chunk into a vector
166 for (std::size_t i = 0; i < Batches; i++) {
167 // Fill a batch
168 batches.emplace_back(CreateBatch(chunkTensor, i));
169 }
170
171 // copy the remaining entries from the chunk into a leftover batch
173 std::copy(chunkTensor.GetData() + (Batches * fBatchSize * fNumColumns),
175 LeftoverBatch.GetData());
176
177 // calculate how many empty slots are left in fPrimaryLeftoverTrainingBatch
178 std::size_t PrimaryLeftoverSize = fPrimaryLeftoverTrainingBatch->GetRows();
180
181 // copy LeftoverBatch to end of fPrimaryLeftoverTrainingBatch
184 std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (LeftoverBatchSize * fNumColumns),
186
187 // copy LeftoverBatch to end of fPrimaryLeftoverTrainingBatch and add it to the batch vector
189 auto copy = std::make_unique<RFlat2DMatrix>(fBatchSize, fNumColumns);
190 std::copy(fPrimaryLeftoverTrainingBatch->GetData(),
191 fPrimaryLeftoverTrainingBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
192 batches.emplace_back(std::move(copy));
193
194 // reset fPrimaryLeftoverTrainingBatch and fSecondaryLeftoverTrainingBatch
196 fSecondaryLeftoverTrainingBatch = std::make_unique<RFlat2DMatrix>();
197 }
198 }
199
200 // copy LeftoverBatch to both fPrimaryLeftoverTrainingBatch and fSecondaryLeftoverTrainingBatch
201 else if (emptySlots < LeftoverBatchSize) {
202 // copy the first part of LeftoverBatch to end of fPrimaryLeftoverTrainingBatch
204 std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (emptySlots * fNumColumns),
206
207 // copy the last part of LeftoverBatch to the end of fSecondaryLeftoverTrainingBatch
209 std::copy(LeftoverBatch.GetData() + (emptySlots * fNumColumns),
212
213 // add fPrimaryLeftoverTrainingBatch to the batch vector
214 auto copy = std::make_unique<RFlat2DMatrix>(fBatchSize, fNumColumns);
215 std::copy(fPrimaryLeftoverTrainingBatch->GetData(),
216 fPrimaryLeftoverTrainingBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
217 batches.emplace_back(std::move(copy));
218
219 // exchange fPrimaryLeftoverTrainingBatch and fSecondaryLeftoverTrainingBatch
221
222 // reset fSecondaryLeftoverTrainingBatch
223 fSecondaryLeftoverTrainingBatch = std::make_unique<RFlat2DMatrix>();
224 }
225
226 // copy the content of fPrimaryLeftoverTrainingBatch to the leftover batch from the chunk
227 if (lastbatch == 1) {
228
229 if (dropRemainder == false && leftoverBatchSize > 0) {
230 auto copy = std::make_unique<RFlat2DMatrix>(leftoverBatchSize, fNumColumns);
231 std::copy(fPrimaryLeftoverTrainingBatch->GetData(),
232 fPrimaryLeftoverTrainingBatch->GetData() + (leftoverBatchSize * fNumColumns), copy->GetData());
233 batches.emplace_back(std::move(copy));
234 }
235
236 fPrimaryLeftoverTrainingBatch = std::make_unique<RFlat2DMatrix>();
237 fSecondaryLeftoverTrainingBatch = std::make_unique<RFlat2DMatrix>();
238 }
239
240 // append the batches from the batch vector from the chunk to the training batch queue
241 for (std::size_t i = 0; i < batches.size(); i++) {
242 fTrainingBatchQueue.push(std::move(batches[i]));
243 }
244 }
245
246 /// \brief Creating the validation batches from a chunk and adding them to the queue
247 /// \param[in] chunkTensor RTensor with the data from the chunk
248 /// \param[in] lastbatch Check if the batch in the chunk is the last one
249 /// \param[in] leftoverBatchSize Size of the leftover batch in the validation dataset
250 /// \param[in] dropRemainder Bool to drop the remainder batch or not
252 bool dropRemainder)
253 {
254 std::size_t ChunkSize = chunkTensor.GetRows();
255 std::size_t Batches = ChunkSize / fBatchSize;
256 std::size_t LeftoverBatchSize = ChunkSize % fBatchSize;
257
258 std::vector<std::unique_ptr<RFlat2DMatrix>> batches;
259
260 for (std::size_t i = 0; i < Batches; i++) {
261 // Fill a batch
262 batches.emplace_back(CreateBatch(chunkTensor, i));
263 }
264
266 std::copy(chunkTensor.GetData() + (Batches * fBatchSize * fNumColumns),
268 LeftoverBatch.GetData());
269
272
275 std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (LeftoverBatchSize * fNumColumns),
277
279 auto copy = std::make_unique<RFlat2DMatrix>(fBatchSize, fNumColumns);
280 std::copy(fPrimaryLeftoverValidationBatch->GetData(),
281 fPrimaryLeftoverValidationBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
282 batches.emplace_back(std::move(copy));
284 fSecondaryLeftoverValidationBatch = std::make_unique<RFlat2DMatrix>();
285 }
286 }
287
288 else if (emptySlots < LeftoverBatchSize) {
290 std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (emptySlots * fNumColumns),
293 std::copy(LeftoverBatch.GetData() + (emptySlots * fNumColumns),
296 auto copy = std::make_unique<RFlat2DMatrix>(fBatchSize, fNumColumns);
297 std::copy(fPrimaryLeftoverValidationBatch->GetData(),
298 fPrimaryLeftoverValidationBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
299 batches.emplace_back(std::move(copy));
301 fSecondaryLeftoverValidationBatch = std::make_unique<RFlat2DMatrix>();
302 }
303
304 if (lastbatch == 1) {
305
306 if (dropRemainder == false && leftoverBatchSize > 0) {
307 auto copy = std::make_unique<RFlat2DMatrix>(leftoverBatchSize, fNumColumns);
308 std::copy(fPrimaryLeftoverValidationBatch->GetData(),
309 fPrimaryLeftoverValidationBatch->GetData() + (leftoverBatchSize * fNumColumns), copy->GetData());
310 batches.emplace_back(std::move(copy));
311 }
312 fPrimaryLeftoverValidationBatch = std::make_unique<RFlat2DMatrix>();
313 fSecondaryLeftoverValidationBatch = std::make_unique<RFlat2DMatrix>();
314 }
315
316 for (std::size_t i = 0; i < batches.size(); i++) {
317 fValidationBatchQueue.push(std::move(batches[i]));
318 }
319 }
320
321 std::size_t GetNumTrainingBatchQueue() { return fTrainingBatchQueue.size(); }
322 std::size_t GetNumValidationBatchQueue() { return fValidationBatchQueue.size(); }
323};
324
325} // namespace TMVA::Experimental::Internal
326
327#endif // TMVA_RBATCHLOADER
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
These classes encapsulate the necessary data for the computations.
std::queue< std::unique_ptr< RFlat2DMatrix > > fTrainingBatchQueue
std::unique_ptr< RFlat2DMatrix > CreateBatch(RFlat2DMatrix &chunTensor, std::size_t idxs)
Return a batch of data as a unique pointer.
RBatchLoader(std::size_t batchSize, std::size_t numColumns)
RFlat2DMatrix GetTrainBatch()
Loading the training batch from the queue.
std::unique_ptr< RFlat2DMatrix > fPrimaryLeftoverValidationBatch
void CreateTrainingBatches(RFlat2DMatrix &chunkTensor, int lastbatch, std::size_t leftoverBatchSize, bool dropRemainder)
Creating the training batches from a chunk and add them to the queue.
void CreateValidationBatches(RFlat2DMatrix &chunkTensor, std::size_t lastbatch, std::size_t leftoverBatchSize, bool dropRemainder)
Creating the validation batches from a chunk and adding them to the queue.
std::unique_ptr< RFlat2DMatrix > fPrimaryLeftoverTrainingBatch
std::unique_ptr< RFlat2DMatrix > fCurrentBatch
std::unique_ptr< RFlat2DMatrix > fSecondaryLeftoverTrainingBatch
std::queue< std::unique_ptr< RFlat2DMatrix > > fValidationBatchQueue
RFlat2DMatrix GetValidationBatch()
Loading the validation batch from the queue.
std::unique_ptr< RFlat2DMatrix > fSecondaryLeftoverValidationBatch
void DeActivate()
DeActivate the batchloader.
Wrapper around ROOT::RVec<float> representing a 2D matrix.