Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RBatchLoader.hxx
Go to the documentation of this file.
1// Author: Dante Niewenhuis, VU Amsterdam 07/2023
2// Author: Kristupas Pranckietis, Vilnius University 05/2024
3// Author: Nopphakorn Subsa-Ard, King Mongkut's University of Technology Thonburi (KMUTT) (TH) 08/2024
4// Author: Vincenzo Eduardo Padulano, CERN 10/2024
5// Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025
6
7/*************************************************************************
8 * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. *
9 * All rights reserved. *
10 * *
11 * For the licensing terms see $ROOTSYS/LICENSE. *
12 * For the list of contributors see $ROOTSYS/README/CREDITS. *
13 *************************************************************************/
14
15#ifndef TMVA_RBATCHLOADER
16#define TMVA_RBATCHLOADER
17
18#include <vector>
19#include <memory>
20#include <numeric>
21
22// Imports for threading
23#include <queue>
24#include <mutex>
25#include <condition_variable>
26
28#include "TMVA/Tools.h"
29
31
32/**
33\class ROOT::TMVA::Experimental::Internal::RBatchLoader
34\ingroup tmva
35\brief Building and loading the batches from loaded chunks in RChunkLoader
36
37In this class the chunks that are loaded into memory (see RChunkLoader) are split into batches used in the ML training
38which are loaded into a queue. This is done for both the training and validation chunks separately.
39*/
40
42private:
43 std::size_t fBatchSize;
44 // needed for calculating the total number of batch columns when vectors columns are present
45 std::vector<std::string> fCols;
46 std::vector<std::size_t> fVecSizes;
47 std::size_t fSumVecSizes;
48 std::size_t fNumColumns;
49 std::size_t fNumEntries;
51
52 std::size_t fNumFullBatches;
54 std::size_t fNumBatches;
55 std::size_t fLeftoverBatchSize;
56
57 bool fIsActive = false;
58
59 std::mutex fBatchLock;
60 std::condition_variable fBatchCondition;
61
62 // queues of flattened tensors (rows * cols)
63 std::queue<std::unique_ptr<RFlat2DMatrix>> fBatchQueue;
64
65 // current batch that is loaded into memory
66 std::unique_ptr<RFlat2DMatrix> fCurrentBatch;
67
68 // primary and secondary leftover batches used to create batches from a chunk
69 std::unique_ptr<RFlat2DMatrix> fPrimaryLeftoverBatch;
70 std::unique_ptr<RFlat2DMatrix> fSecondaryLeftoverBatch;
71
72public:
73 RBatchLoader(std::size_t batchSize, const std::vector<std::string> &cols, const std::vector<std::size_t> &vecSizes = {},
74 std::size_t numEntries = 0, bool dropRemainder = false)
75 : fBatchSize(batchSize),
76 fCols(cols),
78 fNumEntries(numEntries),
80 {
81
82 fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0);
83 fNumColumns = fCols.size() + fSumVecSizes - fVecSizes.size();
84
85 if (fBatchSize == 0) {
87 }
88
91
93
94 if (fDropRemainder) {
96 }
97
98 else {
100 }
101
102 fPrimaryLeftoverBatch = std::make_unique<RFlat2DMatrix>();
103 fSecondaryLeftoverBatch = std::make_unique<RFlat2DMatrix>();
104
105 }
106
107public:
108 void Activate()
109 {
110 {
111 std::lock_guard<std::mutex> lock(fBatchLock);
112 fIsActive = true;
113 }
114 fBatchCondition.notify_all();
115 }
116
117 /// \brief DeActivate the batchloader. This means that no more batches are created.
118 /// Batches can still be returned if they are already loaded
120 {
121 {
122 std::lock_guard<std::mutex> lock(fBatchLock);
123 fIsActive = false;
124 }
125 fBatchCondition.notify_all();
126 }
127
128 /// \brief Return a batch of data as a unique pointer.
129 /// After the batch has been processed, it should be destroyed.
130 /// \param[in] chunkTensor Tensor with the data from the chunk
131 /// \param[in] idxs Index of batch in the chunk
132 /// \return Batch
133 std::unique_ptr<RFlat2DMatrix> CreateBatch(RFlat2DMatrix &chunTensor, std::size_t idxs)
134 {
135 auto batch = std::make_unique<RFlat2DMatrix>(fBatchSize, fNumColumns);
136 std::copy(chunTensor.GetData() + (idxs * fBatchSize * fNumColumns),
137 chunTensor.GetData() + ((idxs + 1) * fBatchSize * fNumColumns), batch->GetData());
138
139 return batch;
140 }
141
142 /// \brief Loading the batch from the queue
143 /// \return Batch
145 {
146
147 if (fBatchQueue.empty()) {
148 fCurrentBatch = std::make_unique<RFlat2DMatrix>();
149 return *fCurrentBatch;
150 }
151
152 fCurrentBatch = std::move(fBatchQueue.front());
153 fBatchQueue.pop();
154
155 return *fCurrentBatch;
156 }
157
158 /// \brief Creating the batches from a chunk and add them to the queue.
159 /// \param[in] chunkTensor Tensor with the data from the chunk
160 /// \param[in] lastbatch Check if the batch in the chunk is the last one
161 void
163 {
164 std::size_t ChunkSize = chunkTensor.GetRows();
165 std::size_t NumCols = chunkTensor.GetCols();
166 std::size_t Batches = ChunkSize / fBatchSize;
167 std::size_t LeftoverBatchSize = ChunkSize % fBatchSize;
168
169 // create a vector of batches
170 std::vector<std::unique_ptr<RFlat2DMatrix>> batches;
171
172 // fill the full batches from the chunk into a vector
173 for (std::size_t i = 0; i < Batches; i++) {
174 // Fill a batch
175 batches.emplace_back(CreateBatch(chunkTensor, i));
176 }
177
178 // copy the remaining entries from the chunk into a leftover batch
180 std::copy(chunkTensor.GetData() + (Batches * fBatchSize * NumCols),
182 LeftoverBatch.GetData());
183
184 // calculate how many empty slots are left in fPrimaryLeftoverBatch
185 std::size_t PrimaryLeftoverSize = fPrimaryLeftoverBatch->GetRows();
187
188 // copy LeftoverBatch to end of fPrimaryLeftoverBatch
191 std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (LeftoverBatchSize * fNumColumns),
193
194 // copy LeftoverBatch to end of fPrimaryLeftoverBatch and add it to the batch vector
196 auto copy = std::make_unique<RFlat2DMatrix>(fBatchSize, fNumColumns);
197 std::copy(fPrimaryLeftoverBatch->GetData(),
198 fPrimaryLeftoverBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
199 batches.emplace_back(std::move(copy));
200
201 // reset fPrimaryLeftoverBatch and fSecondaryLeftoverBatch
203 fSecondaryLeftoverBatch = std::make_unique<RFlat2DMatrix>();
204 }
205 }
206
207 // copy LeftoverBatch to both fPrimaryLeftoverBatch and fSecondaryLeftoverBatch
208 else if (emptySlots < LeftoverBatchSize) {
209 // copy the first part of LeftoverBatch to end of fPrimaryLeftoverTrainingBatch
211 std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (emptySlots * NumCols),
213
214 // copy the last part of LeftoverBatch to the end of fSecondaryLeftoverBatch
216 std::copy(LeftoverBatch.GetData() + (emptySlots * NumCols),
218 fSecondaryLeftoverBatch->GetData());
219
220 // add fPrimaryLeftoverBatch to the batch vector
221 auto copy = std::make_unique<RFlat2DMatrix>(fBatchSize, fNumColumns);
222 std::copy(fPrimaryLeftoverBatch->GetData(),
223 fPrimaryLeftoverBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
224 batches.emplace_back(std::move(copy));
225
226 // exchange fPrimaryLeftoverBatch and fSecondaryLeftoverBatch
228
229 // reset fSecondaryLeftoverTrainingBatch
230 fSecondaryLeftoverBatch = std::make_unique<RFlat2DMatrix>();
231 }
232
233 // copy the content of fPrimaryLeftoverBatch to the leftover batch from the chunk
234 if (lastbatch == 1) {
235
236 if (fDropRemainder == false && fLeftoverBatchSize > 0) {
237 auto copy = std::make_unique<RFlat2DMatrix>(fLeftoverBatchSize, fNumColumns);
238 std::copy(fPrimaryLeftoverBatch->GetData(),
239 fPrimaryLeftoverBatch->GetData() + (fLeftoverBatchSize * fNumColumns), copy->GetData());
240 batches.emplace_back(std::move(copy));
241 }
242
243 fPrimaryLeftoverBatch = std::make_unique<RFlat2DMatrix>();
244 fSecondaryLeftoverBatch = std::make_unique<RFlat2DMatrix>();
245 }
246
247 // append the batches from the batch vector from the chunk to the training batch queue
248 for (std::size_t i = 0; i < batches.size(); i++) {
249 fBatchQueue.push(std::move(batches[i]));
250 }
251 }
252
253 std::size_t GetNumBatches() { return fNumBatches; }
254 std::size_t GetNumEntries() { return fNumEntries; }
255 std::size_t GetNumRemainderRows() { return fLeftoverBatchSize; }
256 std::size_t GetNumBatchQueue() { return fBatchQueue.size(); }
257};
258
259} // namespace TMVA::Experimental::Internal
260
261#endif // TMVA_RBATCHLOADER
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
These classes encapsulate the necessary data for the computations.
std::unique_ptr< RFlat2DMatrix > fSecondaryLeftoverBatch
std::unique_ptr< RFlat2DMatrix > CreateBatch(RFlat2DMatrix &chunTensor, std::size_t idxs)
Return a batch of data as a unique pointer.
void CreateBatches(RFlat2DMatrix &chunkTensor, std::size_t lastbatch)
Creating the batches from a chunk and add them to the queue.
std::unique_ptr< RFlat2DMatrix > fCurrentBatch
std::queue< std::unique_ptr< RFlat2DMatrix > > fBatchQueue
RFlat2DMatrix GetBatch()
Loading the batch from the queue.
std::unique_ptr< RFlat2DMatrix > fPrimaryLeftoverBatch
void DeActivate()
DeActivate the batchloader.
RBatchLoader(std::size_t batchSize, const std::vector< std::string > &cols, const std::vector< std::size_t > &vecSizes={}, std::size_t numEntries=0, bool dropRemainder=false)
Wrapper around ROOT::RVec<float> representing a 2D matrix.