Logo ROOT  
Reference Guide
 
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Modules Pages
Loading...
Searching...
No Matches
RDataSource.hxx
Go to the documentation of this file.
1// Author: Enrico Guiraud, Danilo Piparo CERN 09/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RDATASOURCE
12#define ROOT_RDATASOURCE
13
15#include <string_view>
16#include "RtypesCore.h" // ULong64_t
17#include "TString.h"
18
19#include <algorithm> // std::transform
20#include <cassert>
21#include <optional>
22#include <set>
23#include <string>
24#include <typeinfo>
25#include <unordered_map>
26#include <variant>
27#include <vector>
28#include <functional>
29
30// Need to fwd-declare TTreeReader for CreateColumnReader
31class TTreeReader;
32namespace ROOT::Detail::RDF {
33class RLoopManager;
34}
35
36namespace ROOT {
37namespace RDF {
38class RDataSource;
39class RSampleInfo;
40namespace Experimental {
41class RSample;
42}
43}
44}
45
46/// Print a RDataSource at the prompt
47namespace cling {
48std::string printValue(ROOT::RDF::RDataSource *ds);
49} // namespace cling
50
51namespace ROOT {
52
53namespace Internal {
54namespace TDS {
55
56/// Mother class of TTypedPointerHolder. The instances
57/// of this class can be put in a container. Upon destruction,
58/// the correct deletion of the pointer is performed in the
59/// derived class.
61protected:
62 void *fPointer{nullptr};
63
64public:
65 TPointerHolder(void *ptr) : fPointer(ptr) {}
66 void *GetPointer() { return fPointer; }
67 void *GetPointerAddr() { return &fPointer; }
69 virtual ~TPointerHolder(){};
70};
71
72/// Class to wrap a pointer and delete the memory associated to it
73/// correctly
74template <typename T>
76public:
77 TTypedPointerHolder(T *ptr) : TPointerHolder((void *)ptr) {}
78
80 {
81 const auto typedPtr = static_cast<T *>(fPointer);
82 return new TTypedPointerHolder(new T(*typedPtr));
83 }
84
85 ~TTypedPointerHolder() { delete static_cast<T *>(fPointer); }
86};
87
88} // ns TDS
89
90namespace RDF {
91std::string GetTypeNameWithOpts(const ROOT::RDF::RDataSource &ds, std::string_view colName, bool vector2RVec);
92const std::vector<std::string> &GetTopLevelFieldNames(const ROOT::RDF::RDataSource &ds);
93const std::vector<std::string> &GetColumnNamesNoDuplicates(const ROOT::RDF::RDataSource &ds);
98 const std::unordered_map<std::string, ROOT::RDF::Experimental::RSample *> &sampleMap);
101std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
102CreateColumnReader(ROOT::RDF::RDataSource &ds, unsigned int slot, std::string_view col, const std::type_info &tid,
104} // namespace RDF
105
106} // ns Internal
107
108namespace RDF {
109
110// clang-format off
111/**
112\class ROOT::RDF::RDataSource
113\ingroup dataframe
114\brief RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
115
116A concrete RDataSource implementation (i.e. a class that inherits from RDataSource and implements all of its pure
117methods) provides an adaptor that RDataFrame can leverage to read any kind of tabular data formats.
118RDataFrame calls into RDataSource to retrieve information about the data, retrieve (thread-local) readers or "cursors"
119for selected columns and to advance the readers to the desired data entry.
120
121The sequence of calls that RDataFrame (or any other client of a RDataSource) performs is the following:
122
123 - SetNSlots() : inform RDataSource of the desired level of parallelism
124 - GetColumnReaders() : retrieve from RDataSource per-thread readers for the desired columns
125 - Initialize() : inform RDataSource that an event-loop is about to start
126 - GetEntryRanges() : retrieve from RDataSource a set of ranges of entries that can be processed concurrently
127 - InitSlot() : inform RDataSource that a certain thread is about to start working on a certain range of entries
128 - SetEntry() : inform RDataSource that a certain thread is about to start working on a certain entry
129 - FinalizeSlot() : inform RDataSource that a certain thread finished working on a certain range of entries
130 - Finalize() : inform RDataSource that an event-loop finished
131
132RDataSource implementations must support running multiple event-loops consecutively (although sequentially) on the same dataset.
133 - \b SetNSlots() is called once per RDataSource object, typically when it is associated to a RDataFrame.
134 - \b GetColumnReaders() can be called several times, potentially with the same arguments, also in-between event-loops, but not during an event-loop.
135 - \b GetEntryRanges() will be called several times, including during an event loop, as additional ranges are needed. It will not be called concurrently.
136 - \b Initialize() and \b Finalize() are called once per event-loop, right before starting and right after finishing.
137 - \b InitSlot(), \b SetEntry(), and \b FinalizeSlot() can be called concurrently from multiple threads, multiple times per event-loop.
138
139 Advanced users that plan to implement a custom RDataSource can check out existing implementations, e.g. RCsvDS or RNTupleDS.
140 See the inheritance diagram below for the full list of existing concrete implementations.
141*/
143 // clang-format on
144protected:
145 using Record_t = std::vector<void *>;
146 friend std::string cling::printValue(::ROOT::RDF::RDataSource *);
147
148 virtual std::string AsString() { return "generic data source"; };
149
150 unsigned int fNSlots{};
151
152 std::optional<std::pair<ULong64_t, ULong64_t>> fGlobalEntryRange{};
153
154 friend std::string ROOT::Internal::RDF::GetTypeNameWithOpts(const RDataSource &, std::string_view, bool);
155 virtual std::string GetTypeNameWithOpts(std::string_view colName, bool) const { return GetTypeName(colName); }
156
157 friend const std::vector<std::string> &ROOT::Internal::RDF::GetTopLevelFieldNames(const ROOT::RDF::RDataSource &);
158 virtual const std::vector<std::string> &GetTopLevelFieldNames() const { return GetColumnNames(); }
159
160 friend const std::vector<std::string> &
162 virtual const std::vector<std::string> &GetColumnNamesNoDuplicates() const { return GetColumnNames(); }
163
164 friend void ROOT::Internal::RDF::CallInitializeWithOpts(ROOT::RDF::RDataSource &, const std::set<std::string> &);
165 virtual void InitializeWithOpts(const std::set<std::string> &) { Initialize(); }
166
168 virtual std::string DescribeDataset() { return "Dataframe from datasource " + GetLabel(); }
169
172 const std::unordered_map<std::string, ROOT::RDF::Experimental::RSample *> &);
174 CreateSampleInfo(const std::unordered_map<std::string, ROOT::RDF::Experimental::RSample *> &) const;
175
177 virtual void RunFinalChecks(bool) const {}
178
181
182 friend std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
184 const std::type_info &, TTreeReader *);
185 /**
186 * \brief Creates a column reader for the requested column
187 *
188 * In the general case, this is just a redirect to the right GetColumnReaders overload. The signature notably also
189 * has a TTreeReader * parameter. This is currently necessary to still allow the TTree-based MT scheduling via
190 * TTreeProcessorMT. We use the TTreeProcessorMT::Process method to launch the same kernel across all threads. In
191 * each thread task, TTreeProcessorMT creates a thread-local instance of a TTreeReader which is going to read the
192 * range of events assigned to that task. That TTreeReader instance is what is passed to this method whenever a
193 * column reader needs to be created in a thread task. In the future this method might be removed by either allowing
194 * to request a handle to the thread-local TTreeReader instance programmatically from the TTreeProcessorMT, or
195 * refactoring the TTreeProcessorMT scheduling into RTTreeDS altogether.
196 */
197 virtual std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
198 CreateColumnReader(unsigned int slot, std::string_view col, const std::type_info &tid, TTreeReader *)
199 {
200 return GetColumnReaders(slot, col, tid);
201 }
202
203public:
204 RDataSource() = default;
205 // Rule of five
206 RDataSource(const RDataSource &) = delete;
210 virtual ~RDataSource() = default;
211
212 // clang-format off
213 /// \brief Inform RDataSource of the number of processing slots (i.e. worker threads) used by the associated RDataFrame.
214 /// Slots numbers are used to simplify parallel execution: RDataFrame guarantees that different threads will always
215 /// pass different slot values when calling methods concurrently.
216 // clang-format on
217 virtual void SetNSlots(unsigned int nSlots)
218 {
219 assert(fNSlots == 0);
220 assert(nSlots > 0);
221 fNSlots = nSlots;
222 };
223
224 /// \brief Returns the number of files from which the dataset is constructed
225 virtual std::size_t GetNFiles() const { return 0; }
226
227 // clang-format off
228 /// \brief Returns a reference to the collection of the dataset's column names
229 // clang-format on
230 virtual const std::vector<std::string> &GetColumnNames() const = 0;
231
232 /// \brief Checks if the dataset has a certain column
233 /// \param[in] colName The name of the column
234 virtual bool HasColumn(std::string_view colName) const = 0;
235
236 // clang-format off
237 /// \brief Type of a column as a string, e.g. `GetTypeName("x") == "double"`. Required for jitting e.g. `df.Filter("x>0")`.
238 /// \param[in] colName The name of the column
239 // clang-format on
240 virtual std::string GetTypeName(std::string_view colName) const = 0;
241
242 // clang-format off
243 /// Called at most once per column by RDF. Return vector of pointers to pointers to column values - one per slot.
244 /// \tparam T The type of the data stored in the column
245 /// \param[in] columnName The name of the column
246 ///
247 /// These pointers are veritable cursors: it's a responsibility of the RDataSource implementation that they point to
248 /// the "right" memory region.
249 // clang-format on
250 template <typename T>
251 std::vector<T **> GetColumnReaders(std::string_view columnName)
252 {
254 std::vector<T **> typedVec(typeErasedVec.size());
255 std::transform(typeErasedVec.begin(), typeErasedVec.end(), typedVec.begin(),
256 [](void *p) { return static_cast<T **>(p); });
257 return typedVec;
258 }
259
260 /// If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
261 /// \param[in] slot The data processing slot that needs to be considered
262 /// \param[in] name The name of the column for which a column reader needs to be returned
263 /// \param[in] tid A type_info
264 /// At least one of the two must return a non-empty/non-null value.
265 virtual std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
266 GetColumnReaders(unsigned int /*slot*/, std::string_view /*name*/, const std::type_info &)
267 {
268 return {};
269 }
270
271 // clang-format off
272 /// \brief Return ranges of entries to distribute to tasks.
273 /// They are required to be contiguous intervals with no entries skipped. Supposing a dataset with nEntries, the
274 /// intervals must start at 0 and end at nEntries, e.g. [0-5],[5-10] for 10 entries.
275 /// This function will be invoked repeatedly by RDataFrame as it needs additional entries to process.
276 /// The same entry range should not be returned more than once.
277 /// Returning an empty collection of ranges signals to RDataFrame that the processing can stop.
278 // clang-format on
279 virtual std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() = 0;
280
281 // clang-format off
282 /// \brief Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
283 /// \param[in] slot The data processing slot that needs to be considered
284 /// \param[in] entry The entry which needs to be pointed to by the reader pointers
285 /// Slots are adopted to accommodate parallel data processing.
286 /// Different workers will loop over different ranges and
287 /// will be labelled by different "slot" values.
288 /// Returns *true* if the entry has to be processed, *false* otherwise.
289 // clang-format on
290 virtual bool SetEntry(unsigned int slot, ULong64_t entry) = 0;
291
292 // clang-format off
293 /// \brief Convenience method called before starting an event-loop.
294 /// This method might be called multiple times over the lifetime of a RDataSource, since
295 /// users can run multiple event-loops with the same RDataFrame.
296 /// Ideally, `Initialize` should set the state of the RDataSource so that multiple identical event-loops
297 /// will produce identical results.
298 // clang-format on
299 virtual void Initialize() {}
300
301 // clang-format off
302 /// \brief Convenience method called at the start of the data processing associated to a slot.
303 /// \param[in] slot The data processing slot wihch needs to be initialized
304 /// \param[in] firstEntry The first entry of the range that the task will process.
305 /// This method might be called multiple times per thread per event-loop.
306 // clang-format on
307 virtual void InitSlot(unsigned int /*slot*/, ULong64_t /*firstEntry*/) {}
308
309 // clang-format off
310 /// \brief Convenience method called at the end of the data processing associated to a slot.
311 /// \param[in] slot The data processing slot wihch needs to be finalized
312 /// This method might be called multiple times per thread per event-loop.
313 // clang-format on
314 virtual void FinalizeSlot(unsigned int /*slot*/) {}
315
316 // clang-format off
317 /// \brief Convenience method called after concluding an event-loop.
318 /// See Initialize for more details.
319 // clang-format on
320 virtual void Finalize() {}
321
322 /// \brief Return a string representation of the datasource type.
323 /// The returned string will be used by ROOT::RDF::SaveGraph() to represent
324 /// the datasource in the visualization of the computation graph.
325 /// Concrete datasources can override the default implementation.
326 virtual std::string GetLabel() { return "Custom Datasource"; }
327
328 /// \brief Restrict processing to a [begin, end) range of entries.
329 /// \param entryRange The range of entries to process.
330 virtual void SetGlobalEntryRange(std::pair<ULong64_t, ULong64_t> entryRange)
331 {
332 fGlobalEntryRange = std::move(entryRange);
333 };
334
335protected:
336 /// type-erased vector of pointers to pointers to column values - one per slot
337 virtual Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) = 0;
338};
339
340} // ns RDF
341
342} // ns ROOT
343
344/// Print a RDataSource at the prompt
345namespace cling {
346inline std::string printValue(ROOT::RDF::RDataSource *ds)
347{
348 return ds->AsString();
349}
350} // namespace cling
351
352#endif // ROOT_TDATASOURCE
unsigned long long ULong64_t
Definition RtypesCore.h:70
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
winID h TVirtualViewer3D TVirtualGLPainter p
char name[80]
Definition TGX11.cxx:110
The head node of a RDF computation graph.
Mother class of TTypedPointerHolder.
virtual TPointerHolder * GetDeepCopy()=0
Class to wrap a pointer and delete the memory associated to it correctly.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
RDataSource(RDataSource &&)=delete
RDataSource(const RDataSource &)=delete
RDataSource & operator=(const RDataSource &)=delete
virtual bool HasColumn(std::string_view colName) const =0
Checks if the dataset has a certain column.
virtual void Finalize()
Convenience method called after concluding an event-loop.
virtual void InitSlot(unsigned int, ULong64_t)
Convenience method called at the start of the data processing associated to a slot.
virtual std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > CreateColumnReader(unsigned int slot, std::string_view col, const std::type_info &tid, TTreeReader *)
Creates a column reader for the requested column.
virtual std::string DescribeDataset()
virtual const std::vector< std::string > & GetColumnNamesNoDuplicates() const
virtual void FinalizeSlot(unsigned int)
Convenience method called at the end of the data processing associated to a slot.
virtual ~RDataSource()=default
virtual std::string AsString()
friend ROOT::RDF::RSampleInfo ROOT::Internal::RDF::CreateSampleInfo(const ROOT::RDF::RDataSource &, const std::unordered_map< std::string, ROOT::RDF::Experimental::RSample * > &)
virtual bool SetEntry(unsigned int slot, ULong64_t entry)=0
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
virtual std::string GetTypeNameWithOpts(std::string_view colName, bool) const
virtual void SetNSlots(unsigned int nSlots)
Inform RDataSource of the number of processing slots (i.e.
std::vector< void * > Record_t
virtual std::string GetLabel()
Return a string representation of the datasource type.
virtual std::size_t GetNFiles() const
Returns the number of files from which the dataset is constructed.
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset's column names.
virtual const std::vector< std::string > & GetTopLevelFieldNames() const
virtual void InitializeWithOpts(const std::set< std::string > &)
friend void ROOT::Internal::RDF::ProcessMT(RDataSource &, ROOT::Detail::RDF::RLoopManager &)
virtual std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()=0
Return ranges of entries to distribute to tasks.
RDataSource & operator=(RDataSource &&)=delete
virtual Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &)=0
type-erased vector of pointers to pointers to column values - one per slot
virtual std::string GetTypeName(std::string_view colName) const =0
Type of a column as a string, e.g.
std::vector< T ** > GetColumnReaders(std::string_view columnName)
Called at most once per column by RDF.
virtual std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int, std::string_view, const std::type_info &)
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
virtual void SetGlobalEntryRange(std::pair< ULong64_t, ULong64_t > entryRange)
Restrict processing to a [begin, end) range of entries.
virtual void Initialize()
Convenience method called before starting an event-loop.
std::optional< std::pair< ULong64_t, ULong64_t > > fGlobalEntryRange
virtual void RunFinalChecks(bool) const
This type represents a sample identifier, to be used in conjunction with RDataFrame features such as ...
const_iterator begin() const
const_iterator end() const
A simple, robust and fast interface to read values from ROOT columnar datasets such as TTree,...
Definition TTreeReader.h:46
void RunFinalChecks(const ROOT::RDF::RDataSource &ds, bool nodesLeftNotRun)
Definition RDFUtils.cxx:581
void CallInitializeWithOpts(ROOT::RDF::RDataSource &ds, const std::set< std::string > &suppressErrorsForMissingColumns)
Definition RDFUtils.cxx:563
const std::vector< std::string > & GetTopLevelFieldNames(const ROOT::RDF::RDataSource &ds)
Definition RDFUtils.cxx:553
std::string DescribeDataset(ROOT::RDF::RDataSource &ds)
Definition RDFUtils.cxx:569
ROOT::RDF::RSampleInfo CreateSampleInfo(const ROOT::RDF::RDataSource &ds, const std::unordered_map< std::string, ROOT::RDF::Experimental::RSample * > &sampleMap)
Definition RDFUtils.cxx:574
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > CreateColumnReader(ROOT::RDF::RDataSource &ds, unsigned int slot, std::string_view col, const std::type_info &tid, TTreeReader *treeReader)
Definition RDFUtils.cxx:592
void ProcessMT(ROOT::RDF::RDataSource &ds, ROOT::Detail::RDF::RLoopManager &lm)
Definition RDFUtils.cxx:586
std::string GetTypeNameWithOpts(const ROOT::RDF::RDataSource &ds, std::string_view colName, bool vector2RVec)
Definition RDFUtils.cxx:548
const std::vector< std::string > & GetColumnNamesNoDuplicates(const ROOT::RDF::RDataSource &ds)
Definition RDFUtils.cxx:558
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...