Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RDataSource.hxx
Go to the documentation of this file.
1// Author: Enrico Guiraud, Danilo Piparo CERN 09/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RDATASOURCE
12#define ROOT_RDATASOURCE
13
15#include <string_view>
16#include "RtypesCore.h" // ULong64_t
17#include "TString.h"
18
19#include <algorithm> // std::transform
20#include <cassert>
21#include <optional>
22#include <set>
23#include <string>
24#include <typeinfo>
25#include <unordered_map>
26#include <variant>
27#include <vector>
28#include <functional>
29
30// Need to fwd-declare TTreeReader for CreateColumnReader
31class TTreeReader;
32namespace ROOT::Detail::RDF {
33class RLoopManager;
34}
35
36namespace ROOT {
37namespace RDF {
38class RDataSource;
39class RSampleInfo;
40namespace Experimental {
41class RSample;
42}
43}
44}
45
46/// Print a RDataSource at the prompt
47namespace cling {
48std::string printValue(ROOT::RDF::RDataSource *ds);
49} // namespace cling
50
51namespace ROOT {
52
53namespace Internal {
54namespace RDF {
55
56/// Mother class of TTypedPointerHolder. The instances
57/// of this class can be put in a container. Upon destruction,
58/// the correct deletion of the pointer is performed in the
59/// derived class.
61protected:
62 void *fPointer{nullptr};
63
64public:
65 TPointerHolder(void *ptr) : fPointer(ptr) {}
66 void *GetPointer() { return fPointer; }
67 void *GetPointerAddr() { return &fPointer; }
69 virtual ~TPointerHolder(){};
70};
71
72/// Class to wrap a pointer and delete the memory associated to it
73/// correctly
74template <typename T>
76public:
77 TTypedPointerHolder(T *ptr) : TPointerHolder((void *)ptr) {}
78
80 {
81 const auto typedPtr = static_cast<T *>(fPointer);
82 return new TTypedPointerHolder(new T(*typedPtr));
83 }
84
85 ~TTypedPointerHolder() { delete static_cast<T *>(fPointer); }
86};
87
88std::string GetTypeNameWithOpts(const ROOT::RDF::RDataSource &ds, std::string_view colName, bool vector2RVec);
89const std::vector<std::string> &GetTopLevelFieldNames(const ROOT::RDF::RDataSource &ds);
90const std::vector<std::string> &GetColumnNamesNoDuplicates(const ROOT::RDF::RDataSource &ds);
95 const std::unordered_map<std::string, ROOT::RDF::Experimental::RSample *> &sampleMap);
98std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
99CreateColumnReader(ROOT::RDF::RDataSource &ds, unsigned int slot, std::string_view col, const std::type_info &tid,
101} // namespace RDF
102
103} // ns Internal
104
105namespace RDF {
106
107// clang-format off
108/**
109\class ROOT::RDF::RDataSource
110\ingroup dataframe
111\brief RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
112
113A concrete RDataSource implementation (i.e. a class that inherits from RDataSource and implements all of its pure
114methods) provides an adaptor that RDataFrame can leverage to read any kind of tabular data formats.
115RDataFrame calls into RDataSource to retrieve information about the data, retrieve (thread-local) readers or "cursors"
116for selected columns and to advance the readers to the desired data entry.
117
118The sequence of calls that RDataFrame (or any other client of a RDataSource) performs is the following:
119
120 - SetNSlots() : inform RDataSource of the desired level of parallelism
121 - GetColumnReaders() : retrieve from RDataSource per-thread readers for the desired columns
122 - Initialize() : inform RDataSource that an event-loop is about to start
123 - GetEntryRanges() : retrieve from RDataSource a set of ranges of entries that can be processed concurrently
124 - InitSlot() : inform RDataSource that a certain thread is about to start working on a certain range of entries
125 - SetEntry() : inform RDataSource that a certain thread is about to start working on a certain entry
126 - FinalizeSlot() : inform RDataSource that a certain thread finished working on a certain range of entries
127 - Finalize() : inform RDataSource that an event-loop finished
128
129RDataSource implementations must support running multiple event-loops consecutively (although sequentially) on the same dataset.
130 - \b SetNSlots() is called once per RDataSource object, typically when it is associated to a RDataFrame.
131 - \b GetColumnReaders() can be called several times, potentially with the same arguments, also in-between event-loops, but not during an event-loop.
132 - \b GetEntryRanges() will be called several times, including during an event loop, as additional ranges are needed. It will not be called concurrently.
133 - \b Initialize() and \b Finalize() are called once per event-loop, right before starting and right after finishing.
134 - \b InitSlot(), \b SetEntry(), and \b FinalizeSlot() can be called concurrently from multiple threads, multiple times per event-loop.
135
136 Advanced users that plan to implement a custom RDataSource can check out existing implementations, e.g. RCsvDS or RNTupleDS.
137 See the inheritance diagram below for the full list of existing concrete implementations.
138*/
140 // clang-format on
141protected:
142 using Record_t = std::vector<void *>;
143 friend std::string cling::printValue(::ROOT::RDF::RDataSource *);
144
145 virtual std::string AsString() { return "generic data source"; };
146
147 unsigned int fNSlots{};
148
149 std::optional<std::pair<ULong64_t, ULong64_t>> fGlobalEntryRange{};
150
151 friend std::string ROOT::Internal::RDF::GetTypeNameWithOpts(const RDataSource &, std::string_view, bool);
152 virtual std::string GetTypeNameWithOpts(std::string_view colName, bool) const { return GetTypeName(colName); }
153
154 friend const std::vector<std::string> &ROOT::Internal::RDF::GetTopLevelFieldNames(const ROOT::RDF::RDataSource &);
155 virtual const std::vector<std::string> &GetTopLevelFieldNames() const { return GetColumnNames(); }
156
157 friend const std::vector<std::string> &
159 virtual const std::vector<std::string> &GetColumnNamesNoDuplicates() const { return GetColumnNames(); }
160
161 friend void ROOT::Internal::RDF::CallInitializeWithOpts(ROOT::RDF::RDataSource &, const std::set<std::string> &);
162 virtual void InitializeWithOpts(const std::set<std::string> &) { Initialize(); }
163
165 virtual std::string DescribeDataset() { return "Dataframe from datasource " + GetLabel(); }
166
169 const std::unordered_map<std::string, ROOT::RDF::Experimental::RSample *> &);
171 CreateSampleInfo(unsigned int, const std::unordered_map<std::string, ROOT::RDF::Experimental::RSample *> &) const;
172
174 virtual void RunFinalChecks(bool) const {}
175
178
179 friend std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
181 const std::type_info &, TTreeReader *);
182 /**
183 * \brief Creates a column reader for the requested column
184 *
185 * In the general case, this is just a redirect to the right GetColumnReaders overload. The signature notably also
186 * has a TTreeReader * parameter. This is currently necessary to still allow the TTree-based MT scheduling via
187 * TTreeProcessorMT. We use the TTreeProcessorMT::Process method to launch the same kernel across all threads. In
188 * each thread task, TTreeProcessorMT creates a thread-local instance of a TTreeReader which is going to read the
189 * range of events assigned to that task. That TTreeReader instance is what is passed to this method whenever a
190 * column reader needs to be created in a thread task. In the future this method might be removed by either allowing
191 * to request a handle to the thread-local TTreeReader instance programmatically from the TTreeProcessorMT, or
192 * refactoring the TTreeProcessorMT scheduling into RTTreeDS altogether.
193 */
194 virtual std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
195 CreateColumnReader(unsigned int slot, std::string_view col, const std::type_info &tid, TTreeReader *)
196 {
197 return GetColumnReaders(slot, col, tid);
198 }
199
200public:
201 RDataSource() = default;
202 // Rule of five
203 RDataSource(const RDataSource &) = delete;
207 virtual ~RDataSource() = default;
208
209 // clang-format off
210 /// \brief Inform RDataSource of the number of processing slots (i.e. worker threads) used by the associated RDataFrame.
211 /// Slots numbers are used to simplify parallel execution: RDataFrame guarantees that different threads will always
212 /// pass different slot values when calling methods concurrently.
213 // clang-format on
214 virtual void SetNSlots(unsigned int nSlots)
215 {
216 assert(fNSlots == 0);
217 assert(nSlots > 0);
218 fNSlots = nSlots;
219 };
220
221 /// \brief Returns the number of files from which the dataset is constructed
222 virtual std::size_t GetNFiles() const { return 0; }
223
224 // clang-format off
225 /// \brief Returns a reference to the collection of the dataset's column names
226 // clang-format on
227 virtual const std::vector<std::string> &GetColumnNames() const = 0;
228
229 /// \brief Checks if the dataset has a certain column
230 /// \param[in] colName The name of the column
231 virtual bool HasColumn(std::string_view colName) const = 0;
232
233 // clang-format off
234 /// \brief Type of a column as a string, e.g. `GetTypeName("x") == "double"`. Required for jitting e.g. `df.Filter("x>0")`.
235 /// \param[in] colName The name of the column
236 // clang-format on
237 virtual std::string GetTypeName(std::string_view colName) const = 0;
238
239 // clang-format off
240 /// Called at most once per column by RDF. Return vector of pointers to pointers to column values - one per slot.
241 /// \tparam T The type of the data stored in the column
242 /// \param[in] columnName The name of the column
243 ///
244 /// These pointers are veritable cursors: it's a responsibility of the RDataSource implementation that they point to
245 /// the "right" memory region.
246 // clang-format on
247 template <typename T>
248 std::vector<T **> GetColumnReaders(std::string_view columnName)
249 {
251 std::vector<T **> typedVec(typeErasedVec.size());
252 std::transform(typeErasedVec.begin(), typeErasedVec.end(), typedVec.begin(),
253 [](void *p) { return static_cast<T **>(p); });
254 return typedVec;
255 }
256
257 /// If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
258 /// \param[in] slot The data processing slot that needs to be considered
259 /// \param[in] name The name of the column for which a column reader needs to be returned
260 /// \param[in] tid A type_info
261 /// At least one of the two must return a non-empty/non-null value.
262 virtual std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
263 GetColumnReaders(unsigned int /*slot*/, std::string_view /*name*/, const std::type_info &)
264 {
265 return {};
266 }
267
268 // clang-format off
269 /// \brief Return ranges of entries to distribute to tasks.
270 /// They are required to be contiguous intervals with no entries skipped. Supposing a dataset with nEntries, the
271 /// intervals must start at 0 and end at nEntries, e.g. [0-5],[5-10] for 10 entries.
272 /// This function will be invoked repeatedly by RDataFrame as it needs additional entries to process.
273 /// The same entry range should not be returned more than once.
274 /// Returning an empty collection of ranges signals to RDataFrame that the processing can stop.
275 // clang-format on
276 virtual std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() = 0;
277
278 // clang-format off
279 /// \brief Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
280 /// \param[in] slot The data processing slot that needs to be considered
281 /// \param[in] entry The entry which needs to be pointed to by the reader pointers
282 /// Slots are adopted to accommodate parallel data processing.
283 /// Different workers will loop over different ranges and
284 /// will be labelled by different "slot" values.
285 /// Returns *true* if the entry has to be processed, *false* otherwise.
286 // clang-format on
287 virtual bool SetEntry(unsigned int slot, ULong64_t entry) = 0;
288
289 // clang-format off
290 /// \brief Convenience method called before starting an event-loop.
291 /// This method might be called multiple times over the lifetime of a RDataSource, since
292 /// users can run multiple event-loops with the same RDataFrame.
293 /// Ideally, `Initialize` should set the state of the RDataSource so that multiple identical event-loops
294 /// will produce identical results.
295 // clang-format on
296 virtual void Initialize() {}
297
298 // clang-format off
299 /// \brief Convenience method called at the start of the data processing associated to a slot.
300 /// \param[in] slot The data processing slot wihch needs to be initialized
301 /// \param[in] firstEntry The first entry of the range that the task will process.
302 /// This method might be called multiple times per thread per event-loop.
303 // clang-format on
304 virtual void InitSlot(unsigned int /*slot*/, ULong64_t /*firstEntry*/) {}
305
306 // clang-format off
307 /// \brief Convenience method called at the end of the data processing associated to a slot.
308 /// \param[in] slot The data processing slot wihch needs to be finalized
309 /// This method might be called multiple times per thread per event-loop.
310 // clang-format on
311 virtual void FinalizeSlot(unsigned int /*slot*/) {}
312
313 // clang-format off
314 /// \brief Convenience method called after concluding an event-loop.
315 /// See Initialize for more details.
316 // clang-format on
317 virtual void Finalize() {}
318
319 /// \brief Return a string representation of the datasource type.
320 /// The returned string will be used by ROOT::RDF::SaveGraph() to represent
321 /// the datasource in the visualization of the computation graph.
322 /// Concrete datasources can override the default implementation.
323 virtual std::string GetLabel() { return "Custom Datasource"; }
324
325 /// \brief Restrict processing to a [begin, end) range of entries.
326 /// \param entryRange The range of entries to process.
327 virtual void SetGlobalEntryRange(std::pair<ULong64_t, ULong64_t> entryRange)
328 {
329 fGlobalEntryRange = std::move(entryRange);
330 };
331
332protected:
333 /// type-erased vector of pointers to pointers to column values - one per slot
334 virtual Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) = 0;
335};
336
337} // ns RDF
338
339} // ns ROOT
340
341/// Print a RDataSource at the prompt
342namespace cling {
343inline std::string printValue(ROOT::RDF::RDataSource *ds)
344{
345 return ds->AsString();
346}
347} // namespace cling
348
349#endif // ROOT_TDATASOURCE
Basic types used by ROOT and required by TInterpreter.
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Definition RtypesCore.h:84
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
winID h TVirtualViewer3D TVirtualGLPainter p
char name[80]
Definition TGX11.cxx:110
The head node of a RDF computation graph.
Mother class of TTypedPointerHolder.
virtual TPointerHolder * GetDeepCopy()=0
Class to wrap a pointer and delete the memory associated to it correctly.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
RDataSource(RDataSource &&)=delete
RDataSource(const RDataSource &)=delete
RDataSource & operator=(const RDataSource &)=delete
virtual bool HasColumn(std::string_view colName) const =0
Checks if the dataset has a certain column.
virtual void Finalize()
Convenience method called after concluding an event-loop.
virtual void InitSlot(unsigned int, ULong64_t)
Convenience method called at the start of the data processing associated to a slot.
virtual std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > CreateColumnReader(unsigned int slot, std::string_view col, const std::type_info &tid, TTreeReader *)
Creates a column reader for the requested column.
virtual std::string DescribeDataset()
virtual const std::vector< std::string > & GetColumnNamesNoDuplicates() const
friend ROOT::RDF::RSampleInfo ROOT::Internal::RDF::CreateSampleInfo(const ROOT::RDF::RDataSource &, unsigned int, const std::unordered_map< std::string, ROOT::RDF::Experimental::RSample * > &)
virtual void FinalizeSlot(unsigned int)
Convenience method called at the end of the data processing associated to a slot.
virtual ~RDataSource()=default
virtual std::string AsString()
virtual bool SetEntry(unsigned int slot, ULong64_t entry)=0
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
virtual std::string GetTypeNameWithOpts(std::string_view colName, bool) const
virtual void SetNSlots(unsigned int nSlots)
Inform RDataSource of the number of processing slots (i.e.
std::vector< void * > Record_t
virtual std::string GetLabel()
Return a string representation of the datasource type.
virtual std::size_t GetNFiles() const
Returns the number of files from which the dataset is constructed.
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset's column names.
virtual const std::vector< std::string > & GetTopLevelFieldNames() const
virtual void InitializeWithOpts(const std::set< std::string > &)
friend void ROOT::Internal::RDF::ProcessMT(RDataSource &, ROOT::Detail::RDF::RLoopManager &)
virtual std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()=0
Return ranges of entries to distribute to tasks.
RDataSource & operator=(RDataSource &&)=delete
virtual Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &)=0
type-erased vector of pointers to pointers to column values - one per slot
virtual std::string GetTypeName(std::string_view colName) const =0
Type of a column as a string, e.g.
std::vector< T ** > GetColumnReaders(std::string_view columnName)
Called at most once per column by RDF.
virtual std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int, std::string_view, const std::type_info &)
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
virtual void SetGlobalEntryRange(std::pair< ULong64_t, ULong64_t > entryRange)
Restrict processing to a [begin, end) range of entries.
virtual void Initialize()
Convenience method called before starting an event-loop.
std::optional< std::pair< ULong64_t, ULong64_t > > fGlobalEntryRange
virtual void RunFinalChecks(bool) const
This type represents a sample identifier, to be used in conjunction with RDataFrame features such as ...
const_iterator begin() const
const_iterator end() const
A simple, robust and fast interface to read values from ROOT columnar datasets such as TTree,...
Definition TTreeReader.h:46
void RunFinalChecks(const ROOT::RDF::RDataSource &ds, bool nodesLeftNotRun)
Definition RDFUtils.cxx:660
ROOT::RDF::RSampleInfo CreateSampleInfo(const ROOT::RDF::RDataSource &ds, unsigned int slot, const std::unordered_map< std::string, ROOT::RDF::Experimental::RSample * > &sampleMap)
Definition RDFUtils.cxx:653
void CallInitializeWithOpts(ROOT::RDF::RDataSource &ds, const std::set< std::string > &suppressErrorsForMissingColumns)
Definition RDFUtils.cxx:642
const std::vector< std::string > & GetTopLevelFieldNames(const ROOT::RDF::RDataSource &ds)
Definition RDFUtils.cxx:632
std::string DescribeDataset(ROOT::RDF::RDataSource &ds)
Definition RDFUtils.cxx:648
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > CreateColumnReader(ROOT::RDF::RDataSource &ds, unsigned int slot, std::string_view col, const std::type_info &tid, TTreeReader *treeReader)
Definition RDFUtils.cxx:671
void ProcessMT(ROOT::RDF::RDataSource &ds, ROOT::Detail::RDF::RLoopManager &lm)
Definition RDFUtils.cxx:665
std::string GetTypeNameWithOpts(const ROOT::RDF::RDataSource &ds, std::string_view colName, bool vector2RVec)
Definition RDFUtils.cxx:627
const std::vector< std::string > & GetColumnNamesNoDuplicates(const ROOT::RDF::RDataSource &ds)
Definition RDFUtils.cxx:637
Namespace for new ROOT classes and functions.