Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RDataSource.hxx
Go to the documentation of this file.
1// Author: Enrico Guiraud, Danilo Piparo CERN 09/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RDATASOURCE
12#define ROOT_RDATASOURCE
13
15#include "ROOT/RStringView.hxx"
16#include "RtypesCore.h" // ULong64_t
17#include "TString.h"
18
19#include <algorithm> // std::transform
20#include <string>
21#include <typeinfo>
22#include <vector>
23
24namespace ROOT {
25namespace RDF {
26class RDataSource;
27}
28}
29
30/// Print a RDataSource at the prompt
31namespace cling {
32std::string printValue(ROOT::RDF::RDataSource *ds);
33} // namespace cling
34
35namespace ROOT {
36
37namespace Internal {
38namespace TDS {
39
40/// Mother class of TTypedPointerHolder. The instances
41/// of this class can be put in a container. Upon destruction,
42/// the correct deletion of the pointer is performed in the
43/// derived class.
45protected:
46 void *fPointer{nullptr};
47
48public:
49 TPointerHolder(void *ptr) : fPointer(ptr) {}
50 void *GetPointer() { return fPointer; }
51 void *GetPointerAddr() { return &fPointer; }
53 virtual ~TPointerHolder(){};
54};
55
56/// Class to wrap a pointer and delete the memory associated to it
57/// correctly
58template <typename T>
59class TTypedPointerHolder final : public TPointerHolder {
60public:
62
64 {
65 const auto typedPtr = static_cast<T *>(fPointer);
66 return new TTypedPointerHolder(new T(*typedPtr));
67 }
68
69 ~TTypedPointerHolder() { delete static_cast<T *>(fPointer); }
70};
71
72} // ns TDS
73} // ns Internal
74
75namespace RDF {
76
77// clang-format off
78/**
79\class ROOT::RDF::RDataSource
80\ingroup dataframe
81\brief RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
82
83A concrete RDataSource implementation (i.e. a class that inherits from RDataSource and implements all of its pure
84methods) provides an adaptor that RDataFrame can leverage to read any kind of tabular data formats.
85RDataFrame calls into RDataSource to retrieve information about the data, retrieve (thread-local) readers or "cursors"
86for selected columns and to advance the readers to the desired data entry.
87
88The sequence of calls that RDataFrame (or any other client of a RDataSource) performs is the following:
89
90 - SetNSlots() : inform RDataSource of the desired level of parallelism
91 - GetColumnReaders() : retrieve from RDataSource per-thread readers for the desired columns
92 - Initialise() : inform RDataSource that an event-loop is about to start
93 - GetEntryRanges() : retrieve from RDataSource a set of ranges of entries that can be processed concurrently
94 - InitSlot() : inform RDataSource that a certain thread is about to start working on a certain range of entries
95 - SetEntry() : inform RDataSource that a certain thread is about to start working on a certain entry
96 - FinaliseSlot() : inform RDataSource that a certain thread finished working on a certain range of entries
97 - Finalise() : inform RDataSource that an event-loop finished
98
99RDataSource implementations must support running multiple event-loops consecutively (although sequentially) on the same dataset.
100 - \b SetNSlots() is called once per RDataSource object, typically when it is associated to a RDataFrame.
101 - \b GetColumnReaders() can be called several times, potentially with the same arguments, also in-between event-loops, but not during an event-loop.
102 - \b GetEntryRanges() will be called several times, including during an event loop, as additional ranges are needed. It will not be called concurrently.
103 - \b Initialise() and \b Finalise() are called once per event-loop, right before starting and right after finishing.
104 - \b InitSlot(), \b SetEntry(), and \b FinaliseSlot() can be called concurrently from multiple threads, multiple times per event-loop.
105*/
107 // clang-format on
108protected:
109 using Record_t = std::vector<void *>;
110 friend std::string cling::printValue(::ROOT::RDF::RDataSource *);
111
112 virtual std::string AsString() { return "generic data source"; };
113
114public:
115 virtual ~RDataSource() = default;
116
117 // clang-format off
118 /// \brief Inform RDataSource of the number of processing slots (i.e. worker threads) used by the associated RDataFrame.
119 /// Slots numbers are used to simplify parallel execution: RDataFrame guarantees that different threads will always
120 /// pass different slot values when calling methods concurrently.
121 // clang-format on
122 virtual void SetNSlots(unsigned int nSlots) = 0;
123
124 // clang-format off
125 /// \brief Returns a reference to the collection of the dataset's column names
126 // clang-format on
127 virtual const std::vector<std::string> &GetColumnNames() const = 0;
128
129 /// \brief Checks if the dataset has a certain column
130 /// \param[in] colName The name of the column
131 virtual bool HasColumn(std::string_view colName) const = 0;
132
133 // clang-format off
134 /// \brief Type of a column as a string, e.g. `GetTypeName("x") == "double"`. Required for jitting e.g. `df.Filter("x>0")`.
135 /// \param[in] colName The name of the column
136 // clang-format on
137 virtual std::string GetTypeName(std::string_view colName) const = 0;
138
139 // clang-format off
140 /// Called at most once per column by RDF. Return vector of pointers to pointers to column values - one per slot.
141 /// \tparam T The type of the data stored in the column
142 /// \param[in] columnName The name of the column
143 ///
144 /// These pointers are veritable cursors: it's a responsibility of the RDataSource implementation that they point to
145 /// the "right" memory region.
146 // clang-format on
147 template <typename T>
148 std::vector<T **> GetColumnReaders(std::string_view columnName)
149 {
150 auto typeErasedVec = GetColumnReadersImpl(columnName, typeid(T));
151 std::vector<T **> typedVec(typeErasedVec.size());
152 std::transform(typeErasedVec.begin(), typeErasedVec.end(), typedVec.begin(),
153 [](void *p) { return static_cast<T **>(p); });
154 return typedVec;
155 }
156
157 /// If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
158 /// \param[in] slot The data processing slot that needs to be considered
159 /// \param[in] name The name of the column for which a column reader needs to be returned
160 /// \param[in] tid A type_info
161 /// At least one of the two must return a non-empty/non-null value.
162 virtual std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
163 GetColumnReaders(unsigned int /*slot*/, std::string_view /*name*/, const std::type_info &)
164 {
165 return {};
166 }
167
168 // clang-format off
169 /// \brief Return ranges of entries to distribute to tasks.
170 /// They are required to be contiguous intervals with no entries skipped. Supposing a dataset with nEntries, the
171 /// intervals must start at 0 and end at nEntries, e.g. [0-5],[5-10] for 10 entries.
172 /// This function will be invoked repeatedly by RDataFrame as it needs additional entries to process.
173 /// The same entry range should not be returned more than once.
174 /// Returning an empty collection of ranges signals to RDataFrame that the processing can stop.
175 // clang-format on
176 virtual std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() = 0;
177
178 // clang-format off
179 /// \brief Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
180 /// \param[in] slot The data processing slot that needs to be considered
181 /// \param[in] entry The entry which needs to be pointed to by the reader pointers
182 /// Slots are adopted to accommodate parallel data processing.
183 /// Different workers will loop over different ranges and
184 /// will be labelled by different "slot" values.
185 /// Returns *true* if the entry has to be processed, *false* otherwise.
186 // clang-format on
187 virtual bool SetEntry(unsigned int slot, ULong64_t entry) = 0;
188
189 // clang-format off
190 /// \brief Convenience method called before starting an event-loop.
191 /// This method might be called multiple times over the lifetime of a RDataSource, since
192 /// users can run multiple event-loops with the same RDataFrame.
193 /// Ideally, `Initialise` should set the state of the RDataSource so that multiple identical event-loops
194 /// will produce identical results.
195 // clang-format on
196 virtual void Initialise() {}
197
198 // clang-format off
199 /// \brief Convenience method called at the start of the data processing associated to a slot.
200 /// \param[in] slot The data processing slot wihch needs to be initialised
201 /// \param[in] firstEntry The first entry of the range that the task will process.
202 /// This method might be called multiple times per thread per event-loop.
203 // clang-format on
204 virtual void InitSlot(unsigned int /*slot*/, ULong64_t /*firstEntry*/) {}
205
206 // clang-format off
207 /// \brief Convenience method called at the end of the data processing associated to a slot.
208 /// \param[in] slot The data processing slot wihch needs to be finalised
209 /// This method might be called multiple times per thread per event-loop.
210 // clang-format on
211 virtual void FinaliseSlot(unsigned int /*slot*/) {}
212
213 // clang-format off
214 /// \brief Convenience method called after concluding an event-loop.
215 /// See Initialise for more details.
216 // clang-format on
217 virtual void Finalise() {}
218
219 /// \brief Return a string representation of the datasource type.
220 /// The returned string will be used by ROOT::RDF::SaveGraph() to represent
221 /// the datasource in the visualization of the computation graph.
222 /// Concrete datasources can override the default implementation.
223 virtual std::string GetLabel() { return "Custom Datasource"; }
224
225protected:
226 /// type-erased vector of pointers to pointers to column values - one per slot
227 virtual Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) = 0;
228};
229
230} // ns RDF
231
232} // ns ROOT
233
234/// Print a RDataSource at the prompt
235namespace cling {
236inline std::string printValue(ROOT::RDF::RDataSource *ds)
237{
238 return ds->AsString();
239}
240} // namespace cling
241
242#endif // ROOT_TDATASOURCE
unsigned long long ULong64_t
Definition RtypesCore.h:74
char name[80]
Definition TGX11.cxx:110
typedef void((*Func_t)())
Mother class of TTypedPointerHolder.
virtual TPointerHolder * GetDeepCopy()=0
Class to wrap a pointer and delete the memory associated to it correctly.
virtual TPointerHolder * GetDeepCopy()
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
virtual bool HasColumn(std::string_view colName) const =0
Checks if the dataset has a certain column.
virtual void Finalise()
Convenience method called after concluding an event-loop.
virtual void InitSlot(unsigned int, ULong64_t)
Convenience method called at the start of the data processing associated to a slot.
virtual ~RDataSource()=default
virtual std::string AsString()
virtual bool SetEntry(unsigned int slot, ULong64_t entry)=0
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
virtual void FinaliseSlot(unsigned int)
Convenience method called at the end of the data processing associated to a slot.
std::vector< void * > Record_t
virtual std::string GetLabel()
Return a string representation of the datasource type.
virtual void SetNSlots(unsigned int nSlots)=0
Inform RDataSource of the number of processing slots (i.e.
virtual void Initialise()
Convenience method called before starting an event-loop.
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset's column names.
virtual std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()=0
Return ranges of entries to distribute to tasks.
virtual Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &)=0
type-erased vector of pointers to pointers to column values - one per slot
virtual std::string GetTypeName(std::string_view colName) const =0
Type of a column as a string, e.g.
std::vector< T ** > GetColumnReaders(std::string_view columnName)
Called at most once per column by RDF.
virtual std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int, std::string_view, const std::type_info &)
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...