Logo ROOT  
Reference Guide
RDataSource.hxx
Go to the documentation of this file.
1 // Author: Enrico Guiraud, Danilo Piparo CERN 09/2017
2 
3 /*************************************************************************
4  * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 #ifndef ROOT_RDATASOURCE
12 #define ROOT_RDATASOURCE
13 
15 #include "ROOT/RStringView.hxx"
16 #include "RtypesCore.h" // ULong64_t
17 #include "TString.h"
18 
19 #include <algorithm> // std::transform
20 #include <string>
21 #include <typeinfo>
22 #include <vector>
23 
24 namespace ROOT {
25 namespace RDF {
26 class RDataSource;
27 }
28 }
29 
30 /// Print a RDataSource at the prompt
31 namespace cling {
32 std::string printValue(ROOT::RDF::RDataSource *ds);
33 } // namespace cling
34 
35 namespace ROOT {
36 
37 namespace Internal {
38 namespace TDS {
39 
40 /// Mother class of TTypedPointerHolder. The instances
41 /// of this class can be put in a container. Upon destruction,
42 /// the correct deletion of the pointer is performed in the
43 /// derived class.
45 protected:
46  void *fPointer{nullptr};
47 
48 public:
49  TPointerHolder(void *ptr) : fPointer(ptr) {}
50  void *GetPointer() { return fPointer; }
51  void *GetPointerAddr() { return &fPointer; }
52  virtual TPointerHolder *GetDeepCopy() = 0;
53  virtual ~TPointerHolder(){};
54 };
55 
56 /// Class to wrap a pointer and delete the memory associated to it
57 /// correctly
58 template <typename T>
59 class TTypedPointerHolder final : public TPointerHolder {
60 public:
62 
64  {
65  const auto typedPtr = static_cast<T *>(fPointer);
66  return new TTypedPointerHolder(new T(*typedPtr));
67  }
68 
69  ~TTypedPointerHolder() { delete static_cast<T *>(fPointer); }
70 };
71 
72 } // ns TDS
73 } // ns Internal
74 
75 namespace RDF {
76 
77 // clang-format off
78 /**
79 \class ROOT::RDF::RDataSource
80 \ingroup dataframe
81 \brief RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
82 
83 A concrete RDataSource implementation (i.e. a class that inherits from RDataSource and implements all of its pure
84 methods) provides an adaptor that RDataFrame can leverage to read any kind of tabular data formats.
85 RDataFrame calls into RDataSource to retrieve information about the data, retrieve (thread-local) readers or "cursors"
86 for selected columns and to advance the readers to the desired data entry.
87 
88 The sequence of calls that RDataFrame (or any other client of a RDataSource) performs is the following:
89 
90  - SetNSlots() : inform RDataSource of the desired level of parallelism
91  - GetColumnReaders() : retrieve from RDataSource per-thread readers for the desired columns
92  - Initialise() : inform RDataSource that an event-loop is about to start
93  - GetEntryRanges() : retrieve from RDataSource a set of ranges of entries that can be processed concurrently
94  - InitSlot() : inform RDataSource that a certain thread is about to start working on a certain range of entries
95  - SetEntry() : inform RDataSource that a certain thread is about to start working on a certain entry
96  - FinaliseSlot() : inform RDataSource that a certain thread finished working on a certain range of entries
97  - Finalise() : inform RDataSource that an event-loop finished
98 
99 RDataSource implementations must support running multiple event-loops consecutively (although sequentially) on the same dataset.
100  - \b SetNSlots() is called once per RDataSource object, typically when it is associated to a RDataFrame.
101  - \b GetColumnReaders() can be called several times, potentially with the same arguments, also in-between event-loops, but not during an event-loop.
102  - \b GetEntryRanges() will be called several times, including during an event loop, as additional ranges are needed. It will not be called concurrently.
103  - \b Initialise() and \b Finalise() are called once per event-loop, right before starting and right after finishing.
104  - \b InitSlot(), \b SetEntry(), and \b FinaliseSlot() can be called concurrently from multiple threads, multiple times per event-loop.
105 */
106 class RDataSource {
107  // clang-format on
108 protected:
109  using Record_t = std::vector<void *>;
110  friend std::string cling::printValue(::ROOT::RDF::RDataSource *);
111 
112  virtual std::string AsString() { return "generic data source"; };
113 
114 public:
115  /// Tag type used to indicate that newer versions of RDataSource interfaces should be invoked
116  static struct RV2Interface{} V2;
117 
118  virtual ~RDataSource() = default;
119 
120  // clang-format off
121  /// \brief Inform RDataSource of the number of processing slots (i.e. worker threads) used by the associated RDataFrame.
122  /// Slots numbers are used to simplify parallel execution: RDataFrame guarantees that different threads will always
123  /// pass different slot values when calling methods concurrently.
124  // clang-format on
125  virtual void SetNSlots(unsigned int nSlots) = 0;
126 
127  // clang-format off
128  /// \brief Returns a reference to the collection of the dataset's column names
129  // clang-format on
130  virtual const std::vector<std::string> &GetColumnNames() const = 0;
131 
132  /// \brief Checks if the dataset has a certain column
133  /// \param[in] colName The name of the column
134  virtual bool HasColumn(std::string_view colName) const = 0;
135 
136  // clang-format off
137  /// \brief Type of a column as a string, e.g. `GetTypeName("x") == "double"`. Required for jitting e.g. `df.Filter("x>0")`.
138  /// \param[in] colName The name of the column
139  // clang-format on
140  virtual std::string GetTypeName(std::string_view colName) const = 0;
141 
142  // clang-format off
143  /// Called at most once per column by RDF. Return vector of pointers to pointers to column values - one per slot.
144  /// \tparam T The type of the data stored in the column
145  /// \param[in] columnName The name of the column
146  ///
147  /// These pointers are veritable cursors: it's a responsibility of the RDataSource implementation that they point to
148  /// the "right" memory region.
149  // clang-format on
150  template <typename T>
151  std::vector<T **> GetColumnReaders(std::string_view columnName)
152  {
153  auto typeErasedVec = GetColumnReadersImpl(columnName, typeid(T));
154  std::vector<T **> typedVec(typeErasedVec.size());
155  std::transform(typeErasedVec.begin(), typeErasedVec.end(), typedVec.begin(),
156  [](void *p) { return static_cast<T **>(p); });
157  return typedVec;
158  }
159 
160  /// If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
161  /// \param[in] slot The data processing slot that needs to be considered
162  /// \param[in] name The name of the column for which a column reader needs to be returned
163  /// \param[in] tid A type_info
164  /// At least one of the two must return a non-empty/non-null value.
165  virtual std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
166  GetColumnReaders(unsigned int /*slot*/, std::string_view /*name*/, const std::type_info &)
167  {
168  return {};
169  }
170 
171  // clang-format off
172  /// \brief Return ranges of entries to distribute to tasks.
173  /// They are required to be contiguous intervals with no entries skipped. Supposing a dataset with nEntries, the
174  /// intervals must start at 0 and end at nEntries, e.g. [0-5],[5-10] for 10 entries.
175  /// This function will be invoked repeatedly by RDataFrame as it needs additional entries to process.
176  /// The same entry range should not be returned more than once.
177  /// Returning an empty collection of ranges signals to RDataFrame that the processing can stop.
178  // clang-format on
179  virtual std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() = 0;
180 
181  // clang-format off
182  /// \brief Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
183  /// \param[in] slot The data processing slot that needs to be considered
184  /// \param[in] entry The entry which needs to be pointed to by the reader pointers
185  /// Slots are adopted to accommodate parallel data processing.
186  /// Different workers will loop over different ranges and
187  /// will be labelled by different "slot" values.
188  /// Returns *true* if the entry has to be processed, *false* otherwise.
189  // clang-format on
190  virtual bool SetEntry(unsigned int slot, ULong64_t entry) = 0;
191 
192  // clang-format off
193  /// \brief Convenience method called before starting an event-loop.
194  /// This method might be called multiple times over the lifetime of a RDataSource, since
195  /// users can run multiple event-loops with the same RDataFrame.
196  /// Ideally, `Initialise` should set the state of the RDataSource so that multiple identical event-loops
197  /// will produce identical results.
198  // clang-format on
199  virtual void Initialise() {}
200 
201  // clang-format off
202  /// \brief Convenience method called at the start of the data processing associated to a slot.
203  /// \param[in] slot The data processing slot wihch needs to be initialised
204  /// \param[in] firstEntry The first entry of the range that the task will process.
205  /// This method might be called multiple times per thread per event-loop.
206  // clang-format on
207  virtual void InitSlot(unsigned int /*slot*/, ULong64_t /*firstEntry*/) {}
208 
209  // clang-format off
210  /// \brief Convenience method called at the end of the data processing associated to a slot.
211  /// \param[in] slot The data processing slot wihch needs to be finalised
212  /// This method might be called multiple times per thread per event-loop.
213  // clang-format on
214  virtual void FinaliseSlot(unsigned int /*slot*/) {}
215 
216  // clang-format off
217  /// \brief Convenience method called after concluding an event-loop.
218  /// See Initialise for more details.
219  // clang-format on
220  virtual void Finalise() {}
221 
222  /// \brief Return a string representation of the datasource type.
223  /// The returned string will be used by ROOT::RDF::SaveGraph() to represent
224  /// the datasource in the visualization of the computation graph.
225  /// Concrete datasources can override the default implementation.
226  virtual std::string GetLabel() { return "Custom Datasource"; }
227 
228 protected:
229  /// type-erased vector of pointers to pointers to column values - one per slot
230  virtual Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) = 0;
231 };
232 
233 } // ns RDF
234 
235 } // ns ROOT
236 
237 /// Print a RDataSource at the prompt
238 namespace cling {
239 inline std::string printValue(ROOT::RDF::RDataSource *ds)
240 {
241  return ds->AsString();
242 }
243 } // namespace cling
244 
245 #endif // ROOT_TDATASOURCE
ROOT::RDF::RDataSource::GetColumnNames
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset's column names.
ROOT::RDF::RDataSource::~RDataSource
virtual ~RDataSource()=default
ROOT::Internal::TDS::TPointerHolder::fPointer
void * fPointer
Definition: RDataSource.hxx:46
string_view
basic_string_view< char > string_view
Definition: libcpp_string_view.h:785
ROOT::RDF::RDataSource::V2
static struct ROOT::RDF::RDataSource::RV2Interface V2
extract_docstrings.ds
ds
Definition: extract_docstrings.py:40
ROOT::RDF::RDataSource::GetColumnReadersImpl
virtual Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &)=0
type-erased vector of pointers to pointers to column values - one per slot
ROOT::Internal::TDS::TTypedPointerHolder::~TTypedPointerHolder
~TTypedPointerHolder()
Definition: RDataSource.hxx:69
TString.h
ROOT::RDF::RDataSource::GetLabel
virtual std::string GetLabel()
Return a string representation of the datasource type.
Definition: RDataSource.hxx:226
ROOT::RDF::RDataSource
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
Definition: RDataSource.hxx:106
ROOT::RDF::RDataSource::GetTypeName
virtual std::string GetTypeName(std::string_view colName) const =0
Type of a column as a string, e.g.
ROOT::Internal::TDS::TTypedPointerHolder::TTypedPointerHolder
TTypedPointerHolder(T *ptr)
Definition: RDataSource.hxx:61
ROOT::Internal::TDS::TPointerHolder::GetPointerAddr
void * GetPointerAddr()
Definition: RDataSource.hxx:51
ROOT::Internal::TDS::TPointerHolder::GetPointer
void * GetPointer()
Definition: RDataSource.hxx:50
ROOT::RDF::RDataSource::SetEntry
virtual bool SetEntry(unsigned int slot, ULong64_t entry)=0
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
ROOT::RDF::RDataSource::Initialise
virtual void Initialise()
Convenience method called before starting an event-loop.
Definition: RDataSource.hxx:199
ROOT::RDF::RDataSource::InitSlot
virtual void InitSlot(unsigned int, ULong64_t)
Convenience method called at the start of the data processing associated to a slot.
Definition: RDataSource.hxx:207
ROOT::RDF::RDataSource::GetEntryRanges
virtual std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()=0
Return ranges of entries to distribute to tasks.
RStringView.hxx
ROOT::Internal::TDS::TTypedPointerHolder::GetDeepCopy
virtual TPointerHolder * GetDeepCopy()
Definition: RDataSource.hxx:63
ROOT::Internal::TDS::TTypedPointerHolder
Class to wrap a pointer and delete the memory associated to it correctly.
Definition: RDataSource.hxx:59
ROOT::Internal::TDS::TPointerHolder::GetDeepCopy
virtual TPointerHolder * GetDeepCopy()=0
void
typedef void((*Func_t)())
ROOT::RDF::RDataSource::GetColumnReaders
virtual std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int, std::string_view, const std::type_info &)
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
Definition: RDataSource.hxx:166
ULong64_t
unsigned long long ULong64_t
Definition: RtypesCore.h:74
ROOT::RDF::RDataSource::Record_t
std::vector< void * > Record_t
Definition: RDataSource.hxx:109
RtypesCore.h
ROOT::Internal::TDS::TPointerHolder::~TPointerHolder
virtual ~TPointerHolder()
Definition: RDataSource.hxx:53
ROOT::RDF::RDataSource::AsString
virtual std::string AsString()
Definition: RDataSource.hxx:112
ROOT::RDF::RDataSource::SetNSlots
virtual void SetNSlots(unsigned int nSlots)=0
Inform RDataSource of the number of processing slots (i.e.
name
char name[80]
Definition: TGX11.cxx:110
ROOT::Internal::TDS::TPointerHolder::TPointerHolder
TPointerHolder(void *ptr)
Definition: RDataSource.hxx:49
ROOT::Math::Chebyshev::T
double T(double x)
Definition: ChebyshevPol.h:34
RColumnReaderBase.hxx
ROOT::RDF::RDataSource::RV2Interface
Tag type used to indicate that newer versions of RDataSource interfaces should be invoked.
Definition: RDataSource.hxx:116
ROOT::RDF::RDataSource::FinaliseSlot
virtual void FinaliseSlot(unsigned int)
Convenience method called at the end of the data processing associated to a slot.
Definition: RDataSource.hxx:214
ROOT::Internal::TDS::TPointerHolder
Mother class of TTypedPointerHolder.
Definition: RDataSource.hxx:44
ROOT::RDF::RDataSource::Finalise
virtual void Finalise()
Convenience method called after concluding an event-loop.
Definition: RDataSource.hxx:220
ROOT
VSD Structures.
Definition: StringConv.hxx:21
ROOT::RDF::RDataSource::HasColumn
virtual bool HasColumn(std::string_view colName) const =0
Checks if the dataset has a certain column.
ROOT::RDF::RDataSource::GetColumnReaders
std::vector< T ** > GetColumnReaders(std::string_view columnName)
Called at most once per column by RDF.
Definition: RDataSource.hxx:151