Logo ROOT   6.12/07
Reference Guide
TDataSource.hxx
Go to the documentation of this file.
1 // Author: Enrico Guiraud, Danilo Piparo CERN 09/2017
2 
3 /*************************************************************************
4  * Copyright (C) 1995-2016, Rene Brun and Fons Rademakers. *
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 #ifndef ROOT_TDATASOURCE
12 #define ROOT_TDATASOURCE
13 
14 #include "RStringView.h"
15 #include "RtypesCore.h" // ULong64_t
16 #include <algorithm> // std::transform
17 #include <vector>
18 #include <typeinfo>
19 
20 namespace ROOT {
21 namespace Experimental {
22 namespace TDF {
23 
24 /**
25 \class ROOT::Experimental::TDF::TDataSource
26 \ingroup dataframe
27 \brief TDataSource defines an API that TDataFrame can use to read arbitrary data formats.
28 
29 A concrete TDataSource implementation (i.e. a class that inherits from TDataSource and implements all of its pure
30 methods) provides an adaptor that TDataFrame can leverage to read any kind of tabular data formats.
31 TDataFrame calls into TDataSource to retrieve information about the data, retrieve (thread-local) readers or "cursors"
32 for selected columns and to advance the readers to the desired data entry.
33 
34 The sequence of calls that TDataFrame (or any other client of a TDataSource) performs is the following:
35 
36 1) SetNSlots: inform TDataSource of the desired level of parallelism
37 2) GetColumnReaders: retrieve from TDataSource per-thread readers for the desired columns
38 3) Initialise: inform TDataSource that an event-loop is about to start
39 4) GetEntryRanges: retrieve from TDataSource a set of ranges of entries that can be processed concurrently
40 5) InitSlot: inform TDataSource that a certain thread is about to start working on a certain range of entries
41 6) SetEntry: inform TDataSource that a certain thread is about to start working on a certain entry
42 7) FinaliseSlot: inform TDataSource that a certain thread finished working on a certain range of entries
43 8) Finalise: inform TDataSource that an event-loop finished
44 
45 TDataSource implementations must support running multiple event-loops consecutively (although sequentially) on the same dataset.
46 Method 1 is called once per TDataSource object, typically when it is associated to a TDataFrame.
47 Method 2 can be called several times, potentially with the same arguments, also in-between event-loops, but not during an event-loop.
48 Methods 3,8 are called once per event-loop, right before starting and right after finishing.
49 Methods 5,6,7 can be called concurrently from multiple threads, multiple times per event-loop.
50 */
51 class TDataSource {
52 public:
53  virtual ~TDataSource() = default;
54 
55  /// \brief Inform TDataSource of the number of processing slots (i.e. worker threads) used by the associated TDataFrame.
56  /// Slots numbers are used to simplify parallel execution: TDataFrame guarantees that different threads will always
57  /// pass different slot values when calling methods concurrently.
58  virtual void SetNSlots(unsigned int nSlots) = 0;
59 
60  /// \brief Returns a reference to the collection of the dataset's column names
61  virtual const std::vector<std::string> &GetColumnNames() const = 0;
62 
63  /// \brief Checks if the dataset has a certain column
64  /// \param[in] columnName The name of the column
65  virtual bool HasColumn(std::string_view) const = 0;
66 
67  /// \brief Type of a column as a string, e.g. `GetTypeName("x") == "double"`. Required for jitting e.g. `df.Filter("x>0")`.
68  /// \param[in] columnName The name of the column
69  virtual std::string GetTypeName(std::string_view) const = 0;
70 
71  /// Called at most once per column by TDF. Return vector of pointers to pointers to column values - one per slot.
72  /// \tparam T The type of the data stored in the column
73  /// \param[in] columnName The name of the column
74  ///
75  /// These pointers are veritable cursors: it's a responsibility of the TDataSource implementation that they point to the
76  /// "right" memory region.
77  template <typename T>
78  std::vector<T **> GetColumnReaders(std::string_view columnName)
79  {
80  auto typeErasedVec = GetColumnReadersImpl(columnName, typeid(T));
81  std::vector<T **> typedVec(typeErasedVec.size());
82  std::transform(typeErasedVec.begin(), typeErasedVec.end(), typedVec.begin(),
83  [](void *p) { return static_cast<T **>(p); });
84  return typedVec;
85  }
86 
87  /// \brief Return ranges of entries to distribute to tasks.
88  /// They are required to be contiguous intervals with no entries skipped. Supposing a dataset with nEntries, the intervals
89  /// must start at 0 and end at nEntries, e.g. [0-5],[5-10] for 10 entries.
90  virtual std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() = 0;
91 
92  /// \brief Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
93  /// \param[in] slot The data processing slot that needs to be considered
94  /// \param[in] entry The entry which needs to be pointed to by the reader pointers
95  /// Slots are adopted to accommodate parallel data processing. Different workers will loop over different ranges and will
96  /// be labelled by different "slot" values.
97  virtual void SetEntry(unsigned int slot, ULong64_t entry) = 0;
98 
99  /// \brief Convenience method called before starting an event-loop.
100  /// This method might be called multiple times over the lifetime of a TDataSource, since
101  /// users can run multiple event-loops with the same TDataFrame.
102  /// Ideally, `Initialise` should set the state of the TDataSource so that multiple identical event-loops
103  /// will produce identical results.
104  virtual void Initialise() {}
105 
106  /// \brief Convenience method called at the start of the data processing associated to a slot.
107  /// \param[in] slot The data processing slot wihch needs to be initialised
108  /// \param[in] firstEntry The first entry of the range that the task will process.
109  /// This method might be called multiple times per thread per event-loop.
110  virtual void InitSlot(unsigned int /*slot*/, ULong64_t /*firstEntry*/) {}
111 
112  /// \brief Convenience method called at the end of the data processing associated to a slot.
113  /// \param[in] slot The data processing slot wihch needs to be finalised
114  /// This method might be called multiple times per thread per event-loop.
115  virtual void FinaliseSlot(unsigned int /*slot*/) {}
116 
117  /// \brief Convenience method called after concluding an event-loop.
118  /// See Initialise for more details.
119  virtual void Finalise() {}
120 
121 protected:
122  /// type-erased vector of pointers to pointers to column values - one per slot
123  virtual std::vector<void *>
124  GetColumnReadersImpl(std::string_view name, const std::type_info &) = 0;
125 };
126 
127 } // ns TDF
128 } // ns Experimental
129 } // ns ROOT
130 
131 #endif // ROOT_TDATASOURCE
virtual void InitSlot(unsigned int, ULong64_t)
Convenience method called at the start of the data processing associated to a slot.
basic_string_view< char > string_view
Definition: RStringView.h:35
Namespace for new ROOT classes and functions.
Definition: StringConv.hxx:21
virtual std::vector< void * > GetColumnReadersImpl(std::string_view name, const std::type_info &)=0
type-erased vector of pointers to pointers to column values - one per slot
double T(double x)
Definition: ChebyshevPol.h:34
virtual void SetNSlots(unsigned int nSlots)=0
Inform TDataSource of the number of processing slots (i.e.
virtual void Initialise()
Convenience method called before starting an event-loop.
virtual std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()=0
Return ranges of entries to distribute to tasks.
TDataSource defines an API that TDataFrame can use to read arbitrary data formats.
Definition: TDataSource.hxx:51
virtual void Finalise()
Convenience method called after concluding an event-loop.
virtual std::string GetTypeName(std::string_view) const =0
Type of a column as a string, e.g.
unsigned long long ULong64_t
Definition: RtypesCore.h:70
virtual void SetEntry(unsigned int slot, ULong64_t entry)=0
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot...
std::vector< T ** > GetColumnReaders(std::string_view columnName)
Called at most once per column by TDF.
Definition: TDataSource.hxx:78
virtual void FinaliseSlot(unsigned int)
Convenience method called at the end of the data processing associated to a slot. ...
virtual bool HasColumn(std::string_view) const =0
Checks if the dataset has a certain column.
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset&#39;s column names.
char name[80]
Definition: TGX11.cxx:109