Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RVecDS.hxx
Go to the documentation of this file.
1// Author: Stefan Wunsch CERN 04/2019
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#include <ROOT/RDataFrame.hxx>
12#include <ROOT/RDataSource.hxx>
13#include <ROOT/RVec.hxx>
14#include <ROOT/TSeq.hxx>
15
16#include <algorithm>
17#include <functional>
18#include <map>
19#include <memory>
20#include <string>
21#include <tuple>
22#include <typeinfo>
23#include <utility>
24#include <vector>
25
26#ifndef ROOT_RVECDS
27#define ROOT_RVECDS
28
29namespace ROOT {
30
31namespace Internal {
32
33namespace RDF {
34
42
43////////////////////////////////////////////////////////////////////////////////////////////////
44/// \brief A RDataSource implementation which takes a collection of RVecs, which
45/// are able to adopt data from Numpy arrays
46///
47/// This component allows to create a data source on a set of columns with data
48/// coming from RVecs. The adoption of externally provided data, e.g., via Numpy
49/// arrays, with RVecs allows to read arbitrary data from memory.
50/// In addition, the data source has to keep a reference on the Python owned data
51/// so that the lifetime of the data is tied to the datasource.
52template <typename... ColumnTypes>
54 using PointerHolderPtrs_t = std::vector<ROOT::Internal::RDF::TPointerHolder *>;
55
56 std::tuple<ROOT::RVec<ColumnTypes>...> fColumns;
57 std::vector<std::string> fColNames;
58 std::unordered_map<std::string, std::string> fColTypesMap;
59 // The role of the fPointerHoldersModels is to be initialised with the pack
60 // of arguments in the constrcutor signature at construction time
61 // Once the number of slots is known, the fPointerHolders are initialised
62 // according to the models.
64 std::vector<PointerHolderPtrs_t> fPointerHolders;
65 std::vector<std::pair<ULong64_t, ULong64_t>> fEntryRanges{};
66 std::function<void()> fDeleteRVecs;
67
68 Record_t GetColumnReadersImpl(std::string_view, const std::type_info &) { return {}; }
69
70 size_t GetEntriesNumber() { return std::get<0>(fColumns).size(); }
71 template <std::size_t... S>
72 void SetEntryHelper(unsigned int slot, ULong64_t entry, std::index_sequence<S...>)
73 {
74 std::initializer_list<int> expander{
75 (*static_cast<ColumnTypes *>(fPointerHolders[S][slot]->GetPointer()) = std::get<S>(fColumns)[entry], 0)...};
76 (void)expander; // avoid unused variable warnings
77 }
78
79 template <std::size_t... S>
80 void ColLengthChecker(std::index_sequence<S...>)
81 {
82 if (sizeof...(S) < 2)
83 return;
84
85 const std::vector<size_t> colLengths{std::get<S>(fColumns).size()...};
86 const auto expectedLen = colLengths[0];
87 std::string err;
88 for (auto i : TSeqI(1, colLengths.size())) {
89 if (expectedLen != colLengths[i]) {
90 err += "Column \"" + fColNames[i] + "\" and column \"" + fColNames[0] +
91 "\" have different lengths: " + std::to_string(expectedLen) + " and " +
92 std::to_string(colLengths[i]);
93 }
94 }
95 if (!err.empty()) {
96 throw std::runtime_error(err);
97 }
98 }
99
100protected:
101 std::string AsString() { return "Numpy data source"; };
102
103public:
104 RVecDS(std::function<void()> deleteRVecs, std::pair<std::string, ROOT::RVec<ColumnTypes>> const &...colsNameVals)
105 : fColumns(colsNameVals.second...),
106 fColNames{colsNameVals.first...},
107 fColTypesMap({{colsNameVals.first, ROOT::Internal::RDF::TypeID2TypeName(typeid(ColumnTypes))}...}),
108 fPointerHoldersModels({new ROOT::Internal::RDF::TTypedPointerHolder<ColumnTypes>(new ColumnTypes())...}),
109 fDeleteRVecs(deleteRVecs)
110 {
111 }
112
113 // Rule of five
114 RVecDS(const RVecDS &) = delete;
115 RVecDS &operator=(const RVecDS &) = delete;
116 RVecDS(RVecDS &&) = delete;
117 RVecDS &operator=(RVecDS &&) = delete;
119 {
120 for (auto &&ptrHolderv : fPointerHolders) {
121 for (auto &&ptrHolder : ptrHolderv) {
122 delete ptrHolder;
123 }
124 }
125 // Release the data associated to this data source
126 fDeleteRVecs();
127 }
128
129 std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
130 GetColumnReaders(unsigned int slot, std::string_view colName, const std::type_info &id) final
131 {
132 auto colNameStr = std::string(colName);
133
134 auto it = fColTypesMap.find(colNameStr);
135 if (fColTypesMap.end() == it) {
136 std::string err = "The specified column name, \"" + colNameStr + "\" is not known to the data source.";
137 throw std::runtime_error(err);
138 }
139
140 const auto &colIdName = it->second;
142 if (colIdName != idName) {
143 std::string err = "Column " + colNameStr + " has type " + colIdName +
144 " while the id specified is associated to type " + idName;
145 throw std::runtime_error(err);
146 }
147
148 if (auto colNameIt = std::find(fColNames.begin(), fColNames.end(), colNameStr); colNameIt != fColNames.end()) {
149 const auto index = std::distance(fColNames.begin(), colNameIt);
150 return std::make_unique<ROOT::Internal::RDF::RVecDSColumnReader>(fPointerHolders[index][slot]);
151 }
152
153 throw std::runtime_error("Could not find column name \"" + colNameStr + "\" in available column names.");
154 }
155
156 const std::vector<std::string> &GetColumnNames() const { return fColNames; }
157
158 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges()
159 {
160 auto entryRanges(std::move(fEntryRanges)); // empty fEntryRanges
161 return entryRanges;
162 }
163
164 std::string GetTypeName(std::string_view colName) const
165 {
166 const auto key = std::string(colName);
167 return fColTypesMap.at(key);
168 }
169
170 bool HasColumn(std::string_view colName) const
171 {
172 const auto key = std::string(colName);
173 const auto endIt = fColTypesMap.end();
174 return endIt != fColTypesMap.find(key);
175 }
176
177 bool SetEntry(unsigned int slot, ULong64_t entry)
178 {
179 SetEntryHelper(slot, entry, std::index_sequence_for<ColumnTypes...>());
180 return true;
181 }
182
183 void SetNSlots(unsigned int nSlots) final
184 {
185 fNSlots = nSlots;
186 const auto nCols = fColNames.size();
187 fPointerHolders.resize(nCols); // now we need to fill it with the slots, all of the same type
188 auto colIndex = 0U;
189 for (auto &&ptrHolderv : fPointerHolders) {
190 for (auto slot : ROOT::TSeqI(fNSlots)) {
191 auto ptrHolder = fPointerHoldersModels[colIndex]->GetDeepCopy();
192 ptrHolderv.emplace_back(ptrHolder);
193 (void)slot;
194 }
195 colIndex++;
196 }
197 for (auto &&ptrHolder : fPointerHoldersModels)
198 delete ptrHolder;
199 }
200
202 {
203 ColLengthChecker(std::index_sequence_for<ColumnTypes...>());
204 const auto nEntries = GetEntriesNumber();
205 const auto nEntriesInRange = nEntries / fNSlots; // between integers. Should make smaller?
206 auto reminder = 1U == fNSlots ? 0 : nEntries % fNSlots;
207 fEntryRanges.resize(fNSlots);
208 auto init = 0ULL;
209 auto end = 0ULL;
210 for (auto &&range : fEntryRanges) {
211 end = init + nEntriesInRange;
212 if (0 != reminder) { // Distribute the reminder among the first chunks
213 reminder--;
214 end += 1;
215 }
216 range.first = init;
217 range.second = end;
218 init = end;
219 }
220 }
221
222 std::string GetLabel() { return "RVecDS"; }
223};
224
225// Factory to create datasource able to read Numpy arrays through RVecs.
226// \param pyRVecs Pointer to PyObject holding RVecs.
227// The RVecs itself hold a reference to the associated Numpy arrays so that
228// the data cannot go out of scope as long as the datasource survives.
229template <typename... ColumnTypes>
230std::unique_ptr<RDataFrame>
231MakeRVecDataFrame(std::function<void()> deleteRVecs,
232 std::pair<std::string, ROOT::RVec<ColumnTypes>> const &...colNameProxyPairs)
233{
234 return std::make_unique<RDataFrame>(std::make_unique<RVecDS<ColumnTypes...>>(deleteRVecs, colNameProxyPairs...));
235}
236
237} // namespace RDF
238} // namespace Internal
239} // namespace ROOT
240
241#endif // ROOT_RNUMPYDS
long long Long64_t
Portable signed long integer 8 bytes.
Definition RtypesCore.h:83
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Definition RtypesCore.h:84
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
RVecDSColumnReader(TPointerHolder *ptrHolder)
Definition RVecDS.hxx:40
void * GetImpl(Long64_t) final
Definition RVecDS.hxx:37
A RDataSource implementation which takes a collection of RVecs, which are able to adopt data from Num...
Definition RVecDS.hxx:53
RVecDS(RVecDS &&)=delete
std::vector< PointerHolderPtrs_t > fPointerHolders
Definition RVecDS.hxx:64
void Initialize()
Convenience method called before starting an event-loop.
Definition RVecDS.hxx:201
RVecDS & operator=(const RVecDS &)=delete
std::string GetTypeName(std::string_view colName) const
Type of a column as a string, e.g.
Definition RVecDS.hxx:164
std::unordered_map< std::string, std::string > fColTypesMap
Definition RVecDS.hxx:58
const std::vector< std::string > & GetColumnNames() const
Returns a reference to the collection of the dataset's column names.
Definition RVecDS.hxx:156
bool SetEntry(unsigned int slot, ULong64_t entry)
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition RVecDS.hxx:177
std::function< void()> fDeleteRVecs
Definition RVecDS.hxx:66
std::tuple< ROOT::RVec< ColumnTypes >... > fColumns
Definition RVecDS.hxx:56
std::string GetLabel()
Return a string representation of the datasource type.
Definition RVecDS.hxx:222
RVecDS & operator=(RVecDS &&)=delete
std::vector< std::string > fColNames
Definition RVecDS.hxx:57
RVecDS(const RVecDS &)=delete
void SetEntryHelper(unsigned int slot, ULong64_t entry, std::index_sequence< S... >)
Definition RVecDS.hxx:72
PointerHolderPtrs_t fPointerHoldersModels
Definition RVecDS.hxx:63
void ColLengthChecker(std::index_sequence< S... >)
Definition RVecDS.hxx:80
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int slot, std::string_view colName, const std::type_info &id) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
Definition RVecDS.hxx:130
Record_t GetColumnReadersImpl(std::string_view, const std::type_info &)
type-erased vector of pointers to pointers to column values - one per slot
Definition RVecDS.hxx:68
RVecDS(std::function< void()> deleteRVecs, std::pair< std::string, ROOT::RVec< ColumnTypes > > const &...colsNameVals)
Definition RVecDS.hxx:104
bool HasColumn(std::string_view colName) const
Checks if the dataset has a certain column.
Definition RVecDS.hxx:170
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()
Return ranges of entries to distribute to tasks.
Definition RVecDS.hxx:158
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
Definition RVecDS.hxx:183
std::vector< ROOT::Internal::RDF::TPointerHolder * > PointerHolderPtrs_t
Definition RVecDS.hxx:54
Mother class of TTypedPointerHolder.
Class to wrap a pointer and delete the memory associated to it correctly.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
std::vector< void * > Record_t
A "std::vector"-like collection of values implementing handy operation to analyse them.
Definition RVec.hxx:1526
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition RDFUtils.cxx:178
std::unique_ptr< RDataFrame > MakeRVecDataFrame(std::function< void()> deleteRVecs, std::pair< std::string, ROOT::RVec< ColumnTypes > > const &...colNameProxyPairs)
Definition RVecDS.hxx:231
TSeq< int > TSeqI
Definition TSeq.hxx:203