Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleDS.cxx
Go to the documentation of this file.
1/// \file RNTupleDS.cxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \author Enrico Guiraud <enrico.guiraud@cern.ch>
5/// \date 2018-10-04
6/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
7/// is welcome!
8
9/*************************************************************************
10 * Copyright (C) 1995-2020, Rene Brun and Fons Rademakers. *
11 * All rights reserved. *
12 * *
13 * For the licensing terms see $ROOTSYS/LICENSE. *
14 * For the list of contributors see $ROOTSYS/README/CREDITS. *
15 *************************************************************************/
16
18#include <ROOT/RField.hxx>
19#include <ROOT/RFieldValue.hxx>
20#include <ROOT/RNTuple.hxx>
22#include <ROOT/RNTupleDS.hxx>
23#include <ROOT/RNTupleUtil.hxx>
24#include <ROOT/RPageStorage.hxx>
25#include <ROOT/RStringView.hxx>
26
27#include <TError.h>
28
29#include <string>
30#include <vector>
31#include <typeinfo>
32#include <utility>
33
34// clang-format off
35/**
36* \class ROOT::Experimental::RNTupleDS
37* \ingroup dataframe
38* \brief The RDataSource implementation for RNTuple. It lets RDataFrame read RNTuple data.
39*
40* An RDataFrame that reads RNTuple data can be constructed using MakeNTupleDataFrame().
41*
42* For each column containing an array or a collection, a corresponding column `#colname` is available to access
43* `colname.size()` without reading and deserializing the collection values.
44*
45**/
46// clang-format on
47
48namespace ROOT {
49namespace Experimental {
50namespace Internal {
51
52/// An artificial field that transforms an RNTuple column that contains the offset of collections into
53/// collection sizes. It is used to provide the "number of" RDF columns for collections, e.g.
54/// `R_rdf_sizeof_jets` for a collection named `jets`.
55///
56/// This field owns the collection offset field but instead of exposing the collection offsets it exposes
57/// the collection sizes (offset(N+1) - offset(N)). For the time being, we offer this functionality only in RDataFrame.
58/// TODO(jblomer): consider providing a general set of useful virtual fields as part of RNTuple.
60protected:
61 std::unique_ptr<ROOT::Experimental::Detail::RFieldBase> CloneImpl(std::string_view /* newName */) const final
62 {
63 return std::make_unique<RRDFCardinalityField>();
64 }
65
66public:
67 static std::string TypeName() { return "std::size_t"; }
69 : ROOT::Experimental::Detail::RFieldBase("", TypeName(), ENTupleStructure::kLeaf, false /* isSimple */) {}
73
74 // Field is only used for reading
75 void GenerateColumnsImpl() final { assert(false && "Cardinality fields must only be used for reading"); }
76
78 {
79 RColumnModel model(EColumnType::kIndex, true /* isSorted*/);
80 fColumns.emplace_back(std::unique_ptr<ROOT::Experimental::Detail::RColumn>(
81 ROOT::Experimental::Detail::RColumn::Create<ClusterSize_t, EColumnType::kIndex>(model, 0)));
82 fPrincipalColumn = fColumns[0].get();
83 }
84
86 {
87 return ROOT::Experimental::Detail::RFieldValue(this, static_cast<std::size_t *>(where));
88 }
90 {
91 return ROOT::Experimental::Detail::RFieldValue(true /* captureFlag */, this, where);
92 }
93 size_t GetValueSize() const final { return sizeof(std::size_t); }
94
95 /// Get the number of elements of the collection identified by globalIndex
96 void
98 {
99 RClusterIndex collectionStart;
101 fPrincipalColumn->GetCollectionInfo(globalIndex, &collectionStart, &size);
102 *value->Get<std::size_t>() = size;
103 }
104
105 /// Get the number of elements of the collection identified by clusterIndex
108 {
109 RClusterIndex collectionStart;
111 fPrincipalColumn->GetCollectionInfo(clusterIndex, &collectionStart, &size);
112 *value->Get<std::size_t>() = size;
113 }
114};
115
116/// Every RDF column is represented by exactly one RNTuple field
121
122 std::unique_ptr<RFieldBase> fField; ///< The field backing the RDF column
123 RFieldValue fValue; ///< The memory location used to read from fField
124 Long64_t fLastEntry; ///< Last entry number that was read
125
126public:
127 RNTupleColumnReader(std::unique_ptr<RFieldBase> f)
128 : fField(std::move(f)), fValue(fField->GenerateValue()), fLastEntry(-1)
129 {
130 }
131 ~RNTupleColumnReader() { fField->DestroyValue(fValue); }
132
133 /// Column readers are created as prototype and then cloned for every slot
134 std::unique_ptr<RNTupleColumnReader> Clone()
135 {
136 return std::make_unique<RNTupleColumnReader>(fField->Clone(fField->GetName()));
137 }
138
139 /// Connect the field and its subfields to the page source
140 void Connect(RPageSource &source)
141 {
142 fField->ConnectPageSource(source);
143 for (auto &f : *fField)
144 f.ConnectPageSource(source);
145 }
146
147 void *GetImpl(Long64_t entry) final
148 {
149 if (entry != fLastEntry) {
150 fField->Read(entry, &fValue);
151 fLastEntry = entry;
152 }
153 return fValue.GetRawPtr();
154 }
155};
156
157} // namespace Internal
158
159RNTupleDS::~RNTupleDS() = default;
160
161void RNTupleDS::AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId,
162 std::vector<DescriptorId_t> skeinIDs)
163{
164 // As an example for the mapping of RNTuple fields to RDF columns, let's consider an RNTuple
165 // using the following types and with a top-level field named "event" of type Event:
166 //
167 // struct Event {
168 // int id;
169 // std::vector<Track> tracks;
170 // };
171 // struct Track {
172 // std::vector<Hit> hits;
173 // };
174 // struct Hit {
175 // float x;
176 // float y;
177 // };
178 //
179 // AddField() will be called from the constructor with the RNTuple root field (ENTupleStructure::kRecord).
180 // From there, we recurse into the "event" sub field (also ENTupleStructure::kRecord) and further down the
181 // tree of sub fields and expose the following RDF columns:
182 //
183 // "event" [Event]
184 // "event.id" [int]
185 // "event.tracks" [RVec<Track>]
186 // "R_rdf_sizeof_event.tracks" [unsigned int]
187 // "event.tracks.hits" [RVec<RVec<Hit>>]
188 // "R_rdf_sizeof_event.tracks.hits" [RVec<unsigned int>]
189 // "event.tracks.hits.x" [RVec<RVec<float>>]
190 // "R_rdf_sizeof_event.tracks.hits.x" [RVec<unsigned int>]
191 // "event.tracks.hits.y" [RVec<RVec<float>>]
192 // "R_rdf_sizeof_event.tracks.hits.y" [RVec<unsigned int>]
193
194 const auto &fieldDesc = desc.GetFieldDescriptor(fieldId);
195 if (fieldDesc.GetStructure() == ENTupleStructure::kCollection) {
196 // Inner fields of collections are provided as projected collections of only that inner field,
197 // E.g. we provide a projected collection RVec<RVec<float>> for "event.tracks.hits.x" in the example
198 // above.
199
200 // We open a new collection scope with fieldID being the inner most collection. E.g. for "event.tracks.hits",
201 // skeinIDs would already contain the fieldID of "event.tracks"
202 skeinIDs.emplace_back(fieldId);
203
204 if (fieldDesc.GetTypeName().empty()) {
205 // Anonymous collection with one or several sub fields
206 auto cardinalityField = std::make_unique<ROOT::Experimental::Internal::RRDFCardinalityField>();
207 cardinalityField->SetOnDiskId(fieldId);
208 fColumnNames.emplace_back("R_rdf_sizeof_" + std::string(colName));
209 fColumnTypes.emplace_back(cardinalityField->GetType());
210 auto cardColReader = std::make_unique<ROOT::Experimental::Internal::RNTupleColumnReader>(
211 std::move(cardinalityField));
212 fColumnReaderPrototypes.emplace_back(std::move(cardColReader));
213
214 for (const auto &f : desc.GetFieldIterable(fieldDesc.GetId())) {
215 AddField(desc, std::string(colName) + "." + f.GetFieldName(), f.GetId(), skeinIDs);
216 }
217 } else {
218 // ROOT::RVec with exactly one sub field
219 const auto &f = *desc.GetFieldIterable(fieldDesc.GetId()).begin();
220 AddField(desc, colName, f.GetId(), skeinIDs);
221 }
222 // Note that at the end of the recursion, we handled the inner sub collections as well as the
223 // collection as whole, so we are done.
224 return;
225 } else if (fieldDesc.GetStructure() == ENTupleStructure::kRecord) {
226 // Inner fields of records are provided as individual RDF columns, e.g. "event.id"
227 for (const auto &f : desc.GetFieldIterable(fieldDesc.GetId())) {
228 auto innerName = colName.empty() ? f.GetFieldName() : (std::string(colName) + "." + f.GetFieldName());
229 AddField(desc, innerName, f.GetId(), skeinIDs);
230 }
231 }
232
233 // The fieldID could be the root field or the class of fieldId might not be loaded.
234 // In these cases, only the inner fields are exposed as RDF columns.
235 auto fieldOrException = Detail::RFieldBase::Create("", fieldDesc.GetTypeName());
236 if (!fieldOrException)
237 return;
238 auto valueField = fieldOrException.Unwrap();
239 valueField->SetOnDiskId(fieldId);
240 std::unique_ptr<Detail::RFieldBase> cardinalityField;
241 // Collections get the additional "number of" RDF column (e.g. "R_rdf_sizeof_tracks")
242 if (!skeinIDs.empty()) {
243 cardinalityField = std::make_unique<ROOT::Experimental::Internal::RRDFCardinalityField>();
244 cardinalityField->SetOnDiskId(skeinIDs.back());
245 }
246
247 for (auto i = skeinIDs.rbegin(); i != skeinIDs.rend(); ++i) {
248 valueField = std::make_unique<ROOT::Experimental::RRVecField>("", std::move(valueField));
249 valueField->SetOnDiskId(*i);
250 // Skip the inner-most collection level to construct the cardinality column
251 if (i != skeinIDs.rbegin()) {
252 cardinalityField = std::make_unique<ROOT::Experimental::RRVecField>("", std::move(cardinalityField));
253 cardinalityField->SetOnDiskId(*i);
254 }
255 }
256
257 if (cardinalityField) {
258 fColumnNames.emplace_back("R_rdf_sizeof_" + std::string(colName));
259 fColumnTypes.emplace_back(cardinalityField->GetType());
260 auto cardColReader = std::make_unique<ROOT::Experimental::Internal::RNTupleColumnReader>(
261 std::move(cardinalityField));
262 fColumnReaderPrototypes.emplace_back(std::move(cardColReader));
263 }
264
265 skeinIDs.emplace_back(fieldId);
266 fColumnNames.emplace_back(colName);
267 fColumnTypes.emplace_back(valueField->GetType());
268 auto valColReader = std::make_unique<ROOT::Experimental::Internal::RNTupleColumnReader>(std::move(valueField));
269 fColumnReaderPrototypes.emplace_back(std::move(valColReader));
270}
271
272RNTupleDS::RNTupleDS(std::unique_ptr<Detail::RPageSource> pageSource)
273{
274 pageSource->Attach();
275 auto descriptorGuard = pageSource->GetSharedDescriptorGuard();
276 fSources.emplace_back(std::move(pageSource));
277
278 AddField(descriptorGuard.GetRef(), "", descriptorGuard->GetFieldZeroId(), std::vector<DescriptorId_t>());
279}
280
281RDF::RDataSource::Record_t RNTupleDS::GetColumnReadersImpl(std::string_view /* name */, const std::type_info & /* ti */)
282{
283 // This datasource uses the GetColumnReaders2 API instead (better name in the works)
284 return {};
285}
286
287std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
288RNTupleDS::GetColumnReaders(unsigned int slot, std::string_view name, const std::type_info & /*tid*/)
289{
290 // at this point we can assume that `name` will be found in fColumnNames, RDF is in charge validation
291 // TODO(jblomer): check incoming type
292 const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), name));
293 auto clone = fColumnReaderPrototypes[index]->Clone();
294 clone->Connect(*fSources[slot]);
295 return clone;
296}
297
299{
300 return true;
301}
302
303std::vector<std::pair<ULong64_t, ULong64_t>> RNTupleDS::GetEntryRanges()
304{
305 // TODO(jblomer): use cluster boundaries for the entry ranges
306 std::vector<std::pair<ULong64_t, ULong64_t>> ranges;
308 return ranges;
309
310 auto nEntries = fSources[0]->GetNEntries();
311 const auto chunkSize = nEntries / fNSlots;
312 const auto reminder = 1U == fNSlots ? 0 : nEntries % fNSlots;
313 auto start = 0UL;
314 auto end = 0UL;
315 for (auto i : ROOT::TSeqU(fNSlots)) {
316 start = end;
317 end += chunkSize;
318 ranges.emplace_back(start, end);
319 (void)i;
320 }
321 ranges.back().second += reminder;
322 fHasSeenAllRanges = true;
323 return ranges;
324}
325
326std::string RNTupleDS::GetTypeName(std::string_view colName) const
327{
328 const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), colName));
329 return fColumnTypes[index];
330}
331
332bool RNTupleDS::HasColumn(std::string_view colName) const
333{
334 return std::find(fColumnNames.begin(), fColumnNames.end(), colName) != fColumnNames.end();
335}
336
338{
339 fHasSeenAllRanges = false;
340}
341
343
344void RNTupleDS::SetNSlots(unsigned int nSlots)
345{
346 R__ASSERT(fNSlots == 0);
347 R__ASSERT(nSlots > 0);
348 fNSlots = nSlots;
349
350 for (unsigned int i = 1; i < fNSlots; ++i) {
351 fSources.emplace_back(fSources[0]->Clone());
352 assert(i == (fSources.size() - 1));
353 fSources[i]->Attach();
354 }
355}
356} // namespace Experimental
357} // namespace ROOT
358
359ROOT::RDataFrame ROOT::RDF::Experimental::FromRNTuple(std::string_view ntupleName, std::string_view fileName)
360{
361 auto pageSource = ROOT::Experimental::Detail::RPageSource::Create(ntupleName, fileName);
362 ROOT::RDataFrame rdf(std::make_unique<ROOT::Experimental::RNTupleDS>(std::move(pageSource)));
363 return rdf;
364}
365
367{
368 ROOT::RDataFrame rdf(std::make_unique<ROOT::Experimental::RNTupleDS>(ntuple->MakePageSource()));
369 return rdf;
370}
371
372ROOT::RDataFrame ROOT::Experimental::MakeNTupleDataFrame(std::string_view ntupleName, std::string_view fileName)
373{
374 return ROOT::RDF::Experimental::FromRNTuple(ntupleName, fileName);
375}
376
377ROOT::RDataFrame ROOT::Experimental::MakeNTupleDataFrame(RNTuple *ntuple)
378{
380}
#define f(i)
Definition RSha256.hxx:104
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
long long Long64_t
Definition RtypesCore.h:80
unsigned long long ULong64_t
Definition RtypesCore.h:81
#define R__ASSERT(e)
Definition TError.h:117
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
char name[80]
Definition TGX11.cxx:110
void GetCollectionInfo(const NTupleSize_t globalIndex, RClusterIndex *collectionStart, ClusterSize_t *collectionSize)
For offset columns only, look at the two adjacent values that define a collection's coordinates.
Definition RColumn.hxx:264
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &typeName)
Factory method to resurrect a field from the stored on-disk type information.
Definition RField.cxx:193
std::vector< std::unique_ptr< RColumn > > fColumns
The columns are connected either to a sink or to a source (not to both); they are owned by the field.
Definition RField.hxx:122
RColumn * fPrincipalColumn
Points into fColumns.
Definition RField.hxx:120
Abstract interface to read data from an ntuple.
static std::unique_ptr< RPageSource > Create(std::string_view ntupleName, std::string_view location, const RNTupleReadOptions &options=RNTupleReadOptions())
Guess the concrete derived page source from the file name (location)
Every RDF column is represented by exactly one RNTuple field.
std::unique_ptr< RFieldBase > fField
The field backing the RDF column.
Long64_t fLastEntry
Last entry number that was read.
std::unique_ptr< RNTupleColumnReader > Clone()
Column readers are created as prototype and then cloned for every slot.
RNTupleColumnReader(std::unique_ptr< RFieldBase > f)
RFieldValue fValue
The memory location used to read from fField.
void Connect(RPageSource &source)
Connect the field and its subfields to the page source.
An artificial field that transforms an RNTuple column that contains the offset of collections into co...
Definition RNTupleDS.cxx:59
ROOT::Experimental::Detail::RFieldValue CaptureValue(void *where) final
Creates a value from a memory location with an already constructed object.
Definition RNTupleDS.cxx:89
void GenerateColumnsImpl(const RNTupleDescriptor &) final
Creates the backing columns corresponsing to the field type for reading.
Definition RNTupleDS.cxx:77
RRDFCardinalityField(RRDFCardinalityField &&other)=default
void GenerateColumnsImpl() final
Creates the backing columns corresponsing to the field type for writing.
Definition RNTupleDS.cxx:75
void ReadGlobalImpl(ROOT::Experimental::NTupleSize_t globalIndex, ROOT::Experimental::Detail::RFieldValue *value) final
Get the number of elements of the collection identified by globalIndex.
Definition RNTupleDS.cxx:97
ROOT::Experimental::Detail::RFieldValue GenerateValue(void *where) final
Generates a tree value in a given location of size at least GetValueSize().
Definition RNTupleDS.cxx:85
RRDFCardinalityField & operator=(RRDFCardinalityField &&other)=default
size_t GetValueSize() const final
The number of bytes taken by a value of the appropriate type.
Definition RNTupleDS.cxx:93
void ReadInClusterImpl(const ROOT::Experimental::RClusterIndex &clusterIndex, ROOT::Experimental::Detail::RFieldValue *value) final
Get the number of elements of the collection identified by clusterIndex.
std::unique_ptr< ROOT::Experimental::Detail::RFieldBase > CloneImpl(std::string_view) const final
Called by Clone(), which additionally copies the on-disk ID.
Definition RNTupleDS.cxx:61
Addresses a column element or field item relative to a particular cluster, instead of a global NTuple...
Holds the static meta-data of a column in a tree.
A field translates read and write calls from/to underlying columns to/from tree values.
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int, std::string_view, const std::type_info &) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
void AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId, std::vector< DescriptorId_t > skeinIDs)
Provides the RDF column "colName" given the field identified by fieldID.
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
bool SetEntry(unsigned int slot, ULong64_t entry) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
std::vector< std::unique_ptr< ROOT::Experimental::Internal::RNTupleColumnReader > > fColumnReaderPrototypes
We prepare a column reader prototype for every column.
Definition RNTupleDS.hxx:53
std::vector< std::unique_ptr< ROOT::Experimental::Detail::RPageSource > > fSources
Clones of the first source, one for each slot.
Definition RNTupleDS.hxx:48
Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
void Initialize() final
Convenience method called before starting an event-loop.
std::vector< std::string > fColumnNames
Definition RNTupleDS.hxx:54
void Finalize() final
Convenience method called after concluding an event-loop.
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
RNTupleDS(std::unique_ptr< ROOT::Experimental::Detail::RPageSource > pageSource)
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
std::vector< std::string > fColumnTypes
Definition RNTupleDS.hxx:55
The on-storage meta-data of an ntuple.
RFieldDescriptorIterable GetFieldIterable(const RFieldDescriptor &fieldDesc) const
const RFieldDescriptor & GetFieldDescriptor(DescriptorId_t fieldId) const
Representation of an RNTuple data set in a ROOT file.
Definition RNTuple.hxx:496
std::unique_ptr< Detail::RPageSource > MakePageSource(const RNTupleReadOptions &options=RNTupleReadOptions())
Create a page source from the RNTuple object.
Definition RNTuple.cxx:414
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
std::uint64_t NTupleSize_t
Integer type long enough to hold the maximum number of entries in a column.
ENTupleStructure
The fields in the ntuple model tree can carry different structural information about the type system.
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
RDataFrame FromRNTuple(std::string_view ntupleName, std::string_view fileName)
This file contains a specialised ROOT message handler to test for diagnostic in unit tests.
TSeq< unsigned int > TSeqU
Definition TSeq.hxx:204
Wrap the 32bit integer in a struct in order to avoid template specialization clash with std::uint32_t...