Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleDS.cxx
Go to the documentation of this file.
1/// \file RNTupleDS.cxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \author Enrico Guiraud <enrico.guiraud@cern.ch>
5/// \date 2018-10-04
6/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
7/// is welcome!
8
9/*************************************************************************
10 * Copyright (C) 1995-2020, Rene Brun and Fons Rademakers. *
11 * All rights reserved. *
12 * *
13 * For the licensing terms see $ROOTSYS/LICENSE. *
14 * For the list of contributors see $ROOTSYS/README/CREDITS. *
15 *************************************************************************/
16
18#include <ROOT/RField.hxx>
19#include <ROOT/RNTuple.hxx>
21#include <ROOT/RNTupleDS.hxx>
22#include <ROOT/RNTupleUtil.hxx>
23#include <ROOT/RPageStorage.hxx>
24#include <ROOT/RStringView.hxx>
25
26#include <TError.h>
27
28#include <string>
29#include <vector>
30#include <typeinfo>
31#include <utility>
32
33// clang-format off
34/**
35* \class ROOT::Experimental::RNTupleDS
36* \ingroup dataframe
37* \brief The RDataSource implementation for RNTuple. It lets RDataFrame read RNTuple data.
38*
39* An RDataFrame that reads RNTuple data can be constructed using FromRNTuple().
40*
41* For each column containing an array or a collection, a corresponding column `#colname` is available to access
42* `colname.size()` without reading and deserializing the collection values.
43*
44**/
45// clang-format on
46
47namespace ROOT {
48namespace Experimental {
49namespace Internal {
50
51/// An artificial field that transforms an RNTuple column that contains the offset of collections into
52/// collection sizes. It is used to provide the "number of" RDF columns for collections, e.g.
53/// `R_rdf_sizeof_jets` for a collection named `jets`.
54///
55/// This field owns the collection offset field but instead of exposing the collection offsets it exposes
56/// the collection sizes (offset(N+1) - offset(N)). For the time being, we offer this functionality only in RDataFrame.
57/// TODO(jblomer): consider providing a general set of useful virtual fields as part of RNTuple.
59protected:
60 std::unique_ptr<ROOT::Experimental::Detail::RFieldBase> CloneImpl(std::string_view /* newName */) const final
61 {
62 return std::make_unique<RRDFCardinalityField>();
63 }
64 void GenerateValue(void *where) const final { *static_cast<std::size_t *>(where) = 0; }
65
66public:
67 static std::string TypeName() { return "std::size_t"; }
69 : ROOT::Experimental::Detail::RFieldBase("", TypeName(), ENTupleStructure::kLeaf, false /* isSimple */) {}
73
75 {
76 static RColumnRepresentations representations(
78 {});
79 return representations;
80 }
81 // Field is only used for reading
82 void GenerateColumnsImpl() final { assert(false && "Cardinality fields must only be used for reading"); }
83 void GenerateColumnsImpl(const RNTupleDescriptor &desc) final
84 {
85 auto onDiskTypes = EnsureCompatibleColumnTypes(desc);
86 fColumns.emplace_back(
87 ROOT::Experimental::Detail::RColumn::Create<ClusterSize_t>(RColumnModel(onDiskTypes[0]), 0));
88 }
89
90 size_t GetValueSize() const final { return sizeof(std::size_t); }
91 size_t GetAlignment() const final { return alignof(std::size_t); }
92
93 /// Get the number of elements of the collection identified by globalIndex
94 void ReadGlobalImpl(ROOT::Experimental::NTupleSize_t globalIndex, void *to) final
95 {
96 RClusterIndex collectionStart;
98 fPrincipalColumn->GetCollectionInfo(globalIndex, &collectionStart, &size);
99 *static_cast<std::size_t *>(to) = size;
100 }
101
102 /// Get the number of elements of the collection identified by clusterIndex
103 void ReadInClusterImpl(const ROOT::Experimental::RClusterIndex &clusterIndex, void *to) final
104 {
105 RClusterIndex collectionStart;
107 fPrincipalColumn->GetCollectionInfo(clusterIndex, &collectionStart, &size);
108 *static_cast<std::size_t *>(to) = size;
109 }
110};
111
112/// Every RDF column is represented by exactly one RNTuple field
116
117 std::unique_ptr<RFieldBase> fField; ///< The field backing the RDF column
118 RFieldBase::RValue fValue; ///< The memory location used to read from fField
119 Long64_t fLastEntry; ///< Last entry number that was read
120
121public:
122 RNTupleColumnReader(std::unique_ptr<RFieldBase> f)
123 : fField(std::move(f)), fValue(fField->GenerateValue()), fLastEntry(-1)
124 {
125 }
127
128 /// Column readers are created as prototype and then cloned for every slot
129 std::unique_ptr<RNTupleColumnReader> Clone()
130 {
131 return std::make_unique<RNTupleColumnReader>(fField->Clone(fField->GetName()));
132 }
133
134 /// Connect the field and its subfields to the page source
135 void Connect(RPageSource &source)
136 {
137 fField->ConnectPageSource(source);
138 for (auto &f : *fField)
139 f.ConnectPageSource(source);
140 }
141
142 void *GetImpl(Long64_t entry) final
143 {
144 if (entry != fLastEntry) {
145 fValue.Read(entry);
146 fLastEntry = entry;
147 }
148 return fValue.GetRawPtr();
149 }
150};
151
152} // namespace Internal
153
154RNTupleDS::~RNTupleDS() = default;
155
156void RNTupleDS::AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId,
157 std::vector<DescriptorId_t> skeinIDs)
158{
159 // As an example for the mapping of RNTuple fields to RDF columns, let's consider an RNTuple
160 // using the following types and with a top-level field named "event" of type Event:
161 //
162 // struct Event {
163 // int id;
164 // std::vector<Track> tracks;
165 // };
166 // struct Track {
167 // std::vector<Hit> hits;
168 // };
169 // struct Hit {
170 // float x;
171 // float y;
172 // };
173 //
174 // AddField() will be called from the constructor with the RNTuple root field (ENTupleStructure::kRecord).
175 // From there, we recurse into the "event" sub field (also ENTupleStructure::kRecord) and further down the
176 // tree of sub fields and expose the following RDF columns:
177 //
178 // "event" [Event]
179 // "event.id" [int]
180 // "event.tracks" [RVec<Track>]
181 // "R_rdf_sizeof_event.tracks" [unsigned int]
182 // "event.tracks.hits" [RVec<RVec<Hit>>]
183 // "R_rdf_sizeof_event.tracks.hits" [RVec<unsigned int>]
184 // "event.tracks.hits.x" [RVec<RVec<float>>]
185 // "R_rdf_sizeof_event.tracks.hits.x" [RVec<unsigned int>]
186 // "event.tracks.hits.y" [RVec<RVec<float>>]
187 // "R_rdf_sizeof_event.tracks.hits.y" [RVec<unsigned int>]
188
189 const auto &fieldDesc = desc.GetFieldDescriptor(fieldId);
190 if (fieldDesc.GetStructure() == ENTupleStructure::kCollection) {
191 // Inner fields of collections are provided as projected collections of only that inner field,
192 // E.g. we provide a projected collection RVec<RVec<float>> for "event.tracks.hits.x" in the example
193 // above.
194
195 // We open a new collection scope with fieldID being the inner most collection. E.g. for "event.tracks.hits",
196 // skeinIDs would already contain the fieldID of "event.tracks"
197 skeinIDs.emplace_back(fieldId);
198
199 if (fieldDesc.GetTypeName().empty()) {
200 // Anonymous collection with one or several sub fields
201 auto cardinalityField = std::make_unique<ROOT::Experimental::Internal::RRDFCardinalityField>();
202 cardinalityField->SetOnDiskId(fieldId);
203 fColumnNames.emplace_back("R_rdf_sizeof_" + std::string(colName));
204 fColumnTypes.emplace_back(cardinalityField->GetType());
205 auto cardColReader = std::make_unique<ROOT::Experimental::Internal::RNTupleColumnReader>(
206 std::move(cardinalityField));
207 fColumnReaderPrototypes.emplace_back(std::move(cardColReader));
208
209 for (const auto &f : desc.GetFieldIterable(fieldDesc.GetId())) {
210 AddField(desc, std::string(colName) + "." + f.GetFieldName(), f.GetId(), skeinIDs);
211 }
212 } else {
213 // ROOT::RVec with exactly one sub field
214 const auto &f = *desc.GetFieldIterable(fieldDesc.GetId()).begin();
215 AddField(desc, colName, f.GetId(), skeinIDs);
216 }
217 // Note that at the end of the recursion, we handled the inner sub collections as well as the
218 // collection as whole, so we are done.
219 return;
220 } else if (fieldDesc.GetStructure() == ENTupleStructure::kRecord) {
221 // Inner fields of records are provided as individual RDF columns, e.g. "event.id"
222 for (const auto &f : desc.GetFieldIterable(fieldDesc.GetId())) {
223 auto innerName = colName.empty() ? f.GetFieldName() : (std::string(colName) + "." + f.GetFieldName());
224 AddField(desc, innerName, f.GetId(), skeinIDs);
225 }
226 }
227
228 // The fieldID could be the root field or the class of fieldId might not be loaded.
229 // In these cases, only the inner fields are exposed as RDF columns.
230 auto fieldOrException = Detail::RFieldBase::Create(fieldDesc.GetFieldName(), fieldDesc.GetTypeName());
231 if (!fieldOrException)
232 return;
233 auto valueField = fieldOrException.Unwrap();
234 valueField->SetOnDiskId(fieldId);
235 for (auto &f : *valueField) {
236 f.SetOnDiskId(desc.FindFieldId(f.GetName(), f.GetParent()->GetOnDiskId()));
237 }
238 std::unique_ptr<Detail::RFieldBase> cardinalityField;
239 // Collections get the additional "number of" RDF column (e.g. "R_rdf_sizeof_tracks")
240 if (!skeinIDs.empty()) {
241 cardinalityField = std::make_unique<ROOT::Experimental::Internal::RRDFCardinalityField>();
242 cardinalityField->SetOnDiskId(skeinIDs.back());
243 }
244
245 for (auto i = skeinIDs.rbegin(); i != skeinIDs.rend(); ++i) {
246 valueField = std::make_unique<ROOT::Experimental::RRVecField>("", std::move(valueField));
247 valueField->SetOnDiskId(*i);
248 // Skip the inner-most collection level to construct the cardinality column
249 if (i != skeinIDs.rbegin()) {
250 cardinalityField = std::make_unique<ROOT::Experimental::RRVecField>("", std::move(cardinalityField));
251 cardinalityField->SetOnDiskId(*i);
252 }
253 }
254
255 if (cardinalityField) {
256 fColumnNames.emplace_back("R_rdf_sizeof_" + std::string(colName));
257 fColumnTypes.emplace_back(cardinalityField->GetType());
258 auto cardColReader = std::make_unique<ROOT::Experimental::Internal::RNTupleColumnReader>(
259 std::move(cardinalityField));
260 fColumnReaderPrototypes.emplace_back(std::move(cardColReader));
261 }
262
263 skeinIDs.emplace_back(fieldId);
264 fColumnNames.emplace_back(colName);
265 fColumnTypes.emplace_back(valueField->GetType());
266 auto valColReader = std::make_unique<ROOT::Experimental::Internal::RNTupleColumnReader>(std::move(valueField));
267 fColumnReaderPrototypes.emplace_back(std::move(valColReader));
268}
269
270RNTupleDS::RNTupleDS(std::unique_ptr<Detail::RPageSource> pageSource)
271{
272 pageSource->Attach();
273 auto descriptorGuard = pageSource->GetSharedDescriptorGuard();
274 fSources.emplace_back(std::move(pageSource));
275
276 AddField(descriptorGuard.GetRef(), "", descriptorGuard->GetFieldZeroId(), std::vector<DescriptorId_t>());
277}
278
279RDF::RDataSource::Record_t RNTupleDS::GetColumnReadersImpl(std::string_view /* name */, const std::type_info & /* ti */)
280{
281 // This datasource uses the GetColumnReaders2 API instead (better name in the works)
282 return {};
283}
284
285std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
286RNTupleDS::GetColumnReaders(unsigned int slot, std::string_view name, const std::type_info & /*tid*/)
287{
288 // at this point we can assume that `name` will be found in fColumnNames, RDF is in charge validation
289 // TODO(jblomer): check incoming type
290 const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), name));
292 clone->Connect(*fSources[slot]);
293 return clone;
294}
295
297{
298 return true;
299}
300
301std::vector<std::pair<ULong64_t, ULong64_t>> RNTupleDS::GetEntryRanges()
302{
303 // TODO(jblomer): use cluster boundaries for the entry ranges
304 std::vector<std::pair<ULong64_t, ULong64_t>> ranges;
306 return ranges;
307
308 auto nEntries = fSources[0]->GetNEntries();
309 const auto chunkSize = nEntries / fNSlots;
310 const auto reminder = 1U == fNSlots ? 0 : nEntries % fNSlots;
311 auto start = 0UL;
312 auto end = 0UL;
313 for (auto i : ROOT::TSeqU(fNSlots)) {
314 start = end;
315 end += chunkSize;
316 ranges.emplace_back(start, end);
317 (void)i;
318 }
319 ranges.back().second += reminder;
320 fHasSeenAllRanges = true;
321 return ranges;
322}
323
324std::string RNTupleDS::GetTypeName(std::string_view colName) const
325{
326 const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), colName));
327 return fColumnTypes[index];
328}
329
330bool RNTupleDS::HasColumn(std::string_view colName) const
331{
332 return std::find(fColumnNames.begin(), fColumnNames.end(), colName) != fColumnNames.end();
333}
334
336{
337 fHasSeenAllRanges = false;
338}
339
341
342void RNTupleDS::SetNSlots(unsigned int nSlots)
343{
344 R__ASSERT(fNSlots == 0);
345 R__ASSERT(nSlots > 0);
346 fNSlots = nSlots;
347
348 for (unsigned int i = 1; i < fNSlots; ++i) {
349 fSources.emplace_back(fSources[0]->Clone());
350 assert(i == (fSources.size() - 1));
351 fSources[i]->Attach();
352 }
353}
354} // namespace Experimental
355} // namespace ROOT
356
357ROOT::RDataFrame ROOT::RDF::Experimental::FromRNTuple(std::string_view ntupleName, std::string_view fileName)
358{
359 auto pageSource = ROOT::Experimental::Detail::RPageSource::Create(ntupleName, fileName);
360 ROOT::RDataFrame rdf(std::make_unique<ROOT::Experimental::RNTupleDS>(std::move(pageSource)));
361 return rdf;
362}
363
365{
366 ROOT::RDataFrame rdf(std::make_unique<ROOT::Experimental::RNTupleDS>(ntuple->MakePageSource()));
367 return rdf;
368}
#define f(i)
Definition RSha256.hxx:104
TObject * clone(const char *newname) const override
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
long long Long64_t
Definition RtypesCore.h:80
unsigned long long ULong64_t
Definition RtypesCore.h:81
#define R__ASSERT(e)
Definition TError.h:118
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
char name[80]
Definition TGX11.cxx:110
void GetCollectionInfo(const NTupleSize_t globalIndex, RClusterIndex *collectionStart, ClusterSize_t *collectionSize)
For offset columns only, look at the two adjacent values that define a collection's coordinates.
Definition RColumn.hxx:278
Some fields have multiple possible column representations, e.g.
Definition RField.hxx:118
Points to an object with RNTuple I/O support and keeps a pointer to the corresponding field.
Definition RField.hxx:140
void Read(NTupleSize_t globalIndex)
Definition RField.hxx:184
A field translates read and write calls from/to underlying columns to/from tree values.
Definition RField.hxx:83
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &canonicalType, const std::string &typeAlias)
Factory method to resurrect a field from the stored on-disk type information.
Definition RField.cxx:351
RFieldBase(std::string_view name, std::string_view type, ENTupleStructure structure, bool isSimple, std::size_t nRepetitions=0)
The constructor creates the underlying column objects and connects them to either a sink or a source.
Definition RField.cxx:320
std::vector< std::unique_ptr< RColumn > > fColumns
The columns are connected either to a sink or to a source (not to both); they are owned by the field.
Definition RField.hxx:346
const ColumnRepresentation_t & EnsureCompatibleColumnTypes(const RNTupleDescriptor &desc) const
Returns the on-disk column types found in the provided descriptor for fOnDiskId.
Definition RField.cxx:649
RColumn * fPrincipalColumn
Points into fColumns.
Definition RField.hxx:344
Abstract interface to read data from an ntuple.
static std::unique_ptr< RPageSource > Create(std::string_view ntupleName, std::string_view location, const RNTupleReadOptions &options=RNTupleReadOptions())
Guess the concrete derived page source from the file name (location)
Every RDF column is represented by exactly one RNTuple field.
std::unique_ptr< RFieldBase > fField
The field backing the RDF column.
Long64_t fLastEntry
Last entry number that was read.
std::unique_ptr< RNTupleColumnReader > Clone()
Column readers are created as prototype and then cloned for every slot.
RNTupleColumnReader(std::unique_ptr< RFieldBase > f)
RFieldBase::RValue fValue
The memory location used to read from fField.
void Connect(RPageSource &source)
Connect the field and its subfields to the page source.
An artificial field that transforms an RNTuple column that contains the offset of collections into co...
Definition RNTupleDS.cxx:58
RRDFCardinalityField(RRDFCardinalityField &&other)=default
void GenerateColumnsImpl() final
Creates the backing columns corresponsing to the field type for writing.
Definition RNTupleDS.cxx:82
const RColumnRepresentations & GetColumnRepresentations() const final
Implementations in derived classes should return a static RColumnRepresentations object.
Definition RNTupleDS.cxx:74
RRDFCardinalityField & operator=(RRDFCardinalityField &&other)=default
size_t GetValueSize() const final
The number of bytes taken by a value of the appropriate type.
Definition RNTupleDS.cxx:90
size_t GetAlignment() const final
As a rule of thumb, the alignment is equal to the size of the type.
Definition RNTupleDS.cxx:91
std::unique_ptr< ROOT::Experimental::Detail::RFieldBase > CloneImpl(std::string_view) const final
Called by Clone(), which additionally copies the on-disk ID.
Definition RNTupleDS.cxx:60
void GenerateColumnsImpl(const RNTupleDescriptor &desc) final
Creates the backing columns corresponsing to the field type for reading.
Definition RNTupleDS.cxx:83
void GenerateValue(void *where) const final
Constructs value in a given location of size at least GetValueSize(). Called by the base class' Gener...
Definition RNTupleDS.cxx:64
void ReadInClusterImpl(const ROOT::Experimental::RClusterIndex &clusterIndex, void *to) final
Get the number of elements of the collection identified by clusterIndex.
void ReadGlobalImpl(ROOT::Experimental::NTupleSize_t globalIndex, void *to) final
Get the number of elements of the collection identified by globalIndex.
Definition RNTupleDS.cxx:94
Addresses a column element or field item relative to a particular cluster, instead of a global NTuple...
Holds the static meta-data of an RNTuple column.
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int, std::string_view, const std::type_info &) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
void AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId, std::vector< DescriptorId_t > skeinIDs)
Provides the RDF column "colName" given the field identified by fieldID.
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
bool SetEntry(unsigned int slot, ULong64_t entry) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
std::vector< std::unique_ptr< ROOT::Experimental::Internal::RNTupleColumnReader > > fColumnReaderPrototypes
We prepare a column reader prototype for every column.
Definition RNTupleDS.hxx:53
std::vector< std::unique_ptr< ROOT::Experimental::Detail::RPageSource > > fSources
Clones of the first source, one for each slot.
Definition RNTupleDS.hxx:48
Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
void Initialize() final
Convenience method called before starting an event-loop.
std::vector< std::string > fColumnNames
Definition RNTupleDS.hxx:54
void Finalize() final
Convenience method called after concluding an event-loop.
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
RNTupleDS(std::unique_ptr< ROOT::Experimental::Detail::RPageSource > pageSource)
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
std::vector< std::string > fColumnTypes
Definition RNTupleDS.hxx:55
The on-storage meta-data of an ntuple.
RFieldDescriptorIterable GetFieldIterable(const RFieldDescriptor &fieldDesc) const
DescriptorId_t FindFieldId(std::string_view fieldName, DescriptorId_t parentId) const
const RFieldDescriptor & GetFieldDescriptor(DescriptorId_t fieldId) const
Representation of an RNTuple data set in a ROOT file.
Definition RNTuple.hxx:512
std::unique_ptr< Detail::RPageSource > MakePageSource(const RNTupleReadOptions &options=RNTupleReadOptions())
Create a page source from the RNTuple object.
Definition RNTuple.cxx:383
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
virtual TObject * Clone(const char *newname="") const
Make a clone of an object using the Streamer facility.
Definition TObject.cxx:223
std::uint64_t NTupleSize_t
Integer type long enough to hold the maximum number of entries in a column.
ENTupleStructure
The fields in the ntuple model tree can carry different structural information about the type system.
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
RDataFrame FromRNTuple(std::string_view ntupleName, std::string_view fileName)
This file contains a specialised ROOT message handler to test for diagnostic in unit tests.
TSeq< unsigned int > TSeqU
Definition TSeq.hxx:204
Wrap the integer in a struct in order to avoid template specialization clash with std::uint32_t.