Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleDS.cxx
Go to the documentation of this file.
1/// \file RNTupleDS.cxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \author Enrico Guiraud <enrico.guiraud@cern.ch>
5/// \date 2018-10-04
6/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
7/// is welcome!
8
9/*************************************************************************
10 * Copyright (C) 1995-2020, Rene Brun and Fons Rademakers. *
11 * All rights reserved. *
12 * *
13 * For the licensing terms see $ROOTSYS/LICENSE. *
14 * For the list of contributors see $ROOTSYS/README/CREDITS. *
15 *************************************************************************/
16
18#include <ROOT/RField.hxx>
21#include <ROOT/RNTupleDS.hxx>
22#include <ROOT/RNTupleUtil.hxx>
23#include <ROOT/RPageStorage.hxx>
24#include <string_view>
25
26#include <TError.h>
27
28#include <cassert>
29#include <memory>
30#include <string>
31#include <vector>
32#include <typeinfo>
33#include <utility>
34
35// clang-format off
36/**
37* \class ROOT::Experimental::RNTupleDS
38* \ingroup dataframe
39* \brief The RDataSource implementation for RNTuple. It lets RDataFrame read RNTuple data.
40*
41* An RDataFrame that reads RNTuple data can be constructed using FromRNTuple().
42*
43* For each column containing an array or a collection, a corresponding column `#colname` is available to access
44* `colname.size()` without reading and deserializing the collection values.
45*
46**/
47// clang-format on
48
49namespace ROOT {
50namespace Experimental {
51namespace Internal {
52
53/// An artificial field that transforms an RNTuple column that contains the offset of collections into
54/// collection sizes. It is used to provide the "number of" RDF columns for collections, e.g.
55/// `R_rdf_sizeof_jets` for a collection named `jets`.
56///
57/// This field owns the collection offset field but instead of exposing the collection offsets it exposes
58/// the collection sizes (offset(N+1) - offset(N)). For the time being, we offer this functionality only in RDataFrame.
59/// TODO(jblomer): consider providing a general set of useful virtual fields as part of RNTuple.
61protected:
62 std::unique_ptr<ROOT::Experimental::RFieldBase> CloneImpl(std::string_view /* newName */) const final
63 {
64 return std::make_unique<RRDFCardinalityField>();
65 }
66 void ConstructValue(void *where) const final { *static_cast<std::size_t *>(where) = 0; }
67
68public:
69 static std::string TypeName() { return "std::size_t"; }
71 : ROOT::Experimental::RFieldBase("", TypeName(), ENTupleStructure::kLeaf, false /* isSimple */)
72 {
73 }
77
79 {
80 static RColumnRepresentations representations(
82 {});
83 return representations;
84 }
85 // Field is only used for reading
86 void GenerateColumnsImpl() final { assert(false && "Cardinality fields must only be used for reading"); }
87 void GenerateColumnsImpl(const RNTupleDescriptor &desc) final
88 {
89 auto onDiskTypes = EnsureCompatibleColumnTypes(desc);
90 fColumns.emplace_back(
91 ROOT::Experimental::Internal::RColumn::Create<ClusterSize_t>(RColumnModel(onDiskTypes[0]), 0));
92 }
93
94 size_t GetValueSize() const final { return sizeof(std::size_t); }
95 size_t GetAlignment() const final { return alignof(std::size_t); }
96
97 /// Get the number of elements of the collection identified by globalIndex
98 void ReadGlobalImpl(ROOT::Experimental::NTupleSize_t globalIndex, void *to) final
99 {
100 RClusterIndex collectionStart;
102 fPrincipalColumn->GetCollectionInfo(globalIndex, &collectionStart, &size);
103 *static_cast<std::size_t *>(to) = size;
104 }
105
106 /// Get the number of elements of the collection identified by clusterIndex
107 void ReadInClusterImpl(ROOT::Experimental::RClusterIndex clusterIndex, void *to) final
108 {
109 RClusterIndex collectionStart;
111 fPrincipalColumn->GetCollectionInfo(clusterIndex, &collectionStart, &size);
112 *static_cast<std::size_t *>(to) = size;
113 }
114};
115
116/**
117 * @brief An artificial field that provides the size of a fixed-size array
118 *
119 * This is the implementation of `R_rdf_sizeof_column` in case `column` contains
120 * fixed-size arrays on disk.
121 */
123private:
124 std::size_t fArrayLength;
125
126 std::unique_ptr<ROOT::Experimental::RFieldBase> CloneImpl(std::string_view) const final
127 {
128 return std::make_unique<RArraySizeField>(fArrayLength);
129 }
130 void GenerateColumnsImpl() final { assert(false && "RArraySizeField fields must only be used for reading"); }
132 void ReadGlobalImpl(NTupleSize_t /*globalIndex*/, void *to) final { *static_cast<std::size_t *>(to) = fArrayLength; }
133 void ReadInClusterImpl(RClusterIndex /*clusterIndex*/, void *to) final
134 {
135 *static_cast<std::size_t *>(to) = fArrayLength;
136 }
137
138public:
139 RArraySizeField(std::size_t arrayLength)
140 : ROOT::Experimental::RFieldBase("", "std::size_t", ENTupleStructure::kLeaf, false /* isSimple */),
141 fArrayLength(arrayLength)
142 {
143 }
144 RArraySizeField(const RArraySizeField &other) = delete;
148 ~RArraySizeField() final = default;
149
150 void ConstructValue(void *where) const final { *static_cast<std::size_t *>(where) = 0; }
151 std::size_t GetValueSize() const final { return sizeof(std::size_t); }
152 std::size_t GetAlignment() const final { return alignof(std::size_t); }
153};
154
155/// Every RDF column is represented by exactly one RNTuple field
159
160 RNTupleDS *fDataSource; ///< The data source that owns this column reader
161 RFieldBase *fProtoField; ///< The prototype field from which fField is cloned
162 std::unique_ptr<RFieldBase> fField; ///< The field backing the RDF column
163 std::unique_ptr<RFieldBase::RValue> fValue; ///< The memory location used to read from fField
164 std::shared_ptr<void> fValuePtr; ///< Used to reuse the object created by fValue when reconnecting sources
165 Long64_t fLastEntry = -1; ///< Last entry number that was read
166 /// For chains, the logical entry and the physical entry in any particular file can be different.
167 /// The entry offset stores the logical entry number (sum of all previous physical entries) when a file of the corresponding
168 /// data source was opened.
170
171public:
172 RNTupleColumnReader(RNTupleDS *ds, RFieldBase *protoField) : fDataSource(ds), fProtoField(protoField) {}
174
175 /// Connect the field and its subfields to the page source
176 void Connect(RPageSource &source, Long64_t entryOffset)
177 {
178 assert(fLastEntry == -1);
179 fEntryOffset = entryOffset;
180
181 // Create a new, real field from the prototype and set its field ID in the context of the given page source
183 {
184 auto descGuard = source.GetSharedDescriptorGuard();
185 // Set the on-disk field IDs for the field and the subfield
186 fField->SetOnDiskId(
187 descGuard->FindFieldId(fDataSource->fFieldId2QualifiedName.at(fProtoField->GetOnDiskId())));
188 auto iProto = fProtoField->cbegin();
189 auto iReal = fField->begin();
190 for (; iReal != fField->end(); ++iProto, ++iReal) {
191 iReal->SetOnDiskId(descGuard->FindFieldId(fDataSource->fFieldId2QualifiedName.at(iProto->GetOnDiskId())));
192 }
193 }
194
196
197 if (fValuePtr) {
198 // When the reader reconnects to a new file, the fValuePtr is already set
199 fValue = std::make_unique<RFieldBase::RValue>(fField->BindValue(fValuePtr));
200 fValuePtr = nullptr;
201 } else {
202 // For the first file, create a new object for this field (reader)
203 fValue = std::make_unique<RFieldBase::RValue>(fField->CreateValue());
204 }
205 }
206
207 void Disconnect(bool keepValue)
208 {
209 if (fValue && keepValue) {
210 fValuePtr = fValue->GetPtr<void>();
211 }
212 fValue = nullptr;
213 fField = nullptr;
214 fLastEntry = -1;
215 }
216
217 void *GetImpl(Long64_t entry) final
218 {
219 if (entry != fLastEntry) {
220 fValue->Read(entry - fEntryOffset);
221 fLastEntry = entry;
222 }
223 return fValue->GetPtr<void>().get();
224 }
225};
226
227} // namespace Internal
228
229RNTupleDS::~RNTupleDS() = default;
230
231void RNTupleDS::AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId,
232 std::vector<RNTupleDS::RFieldInfo> fieldInfos)
233{
234 // As an example for the mapping of RNTuple fields to RDF columns, let's consider an RNTuple
235 // using the following types and with a top-level field named "event" of type Event:
236 //
237 // struct Event {
238 // int id;
239 // std::vector<Track> tracks;
240 // };
241 // struct Track {
242 // std::vector<Hit> hits;
243 // };
244 // struct Hit {
245 // float x;
246 // float y;
247 // };
248 //
249 // AddField() will be called from the constructor with the RNTuple root field (ENTupleStructure::kRecord).
250 // From there, we recurse into the "event" sub field (also ENTupleStructure::kRecord) and further down the
251 // tree of sub fields and expose the following RDF columns:
252 //
253 // "event" [Event]
254 // "event.id" [int]
255 // "event.tracks" [RVec<Track>]
256 // "R_rdf_sizeof_event.tracks" [unsigned int]
257 // "event.tracks.hits" [RVec<RVec<Hit>>]
258 // "R_rdf_sizeof_event.tracks.hits" [RVec<unsigned int>]
259 // "event.tracks.hits.x" [RVec<RVec<float>>]
260 // "R_rdf_sizeof_event.tracks.hits.x" [RVec<unsigned int>]
261 // "event.tracks.hits.y" [RVec<RVec<float>>]
262 // "R_rdf_sizeof_event.tracks.hits.y" [RVec<unsigned int>]
263
264 const auto &fieldDesc = desc.GetFieldDescriptor(fieldId);
265 const auto &nRepetitions = fieldDesc.GetNRepetitions();
266 if ((fieldDesc.GetStructure() == ENTupleStructure::kCollection) || (nRepetitions > 0)) {
267 // The field is a collection or a fixed-size array.
268 // We open a new collection scope with fieldID being the inner most collection. E.g. for "event.tracks.hits",
269 // fieldInfos would already contain the fieldID of "event.tracks"
270 fieldInfos.emplace_back(fieldId, nRepetitions);
271 }
272
273 if (fieldDesc.GetStructure() == ENTupleStructure::kCollection) {
274 // Inner fields of collections are provided as projected collections of only that inner field,
275 // E.g. we provide a projected collection RVec<RVec<float>> for "event.tracks.hits.x" in the example
276 // above.
277
278 if (fieldDesc.GetTypeName().empty()) {
279 // Anonymous collection with one or several sub fields
280 auto cardinalityField = std::make_unique<ROOT::Experimental::Internal::RRDFCardinalityField>();
281 cardinalityField->SetOnDiskId(fieldId);
282 fColumnNames.emplace_back("R_rdf_sizeof_" + std::string(colName));
283 fColumnTypes.emplace_back(cardinalityField->GetTypeName());
284 fProtoFields.emplace_back(std::move(cardinalityField));
285
286 for (const auto &f : desc.GetFieldIterable(fieldDesc.GetId())) {
287 AddField(desc, std::string(colName) + "." + f.GetFieldName(), f.GetId(), fieldInfos);
288 }
289 } else {
290 // ROOT::RVec with exactly one sub field
291 const auto &f = *desc.GetFieldIterable(fieldDesc.GetId()).begin();
292 AddField(desc, colName, f.GetId(), fieldInfos);
293 }
294 // Note that at the end of the recursion, we handled the inner sub collections as well as the
295 // collection as whole, so we are done.
296 return;
297
298 } else if (nRepetitions > 0) {
299 // Fixed-size array, same logic as ROOT::RVec.
300 const auto &f = *desc.GetFieldIterable(fieldDesc.GetId()).begin();
301 AddField(desc, colName, f.GetId(), fieldInfos);
302 return;
303 } else if (fieldDesc.GetStructure() == ENTupleStructure::kRecord) {
304 // Inner fields of records are provided as individual RDF columns, e.g. "event.id"
305 for (const auto &f : desc.GetFieldIterable(fieldDesc.GetId())) {
306 auto innerName = colName.empty() ? f.GetFieldName() : (std::string(colName) + "." + f.GetFieldName());
307 AddField(desc, innerName, f.GetId(), fieldInfos);
308 }
309 }
310
311 // The fieldID could be the root field or the class of fieldId might not be loaded.
312 // In these cases, only the inner fields are exposed as RDF columns.
313 auto fieldOrException = RFieldBase::Create(fieldDesc.GetFieldName(), fieldDesc.GetTypeName());
314 if (!fieldOrException)
315 return;
316 auto valueField = fieldOrException.Unwrap();
317 valueField->SetOnDiskId(fieldId);
318 for (auto &f : *valueField) {
319 f.SetOnDiskId(desc.FindFieldId(f.GetFieldName(), f.GetParent()->GetOnDiskId()));
320 }
321 std::unique_ptr<RFieldBase> cardinalityField;
322 // Collections get the additional "number of" RDF column (e.g. "R_rdf_sizeof_tracks")
323 if (!fieldInfos.empty()) {
324 const auto &info = fieldInfos.back();
325 if (info.fNRepetitions > 0) {
326 cardinalityField = std::make_unique<ROOT::Experimental::Internal::RArraySizeField>(info.fNRepetitions);
327 } else {
328 cardinalityField = std::make_unique<ROOT::Experimental::Internal::RRDFCardinalityField>();
329 }
330 cardinalityField->SetOnDiskId(info.fFieldId);
331 }
332
333 for (auto i = fieldInfos.rbegin(); i != fieldInfos.rend(); ++i) {
334 const auto &fieldInfo = *i;
335
336 if (fieldInfo.fNRepetitions > 0) {
337 // Fixed-size array, read it as ROOT::RVec in memory
338 valueField =
339 std::make_unique<ROOT::Experimental::RArrayAsRVecField>("", std::move(valueField), fieldInfo.fNRepetitions);
340 } else {
341 // Actual ROOT::RVec
342 valueField = std::make_unique<ROOT::Experimental::RRVecField>("", std::move(valueField));
343 }
344
345 valueField->SetOnDiskId(fieldInfo.fFieldId);
346
347 // Skip the inner-most collection level to construct the cardinality column
348 // It's taken care of by the `if (!fieldInfos.empty())` scope above
349 if (i != fieldInfos.rbegin()) {
350 if (fieldInfo.fNRepetitions > 0) {
351 // This collection level refers to a fixed-size array
352 cardinalityField = std::make_unique<ROOT::Experimental::RArrayAsRVecField>("", std::move(cardinalityField),
353 fieldInfo.fNRepetitions);
354 } else {
355 // This collection level refers to an RVec
356 cardinalityField = std::make_unique<ROOT::Experimental::RRVecField>("", std::move(cardinalityField));
357 }
358
359 cardinalityField->SetOnDiskId(fieldInfo.fFieldId);
360 }
361 }
362
363 if (cardinalityField) {
364 fColumnNames.emplace_back("R_rdf_sizeof_" + std::string(colName));
365 fColumnTypes.emplace_back(cardinalityField->GetTypeName());
366 fProtoFields.emplace_back(std::move(cardinalityField));
367 }
368
369 fieldInfos.emplace_back(fieldId, nRepetitions);
370 fColumnNames.emplace_back(colName);
371 fColumnTypes.emplace_back(valueField->GetTypeName());
372 fProtoFields.emplace_back(std::move(valueField));
373}
374
375RNTupleDS::RNTupleDS(std::unique_ptr<Internal::RPageSource> pageSource) : fPrincipalSource(std::move(pageSource))
376{
377 fPrincipalSource->Attach();
378 fPrincipalDescriptor = fPrincipalSource->GetSharedDescriptorGuard()->Clone();
379
380 AddField(*fPrincipalDescriptor, "", fPrincipalDescriptor->GetFieldZeroId(),
381 std::vector<ROOT::Experimental::RNTupleDS::RFieldInfo>());
382}
383
384RNTupleDS::RNTupleDS(std::string_view ntupleName, std::string_view fileName)
385 : RNTupleDS(ROOT::Experimental::Internal::RPageSource::Create(ntupleName, fileName))
386{
387}
388
390 : RNTupleDS(ROOT::Experimental::Internal::RPageSourceFile::CreateFromAnchor(*ntuple))
391{
392}
393
394RNTupleDS::RNTupleDS(std::string_view ntupleName, const std::vector<std::string> &fileNames)
395 : RNTupleDS(Internal::RPageSource::Create(ntupleName, fileNames[0]))
396{
397 fNTupleName = ntupleName;
398 fFileNames = fileNames;
399}
400
401RDF::RDataSource::Record_t RNTupleDS::GetColumnReadersImpl(std::string_view /* name */, const std::type_info & /* ti */)
402{
403 // This datasource uses the newer GetColumnReaders() API
404 return {};
405}
406
407std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
408RNTupleDS::GetColumnReaders(unsigned int slot, std::string_view name, const std::type_info & /*tid*/)
409{
410 // At this point we can assume that `name` will be found in fColumnNames
411 // TODO(jblomer): check incoming type
412 const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), name));
413 auto field = fProtoFields[index].get();
414
415 // Map the field's and subfields' IDs to qualified names so that we can later connect the fields to
416 // other page sources from the chain
417 fFieldId2QualifiedName[field->GetOnDiskId()] = fPrincipalDescriptor->GetQualifiedFieldName(field->GetOnDiskId());
418 for (const auto &s : *field) {
419 fFieldId2QualifiedName[s.GetOnDiskId()] = fPrincipalDescriptor->GetQualifiedFieldName(s.GetOnDiskId());
420 }
421
422 auto reader = std::make_unique<Internal::RNTupleColumnReader>(this, field);
423 fActiveColumnReaders[slot].emplace_back(reader.get());
424
425 return reader;
426}
427
429{
430 // Old API, unsused
431 return true;
432}
433
435{
436 assert(fNextRanges.empty());
437 auto nFiles = fFileNames.empty() ? 1 : fFileNames.size();
438 auto nRemainingFiles = nFiles - fNextFileIndex;
439 if (nRemainingFiles == 0)
440 return;
441
442 // Easy work scheduling: one file per slot. We skip empty files (files without entries).
443 if (nRemainingFiles >= fNSlots) {
444 while ((fNextRanges.size() < fNSlots) && (fNextFileIndex < nFiles)) {
445 REntryRangeDS range;
446
447 if (fPrincipalSource) {
448 // Avoid reopening the first file, which has been opened already to read the schema
449 assert(fNextFileIndex == 0);
450 std::swap(fPrincipalSource, range.fSource);
451 } else {
453 range.fSource->Attach();
454 }
456
457 auto nEntries = range.fSource->GetNEntries();
458 if (nEntries == 0)
459 continue;
460
461 range.fLastEntry = nEntries; // whole file per slot, i.e. entry range [0..nEntries - 1]
462 fNextRanges.emplace_back(std::move(range));
463 }
464 return;
465 }
466
467 // Work scheduling of the tail: multiple slots work on the same file.
468 // Every slot still has its own page source but these page sources may open the same file.
469 // Again, we need to skip empty files.
470 unsigned int nSlotsPerFile = fNSlots / nRemainingFiles;
471 for (std::size_t i = 0; (fNextRanges.size() < fNSlots) && (fNextFileIndex < nFiles); ++i) {
472 std::unique_ptr<Internal::RPageSource> source;
473 if (fPrincipalSource) {
474 // Avoid reopening the first file, which has been opened already to read the schema
475 assert(fNextFileIndex == 0);
476 std::swap(source, fPrincipalSource);
477 } else {
479 source->Attach();
480 }
482
483 auto nEntries = source->GetNEntries();
484 if (nEntries == 0)
485 continue;
486
487 // If last file: use all remaining slots
488 if (i == (nRemainingFiles - 1))
489 nSlotsPerFile = fNSlots - fNextRanges.size();
490
491 std::vector<std::pair<ULong64_t, ULong64_t>> rangesByCluster;
492 {
493 auto descriptorGuard = source->GetSharedDescriptorGuard();
494 auto clusterId = descriptorGuard->FindClusterId(0, 0);
495 while (clusterId != kInvalidDescriptorId) {
496 const auto &clusterDesc = descriptorGuard->GetClusterDescriptor(clusterId);
497 rangesByCluster.emplace_back(std::make_pair<ULong64_t, ULong64_t>(
498 clusterDesc.GetFirstEntryIndex(), clusterDesc.GetFirstEntryIndex() + clusterDesc.GetNEntries()));
499 clusterId = descriptorGuard->FindNextClusterId(clusterId);
500 }
501 }
502 const unsigned int nRangesByCluster = rangesByCluster.size();
503
504 // Distribute slots equidistantly over the entry range, aligned on cluster boundaries
505 const auto nClustersPerSlot = nRangesByCluster / nSlotsPerFile;
506 const auto remainder = nRangesByCluster % nSlotsPerFile;
507 std::size_t iRange = 0;
508 unsigned int iSlot = 0;
509 const unsigned int N = std::min(nSlotsPerFile, nRangesByCluster);
510 for (; iSlot < N; ++iSlot) {
511 auto start = rangesByCluster[iRange].first;
512 iRange += nClustersPerSlot + static_cast<int>(iSlot < remainder);
513 assert(iRange > 0);
514 auto end = rangesByCluster[iRange - 1].second;
515
516 REntryRangeDS range;
517 // The last range for this file just takes the already opened page source. All previous ranges clone.
518 if (iSlot == N - 1) {
519 range.fSource = std::move(source);
520 } else {
521 range.fSource = source->Clone();
522 range.fSource->Attach();
523 }
524 range.fSource->SetEntryRange({start, end - start});
525 range.fFirstEntry = start;
526 range.fLastEntry = end;
527 fNextRanges.emplace_back(std::move(range));
528 }
529 } // loop over tail of remaining files
530}
531
532std::vector<std::pair<ULong64_t, ULong64_t>> RNTupleDS::GetEntryRanges()
533{
534 std::vector<std::pair<ULong64_t, ULong64_t>> ranges;
535 if (fNextRanges.empty())
536 return ranges;
537 assert(fNextRanges.size() <= fNSlots);
538
539 // We need to distinguish between single threaded and multi-threaded runs.
540 // In single threaded mode, InitSlot is only called once and column readers have to be rewired
541 // to new page sources of the chain in GetEntryRanges. In multi-threaded mode, on the other hand,
542 // InitSlot is called for every returned range, thus rewiring the column readers takes place in
543 // InitSlot and FinalizeSlot.
544
545 if (fNSlots == 1) {
546 for (auto r : fActiveColumnReaders[0]) {
547 r->Disconnect(true /* keepValue */);
548 }
549 }
550
551 fCurrentRanges.clear();
552 std::swap(fCurrentRanges, fNextRanges);
554
555 // Create ranges for the RDF loop manager from the list of REntryRangeDS records.
556 // The entry ranges that are relative to the page source in REntryRangeDS are translated into absolute
557 // entry ranges, given the current state of the entry cursor.
558 // We remember the connection from first absolute entry index of a range to its REntryRangeDS record
559 // so that we can properly rewire the column reader in InitSlot
560 fFirstEntry2RangeIdx.clear();
561 ULong64_t nEntriesPerSource = 0;
562 for (std::size_t i = 0; i < fCurrentRanges.size(); ++i) {
563 // Several consecutive ranges may operate on the same file (each with their own page source clone).
564 // We can detect a change of file when the first entry number jumps back to 0.
565 if (fCurrentRanges[i].fFirstEntry == 0) {
566 // New source
567 fSeenEntries += nEntriesPerSource;
568 nEntriesPerSource = 0;
569 }
570 auto start = fCurrentRanges[i].fFirstEntry + fSeenEntries;
571 auto end = fCurrentRanges[i].fLastEntry + fSeenEntries;
572 nEntriesPerSource += end - start;
573
574 fFirstEntry2RangeIdx[start] = i;
575 ranges.emplace_back(start, end);
576 }
577 fSeenEntries += nEntriesPerSource;
578
579 if ((fNSlots == 1) && (fCurrentRanges[0].fSource)) {
580 for (auto r : fActiveColumnReaders[0]) {
581 r->Connect(*fCurrentRanges[0].fSource, ranges[0].first);
582 }
583 }
584
585 return ranges;
586}
587
588void RNTupleDS::InitSlot(unsigned int slot, ULong64_t firstEntry)
589{
590 if (fNSlots == 1)
591 return;
592
593 auto idxRange = fFirstEntry2RangeIdx.at(firstEntry);
594 for (auto r : fActiveColumnReaders[slot]) {
595 r->Connect(*fCurrentRanges[idxRange].fSource, firstEntry - fCurrentRanges[idxRange].fFirstEntry);
596 }
597}
598
599void RNTupleDS::FinalizeSlot(unsigned int slot)
600{
601 if (fNSlots == 1)
602 return;
603
604 for (auto r : fActiveColumnReaders[slot]) {
605 r->Disconnect(true /* keepValue */);
606 }
607}
608
609std::string RNTupleDS::GetTypeName(std::string_view colName) const
610{
611 const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), colName));
612 return fColumnTypes[index];
613}
614
615bool RNTupleDS::HasColumn(std::string_view colName) const
616{
617 return std::find(fColumnNames.begin(), fColumnNames.end(), colName) != fColumnNames.end();
618}
619
621{
622 fSeenEntries = 0;
623 fNextFileIndex = 0;
624 if (!fCurrentRanges.empty() && (fFileNames.size() <= fNSlots)) {
625 assert(fNextRanges.empty());
626 std::swap(fCurrentRanges, fNextRanges);
627 fNextFileIndex = std::max(fFileNames.size(), std::size_t(1));
628 } else {
630 }
631}
632
634{
635 for (unsigned int i = 0; i < fNSlots; ++i) {
636 for (auto r : fActiveColumnReaders[i]) {
637 r->Disconnect(false /* keepValue */);
638 }
639 }
640}
641
642void RNTupleDS::SetNSlots(unsigned int nSlots)
643{
644 assert(fNSlots == 0);
645 assert(nSlots > 0);
646 fNSlots = nSlots;
648}
649} // namespace Experimental
650} // namespace ROOT
651
652ROOT::RDataFrame ROOT::RDF::Experimental::FromRNTuple(std::string_view ntupleName, std::string_view fileName)
653{
654 return ROOT::RDataFrame(std::make_unique<ROOT::Experimental::RNTupleDS>(ntupleName, fileName));
655}
656
658ROOT::RDF::Experimental::FromRNTuple(std::string_view ntupleName, const std::vector<std::string> &fileNames)
659{
660 return ROOT::RDataFrame(std::make_unique<ROOT::Experimental::RNTupleDS>(ntupleName, fileNames));
661}
662
664{
665 return ROOT::RDataFrame(std::make_unique<ROOT::Experimental::RNTupleDS>(ntuple));
666}
#define f(i)
Definition RSha256.hxx:104
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
long long Long64_t
Definition RtypesCore.h:80
unsigned long long ULong64_t
Definition RtypesCore.h:81
#define N
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t r
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
char name[80]
Definition TGX11.cxx:110
An artificial field that provides the size of a fixed-size array.
RArraySizeField & operator=(RArraySizeField &&other)=default
std::unique_ptr< ROOT::Experimental::RFieldBase > CloneImpl(std::string_view) const final
Called by Clone(), which additionally copies the on-disk ID.
void ReadGlobalImpl(NTupleSize_t, void *to) final
void ConstructValue(void *where) const final
Constructs value in a given location of size at least GetValueSize(). Called by the base class' Creat...
RArraySizeField(RArraySizeField &&other)=default
std::size_t GetValueSize() const final
The number of bytes taken by a value of the appropriate type.
RArraySizeField(const RArraySizeField &other)=delete
void ReadInClusterImpl(RClusterIndex, void *to) final
std::size_t GetAlignment() const final
As a rule of thumb, the alignment is equal to the size of the type.
void GenerateColumnsImpl(const ROOT::Experimental::RNTupleDescriptor &) final
Creates the backing columns corresponsing to the field type for reading.
RArraySizeField & operator=(const RArraySizeField &other)=delete
void GenerateColumnsImpl() final
Creates the backing columns corresponsing to the field type for writing.
void GetCollectionInfo(const NTupleSize_t globalIndex, RClusterIndex *collectionStart, ClusterSize_t *collectionSize)
For offset columns only, look at the two adjacent values that define a collection's coordinates.
Definition RColumn.hxx:282
Every RDF column is represented by exactly one RNTuple field.
std::unique_ptr< RFieldBase > fField
The field backing the RDF column.
Long64_t fEntryOffset
For chains, the logical entry and the physical entry in any particular file can be different.
std::unique_ptr< RFieldBase::RValue > fValue
The memory location used to read from fField.
RNTupleColumnReader(RNTupleDS *ds, RFieldBase *protoField)
Long64_t fLastEntry
Last entry number that was read.
RNTupleDS * fDataSource
The data source that owns this column reader.
void Connect(RPageSource &source, Long64_t entryOffset)
Connect the field and its subfields to the page source.
std::shared_ptr< void > fValuePtr
Used to reuse the object created by fValue when reconnecting sources.
RFieldBase * fProtoField
The prototype field from which fField is cloned.
Abstract interface to read data from an ntuple.
static std::unique_ptr< RPageSource > Create(std::string_view ntupleName, std::string_view location, const RNTupleReadOptions &options=RNTupleReadOptions())
Guess the concrete derived page source from the file name (location)
const RSharedDescriptorGuard GetSharedDescriptorGuard() const
Takes the read lock for the descriptor.
An artificial field that transforms an RNTuple column that contains the offset of collections into co...
Definition RNTupleDS.cxx:60
std::unique_ptr< ROOT::Experimental::RFieldBase > CloneImpl(std::string_view) const final
Called by Clone(), which additionally copies the on-disk ID.
Definition RNTupleDS.cxx:62
void ReadInClusterImpl(ROOT::Experimental::RClusterIndex clusterIndex, void *to) final
Get the number of elements of the collection identified by clusterIndex.
RRDFCardinalityField(RRDFCardinalityField &&other)=default
void GenerateColumnsImpl() final
Creates the backing columns corresponsing to the field type for writing.
Definition RNTupleDS.cxx:86
const RColumnRepresentations & GetColumnRepresentations() const final
Implementations in derived classes should return a static RColumnRepresentations object.
Definition RNTupleDS.cxx:78
RRDFCardinalityField & operator=(RRDFCardinalityField &&other)=default
size_t GetValueSize() const final
The number of bytes taken by a value of the appropriate type.
Definition RNTupleDS.cxx:94
size_t GetAlignment() const final
As a rule of thumb, the alignment is equal to the size of the type.
Definition RNTupleDS.cxx:95
void ConstructValue(void *where) const final
Constructs value in a given location of size at least GetValueSize(). Called by the base class' Creat...
Definition RNTupleDS.cxx:66
void GenerateColumnsImpl(const RNTupleDescriptor &desc) final
Creates the backing columns corresponsing to the field type for reading.
Definition RNTupleDS.cxx:87
void ReadGlobalImpl(ROOT::Experimental::NTupleSize_t globalIndex, void *to) final
Get the number of elements of the collection identified by globalIndex.
Definition RNTupleDS.cxx:98
Addresses a column element or field item relative to a particular cluster, instead of a global NTuple...
Holds the static meta-data of an RNTuple column.
Some fields have multiple possible column representations, e.g.
Definition RField.hxx:167
A field translates read and write calls from/to underlying columns to/from tree values.
Definition RField.hxx:95
std::string GetFieldName() const
Definition RField.hxx:665
const ColumnRepresentation_t & EnsureCompatibleColumnTypes(const RNTupleDescriptor &desc) const
Returns the on-disk column types found in the provided descriptor for fOnDiskId.
Definition RField.cxx:938
std::vector< std::unique_ptr< Internal::RColumn > > fColumns
The columns are connected either to a sink or to a source (not to both); they are owned by the field.
Definition RField.hxx:400
RConstSchemaIterator cbegin() const
Definition RField.hxx:706
std::unique_ptr< RFieldBase > Clone(std::string_view newName) const
Copies the field and its sub fields using a possibly new name and a new, unconnected set of columns.
Definition RField.cxx:791
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &canonicalType, const std::string &typeAlias, bool fContinueOnError=false)
Factory method to resurrect a field from the stored on-disk type information.
Definition RField.cxx:566
Internal::RColumn * fPrincipalColumn
Points into fColumns.
Definition RField.hxx:398
DescriptorId_t GetOnDiskId() const
Definition RField.hxx:682
The RDataSource implementation for RNTuple.
Definition RNTupleDS.hxx:43
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int, std::string_view, const std::type_info &) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
bool SetEntry(unsigned int slot, ULong64_t entry) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
void Initialize() final
Convenience method called before starting an event-loop.
void AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId, std::vector< RFieldInfo > fieldInfos)
Provides the RDF column "colName" given the field identified by fieldID.
std::size_t fNextFileIndex
Index into fFileNames to the next file to process.
Definition RNTupleDS.hxx:66
std::vector< std::string > fColumnNames
Definition RNTupleDS.hxx:77
std::unordered_map< ULong64_t, std::size_t > fFirstEntry2RangeIdx
Maps the first entries from the ranges of the last GetEntryRanges() call to their corresponding index...
Definition RNTupleDS.hxx:90
void Finalize() final
Convenience method called after concluding an event-loop.
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
void InitSlot(unsigned int slot, ULong64_t firstEntry) final
Convenience method called at the start of the data processing associated to a slot.
std::unordered_map< ROOT::Experimental::DescriptorId_t, std::string > fFieldId2QualifiedName
Connects the IDs of active proto fields and their subfields to their fully qualified name (a....
Definition RNTupleDS.hxx:76
void FinalizeSlot(unsigned int slot) final
Convenience method called at the end of the data processing associated to a slot.
std::vector< std::vector< Internal::RNTupleColumnReader * > > fActiveColumnReaders
List of column readers returned by GetColumnReaders() organized by slot.
Definition RNTupleDS.hxx:81
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
std::unique_ptr< Internal::RPageSource > fPrincipalSource
The first source is used to extract the schema and build the prototype fields.
Definition RNTupleDS.hxx:59
std::vector< REntryRangeDS > fCurrentRanges
Basis for the ranges returned by the last GetEntryRanges() call.
Definition RNTupleDS.hxx:85
std::unique_ptr< RNTupleDescriptor > fPrincipalDescriptor
A clone of the first pages source's descriptor.
Definition RNTupleDS.hxx:61
std::vector< REntryRangeDS > fNextRanges
Basis for the ranges populated by the PrepareNextRanges() call.
Definition RNTupleDS.hxx:86
std::vector< std::unique_ptr< ROOT::Experimental::RFieldBase > > fProtoFields
We prepare a prototype field for every column.
Definition RNTupleDS.hxx:72
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
void PrepareNextRanges()
Populates fNextRanges with the next set of entry ranges.
std::vector< std::string > fFileNames
Definition RNTupleDS.hxx:65
std::string fNTupleName
The data source may be constructed with an ntuple name and a list of files.
Definition RNTupleDS.hxx:64
std::vector< std::string > fColumnTypes
Definition RNTupleDS.hxx:78
RNTupleDS(std::unique_ptr< ROOT::Experimental::Internal::RPageSource > pageSource)
ULong64_t fSeenEntries
The number of entries so far returned by GetEntryRanges()
Definition RNTupleDS.hxx:84
The on-storage meta-data of an ntuple.
RFieldDescriptorIterable GetFieldIterable(const RFieldDescriptor &fieldDesc) const
DescriptorId_t FindFieldId(std::string_view fieldName, DescriptorId_t parentId) const
const RFieldDescriptor & GetFieldDescriptor(DescriptorId_t fieldId) const
Representation of an RNTuple data set in a ROOT file.
Definition RNTuple.hxx:61
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
void CallConnectPageSourceOnField(RFieldBase &, RPageSource &)
Definition RField.cxx:364
std::uint64_t NTupleSize_t
Integer type long enough to hold the maximum number of entries in a column.
ENTupleStructure
The fields in the ntuple model tree can carry different structural information about the type system.
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
constexpr DescriptorId_t kInvalidDescriptorId
RDataFrame FromRNTuple(std::string_view ntupleName, std::string_view fileName)
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Wrap the integer in a struct in order to avoid template specialization clash with std::uint64_t.
The PrepareNextRanges() method populates the fNextRanges list with REntryRangeDS records.
Definition RNTupleDS.hxx:49
std::unique_ptr< ROOT::Experimental::Internal::RPageSource > fSource
Definition RNTupleDS.hxx:50
ULong64_t fLastEntry
End entry index in fSource, e.g. the number of entries in the range is fLastEntry - fFirstEntry.
Definition RNTupleDS.hxx:53
ULong64_t fFirstEntry
First entry index in fSource.
Definition RNTupleDS.hxx:51