24#include <condition_variable>
32#include <unordered_map>
45ROOT::RDataFrame
FromRNTuple(std::string_view ntupleName,
const std::vector<std::string> &fileNames,
46 const std::pair<ULong64_t, ULong64_t> &range);
74std::vector<std::pair<std::uint64_t, std::uint64_t>>
89 std::unique_ptr<ROOT::Internal::RPageSource>
fSource;
118 std::vector<std::unique_ptr<ROOT::Internal::RPageSource>>
fStagingArea;
197 std::vector<RFieldInfo> fieldInfos,
bool convertToRVec =
true);
210 explicit RNTupleDS(std::unique_ptr<ROOT::Internal::RPageSource> pageSource);
215 const std::vector<std::string> &fileNames,
216 const std::pair<ULong64_t, ULong64_t> &range);
222 explicit RNTupleDS(std::string_view ntupleName,
const std::vector<std::string> &fileNames,
223 const std::pair<ULong64_t, ULong64_t> &range);
226 RNTupleDS(std::string_view ntupleName, std::string_view fileName);
227 RNTupleDS(std::string_view ntupleName,
const std::vector<std::string> &fileNames);
235 void SetNSlots(
unsigned int nSlots) final;
239 bool HasColumn(std::string_view colName)
const final;
240 std::string
GetTypeName(std::string_view colName)
const final;
241 std::vector<std::pair<ULong64_t, ULong64_t>>
GetEntryRanges() final;
242 std::
string GetLabel() final {
return "RNTupleDS"; }
265RDataFrame FromRNTuple(std::string_view ntupleName, std::string_view fileName);
266RDataFrame FromRNTuple(std::string_view ntupleName,
const std::vector<std::string> &fileNames);
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Base class for non-leaf nodes of the computational graph.
Pure virtual base class for all column reader types.
Every RDF column is represented by exactly one RNTuple field.
Abstract interface to read data from an ntuple.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
std::vector< void * > Record_t
The public interface to the RDataFrame federation of classes.
bool fHasNextSources
Is true when the staging thread has populated the next batch of files to fStagingArea.
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
std::size_t GetNFiles() const final
Returns the number of files from which the dataset is constructed.
std::vector< REntryRangeDS > fNextRanges
Basis for the ranges populated by the PrepareNextRanges() call.
std::unordered_map< ULong64_t, std::size_t > fFirstEntry2RangeIdx
Maps the first entries from the ranges of the last GetEntryRanges() call to their corresponding index...
bool fStagingThreadShouldTerminate
Is true when the I/O thread should quit.
RNTupleDS(RNTupleDS &&)=delete
std::vector< std::vector< ROOT::Internal::RDF::RNTupleColumnReader * > > fActiveColumnReaders
List of column readers returned by GetColumnReaders() organized by slot.
std::vector< std::unique_ptr< ROOT::Internal::RPageSource > > fStagingArea
The staging area is relevant for chains of files, i.e.
std::vector< std::pair< ULong64_t, ULong64_t > > fOriginalRanges
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int, std::string_view, const std::type_info &) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
std::vector< std::size_t > fSlotsToRangeIdxs
One element per slot, corresponding to the current range index for that slot, as filled by InitSlot.
std::vector< std::unique_ptr< ROOT::RFieldBase > > fProtoFields
We prepare a prototype field for every column.
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
ROOT::RFieldBase * GetFieldWithTypeChecks(std::string_view fieldName, const std::type_info &tid)
RNTupleDS & operator=(const RNTupleDS &)=delete
std::vector< std::string > fFileNames
bool fIsReadyForStaging
Is true when the staging thread should start working.
void InitSlot(unsigned int slot, ULong64_t firstEntry) final
Convenience method called at the start of the data processing associated to a slot.
ROOT::RNTupleDescriptor fPrincipalDescriptor
A clone of the first pages source's descriptor.
ULong64_t fSeenEntriesNoGlobalRange
The number of entries seen so far in GetEntryRanges().
std::vector< REntryRangeDS > fCurrentRanges
Basis for the ranges returned by the last GetEntryRanges() call.
std::vector< std::string > fTopLevelFieldNames
RNTupleDS(std::unique_ptr< ROOT::Internal::RPageSource > pageSource)
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
std::size_t fNextFileIndex
Index into fFileNames to the next file to process.
const std::vector< std::string > & GetTopLevelFieldNames() const final
ROOT::RDF::RSampleInfo CreateSampleInfo(unsigned int, const std::unordered_map< std::string, ROOT::RDF::Experimental::RSample * > &) const final
std::unordered_map< ROOT::DescriptorId_t, std::string > fFieldId2QualifiedName
Connects the IDs of active proto fields and their subfields to their fully qualified name (a....
std::string fNTupleName
The data source may be constructed with an ntuple name and a list of files.
void PrepareNextRanges()
Populates fNextRanges with the next set of entry ranges.
void StageNextSources()
Provides the RDF column "colName" given the field identified by fieldID.
std::condition_variable fCvStaging
Signal for the state information of fIsReadyForStaging and fHasNextSources.
RNTupleDS(const RNTupleDS &)=delete
void Finalize() final
Convenience method called after concluding an event-loop.
std::string GetLabel() final
Return a string representation of the datasource type.
RNTupleDS & operator=(RNTupleDS &&)=delete
std::thread fThreadStaging
The background thread that runs StageNextSources().
std::mutex fMutexStaging
Protects the shared state between the main thread and the I/O thread.
std::unordered_map< std::size_t, std::vector< std::unique_ptr< ROOT::RFieldBase > > > fAlternativeProtoFields
Columns may be requested with types other than with which they were initially added as proto fields.
bool SetEntry(unsigned int, ULong64_t) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
std::vector< std::string > fColumnTypes
void Initialize() final
Convenience method called before starting an event-loop.
std::vector< std::string > fColumnNames
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
void FinalizeSlot(unsigned int slot) final
Convenience method called at the end of the data processing associated to a slot.
This type represents a sample identifier, to be used in conjunction with RDataFrame features such as ...
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
A field translates read and write calls from/to underlying columns to/from tree values.
The on-storage metadata of an RNTuple.
Representation of an RNTuple data set in a ROOT file.
Special implementation of ROOT::RRangeCast for TCollection, including a check that the cast target ty...
ROOT::RDataFrame FromRNTuple(std::string_view ntupleName, const std::vector< std::string > &fileNames, const std::pair< ULong64_t, ULong64_t > &range)
Internal overload of the function that allows passing a range of entries.
std::pair< std::vector< ROOT::Internal::RNTupleClusterBoundaries >, ROOT::NTupleSize_t > GetClustersAndEntries(std::string_view ntupleName, std::string_view location)
Retrieves the cluster boundaries and the number of entries for the input RNTuple.
std::vector< std::pair< std::uint64_t, std::uint64_t > > GetDatasetGlobalClusterBoundaries(const RNode &node)
Retrieve the cluster boundaries for each cluster in the dataset, across files, with a global offset.
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
std::uint64_t NTupleSize_t
Integer type long enough to hold the maximum number of entries in a column.
The PrepareNextRanges() method populates the fNextRanges list with REntryRangeDS records.
std::string_view fFileName
Storage location of the current RNTuple.
ULong64_t fLastEntry
End entry index in fSource, e.g. the number of entries in the range is fLastEntry - fFirstEntry.
std::unique_ptr< ROOT::Internal::RPageSource > fSource
ULong64_t fFirstEntry
First entry index in fSource.
RFieldInfo(ROOT::DescriptorId_t fieldId, std::size_t nRepetitions)
std::size_t fNRepetitions
ROOT::DescriptorId_t fFieldId