25#include <condition_variable>
32#include <unordered_map>
37namespace Experimental {
39class RNTupleDescriptor;
42class RNTupleColumnReader;
53 std::unique_ptr<ROOT::Experimental::Internal::RPageSource>
fSource;
81 std::vector<std::unique_ptr<ROOT::Experimental::Internal::RPageSource>>
fStagingArea;
88 std::vector<std::unique_ptr<ROOT::Experimental::RFieldBase>>
fProtoFields;
139 std::vector<RFieldInfo> fieldInfos);
152 explicit RNTupleDS(std::unique_ptr<ROOT::Experimental::Internal::RPageSource> pageSource);
155 RNTupleDS(std::string_view ntupleName, std::string_view fileName);
157 RNTupleDS(std::string_view ntupleName,
const std::vector<std::string> &fileNames);
165 void SetNSlots(
unsigned int nSlots) final;
168 bool HasColumn(std::string_view colName)
const final;
169 std::string
GetTypeName(std::string_view colName)
const final;
170 std::vector<std::pair<ULong64_t, ULong64_t>>
GetEntryRanges() final;
171 std::
string GetLabel() final {
return "RNTupleDS"; }
179 GetColumnReaders(
unsigned int , std::string_view , const std::type_info &) final;
191namespace Experimental {
192RDataFrame
FromRNTuple(std::string_view ntupleName, std::string_view fileName);
193RDataFrame
FromRNTuple(std::string_view ntupleName,
const std::vector<std::string> &fileNames);
unsigned long long ULong64_t
Every RDF column is represented by exactly one RNTuple field.
The RDataSource implementation for RNTuple.
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int, std::string_view, const std::type_info &) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
RNTupleDS(const RNTupleDS &)=delete
void ExecStaging()
The main function of the fThreadStaging background thread.
Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
std::thread fThreadStaging
The background thread that runs StageNextSources()
void Initialize() final
Convenience method called before starting an event-loop.
void AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId, std::vector< RFieldInfo > fieldInfos)
Provides the RDF column "colName" given the field identified by fieldID.
RNTupleDS & operator=(const RNTupleDS &)=delete
std::size_t fNextFileIndex
Index into fFileNames to the next file to process.
std::vector< std::string > fColumnNames
std::unordered_map< ULong64_t, std::size_t > fFirstEntry2RangeIdx
Maps the first entries from the ranges of the last GetEntryRanges() call to their corresponding index...
void Finalize() final
Convenience method called after concluding an event-loop.
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
void InitSlot(unsigned int slot, ULong64_t firstEntry) final
Convenience method called at the start of the data processing associated to a slot.
RNTupleDS(RNTupleDS &&)=delete
std::unordered_map< ROOT::Experimental::DescriptorId_t, std::string > fFieldId2QualifiedName
Connects the IDs of active proto fields and their subfields to their fully qualified name (a....
void FinalizeSlot(unsigned int slot) final
Convenience method called at the end of the data processing associated to a slot.
std::vector< std::vector< Internal::RNTupleColumnReader * > > fActiveColumnReaders
List of column readers returned by GetColumnReaders() organized by slot.
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
std::vector< REntryRangeDS > fCurrentRanges
Basis for the ranges returned by the last GetEntryRanges() call.
bool fStagingThreadShouldTerminate
Is true when the I/O thread should quit.
std::unique_ptr< RNTupleDescriptor > fPrincipalDescriptor
A clone of the first pages source's descriptor.
bool fHasNextSources
Is true when the staging thread has populated the next batch of files to fStagingArea.
std::vector< REntryRangeDS > fNextRanges
Basis for the ranges populated by the PrepareNextRanges() call.
std::vector< std::unique_ptr< ROOT::Experimental::RFieldBase > > fProtoFields
We prepare a prototype field for every column.
std::mutex fMutexStaging
Protects the shared state between the main thread and the I/O thread.
void StageNextSources()
Starting from fNextFileIndex, opens the next fNSlots files.
std::size_t GetNFiles() const final
Returns the number of files from which the dataset is constructed.
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
std::vector< std::unique_ptr< ROOT::Experimental::Internal::RPageSource > > fStagingArea
The staging area is relevant for chains of files, i.e.
void PrepareNextRanges()
Populates fNextRanges with the next set of entry ranges.
std::vector< std::string > fFileNames
std::string fNTupleName
The data source may be constructed with an ntuple name and a list of files.
std::vector< std::string > fColumnTypes
RNTupleDS & operator=(RNTupleDS &&)=delete
std::string GetLabel() final
Return a string representation of the datasource type.
bool SetEntry(unsigned int, ULong64_t) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
ULong64_t fSeenEntries
The number of entries so far returned by GetEntryRanges()
std::condition_variable fCvStaging
Signal for the state information of fIsReadyForStaging and fHasNextSources.
bool fIsReadyForStaging
Is true when the staging thread should start working.
The on-storage meta-data of an ntuple.
Pure virtual base class for all column reader types.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
std::vector< void * > Record_t
Representation of an RNTuple data set in a ROOT file.
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
RDataFrame FromRNTuple(std::string_view ntupleName, std::string_view fileName)
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
The PrepareNextRanges() method populates the fNextRanges list with REntryRangeDS records.
std::unique_ptr< ROOT::Experimental::Internal::RPageSource > fSource
ULong64_t fLastEntry
End entry index in fSource, e.g. the number of entries in the range is fLastEntry - fFirstEntry.
ULong64_t fFirstEntry
First entry index in fSource.
Holds useful information about fields added to the RNTupleDS.
RFieldInfo(DescriptorId_t fieldId, std::size_t nRepetitions)
std::size_t fNRepetitions