25#include <condition_variable>
33#include <unordered_map>
47 const std::pair<ULong64_t, ULong64_t> &
range);
67class RNTupleColumnReader;
81 std::unique_ptr<ROOT::Internal::RPageSource>
fSource;
110 std::vector<std::unique_ptr<ROOT::Internal::RPageSource>>
fStagingArea;
206 const std::vector<std::string> &
fileNames,
207 const std::pair<ULong64_t, ULong64_t> &
range);
210 const std::pair<ULong64_t, ULong64_t> &
range);
251RDataFrame FromRNTuple(std::string_view
ntupleName, std::string_view fileName);
252RDataFrame FromRNTuple(std::string_view
ntupleName,
const std::vector<std::string> &
fileNames);
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Pure virtual base class for all column reader types.
Every RDF column is represented by exactly one RNTuple field.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
friend ROOT::RDF::RSampleInfo ROOT::Internal::RDF::CreateSampleInfo(const ROOT::RDF::RDataSource &, unsigned int, const std::unordered_map< std::string, ROOT::RDF::Experimental::RSample * > &)
std::vector< void * > Record_t
The RDataSource implementation for RNTuple.
bool fHasNextSources
Is true when the staging thread has populated the next batch of files to fStagingArea.
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
void AddField(const ROOT::RNTupleDescriptor &desc, std::string_view colName, ROOT::DescriptorId_t fieldId, std::vector< RFieldInfo > fieldInfos, bool convertToRVec=true)
Provides the RDF column "colName" given the field identified by fieldID.
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
std::size_t GetNFiles() const final
Returns the number of files from which the dataset is constructed.
std::vector< REntryRangeDS > fNextRanges
Basis for the ranges populated by the PrepareNextRanges() call.
std::unordered_map< ULong64_t, std::size_t > fFirstEntry2RangeIdx
Maps the first entries from the ranges of the last GetEntryRanges() call to their corresponding index...
void ExecStaging()
The main function of the fThreadStaging background thread.
bool fStagingThreadShouldTerminate
Is true when the I/O thread should quit.
RNTupleDS(RNTupleDS &&)=delete
std::vector< std::vector< ROOT::Internal::RDF::RNTupleColumnReader * > > fActiveColumnReaders
List of column readers returned by GetColumnReaders() organized by slot.
std::vector< std::unique_ptr< ROOT::Internal::RPageSource > > fStagingArea
The staging area is relevant for chains of files, i.e.
std::vector< std::pair< ULong64_t, ULong64_t > > fOriginalRanges
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int, std::string_view, const std::type_info &) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
std::vector< std::size_t > fSlotsToRangeIdxs
One element per slot, corresponding to the current range index for that slot, as filled by InitSlot.
std::vector< std::unique_ptr< ROOT::RFieldBase > > fProtoFields
We prepare a prototype field for every column.
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
ROOT::RFieldBase * GetFieldWithTypeChecks(std::string_view fieldName, const std::type_info &tid)
RNTupleDS & operator=(const RNTupleDS &)=delete
std::vector< std::string > fFileNames
bool fIsReadyForStaging
Is true when the staging thread should start working.
void InitSlot(unsigned int slot, ULong64_t firstEntry) final
Convenience method called at the start of the data processing associated to a slot.
ROOT::RNTupleDescriptor fPrincipalDescriptor
A clone of the first pages source's descriptor.
ULong64_t fSeenEntriesNoGlobalRange
The number of entries seen so far in GetEntryRanges()
std::vector< REntryRangeDS > fCurrentRanges
Basis for the ranges returned by the last GetEntryRanges() call.
RNTupleDS(std::unique_ptr< ROOT::Internal::RPageSource > pageSource)
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
std::size_t fNextFileIndex
Index into fFileNames to the next file to process.
std::unordered_map< ROOT::DescriptorId_t, std::string > fFieldId2QualifiedName
Connects the IDs of active proto fields and their subfields to their fully qualified name (a....
std::string fNTupleName
The data source may be constructed with an ntuple name and a list of files.
void PrepareNextRanges()
Populates fNextRanges with the next set of entry ranges.
void StageNextSources()
Starting from fNextFileIndex, opens the next fNSlots files.
std::condition_variable fCvStaging
Signal for the state information of fIsReadyForStaging and fHasNextSources.
RNTupleDS(const RNTupleDS &)=delete
void Finalize() final
Convenience method called after concluding an event-loop.
std::string GetLabel() final
Return a string representation of the datasource type.
RNTupleDS & operator=(RNTupleDS &&)=delete
std::thread fThreadStaging
The background thread that runs StageNextSources()
std::mutex fMutexStaging
Protects the shared state between the main thread and the I/O thread.
std::unordered_map< std::size_t, std::vector< std::unique_ptr< ROOT::RFieldBase > > > fAlternativeProtoFields
Columns may be requested with types other than with which they were initially added as proto fields.
bool SetEntry(unsigned int, ULong64_t) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
std::vector< std::string > fColumnTypes
void Initialize() final
Convenience method called before starting an event-loop.
std::vector< std::string > fColumnNames
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
void FinalizeSlot(unsigned int slot) final
Convenience method called at the end of the data processing associated to a slot.
This type represents a sample identifier, to be used in conjunction with RDataFrame features such as ...
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
A field translates read and write calls from/to underlying columns to/from tree values.
The on-storage metadata of an RNTuple.
ROOT::RDataFrame FromRNTuple(std::string_view ntupleName, const std::vector< std::string > &fileNames, const std::pair< ULong64_t, ULong64_t > &range)
Internal overload of the function that allows passing a range of entries.
std::pair< std::vector< ROOT::Internal::RNTupleClusterBoundaries >, ROOT::NTupleSize_t > GetClustersAndEntries(std::string_view ntupleName, std::string_view location)
Retrieves the cluster boundaries and the number of entries for the input RNTuple.
Namespace for new ROOT classes and functions.
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
std::uint64_t NTupleSize_t
Integer type long enough to hold the maximum number of entries in a column.
The PrepareNextRanges() method populates the fNextRanges list with REntryRangeDS records.
std::string_view fFileName
Storage location of the current RNTuple.
ULong64_t fLastEntry
End entry index in fSource, e.g. the number of entries in the range is fLastEntry - fFirstEntry.
std::unique_ptr< ROOT::Internal::RPageSource > fSource
ULong64_t fFirstEntry
First entry index in fSource.
Holds useful information about fields added to the RNTupleDS.
RFieldInfo(ROOT::DescriptorId_t fieldId, std::size_t nRepetitions)
std::size_t fNRepetitions
ROOT::DescriptorId_t fFieldId