20 std::pair<ULong64_t, ULong64_t> &&newRange)
22 R__ASSERT(newRange.second >= newRange.first &&
"end is less than begin in the passed entry range!");
28 R__ASSERT(end >= begin &&
"end is less than begin in the passed entry range!");
52std::vector<std::pair<std::uint64_t, std::uint64_t>>
55 std::vector<std::pair<std::uint64_t, std::uint64_t>> boundaries{};
58 auto *ds = lm->GetDataSource();
61 throw std::runtime_error(
"Cannot retrieve cluster boundaries: no data source available.");
64 std::string datasetName;
65 std::vector<std::string> fileNames;
68 if (
auto *ttreeds =
dynamic_cast<RTTreeDS *
>(ds)) {
69 auto *tree = ttreeds->GetTree();
70 assert(tree &&
"The internal TTree is not available, something went wrong.");
71 datasetName = tree->GetName();
74 }
else if (
auto *rntupleds =
dynamic_cast<RNTupleDS *
>(ds)) {
75 datasetName = rntupleds->fNTupleName;
76 fileNames = rntupleds->fFileNames;
79 throw std::runtime_error(
"Cannot retrieve cluster boundaries: unsupported data source type.");
82 if (fileNames.empty()) {
86 const auto nFiles = fileNames.size();
89 using FileResult = std::pair<std::vector<std::pair<std::uint64_t, std::uint64_t>>, std::uint64_t>;
90 std::vector<FileResult> perFileResults(nFiles);
93 auto processFile = [&datasetName, isTTree](
const std::string &fileName) -> FileResult {
94 std::vector<std::pair<std::uint64_t, std::uint64_t>> clusters;
95 std::uint64_t nEntries = 0;
102 for (std::size_t i = 0; i + 1 < clusterBoundaries.size(); ++i) {
103 clusters.emplace_back(clusterBoundaries[i], clusterBoundaries[i + 1]);
109 for (
const auto &cluster : clusterBoundaries) {
110 clusters.emplace_back(cluster.fFirstEntry, cluster.fLastEntryPlusOne);
114 return {clusters, nEntries};
122 pool.
Foreach([&perFileResults, &fileNames,
123 &processFile](std::size_t idx) { perFileResults[idx] = processFile(fileNames[idx]); },
127 for (std::size_t idx = 0; idx < nFiles; ++idx) {
128 perFileResults[idx] = processFile(fileNames[idx]);
133 std::uint64_t offset = 0;
134 for (
const auto &[clusters, nEntries] : perFileResults) {
135 for (
const auto &[
start, end] : clusters) {
136 boundaries.emplace_back(offset +
start, offset + end);
159 return ds->GetLabel();
long long Long64_t
Portable signed long integer 8 bytes.
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
The dataset specification for RDataFrame.
std::shared_ptr< ROOT::Detail::RDF::RLoopManager > fLoopManager
< The RLoopManager at the root of this computation graph. Never null.
RDataSource * GetDataSource() const
RDFDetail::RLoopManager * GetLoopManager() const
The RDataSource implementation for RNTuple.
A pseudo container class which is a generator of indices.
This class provides a simple interface to execute the same task multiple times in parallel threads,...
void Foreach(F func, unsigned nTimes, unsigned nChunks=0)
Execute a function without arguments several times in parallel, dividing the execution in nChunks.
std::vector< std::string > GetFileNamesFromTree(const TTree &tree)
Get and store the file names associated with the input tree.
void ChangeEmptyEntryRange(const ROOT::RDF::RNode &node, std::pair< ULong64_t, ULong64_t > &&newRange)
void ChangeSpec(const ROOT::RDF::RNode &node, ROOT::RDF::Experimental::RDatasetSpec &&spec)
Changes the input dataset specification of an RDataFrame.
std::string GetDataSourceLabel(const ROOT::RDF::RNode &node)
void TriggerRun(ROOT::RDF::RNode node)
Trigger the execution of an RDataFrame computation graph.
std::pair< std::vector< ROOT::Internal::RNTupleClusterBoundaries >, ROOT::NTupleSize_t > GetClustersAndEntries(std::string_view ntupleName, std::string_view location)
Retrieves the cluster boundaries and the number of entries for the input RNTuple.
void SetTTreeLifeline(ROOT::RDF::RNode &node, std::any lifeline)
std::vector< std::pair< std::uint64_t, std::uint64_t > > GetDatasetGlobalClusterBoundaries(const RNode &node)
Retrieve the cluster boundaries for each cluster in the dataset, across files, with a global offset.
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
std::pair< std::vector< Long64_t >, Long64_t > GetClustersAndEntries(std::string_view treename, std::string_view path)
Returns the cluster boundaries and number of entries of the input tree.
RInterface<::ROOT::Detail::RDF::RNodeBase > RNode