Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RInterface.cxx
Go to the documentation of this file.
1// Author: Vincenzo Eduardo Padulano, Axel Naumann, Enrico Guiraud CERN 02/2023
2
3/*************************************************************************
4 * Copyright (C) 1995-2023, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
13#include "ROOT/RNTupleDS.hxx"
14#include "ROOT/RTTreeDS.hxx"
15#ifdef R__USE_IMT
17#endif
18
20 std::pair<ULong64_t, ULong64_t> &&newRange)
21{
22 R__ASSERT(newRange.second >= newRange.first && "end is less than begin in the passed entry range!");
23 node.GetLoopManager()->SetEmptyEntryRange(std::move(newRange));
24}
25
27{
28 R__ASSERT(end >= begin && "end is less than begin in the passed entry range!");
29 node.GetLoopManager()->ChangeBeginAndEndEntries(begin, end);
30}
31
32/**
33 * \brief Changes the input dataset specification of an RDataFrame.
34 *
35 * \param node Any node of the computation graph.
36 * \param spec The new specification.
37 */
42
43/**
44 * \brief Retrieve the cluster boundaries for each cluster in the dataset,
45 * across files, with a global offset.
46 *
47 * \param node Any node of the computation graph.
48 * \return A vector of [begin, end) entry pairs for each cluster in the dataset.
49 *
50 * \note When IMT is enabled, files are processed in parallel using a thread pool.
51 */
52std::vector<std::pair<std::uint64_t, std::uint64_t>>
54{
55 std::vector<std::pair<std::uint64_t, std::uint64_t>> boundaries{};
56
57 auto *lm = node.GetLoopManager();
58 auto *ds = lm->GetDataSource();
59
60 if (!ds) {
61 throw std::runtime_error("Cannot retrieve cluster boundaries: no data source available.");
62 }
63
64 std::string datasetName;
65 std::vector<std::string> fileNames;
66 bool isTTree = false;
67
68 if (auto *ttreeds = dynamic_cast<RTTreeDS *>(ds)) {
69 auto *tree = ttreeds->GetTree();
70 assert(tree && "The internal TTree is not available, something went wrong.");
71 datasetName = tree->GetName();
73 isTTree = true;
74 } else if (auto *rntupleds = dynamic_cast<RNTupleDS *>(ds)) {
75 datasetName = rntupleds->fNTupleName;
76 fileNames = rntupleds->fFileNames;
77 isTTree = false;
78 } else {
79 throw std::runtime_error("Cannot retrieve cluster boundaries: unsupported data source type.");
80 }
81
82 if (fileNames.empty()) {
83 return boundaries;
84 }
85
86 const auto nFiles = fileNames.size();
87
88 // For each file retrieve the cluster boundaries + the number of entries
89 using FileResult = std::pair<std::vector<std::pair<std::uint64_t, std::uint64_t>>, std::uint64_t>;
90 std::vector<FileResult> perFileResults(nFiles);
91
92 // Function to process a single file and return its cluster boundaries + entry count
93 auto processFile = [&datasetName, isTTree](const std::string &fileName) -> FileResult {
94 std::vector<std::pair<std::uint64_t, std::uint64_t>> clusters;
95 std::uint64_t nEntries = 0;
96
97 if (isTTree) {
98 // TTree
100 nEntries = entries;
101 // [0, 10, 20, ...] --> [(0,10), (10,20), ...]
102 for (std::size_t i = 0; i + 1 < clusterBoundaries.size(); ++i) {
103 clusters.emplace_back(clusterBoundaries[i], clusterBoundaries[i + 1]);
104 }
105 } else {
106 // RNTuple
107 auto [clusterBoundaries, entries] = GetClustersAndEntries(datasetName, fileName);
108 nEntries = entries;
109 for (const auto &cluster : clusterBoundaries) {
110 clusters.emplace_back(cluster.fFirstEntry, cluster.fLastEntryPlusOne);
111 }
112 }
113
114 return {clusters, nEntries};
115 };
116
117#ifdef R__USE_IMT
119 // Distribute the processing of files in parallel across the thread pool,
120 // each thread takes a file and its index in the fileNames vector as input
121 // and fills the corresponding position in the perFileResults vector
122 pool.Foreach([&perFileResults, &fileNames,
123 &processFile](std::size_t idx) { perFileResults[idx] = processFile(fileNames[idx]); },
125#else
126 // Process files sequentially as a fallback
127 for (std::size_t idx = 0; idx < nFiles; ++idx) {
128 perFileResults[idx] = processFile(fileNames[idx]);
129 }
130#endif
131 // Now that we have the cluster boundaries and entry counts for each file,
132 // we can compute the global boundaries with offsets (sequentially)
133 std::uint64_t offset = 0;
134 for (const auto &[clusters, nEntries] : perFileResults) {
135 for (const auto &[start, end] : clusters) {
136 boundaries.emplace_back(offset + start, offset + end);
137 }
138 offset += nEntries;
139 }
140
141 return boundaries;
142}
143
144/**
145 * \brief Trigger the execution of an RDataFrame computation graph.
146 * \param[in] node A node of the computation graph (not a result).
147 *
148 * This function calls the RLoopManager::Run method on the \p fLoopManager data
149 * member of the input argument. It is intended for internal use only.
150 */
155
157{
158 if (auto ds = node.GetDataSource()) {
159 return ds->GetLabel();
160 } else {
161 return "EmptyDS";
162 }
163}
164
long long Long64_t
Portable signed long integer 8 bytes.
Definition RtypesCore.h:83
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
void SetEmptyEntryRange(std::pair< ULong64_t, ULong64_t > &&newRange)
void ChangeSpec(ROOT::RDF::Experimental::RDatasetSpec &&spec)
Changes the internal TTree held by the RLoopManager.
void ChangeBeginAndEndEntries(Long64_t begin, Long64_t end)
void SetTTreeLifeline(std::any lifeline)
The dataset specification for RDataFrame.
std::shared_ptr< ROOT::Detail::RDF::RLoopManager > fLoopManager
< The RLoopManager at the root of this computation graph. Never null.
RDataSource * GetDataSource() const
RDFDetail::RLoopManager * GetLoopManager() const
The public interface to the RDataFrame federation of classes.
The RDataSource implementation for RNTuple.
Definition RNTupleDS.hxx:83
A pseudo container class which is a generator of indices.
Definition TSeq.hxx:67
This class provides a simple interface to execute the same task multiple times in parallel threads,...
void ChangeEmptyEntryRange(const ROOT::RDF::RNode &node, std::pair< ULong64_t, ULong64_t > &&newRange)
void ChangeSpec(const ROOT::RDF::RNode &node, ROOT::RDF::Experimental::RDatasetSpec &&spec)
Changes the input dataset specification of an RDataFrame.
std::string GetDataSourceLabel(const ROOT::RDF::RNode &node)
void TriggerRun(ROOT::RDF::RNode node)
Trigger the execution of an RDataFrame computation graph.
std::pair< std::vector< ROOT::Internal::RNTupleClusterBoundaries >, ROOT::NTupleSize_t > GetClustersAndEntries(std::string_view ntupleName, std::string_view location)
Retrieves the cluster boundaries and the number of entries for the input RNTuple.
void SetTTreeLifeline(ROOT::RDF::RNode &node, std::any lifeline)
std::vector< std::pair< std::uint64_t, std::uint64_t > > GetDatasetGlobalClusterBoundaries(const RNode &node)
Retrieve the cluster boundaries for each cluster in the dataset, across files, with a global offset.
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
std::pair< std::vector< Long64_t >, Long64_t > GetClustersAndEntries(std::string_view treename, std::string_view path)
Returns the cluster boundaries and number of entries of the input tree.
std::vector< std::string > GetFileNamesFromTree(const TTree &tree)