Logo ROOT  
Reference Guide
RInterfaceBase.cxx
Go to the documentation of this file.
1// Author: Enrico Guiraud CERN 08/2022
2
3/*************************************************************************
4 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#include <ROOT/InternalTreeUtils.hxx> // GetFriendInfo, GetFileNamesFromTree
14#include <ROOT/RDF/Utils.hxx>
16#include <ROOT/RStringView.hxx>
17#include <TTree.h>
18
19#include <algorithm> // std::for_each
20#include <iomanip> // std::setw
21#include <memory>
22#include <set>
23#include <sstream>
24#include <string>
25#include <unordered_set>
26
28{
29 // TTree/TChain as input
30 const auto tree = fLoopManager->GetTree();
31 if (tree) {
32 const auto treeName = tree->GetName();
33 const auto isTChain = dynamic_cast<TChain *>(tree) ? true : false;
34 const auto treeType = isTChain ? "TChain" : "TTree";
35 const auto isInMemory = !isTChain && !tree->GetCurrentFile() ? true : false;
36 const auto friendInfo = ROOT::Internal::TreeUtils::GetFriendInfo(*tree);
37 const auto hasFriends = friendInfo.fFriendNames.empty() ? false : true;
38 std::stringstream ss;
39 ss << "Dataframe from " << treeType << " " << treeName;
40 if (isInMemory) {
41 ss << " (in-memory)";
42 } else {
44 const auto numFiles = files.size();
45 if (numFiles == 1) {
46 ss << " in file " << files[0];
47 } else {
48 ss << " in files\n";
49 for (auto i = 0u; i < numFiles; i++) {
50 ss << " " << files[i];
51 if (i < numFiles - 1)
52 ss << '\n';
53 }
54 }
55 }
56 if (hasFriends) {
57 const auto numFriends = friendInfo.fFriendNames.size();
58 if (numFriends == 1) {
59 ss << "\nwith friend\n";
60 } else {
61 ss << "\nwith friends\n";
62 }
63 for (auto i = 0u; i < numFriends; i++) {
64 const auto nameAlias = friendInfo.fFriendNames[i];
65 const auto files = friendInfo.fFriendFileNames[i];
66 const auto numFiles = files.size();
67 const auto subnames = friendInfo.fFriendChainSubNames[i];
68 ss << " " << nameAlias.first;
69 if (nameAlias.first != nameAlias.second)
70 ss << " (" << nameAlias.second << ")";
71 // case: TTree as friend
72 if (numFiles == 1) {
73 ss << " " << files[0];
74 }
75 // case: TChain as friend
76 else {
77 ss << '\n';
78 for (auto j = 0u; j < numFiles; j++) {
79 ss << " " << subnames[j] << " " << files[j];
80 if (j < numFiles - 1)
81 ss << '\n';
82 }
83 }
84 if (i < numFriends - 1)
85 ss << '\n';
86 }
87 }
88 return ss.str();
89 }
90 // Datasource as input
91 else if (fDataSource) {
92 const auto datasourceLabel = fDataSource->GetLabel();
93 return "Dataframe from datasource " + datasourceLabel;
94 }
95 // Trivial/empty datasource
96 else {
97 const auto n = fLoopManager->GetNEmptyEntries();
98 if (n == 1) {
99 return "Empty dataframe filling 1 row";
100 } else {
101 return "Empty dataframe filling " + std::to_string(n) + " rows";
102 }
103 }
104}
105
106ROOT::RDF::RInterfaceBase::RInterfaceBase(std::shared_ptr<RDFDetail::RLoopManager> lm)
107 : fLoopManager(lm.get()), fDataSource(lm->GetDataSource()), fColRegister(std::move(lm))
108{
110}
111
113 : fLoopManager(&lm), fDataSource(lm.GetDataSource()), fColRegister(colRegister)
114{
115}
116
117/////////////////////////////////////////////////////////////////////////////
118/// \brief Returns the names of the available columns.
119/// \return the container of column names.
120///
121/// This is not an action nor a transformation, just a query to the RDataFrame object.
122///
123/// ### Example usage:
124/// ~~~{.cpp}
125/// auto colNames = d.GetColumnNames();
126/// // Print columns' names
127/// for (auto &&colName : colNames) std::cout << colName << std::endl;
128/// ~~~
129///
131{
132 // there could be duplicates between Redefined columns and columns in the data source
133 std::unordered_set<std::string> allColumns;
134
135 auto addIfNotInternal = [&allColumns](std::string_view colName) {
136 if (!RDFInternal::IsInternalColumn(colName))
137 allColumns.emplace(colName);
138 };
139
140 auto definedColumns = fColRegister.GetNames();
141
142 std::for_each(definedColumns.begin(), definedColumns.end(), addIfNotInternal);
143
144 auto tree = fLoopManager->GetTree();
145 if (tree) {
146 for (const auto &bName : RDFInternal::GetBranchNames(*tree, /*allowDuplicates=*/false))
147 allColumns.emplace(bName);
148 }
149
150 if (fDataSource) {
151 for (const auto &s : fDataSource->GetColumnNames()) {
152 if (s.rfind("R_rdf_sizeof", 0) != 0)
153 allColumns.emplace(s);
154 }
155 }
156
157 ColumnNames_t ret(allColumns.begin(), allColumns.end());
158 std::sort(ret.begin(), ret.end());
159 return ret;
160}
161
162/////////////////////////////////////////////////////////////////////////////
163/// \brief Return the type of a given column as a string.
164/// \return the type of the required column.
165///
166/// This is not an action nor a transformation, just a query to the RDataFrame object.
167///
168/// ### Example usage:
169/// ~~~{.cpp}
170/// auto colType = d.GetColumnType("columnName");
171/// // Print column type
172/// std::cout << "Column " << colType << " has type " << colType << std::endl;
173/// ~~~
174///
176{
177 const auto col = fColRegister.ResolveAlias(std::string(column));
178
179 RDFDetail::RDefineBase *define = fColRegister.GetDefine(col);
180
181 const bool convertVector2RVec = true;
182 return RDFInternal::ColumnName2ColumnTypeName(col, fLoopManager->GetTree(), fLoopManager->GetDataSource(), define,
183 convertVector2RVec);
184}
185
186/////////////////////////////////////////////////////////////////////////////
187/// \brief Return information about the dataframe.
188/// \return information about the dataframe as RDFDescription object
189///
190/// This convenience function describes the dataframe and combines the following information:
191/// - Number of event loops run, see GetNRuns()
192/// - Number of total and defined columns, see GetColumnNames() and GetDefinedColumnNames()
193/// - Column names, see GetColumnNames()
194/// - Column types, see GetColumnType()
195/// - Number of processing slots, see GetNSlots()
196///
197/// This is not an action nor a transformation, just a query to the RDataFrame object.
198/// The result is dependent on the node from which this method is called, e.g. the list of
199/// defined columns returned by GetDefinedColumnNames().
200///
201/// Please note that this is a convenience feature and the layout of the output can be subject
202/// to change and should be parsed via RDFDescription methods.
203///
204/// ### Example usage:
205/// ~~~{.cpp}
206/// RDataFrame df(10);
207/// auto df2 = df.Define("x", "1.f").Define("s", "\"myStr\"");
208/// // Describe the dataframe
209/// df2.Describe().Print()
210/// df2.Describe().Print(/*shortFormat=*/true)
211/// std::cout << df2.Describe().AsString() << std::endl;
212/// std::cout << df2.Describe().AsString(/*shortFormat=*/true) << std::endl;
213/// ~~~
214///
216{
217 // Build set of defined column names to find later in all column names
218 // the defined columns more efficiently
219 const auto columnNames = GetColumnNames();
220 std::set<std::string> definedColumnNamesSet;
221 for (const auto &name : GetDefinedColumnNames())
222 definedColumnNamesSet.insert(name);
223
224 // Get information for the metadata table
225 const std::vector<std::string> metadataProperties = {"Columns in total", "Columns from defines", "Event loops run",
226 "Processing slots"};
227 const std::vector<std::string> metadataValues = {std::to_string(columnNames.size()),
228 std::to_string(definedColumnNamesSet.size()),
229 std::to_string(GetNRuns()), std::to_string(GetNSlots())};
230
231 // Set header for metadata table
232 const auto columnWidthProperties = RDFInternal::GetColumnWidth(metadataProperties);
233 // The column width of the values is required to make right-bound numbers and is equal
234 // to the maximum of the string "Value" and all values to be put in this column.
235 const auto columnWidthValues =
236 std::max(std::max_element(metadataValues.begin(), metadataValues.end())->size(), static_cast<std::size_t>(5u));
237 std::stringstream ss;
238 ss << std::left << std::setw(columnWidthProperties) << "Property" << std::setw(columnWidthValues) << "Value\n"
239 << std::setw(columnWidthProperties) << "--------" << std::setw(columnWidthValues) << "-----\n";
240
241 // Build metadata table
242 // All numbers should be bound to the right and strings bound to the left.
243 for (auto i = 0u; i < metadataProperties.size(); i++) {
244 ss << std::left << std::setw(columnWidthProperties) << metadataProperties[i] << std::right
245 << std::setw(columnWidthValues) << metadataValues[i] << '\n';
246 }
247 ss << '\n'; // put space between this and the next table
248
249 // Set header for columns table
250 const auto columnWidthNames = RDFInternal::GetColumnWidth(columnNames);
251 const auto columnTypes = GetColumnTypeNamesList(columnNames);
252 const auto columnWidthTypes = RDFInternal::GetColumnWidth(columnTypes);
253 ss << std::left << std::setw(columnWidthNames) << "Column" << std::setw(columnWidthTypes) << "Type"
254 << "Origin\n"
255 << std::setw(columnWidthNames) << "------" << std::setw(columnWidthTypes) << "----"
256 << "------\n";
257
258 // Build columns table
259 const auto nCols = columnNames.size();
260 for (auto i = 0u; i < nCols; i++) {
261 auto origin = "Dataset";
262 if (definedColumnNamesSet.find(columnNames[i]) != definedColumnNamesSet.end())
263 origin = "Define";
264 ss << std::left << std::setw(columnWidthNames) << columnNames[i] << std::setw(columnWidthTypes) << columnTypes[i]
265 << origin;
266 if (i < nCols - 1)
267 ss << '\n';
268 }
269 // Use the string returned from DescribeDataset() as the 'brief' description
270 // Use the converted to string stringstream ss as the 'full' description
271 return RDFDescription(DescribeDataset(), ss.str());
272}
273
274/// \brief Returns the names of the defined columns.
275/// \return the container of the defined column names.
276///
277/// This is not an action nor a transformation, just a simple utility to
278/// get the columns names that have been defined up to the node.
279/// If no column has been defined, e.g. on a root node, it returns an
280/// empty collection.
281///
282/// ### Example usage:
283/// ~~~{.cpp}
284/// auto defColNames = d.GetDefinedColumnNames();
285/// // Print defined columns' names
286/// for (auto &&defColName : defColNames) std::cout << defColName << std::endl;
287/// ~~~
288///
290{
291 ColumnNames_t definedColumns;
292
293 const auto columns = fColRegister.BuildDefineNames();
294 for (const auto &column : columns) {
296 definedColumns.emplace_back(column);
297 }
298
299 return definedColumns;
300}
301
302/// \brief Return a descriptor for the systematic variations registered in this branch of the computation graph.
303///
304/// This is not an action nor a transformation, just a simple utility to
305/// inspect the systematic variations that have been registered with Vary() up to this node.
306/// When called on the root node, it returns an empty descriptor.
307///
308/// ### Example usage:
309/// ~~~{.cpp}
310/// auto variations = d.GetVariations();
311/// variations.Print();
312/// ~~~
313///
315{
316 return fColRegister.BuildVariationsDescription();
317}
318
319/// \brief Checks if a column is present in the dataset.
320/// \return true if the column is available, false otherwise
321///
322/// This method checks if a column is part of the input ROOT dataset, has
323/// been defined or can be provided by the data source.
324///
325/// Example usage:
326/// ~~~{.cpp}
327/// ROOT::RDataFrame base(1);
328/// auto rdf = base.Define("definedColumn", [](){return 0;});
329/// rdf.HasColumn("definedColumn"); // true: we defined it
330/// rdf.HasColumn("rdfentry_"); // true: it's always there
331/// rdf.HasColumn("foo"); // false: it is not there
332/// ~~~
334{
335 if (fColRegister.IsDefineOrAlias(columnName))
336 return true;
337
338 if (fLoopManager->GetTree()) {
339 const auto &branchNames = fLoopManager->GetBranchNames();
340 const auto branchNamesEnd = branchNames.end();
341 if (branchNamesEnd != std::find(branchNames.begin(), branchNamesEnd, columnName))
342 return true;
343 }
344
345 if (fDataSource && fDataSource->HasColumn(columnName))
346 return true;
347
348 return false;
349}
350
351/// \brief Gets the number of data processing slots.
352/// \return The number of data processing slots used by this RDataFrame instance
353///
354/// This method returns the number of data processing slots used by this RDataFrame
355/// instance. This number is influenced by the global switch ROOT::EnableImplicitMT().
356///
357/// Example usage:
358/// ~~~{.cpp}
359/// ROOT::EnableImplicitMT(6)
360/// ROOT::RDataFrame df(1);
361/// std::cout << df.GetNSlots() << std::endl; // prints "6"
362/// ~~~
364{
365 return fLoopManager->GetNSlots();
366}
367
368/// \brief Gets the number of event loops run.
369/// \return The number of event loops run by this RDataFrame instance
370///
371/// This method returns the number of events loops run so far by this RDataFrame instance.
372///
373/// Example usage:
374/// ~~~{.cpp}
375/// ROOT::RDataFrame df(1);
376/// std::cout << df.GetNRuns() << std::endl; // prints "0"
377/// df.Sum("rdfentry_").GetValue(); // trigger the event loop
378/// std::cout << df.GetNRuns() << std::endl; // prints "1"
379/// df.Sum("rdfentry_").GetValue(); // trigger another event loop
380/// std::cout << df.GetNRuns() << std::endl; // prints "2"
381/// ~~~
383{
384 return fLoopManager->GetNRuns();
385}
386
388{
389 std::vector<std::string> types;
390
391 for (auto column : columnList) {
392 types.push_back(GetColumnType(column));
393 }
394 return types;
395}
396
398{
400 std::string error(callerName);
401 error += " was called with ImplicitMT enabled, but multi-thread is not supported.";
402 throw std::runtime_error(error);
403 }
404}
405
407{
408 // Entry number column
409 const std::string entryColName = "rdfentry_";
410 const std::string entryColType = "ULong64_t";
411 auto entryColGen = [](unsigned int, ULong64_t entry) { return entry; };
412 using NewColEntry_t = RDFDetail::RDefine<decltype(entryColGen), RDFDetail::ExtraArgsForDefine::SlotAndEntry>;
413
414 auto entryColumn = std::make_shared<NewColEntry_t>(entryColName, entryColType, std::move(entryColGen),
415 ColumnNames_t{}, fColRegister, *fLoopManager);
416 fColRegister.AddDefine(std::move(entryColumn));
417
418 // Slot number column
419 const std::string slotColName = "rdfslot_";
420 const std::string slotColType = "unsigned int";
421 auto slotColGen = [](unsigned int slot) { return slot; };
422 using NewColSlot_t = RDFDetail::RDefine<decltype(slotColGen), RDFDetail::ExtraArgsForDefine::Slot>;
423
424 auto slotColumn = std::make_shared<NewColSlot_t>(slotColName, slotColType, std::move(slotColGen), ColumnNames_t{},
425 fColRegister, *fLoopManager);
426 fColRegister.AddDefine(std::move(slotColumn));
427
428 fColRegister.AddAlias("tdfentry_", entryColName);
429 fColRegister.AddAlias("tdfslot_", slotColName);
430}
unsigned long long ULong64_t
Definition: RtypesCore.h:81
char name[80]
Definition: TGX11.cxx:110
The head node of a RDF computation graph.
ULong64_t GetNEmptyEntries() const
A binder for user-defined columns, variations and aliases.
A DFDescription contains useful information about a given RDataFrame computation graph.
virtual std::string GetLabel()
Return a string representation of the datasource type.
RVariationsDescription GetVariations() const
Return a descriptor for the systematic variations registered in this branch of the computation graph.
std::string GetColumnType(std::string_view column)
Return the type of a given column as a string.
RDFDescription Describe()
Return information about the dataframe.
ColumnNames_t GetColumnTypeNamesList(const ColumnNames_t &columnList)
RDFDetail::RLoopManager * fLoopManager
< The RLoopManager at the root of this computation graph. Never null.
unsigned int GetNRuns() const
Gets the number of event loops run.
RDataSource * fDataSource
Non-owning pointer to a data-source object. Null if no data-source. RLoopManager has ownership of the...
ColumnNames_t GetDefinedColumnNames()
Returns the names of the defined columns.
void CheckIMTDisabled(std::string_view callerName)
unsigned int GetNSlots() const
Gets the number of data processing slots.
RInterfaceBase(std::shared_ptr< RDFDetail::RLoopManager > lm)
bool HasColumn(std::string_view columnName)
Checks if a column is present in the dataset.
std::string DescribeDataset() const
ColumnNames_t GetColumnNames()
Returns the names of the available columns.
A descriptor for the systematic variations known to a given RDataFrame node.
A chain is a collection of files containing TTree objects.
Definition: TChain.h:33
std::vector< std::string > GetFileNamesFromTree(const TTree &tree)
Get and store the file names associated with the input tree.
ROOT::TreeUtils::RFriendInfo GetFriendInfo(const TTree &tree)
Get and store the names, aliases and file names of the direct friends of the tree.
const Int_t n
Definition: legend1.C:16
basic_string_view< char > string_view
std::vector< std::string > GetBranchNames(TTree &t, bool allowDuplicates=true)
Get all the branches names, including the ones of the friend trees.
unsigned int GetNSlots()
Definition: RDFUtils.cxx:283
std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *, RDataSource *, RDefineBase *, bool vector2rvec=true)
Return a string containing the type of the given branch.
Definition: RDFUtils.cxx:222
unsigned int GetColumnWidth(const std::vector< std::string > &names, const unsigned int minColumnSpace=8u)
Get optimal column width for printing a table given the names and the desired minimal space between c...
Definition: RDFUtils.cxx:372
bool IsInternalColumn(std::string_view colName)
Whether custom column with name colName is an "internal" column such as rdfentry_ or rdfslot_.
Definition: RDFUtils.cxx:363
std::vector< std::string > ColumnNames_t
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
Definition: TROOT.cxx:558
static constexpr double s
Definition: tree.py:1