Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RInterfaceBase.cxx
Go to the documentation of this file.
1// Author: Enrico Guiraud CERN 08/2022
2
3/*************************************************************************
4 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#include <ROOT/InternalTreeUtils.hxx> // GetFriendInfo, GetFileNamesFromTree
14#include <ROOT/RDF/Utils.hxx>
16#include <string_view>
17#include <TTree.h>
18
19#include <algorithm> // std::for_each
20#include <iomanip> // std::setw
21#include <memory>
22#include <set>
23#include <sstream>
24#include <string>
25#include <unordered_set>
26
28{
29 // TTree/TChain as input
30 const auto tree = fLoopManager->GetTree();
31
32 if (tree) {
34 return files.size();
35 }
36 return 0;
37}
38
40{
41 // TTree/TChain as input
42 const auto tree = fLoopManager->GetTree();
43 if (tree) {
44 const auto treeName = tree->GetName();
45 const auto isTChain = dynamic_cast<TChain *>(tree) ? true : false;
46 const auto treeType = isTChain ? "TChain" : "TTree";
47 const auto isInMemory = !isTChain && !tree->GetCurrentFile() ? true : false;
48 const auto friendInfo = ROOT::Internal::TreeUtils::GetFriendInfo(*tree);
49 const auto hasFriends = friendInfo.fFriendNames.empty() ? false : true;
50 std::stringstream ss;
51 ss << "Dataframe from " << treeType;
52 if (*treeName != 0) {
53 ss << " " << treeName;
54 }
55 if (isInMemory) {
56 ss << " (in-memory)";
57 } else {
59 const auto numFiles = files.size();
60 if (numFiles == 1) {
61 ss << " in file " << files[0];
62 } else {
63 ss << " in files\n";
64 for (auto i = 0u; i < numFiles; i++) {
65 ss << " " << files[i];
66 if (i < numFiles - 1)
67 ss << '\n';
68 }
69 }
70 }
71 if (hasFriends) {
72 const auto numFriends = friendInfo.fFriendNames.size();
73 if (numFriends == 1) {
74 ss << "\nwith friend\n";
75 } else {
76 ss << "\nwith friends\n";
77 }
78 for (auto i = 0u; i < numFriends; i++) {
79 const auto nameAlias = friendInfo.fFriendNames[i];
80 const auto files = friendInfo.fFriendFileNames[i];
81 const auto numFiles = files.size();
82 const auto subnames = friendInfo.fFriendChainSubNames[i];
83 ss << " " << nameAlias.first;
84 if (nameAlias.first != nameAlias.second)
85 ss << " (" << nameAlias.second << ")";
86 // case: TTree as friend
87 if (numFiles == 1) {
88 ss << " " << files[0];
89 }
90 // case: TChain as friend
91 else {
92 ss << '\n';
93 for (auto j = 0u; j < numFiles; j++) {
94 ss << " " << subnames[j] << " " << files[j];
95 if (j < numFiles - 1)
96 ss << '\n';
97 }
98 }
99 if (i < numFriends - 1)
100 ss << '\n';
101 }
102 }
103 return ss.str();
104 }
105 // Datasource as input
106 else if (fDataSource) {
107 const auto datasourceLabel = fDataSource->GetLabel();
108 return "Dataframe from datasource " + datasourceLabel;
109 }
110 // Trivial/empty datasource
111 else {
112 const auto n = fLoopManager->GetNEmptyEntries();
113 if (n == 1) {
114 return "Empty dataframe filling 1 row";
115 } else {
116 return "Empty dataframe filling " + std::to_string(n) + " rows";
117 }
118 }
119}
120
121ROOT::RDF::RInterfaceBase::RInterfaceBase(std::shared_ptr<RDFDetail::RLoopManager> lm)
122 : fLoopManager(lm.get()), fDataSource(lm->GetDataSource()), fColRegister(std::move(lm))
123{
125}
126
128 : fLoopManager(&lm), fDataSource(lm.GetDataSource()), fColRegister(colRegister)
129{
130}
131
132/////////////////////////////////////////////////////////////////////////////
133/// \brief Returns the names of the available columns.
134/// \return the container of column names.
135///
136/// This is not an action nor a transformation, just a query to the RDataFrame object.
137///
138/// ### Example usage:
139/// ~~~{.cpp}
140/// auto colNames = d.GetColumnNames();
141/// // Print columns' names
142/// for (auto &&colName : colNames) std::cout << colName << std::endl;
143/// ~~~
144///
146{
147 // there could be duplicates between Redefined columns and columns in the data source
148 std::unordered_set<std::string> allColumns;
149
150 auto addIfNotInternal = [&allColumns](std::string_view colName) {
151 if (!RDFInternal::IsInternalColumn(colName))
152 allColumns.emplace(colName);
153 };
154
155 auto definedColumns = fColRegister.GetNames();
156
157 std::for_each(definedColumns.begin(), definedColumns.end(), addIfNotInternal);
158
159 auto tree = fLoopManager->GetTree();
160 if (tree) {
161 for (const auto &bName : RDFInternal::GetBranchNames(*tree, /*allowDuplicates=*/false))
162 allColumns.emplace(bName);
163 }
164
165 if (fDataSource) {
166 for (const auto &s : fDataSource->GetColumnNames()) {
167 if (s.rfind("R_rdf_sizeof", 0) != 0)
168 allColumns.emplace(s);
169 }
170 }
171
172 ColumnNames_t ret(allColumns.begin(), allColumns.end());
173 std::sort(ret.begin(), ret.end());
174 return ret;
175}
176
177/////////////////////////////////////////////////////////////////////////////
178/// \brief Return the type of a given column as a string.
179/// \return the type of the required column.
180///
181/// This is not an action nor a transformation, just a query to the RDataFrame object.
182///
183/// ### Example usage:
184/// ~~~{.cpp}
185/// auto colType = d.GetColumnType("columnName");
186/// // Print column type
187/// std::cout << "Column " << colType << " has type " << colType << std::endl;
188/// ~~~
189///
190std::string ROOT::RDF::RInterfaceBase::GetColumnType(std::string_view column)
191{
192 const auto col = fColRegister.ResolveAlias(std::string(column));
193
194 RDFDetail::RDefineBase *define = fColRegister.GetDefine(col);
195
196 const bool convertVector2RVec = true;
197 return RDFInternal::ColumnName2ColumnTypeName(col, fLoopManager->GetTree(), fLoopManager->GetDataSource(), define,
198 convertVector2RVec);
199}
200
201/////////////////////////////////////////////////////////////////////////////
202/// \brief Return information about the dataframe.
203/// \return information about the dataframe as RDFDescription object
204///
205/// This convenience function describes the dataframe and combines the following information:
206/// - Number of event loops run, see GetNRuns()
207/// - Number of total and defined columns, see GetColumnNames() and GetDefinedColumnNames()
208/// - Column names, see GetColumnNames()
209/// - Column types, see GetColumnType()
210/// - Number of processing slots, see GetNSlots()
211///
212/// This is not an action nor a transformation, just a query to the RDataFrame object.
213/// The result is dependent on the node from which this method is called, e.g. the list of
214/// defined columns returned by GetDefinedColumnNames().
215///
216/// Please note that this is a convenience feature and the layout of the output can be subject
217/// to change and should be parsed via RDFDescription methods.
218///
219/// ### Example usage:
220/// ~~~{.cpp}
221/// RDataFrame df(10);
222/// auto df2 = df.Define("x", "1.f").Define("s", "\"myStr\"");
223/// // Describe the dataframe
224/// df2.Describe().Print()
225/// df2.Describe().Print(/*shortFormat=*/true)
226/// std::cout << df2.Describe().AsString() << std::endl;
227/// std::cout << df2.Describe().AsString(/*shortFormat=*/true) << std::endl;
228/// ~~~
229///
231{
232 // Build set of defined column names to find later in all column names
233 // the defined columns more efficiently
234 const auto columnNames = GetColumnNames();
235 std::set<std::string> definedColumnNamesSet;
236 for (const auto &name : GetDefinedColumnNames())
237 definedColumnNamesSet.insert(name);
238
239 // Get information for the metadata table
240 const std::vector<std::string> metadataProperties = {"Columns in total", "Columns from defines", "Event loops run",
241 "Processing slots"};
242 const std::vector<std::string> metadataValues = {std::to_string(columnNames.size()),
243 std::to_string(definedColumnNamesSet.size()),
244 std::to_string(GetNRuns()), std::to_string(GetNSlots())};
245
246 // Set header for metadata table
247 const auto columnWidthProperties = RDFInternal::GetColumnWidth(metadataProperties);
248 // The column width of the values is required to make right-bound numbers and is equal
249 // to the maximum of the string "Value" and all values to be put in this column.
250 const auto columnWidthValues =
251 std::max(std::max_element(metadataValues.begin(), metadataValues.end())->size(), static_cast<std::size_t>(5u));
252 std::stringstream ss;
253 ss << std::left << std::setw(columnWidthProperties) << "Property" << std::setw(columnWidthValues) << "Value\n"
254 << std::setw(columnWidthProperties) << "--------" << std::setw(columnWidthValues) << "-----\n";
255
256 // Build metadata table
257 // All numbers should be bound to the right and strings bound to the left.
258 for (auto i = 0u; i < metadataProperties.size(); i++) {
259 ss << std::left << std::setw(columnWidthProperties) << metadataProperties[i] << std::right
260 << std::setw(columnWidthValues) << metadataValues[i] << '\n';
261 }
262 ss << '\n'; // put space between this and the next table
263
264 // Set header for columns table
265 const auto columnWidthNames = RDFInternal::GetColumnWidth(columnNames);
266 const auto columnTypes = GetColumnTypeNamesList(columnNames);
267 const auto columnWidthTypes = RDFInternal::GetColumnWidth(columnTypes);
268 ss << std::left << std::setw(columnWidthNames) << "Column" << std::setw(columnWidthTypes) << "Type"
269 << "Origin\n"
270 << std::setw(columnWidthNames) << "------" << std::setw(columnWidthTypes) << "----"
271 << "------\n";
272
273 // Build columns table
274 const auto nCols = columnNames.size();
275 for (auto i = 0u; i < nCols; i++) {
276 auto origin = "Dataset";
277 if (definedColumnNamesSet.find(columnNames[i]) != definedColumnNamesSet.end())
278 origin = "Define";
279 ss << std::left << std::setw(columnWidthNames) << columnNames[i] << std::setw(columnWidthTypes) << columnTypes[i]
280 << origin << '\n';
281 }
282 // Use the string returned from DescribeDataset() as the 'brief' description
283 // Use the converted to string stringstream ss as the 'full' description
284 return RDFDescription(DescribeDataset(), ss.str());
285}
286
287/// \brief Returns the names of the defined columns.
288/// \return the container of the defined column names.
289///
290/// This is not an action nor a transformation, just a simple utility to
291/// get the columns names that have been defined up to the node.
292/// If no column has been defined, e.g. on a root node, it returns an
293/// empty collection.
294///
295/// ### Example usage:
296/// ~~~{.cpp}
297/// auto defColNames = d.GetDefinedColumnNames();
298/// // Print defined columns' names
299/// for (auto &&defColName : defColNames) std::cout << defColName << std::endl;
300/// ~~~
301///
303{
304 ColumnNames_t definedColumns;
305
306 const auto columns = fColRegister.BuildDefineNames();
307 for (const auto &column : columns) {
309 definedColumns.emplace_back(column);
310 }
311
312 return definedColumns;
313}
314
315/// \brief Return a descriptor for the systematic variations registered in this branch of the computation graph.
316///
317/// This is not an action nor a transformation, just a simple utility to
318/// inspect the systematic variations that have been registered with Vary() up to this node.
319/// When called on the root node, it returns an empty descriptor.
320///
321/// ### Example usage:
322/// ~~~{.cpp}
323/// auto variations = d.GetVariations();
324/// variations.Print();
325/// ~~~
326///
328{
329 return fColRegister.BuildVariationsDescription();
330}
331
332/// \brief Checks if a column is present in the dataset.
333/// \return true if the column is available, false otherwise
334///
335/// This method checks if a column is part of the input ROOT dataset, has
336/// been defined or can be provided by the data source.
337///
338/// Example usage:
339/// ~~~{.cpp}
340/// ROOT::RDataFrame base(1);
341/// auto rdf = base.Define("definedColumn", [](){return 0;});
342/// rdf.HasColumn("definedColumn"); // true: we defined it
343/// rdf.HasColumn("rdfentry_"); // true: it's always there
344/// rdf.HasColumn("foo"); // false: it is not there
345/// ~~~
346bool ROOT::RDF::RInterfaceBase::HasColumn(std::string_view columnName)
347{
348 if (fColRegister.IsDefineOrAlias(columnName))
349 return true;
350
351 if (fLoopManager->GetTree()) {
352 const auto &branchNames = fLoopManager->GetBranchNames();
353 const auto branchNamesEnd = branchNames.end();
354 if (branchNamesEnd != std::find(branchNames.begin(), branchNamesEnd, columnName))
355 return true;
356 }
357
358 if (fDataSource && fDataSource->HasColumn(columnName))
359 return true;
360
361 return false;
362}
363
364/// \brief Gets the number of data processing slots.
365/// \return The number of data processing slots used by this RDataFrame instance
366///
367/// This method returns the number of data processing slots used by this RDataFrame
368/// instance. This number is influenced by the global switch ROOT::EnableImplicitMT().
369///
370/// Example usage:
371/// ~~~{.cpp}
372/// ROOT::EnableImplicitMT(6)
373/// ROOT::RDataFrame df(1);
374/// std::cout << df.GetNSlots() << std::endl; // prints "6"
375/// ~~~
377{
378 return fLoopManager->GetNSlots();
379}
380
381/// \brief Gets the number of event loops run.
382/// \return The number of event loops run by this RDataFrame instance
383///
384/// This method returns the number of events loops run so far by this RDataFrame instance.
385///
386/// Example usage:
387/// ~~~{.cpp}
388/// ROOT::RDataFrame df(1);
389/// std::cout << df.GetNRuns() << std::endl; // prints "0"
390/// df.Sum("rdfentry_").GetValue(); // trigger the event loop
391/// std::cout << df.GetNRuns() << std::endl; // prints "1"
392/// df.Sum("rdfentry_").GetValue(); // trigger another event loop
393/// std::cout << df.GetNRuns() << std::endl; // prints "2"
394/// ~~~
396{
397 return fLoopManager->GetNRuns();
398}
399
401{
402 std::vector<std::string> types;
403
404 for (auto column : columnList) {
405 types.push_back(GetColumnType(column));
406 }
407 return types;
408}
409
410void ROOT::RDF::RInterfaceBase::CheckIMTDisabled(std::string_view callerName)
411{
413 std::string error(callerName);
414 error += " was called with ImplicitMT enabled, but multi-thread is not supported.";
415 throw std::runtime_error(error);
416 }
417}
418
420{
421 // Entry number column
422 const std::string entryColName = "rdfentry_";
423 const std::string entryColType = "ULong64_t";
424 auto entryColGen = [](unsigned int, ULong64_t entry) { return entry; };
425 using NewColEntry_t = RDFDetail::RDefine<decltype(entryColGen), RDFDetail::ExtraArgsForDefine::SlotAndEntry>;
426
427 auto entryColumn = std::make_shared<NewColEntry_t>(entryColName, entryColType, std::move(entryColGen),
428 ColumnNames_t{}, fColRegister, *fLoopManager);
429 fColRegister.AddDefine(std::move(entryColumn));
430
431 // Slot number column
432 const std::string slotColName = "rdfslot_";
433 const std::string slotColType = "unsigned int";
434 auto slotColGen = [](unsigned int slot) { return slot; };
435 using NewColSlot_t = RDFDetail::RDefine<decltype(slotColGen), RDFDetail::ExtraArgsForDefine::Slot>;
436
437 auto slotColumn = std::make_shared<NewColSlot_t>(slotColName, slotColType, std::move(slotColGen), ColumnNames_t{},
438 fColRegister, *fLoopManager);
439 fColRegister.AddDefine(std::move(slotColumn));
440
441 fColRegister.AddAlias("tdfentry_", entryColName);
442 fColRegister.AddAlias("tdfslot_", slotColName);
443}
unsigned long long ULong64_t
Definition RtypesCore.h:81
char name[80]
Definition TGX11.cxx:110
The head node of a RDF computation graph.
A binder for user-defined columns, variations and aliases.
A DFDescription contains useful information about a given RDataFrame computation graph.
RVariationsDescription GetVariations() const
Return a descriptor for the systematic variations registered in this branch of the computation graph.
std::string GetColumnType(std::string_view column)
Return the type of a given column as a string.
RDFDescription Describe()
Return information about the dataframe.
ColumnNames_t GetColumnTypeNamesList(const ColumnNames_t &columnList)
RDFDetail::RLoopManager * fLoopManager
< The RLoopManager at the root of this computation graph. Never null.
unsigned int GetNRuns() const
Gets the number of event loops run.
ColumnNames_t GetDefinedColumnNames()
Returns the names of the defined columns.
void CheckIMTDisabled(std::string_view callerName)
unsigned int GetNSlots() const
Gets the number of data processing slots.
RInterfaceBase(std::shared_ptr< RDFDetail::RLoopManager > lm)
bool HasColumn(std::string_view columnName)
Checks if a column is present in the dataset.
std::string DescribeDataset() const
ColumnNames_t GetColumnNames()
Returns the names of the available columns.
A descriptor for the systematic variations known to a given RDataFrame node.
A chain is a collection of files containing TTree objects.
Definition TChain.h:33
const Int_t n
Definition legend1.C:16
std::vector< std::string > GetBranchNames(TTree &t, bool allowDuplicates=true)
Get all the branches names, including the ones of the friend trees.
unsigned int GetNSlots()
Definition RDFUtils.cxx:283
std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *, RDataSource *, RDefineBase *, bool vector2rvec=true)
Return a string containing the type of the given branch.
Definition RDFUtils.cxx:222
unsigned int GetColumnWidth(const std::vector< std::string > &names, const unsigned int minColumnSpace=8u)
Get optimal column width for printing a table given the names and the desired minimal space between c...
Definition RDFUtils.cxx:372
bool IsInternalColumn(std::string_view colName)
Whether custom column with name colName is an "internal" column such as rdfentry_ or rdfslot_.
Definition RDFUtils.cxx:363
ROOT::TreeUtils::RFriendInfo GetFriendInfo(const TTree &tree, bool retrieveEntries=false)
std::vector< std::string > GetFileNamesFromTree(const TTree &tree)
std::vector< std::string > ColumnNames_t
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
Definition TROOT.cxx:568