Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RInterfaceBase.cxx
Go to the documentation of this file.
1// Author: Enrico Guiraud CERN 08/2022
2
3/*************************************************************************
4 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#include <ROOT/InternalTreeUtils.hxx> // GetFriendInfo, GetFileNamesFromTree
14#include <ROOT/RDF/Utils.hxx>
16#include <string_view>
17#include <TTree.h>
18
19#include <algorithm> // std::for_each
20#include <iomanip> // std::setw
21#include <memory>
22#include <set>
23#include <sstream>
24#include <string>
25#include <unordered_set>
26
28{
29 // TTree/TChain as input
30 if (const auto *tree = fLoopManager->GetTree()) {
31 if (!dynamic_cast<const TChain *>(tree) && !tree->GetCurrentFile()) {
32 // in-memory TTree
33 return 0;
34 }
36 }
37 // Datasource as input
38 if (fDataSource) {
39 return fDataSource->GetNFiles();
40 }
41 return 0;
42}
43
45{
46 // TTree/TChain as input
47 const auto tree = fLoopManager->GetTree();
48 if (tree) {
49 const auto treeName = tree->GetName();
50 const auto isTChain = dynamic_cast<TChain *>(tree) ? true : false;
51 const auto treeType = isTChain ? "TChain" : "TTree";
52 const auto isInMemory = !isTChain && !tree->GetCurrentFile() ? true : false;
53 const auto friendInfo = ROOT::Internal::TreeUtils::GetFriendInfo(*tree);
54 const auto hasFriends = friendInfo.fFriendNames.empty() ? false : true;
55 std::stringstream ss;
56 ss << "Dataframe from " << treeType;
57 if (*treeName != 0) {
58 ss << " " << treeName;
59 }
60 if (isInMemory) {
61 ss << " (in-memory)";
62 } else {
64 const auto numFiles = files.size();
65 if (numFiles == 1) {
66 ss << " in file " << files[0];
67 } else {
68 ss << " in files\n";
69 for (auto i = 0u; i < numFiles; i++) {
70 ss << " " << files[i];
71 if (i < numFiles - 1)
72 ss << '\n';
73 }
74 }
75 }
76 if (hasFriends) {
77 const auto numFriends = friendInfo.fFriendNames.size();
78 if (numFriends == 1) {
79 ss << "\nwith friend\n";
80 } else {
81 ss << "\nwith friends\n";
82 }
83 for (auto i = 0u; i < numFriends; i++) {
84 const auto nameAlias = friendInfo.fFriendNames[i];
85 const auto files = friendInfo.fFriendFileNames[i];
86 const auto numFiles = files.size();
87 const auto subnames = friendInfo.fFriendChainSubNames[i];
88 ss << " " << nameAlias.first;
89 if (nameAlias.first != nameAlias.second)
90 ss << " (" << nameAlias.second << ")";
91 // case: TTree as friend
92 if (numFiles == 1) {
93 ss << " " << files[0];
94 }
95 // case: TChain as friend
96 else {
97 ss << '\n';
98 for (auto j = 0u; j < numFiles; j++) {
99 ss << " " << subnames[j] << " " << files[j];
100 if (j < numFiles - 1)
101 ss << '\n';
102 }
103 }
104 if (i < numFriends - 1)
105 ss << '\n';
106 }
107 }
108 return ss.str();
109 }
110 // Datasource as input
111 else if (fDataSource) {
112 const auto datasourceLabel = fDataSource->GetLabel();
113 return "Dataframe from datasource " + datasourceLabel;
114 }
115 // Trivial/empty datasource
116 else {
117 const auto n = fLoopManager->GetNEmptyEntries();
118 if (n == 1) {
119 return "Empty dataframe filling 1 row";
120 } else {
121 return "Empty dataframe filling " + std::to_string(n) + " rows";
122 }
123 }
124}
125
126ROOT::RDF::RInterfaceBase::RInterfaceBase(std::shared_ptr<RDFDetail::RLoopManager> lm)
127 : fLoopManager(lm), fDataSource(lm->GetDataSource()), fColRegister(lm.get())
128{
130}
131
133 : fLoopManager(std::shared_ptr<ROOT::Detail::RDF::RLoopManager>{&lm, [](ROOT::Detail::RDF::RLoopManager *) {}}),
134 fDataSource(lm.GetDataSource()),
135 fColRegister(colRegister)
136{
137}
138
139/////////////////////////////////////////////////////////////////////////////
140/// \brief Returns the names of the available columns.
141/// \return the container of column names.
142///
143/// This is not an action nor a transformation, just a query to the RDataFrame object.
144///
145/// ### Example usage:
146/// ~~~{.cpp}
147/// auto colNames = d.GetColumnNames();
148/// // Print columns' names
149/// for (auto &&colName : colNames) std::cout << colName << std::endl;
150/// ~~~
151///
153{
154 // there could be duplicates between Redefined columns and columns in the data source
155 std::unordered_set<std::string> allColumns;
156
157 auto addIfNotInternal = [&allColumns](std::string_view colName) {
158 if (!RDFInternal::IsInternalColumn(colName))
159 allColumns.emplace(colName);
160 };
161
162 auto definedColumns = fColRegister.GenerateColumnNames();
163
164 std::for_each(definedColumns.begin(), definedColumns.end(), addIfNotInternal);
165
166 auto tree = fLoopManager->GetTree();
167 if (tree) {
168 for (const auto &bName : RDFInternal::GetBranchNames(*tree, /*allowDuplicates=*/false))
169 allColumns.emplace(bName);
170 }
171
172 if (fDataSource) {
173 for (const auto &s : fDataSource->GetColumnNames()) {
174 if (s.rfind("R_rdf_sizeof", 0) != 0)
175 allColumns.emplace(s);
176 }
177 }
178
179 ColumnNames_t ret(allColumns.begin(), allColumns.end());
180 std::sort(ret.begin(), ret.end());
181 return ret;
182}
183
184/////////////////////////////////////////////////////////////////////////////
185/// \brief Return the type of a given column as a string.
186/// \return the type of the required column.
187///
188/// This is not an action nor a transformation, just a query to the RDataFrame object.
189///
190/// ### Example usage:
191/// ~~~{.cpp}
192/// auto colType = d.GetColumnType("columnName");
193/// // Print column type
194/// std::cout << "Column " << colType << " has type " << colType << std::endl;
195/// ~~~
196///
197std::string ROOT::RDF::RInterfaceBase::GetColumnType(std::string_view column)
198{
199 const auto col = fColRegister.ResolveAlias(column);
200
201 RDFDetail::RDefineBase *define = fColRegister.GetDefine(col);
202
203 const bool convertVector2RVec = true;
204 return RDFInternal::ColumnName2ColumnTypeName(std::string(col), fLoopManager->GetTree(),
205 fLoopManager->GetDataSource(), define, convertVector2RVec);
206}
207
208/////////////////////////////////////////////////////////////////////////////
209/// \brief Return information about the dataframe.
210/// \return information about the dataframe as RDFDescription object
211///
212/// This convenience function describes the dataframe and combines the following information:
213/// - Number of event loops run, see GetNRuns()
214/// - Number of total and defined columns, see GetColumnNames() and GetDefinedColumnNames()
215/// - Column names, see GetColumnNames()
216/// - Column types, see GetColumnType()
217/// - Number of processing slots, see GetNSlots()
218///
219/// This is not an action nor a transformation, just a query to the RDataFrame object.
220/// The result is dependent on the node from which this method is called, e.g. the list of
221/// defined columns returned by GetDefinedColumnNames().
222///
223/// Please note that this is a convenience feature and the layout of the output can be subject
224/// to change and should be parsed via RDFDescription methods.
225///
226/// ### Example usage:
227/// ~~~{.cpp}
228/// RDataFrame df(10);
229/// auto df2 = df.Define("x", "1.f").Define("s", "\"myStr\"");
230/// // Describe the dataframe
231/// df2.Describe().Print()
232/// df2.Describe().Print(/*shortFormat=*/true)
233/// std::cout << df2.Describe().AsString() << std::endl;
234/// std::cout << df2.Describe().AsString(/*shortFormat=*/true) << std::endl;
235/// ~~~
236///
238{
239 // Build set of defined column names to find later in all column names
240 // the defined columns more efficiently
241 const auto columnNames = GetColumnNames();
242 std::set<std::string> definedColumnNamesSet;
243 for (const auto &name : GetDefinedColumnNames())
244 definedColumnNamesSet.insert(name);
245
246 // Get information for the metadata table
247 const std::vector<std::string> metadataProperties = {"Columns in total", "Columns from defines", "Event loops run",
248 "Processing slots"};
249 const std::vector<std::string> metadataValues = {std::to_string(columnNames.size()),
250 std::to_string(definedColumnNamesSet.size()),
251 std::to_string(GetNRuns()), std::to_string(GetNSlots())};
252
253 // Set header for metadata table
254 const auto columnWidthProperties = RDFInternal::GetColumnWidth(metadataProperties);
255 // The column width of the values is required to make right-bound numbers and is equal
256 // to the maximum of the string "Value" and all values to be put in this column.
257 const auto columnWidthValues =
258 std::max(std::max_element(metadataValues.begin(), metadataValues.end())->size(), static_cast<std::size_t>(5u));
259 std::stringstream ss;
260 ss << std::left << std::setw(columnWidthProperties) << "Property" << std::setw(columnWidthValues) << "Value\n"
261 << std::setw(columnWidthProperties) << "--------" << std::setw(columnWidthValues) << "-----\n";
262
263 // Build metadata table
264 // All numbers should be bound to the right and strings bound to the left.
265 for (auto i = 0u; i < metadataProperties.size(); i++) {
266 ss << std::left << std::setw(columnWidthProperties) << metadataProperties[i] << std::right
267 << std::setw(columnWidthValues) << metadataValues[i] << '\n';
268 }
269 ss << '\n'; // put space between this and the next table
270
271 // Set header for columns table
272 const auto columnWidthNames = RDFInternal::GetColumnWidth(columnNames);
273 const auto columnTypes = GetColumnTypeNamesList(columnNames);
274 const auto columnWidthTypes = RDFInternal::GetColumnWidth(columnTypes);
275 ss << std::left << std::setw(columnWidthNames) << "Column" << std::setw(columnWidthTypes) << "Type"
276 << "Origin\n"
277 << std::setw(columnWidthNames) << "------" << std::setw(columnWidthTypes) << "----"
278 << "------\n";
279
280 // Build columns table
281 const auto nCols = columnNames.size();
282 for (auto i = 0u; i < nCols; i++) {
283 auto origin = "Dataset";
284 if (definedColumnNamesSet.find(columnNames[i]) != definedColumnNamesSet.end())
285 origin = "Define";
286 ss << std::left << std::setw(columnWidthNames) << columnNames[i] << std::setw(columnWidthTypes) << columnTypes[i]
287 << origin << '\n';
288 }
289 // Use the string returned from DescribeDataset() as the 'brief' description
290 // Use the converted to string stringstream ss as the 'full' description
291 return RDFDescription(DescribeDataset(), ss.str(), GetNFiles());
292}
293
294/// \brief Returns the names of the defined columns.
295/// \return the container of the defined column names.
296///
297/// This is not an action nor a transformation, just a simple utility to
298/// get the columns names that have been defined up to the node.
299/// If no column has been defined, e.g. on a root node, it returns an
300/// empty collection.
301///
302/// ### Example usage:
303/// ~~~{.cpp}
304/// auto defColNames = d.GetDefinedColumnNames();
305/// // Print defined columns' names
306/// for (auto &&defColName : defColNames) std::cout << defColName << std::endl;
307/// ~~~
308///
310{
311 ColumnNames_t definedColumns;
312
313 const auto columns = fColRegister.BuildDefineNames();
314 for (const auto &column : columns) {
316 definedColumns.emplace_back(column);
317 }
318
319 return definedColumns;
320}
321
322/// \brief Return a descriptor for the systematic variations registered in this branch of the computation graph.
323///
324/// This is not an action nor a transformation, just a simple utility to
325/// inspect the systematic variations that have been registered with Vary() up to this node.
326/// When called on the root node, it returns an empty descriptor.
327///
328/// ### Example usage:
329/// ~~~{.cpp}
330/// auto variations = d.GetVariations();
331/// variations.Print();
332/// ~~~
333///
335{
336 return fColRegister.BuildVariationsDescription();
337}
338
339/// \brief Checks if a column is present in the dataset.
340/// \return true if the column is available, false otherwise
341///
342/// This method checks if a column is part of the input ROOT dataset, has
343/// been defined or can be provided by the data source.
344///
345/// Example usage:
346/// ~~~{.cpp}
347/// ROOT::RDataFrame base(1);
348/// auto rdf = base.Define("definedColumn", [](){return 0;});
349/// rdf.HasColumn("definedColumn"); // true: we defined it
350/// rdf.HasColumn("rdfentry_"); // true: it's always there
351/// rdf.HasColumn("foo"); // false: it is not there
352/// ~~~
353bool ROOT::RDF::RInterfaceBase::HasColumn(std::string_view columnName)
354{
355 if (fColRegister.IsDefineOrAlias(columnName))
356 return true;
357
358 if (fLoopManager->GetTree()) {
359 const auto &branchNames = fLoopManager->GetBranchNames();
360 const auto branchNamesEnd = branchNames.end();
361 if (branchNamesEnd != std::find(branchNames.begin(), branchNamesEnd, columnName))
362 return true;
363 }
364
365 if (fDataSource && fDataSource->HasColumn(columnName))
366 return true;
367
368 return false;
369}
370
371/// \brief Gets the number of data processing slots.
372/// \return The number of data processing slots used by this RDataFrame instance
373///
374/// This method returns the number of data processing slots used by this RDataFrame
375/// instance. This number is influenced by the global switch ROOT::EnableImplicitMT().
376///
377/// Example usage:
378/// ~~~{.cpp}
379/// ROOT::EnableImplicitMT(6)
380/// ROOT::RDataFrame df(1);
381/// std::cout << df.GetNSlots() << std::endl; // prints "6"
382/// ~~~
384{
385 return fLoopManager->GetNSlots();
386}
387
388/// \brief Gets the number of event loops run.
389/// \return The number of event loops run by this RDataFrame instance
390///
391/// This method returns the number of events loops run so far by this RDataFrame instance.
392///
393/// Example usage:
394/// ~~~{.cpp}
395/// ROOT::RDataFrame df(1);
396/// std::cout << df.GetNRuns() << std::endl; // prints "0"
397/// df.Sum("rdfentry_").GetValue(); // trigger the event loop
398/// std::cout << df.GetNRuns() << std::endl; // prints "1"
399/// df.Sum("rdfentry_").GetValue(); // trigger another event loop
400/// std::cout << df.GetNRuns() << std::endl; // prints "2"
401/// ~~~
403{
404 return fLoopManager->GetNRuns();
405}
406
408{
409 std::vector<std::string> types;
410
411 for (auto column : columnList) {
412 types.push_back(GetColumnType(column));
413 }
414 return types;
415}
416
417void ROOT::RDF::RInterfaceBase::CheckIMTDisabled(std::string_view callerName)
418{
420 std::string error(callerName);
421 error += " was called with ImplicitMT enabled, but multi-thread is not supported.";
422 throw std::runtime_error(error);
423 }
424}
425
427{
428 // Entry number column
429 const std::string entryColName = "rdfentry_";
430 const std::string entryColType = "ULong64_t";
431 auto entryColGen = [](unsigned int, ULong64_t entry) { return entry; };
432 using NewColEntry_t = RDFDetail::RDefine<decltype(entryColGen), RDFDetail::ExtraArgsForDefine::SlotAndEntry>;
433
434 auto entryColumn = std::make_shared<NewColEntry_t>(entryColName, entryColType, std::move(entryColGen),
435 ColumnNames_t{}, fColRegister, *fLoopManager);
436 fColRegister.AddDefine(std::move(entryColumn));
437
438 // Slot number column
439 const std::string slotColName = "rdfslot_";
440 const std::string slotColType = "unsigned int";
441 auto slotColGen = [](unsigned int slot) { return slot; };
442 using NewColSlot_t = RDFDetail::RDefine<decltype(slotColGen), RDFDetail::ExtraArgsForDefine::Slot>;
443
444 auto slotColumn = std::make_shared<NewColSlot_t>(slotColName, slotColType, std::move(slotColGen), ColumnNames_t{},
445 fColRegister, *fLoopManager);
446 fColRegister.AddDefine(std::move(slotColumn));
447
448 fColRegister.AddAlias("tdfentry_", entryColName);
449 fColRegister.AddAlias("tdfslot_", slotColName);
450}
unsigned long long ULong64_t
Definition RtypesCore.h:70
char name[80]
Definition TGX11.cxx:110
The head node of a RDF computation graph.
A binder for user-defined columns, variations and aliases.
A DFDescription contains useful information about a given RDataFrame computation graph.
virtual std::size_t GetNFiles() const
Returns the number of files from which the dataset is constructed.
RVariationsDescription GetVariations() const
Return a descriptor for the systematic variations registered in this branch of the computation graph.
std::string GetColumnType(std::string_view column)
Return the type of a given column as a string.
RDFDescription Describe()
Return information about the dataframe.
ColumnNames_t GetColumnTypeNamesList(const ColumnNames_t &columnList)
std::shared_ptr< ROOT::Detail::RDF::RLoopManager > fLoopManager
< The RLoopManager at the root of this computation graph. Never null.
unsigned int GetNRuns() const
Gets the number of event loops run.
RDataSource * fDataSource
Non-owning pointer to a data-source object. Null if no data-source. RLoopManager has ownership of the...
ColumnNames_t GetDefinedColumnNames()
Returns the names of the defined columns.
void CheckIMTDisabled(std::string_view callerName)
unsigned int GetNSlots() const
Gets the number of data processing slots.
RInterfaceBase(std::shared_ptr< RDFDetail::RLoopManager > lm)
bool HasColumn(std::string_view columnName)
Checks if a column is present in the dataset.
std::string DescribeDataset() const
ColumnNames_t GetColumnNames()
Returns the names of the available columns.
A descriptor for the systematic variations known to a given RDataFrame node.
A chain is a collection of files containing TTree objects.
Definition TChain.h:33
const Int_t n
Definition legend1.C:16
std::vector< std::string > GetBranchNames(TTree &t, bool allowDuplicates=true)
Get all the branches names, including the ones of the friend trees.
unsigned int GetNSlots()
Definition RDFUtils.cxx:283
std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *, RDataSource *, RDefineBase *, bool vector2rvec=true)
Return a string containing the type of the given branch.
Definition RDFUtils.cxx:222
unsigned int GetColumnWidth(const std::vector< std::string > &names, const unsigned int minColumnSpace=8u)
Get optimal column width for printing a table given the names and the desired minimal space between c...
Definition RDFUtils.cxx:372
bool IsInternalColumn(std::string_view colName)
Whether custom column with name colName is an "internal" column such as rdfentry_ or rdfslot_.
Definition RDFUtils.cxx:363
ROOT::TreeUtils::RFriendInfo GetFriendInfo(const TTree &tree, bool retrieveEntries=false)
std::vector< std::string > GetFileNamesFromTree(const TTree &tree)
std::vector< std::string > ColumnNames_t
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
Definition TROOT.cxx:570