Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RDFUtils.cxx
Go to the documentation of this file.
1// Author: Enrico Guiraud, Danilo Piparo CERN 03/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#include "RConfigure.h" // R__USE_IMT
12#include "ROOT/RDataSource.hxx"
15#include "ROOT/RDF/Utils.hxx"
16#include "ROOT/RLogger.hxx"
17#include "RtypesCore.h"
18#include "TBranch.h"
19#include "TBranchElement.h"
20#include "TClass.h"
21#include "TClassEdit.h"
22#include "TClassRef.h"
23#include "TError.h" // Info
24#include "TInterpreter.h"
25#include "TLeaf.h"
26#include "TROOT.h" // IsImplicitMTEnabled, GetThreadPoolSize
27#include "TTree.h"
28
29#include <stdexcept>
30#include <string>
31#include <cstring>
32#include <typeinfo>
33
34using namespace ROOT::Detail::RDF;
35using namespace ROOT::RDF;
36
38{
39 static ROOT::Experimental::RLogChannel c("ROOT.RDF");
40 return c;
41}
42
43namespace ROOT {
44namespace Internal {
45namespace RDF {
46
47/// Return the type_info associated to a name. If the association fails, an
48/// exception is thrown.
49/// References and pointers are not supported since those cannot be stored in
50/// columns.
51const std::type_info &TypeName2TypeID(const std::string &name)
52{
53 if (auto c = TClass::GetClass(name.c_str())) {
54 if (!c->GetTypeInfo()) {
55 std::string msg("Cannot extract type_info of type ");
56 msg += name.c_str();
57 msg += ".";
58 throw std::runtime_error(msg);
59 }
60 return *c->GetTypeInfo();
61 } else if (name == "char" || name == "Char_t")
62 return typeid(char);
63 else if (name == "unsigned char" || name == "UChar_t")
64 return typeid(unsigned char);
65 else if (name == "int" || name == "Int_t")
66 return typeid(int);
67 else if (name == "unsigned int" || name == "UInt_t")
68 return typeid(unsigned int);
69 else if (name == "short" || name == "Short_t")
70 return typeid(short);
71 else if (name == "unsigned short" || name == "UShort_t")
72 return typeid(unsigned short);
73 else if (name == "long" || name == "Long_t")
74 return typeid(long);
75 else if (name == "unsigned long" || name == "ULong_t")
76 return typeid(unsigned long);
77 else if (name == "double" || name == "Double_t")
78 return typeid(double);
79 else if (name == "float" || name == "Float_t")
80 return typeid(float);
81 else if (name == "long long" || name == "long long int" || name == "Long64_t")
82 return typeid(Long64_t);
83 else if (name == "unsigned long long" || name == "unsigned long long int" || name == "ULong64_t")
84 return typeid(ULong64_t);
85 else if (name == "bool" || name == "Bool_t")
86 return typeid(bool);
87 else {
88 std::string msg("Cannot extract type_info of type ");
89 msg += name.c_str();
90 msg += ".";
91 throw std::runtime_error(msg);
92 }
93}
94
95/// Returns the name of a type starting from its type_info
96/// An empty string is returned in case of failure
97/// References and pointers are not supported since those cannot be stored in
98/// columns.
99std::string TypeID2TypeName(const std::type_info &id)
100{
101 if (auto c = TClass::GetClass(id)) {
102 return c->GetName();
103 } else if (id == typeid(char))
104 return "char";
105 else if (id == typeid(unsigned char))
106 return "unsigned char";
107 else if (id == typeid(int))
108 return "int";
109 else if (id == typeid(unsigned int))
110 return "unsigned int";
111 else if (id == typeid(short))
112 return "short";
113 else if (id == typeid(unsigned short))
114 return "unsigned short";
115 else if (id == typeid(long))
116 return "long";
117 else if (id == typeid(unsigned long))
118 return "unsigned long";
119 else if (id == typeid(double))
120 return "double";
121 else if (id == typeid(float))
122 return "float";
123 else if (id == typeid(Long64_t))
124 return "Long64_t";
125 else if (id == typeid(ULong64_t))
126 return "ULong64_t";
127 else if (id == typeid(bool))
128 return "bool";
129 else
130 return "";
131}
132
133std::string ComposeRVecTypeName(const std::string &valueType)
134{
135 return "ROOT::VecOps::RVec<" + valueType + ">";
136}
137
138std::string GetLeafTypeName(TLeaf *leaf, const std::string &colName)
139{
140 const char *colTypeCStr = leaf->GetTypeName();
141 std::string colType = colTypeCStr == nullptr ? "" : colTypeCStr;
142 if (colType.empty())
143 throw std::runtime_error("Could not deduce type of leaf " + colName);
144 if (leaf->GetLeafCount() != nullptr && leaf->GetLenStatic() == 1) {
145 // this is a variable-sized array
146 colType = ComposeRVecTypeName(colType);
147 } else if (leaf->GetLeafCount() == nullptr && leaf->GetLenStatic() > 1) {
148 // this is a fixed-sized array (we do not differentiate between variable- and fixed-sized arrays)
149 colType = ComposeRVecTypeName(colType);
150 } else if (leaf->GetLeafCount() != nullptr && leaf->GetLenStatic() > 1) {
151 // we do not know how to deal with this branch
152 throw std::runtime_error("TTree leaf " + colName +
153 " has both a leaf count and a static length. This is not supported.");
154 }
155
156 return colType;
157}
158
159/// Return the typename of object colName stored in t, if any. Return an empty string if colName is not in t.
160/// Supported cases:
161/// - leaves corresponding to single values, variable- and fixed-length arrays, with following syntax:
162/// - "leafname", as long as TTree::GetLeaf resolves it
163/// - "b1.b2...leafname", as long as TTree::GetLeaf("b1.b2....", "leafname") resolves it
164/// - TBranchElements, as long as TTree::GetBranch resolves their names
165std::string GetBranchOrLeafTypeName(TTree &t, const std::string &colName)
166{
167 // look for TLeaf either with GetLeaf(colName) or with GetLeaf(branchName, leafName) (splitting on last dot)
168 auto *leaf = t.GetLeaf(colName.c_str());
169 if (!leaf)
170 leaf = t.FindLeaf(colName.c_str()); // try harder
171 if (!leaf) {
172 // try splitting branchname and leafname
173 const auto dotPos = colName.find_last_of('.');
174 const auto hasDot = dotPos != std::string::npos;
175 if (hasDot) {
176 const auto branchName = colName.substr(0, dotPos);
177 const auto leafName = colName.substr(dotPos + 1);
178 leaf = t.GetLeaf(branchName.c_str(), leafName.c_str());
179 }
180 }
181 if (leaf)
182 return GetLeafTypeName(leaf, std::string(leaf->GetFullName()));
183
184 // we could not find a leaf named colName, so we look for a branch called like this
185 auto branch = t.GetBranch(colName.c_str());
186 if (!branch)
187 branch = t.FindBranch(colName.c_str()); // try harder
188 if (branch) {
189 static const TClassRef tbranchelement("TBranchElement");
190 if (branch->InheritsFrom(tbranchelement)) {
191 auto be = static_cast<TBranchElement *>(branch);
192 if (auto currentClass = be->GetCurrentClass())
193 return currentClass->GetName();
194 else {
195 // Here we have a special case for getting right the type of data members
196 // of classes sorted in TClonesArrays: ROOT-9674
197 auto mother = be->GetMother();
198 if (mother && mother->InheritsFrom(tbranchelement) && mother != be) {
199 auto beMom = static_cast<TBranchElement *>(mother);
200 auto beMomClass = beMom->GetClass();
201 if (beMomClass && 0 == std::strcmp("TClonesArray", beMomClass->GetName()))
202 return be->GetTypeName();
203 }
204 return be->GetClassName();
205 }
206 } else if (branch->IsA() == TBranch::Class() && branch->GetListOfLeaves()->GetEntriesUnsafe() == 1) {
207 // normal branch (not a TBranchElement): if it has only one leaf, we pick the type of the leaf:
208 // RDF and TTreeReader allow referring to branch.leaf as just branch if branch has only one leaf
209 leaf = static_cast<TLeaf *>(branch->GetListOfLeaves()->UncheckedAt(0));
210 return GetLeafTypeName(leaf, std::string(leaf->GetFullName()));
211 }
212 }
213
214 // we could not find a branch or a leaf called colName
215 return std::string();
216}
217
218/// Return a string containing the type of the given branch. Works both with real TTree branches and with temporary
219/// column created by Define. Throws if type name deduction fails.
220/// Note that for fixed- or variable-sized c-style arrays the returned type name will be RVec<T>.
221/// vector2rvec specifies whether typename 'std::vector<T>' should be converted to 'RVec<T>' or returned as is
222std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *tree, RDataSource *ds, RDefineBase *define,
223 bool vector2rvec)
224{
225 std::string colType;
226
227 // must check defines first: we want Redefines to have precedence over everything else
228 if (define) {
229 colType = define->GetTypeName();
230 } else if (ds && ds->HasColumn(colName)) {
231 colType = ds->GetTypeName(colName);
232 } else if (tree) {
233 colType = GetBranchOrLeafTypeName(*tree, colName);
234 if (vector2rvec && TClassEdit::IsSTLCont(colType) == ROOT::ESTLType::kSTLvector) {
235 std::vector<std::string> split;
236 int dummy;
237 TClassEdit::GetSplit(colType.c_str(), split, dummy);
238 auto &valueType = split[1];
239 colType = ComposeRVecTypeName(valueType);
240 }
241 }
242
243 if (colType.empty())
244 throw std::runtime_error("Column \"" + colName +
245 "\" is not in a dataset and is not a custom column been defined.");
246
247 return colType;
248}
249
250/// Convert type name (e.g. "Float_t") to ROOT type code (e.g. 'F') -- see TBranch documentation.
251/// Return a space ' ' in case no match was found.
252char TypeName2ROOTTypeName(const std::string &b)
253{
254 if (b == "Char_t" || b == "char")
255 return 'B';
256 if (b == "UChar_t" || b == "unsigned char")
257 return 'b';
258 if (b == "Short_t" || b == "short" || b == "short int")
259 return 'S';
260 if (b == "UShort_t" || b == "unsigned short" || b == "unsigned short int")
261 return 's';
262 if (b == "Int_t" || b == "int")
263 return 'I';
264 if (b == "UInt_t" || b == "unsigned" || b == "unsigned int")
265 return 'i';
266 if (b == "Float_t" || b == "float")
267 return 'F';
268 if (b == "Double_t" || b == "double")
269 return 'D';
270 if (b == "Long64_t" || b == "long long" || b == "long long int")
271 return 'L';
272 if (b == "ULong64_t" || b == "unsigned long long" || b == "unsigned long long int")
273 return 'l';
274 if (b == "Long_t" || b == "long" || b == "long int")
275 return 'G';
276 if (b == "ULong_t" || b == "unsigned long" || b == "unsigned long int")
277 return 'g';
278 if (b == "Bool_t" || b == "bool")
279 return 'O';
280 return ' ';
281}
282
283unsigned int GetNSlots()
284{
285 unsigned int nSlots = 1;
286#ifdef R__USE_IMT
288 nSlots = ROOT::GetThreadPoolSize();
289#endif // R__USE_IMT
290 return nSlots;
291}
292
293/// Replace occurrences of '.' with '_' in each string passed as argument.
294/// An Info message is printed when this happens. Dots at the end of the string are not replaced.
295/// An exception is thrown in case the resulting set of strings would contain duplicates.
296std::vector<std::string> ReplaceDotWithUnderscore(const std::vector<std::string> &columnNames)
297{
298 auto newColNames = columnNames;
299 for (auto &col : newColNames) {
300 const auto dotPos = col.find('.');
301 if (dotPos != std::string::npos && dotPos != col.size() - 1 && dotPos != 0u) {
302 auto oldName = col;
303 std::replace(col.begin(), col.end(), '.', '_');
304 if (std::find(columnNames.begin(), columnNames.end(), col) != columnNames.end())
305 throw std::runtime_error("Column " + oldName + " would be written as " + col +
306 " but this column already exists. Please use Alias to select a new name for " +
307 oldName);
308 Info("Snapshot", "Column %s will be saved as %s", oldName.c_str(), col.c_str());
309 }
310 }
311
312 return newColNames;
313}
314
315void InterpreterDeclare(const std::string &code)
316{
317 R__LOG_DEBUG(10, RDFLogChannel()) << "Declaring the following code to cling:\n\n" << code << '\n';
318
319 if (!gInterpreter->Declare(code.c_str())) {
320 const auto msg =
321 "\nRDataFrame: An error occurred during just-in-time compilation. The lines above might indicate the cause of "
322 "the crash\n All RDF objects that have not run an event loop yet should be considered in an invalid state.\n";
323 throw std::runtime_error(msg);
324 }
325}
326
327Long64_t InterpreterCalc(const std::string &code, const std::string &context)
328{
329 R__LOG_DEBUG(10, RDFLogChannel()) << "Jitting and executing the following code:\n\n" << code << '\n';
330
331 TInterpreter::EErrorCode errorCode(TInterpreter::kNoError); // storage for cling errors
332
333 auto callCalc = [&errorCode, &context](const std::string &codeSlice) {
334 gInterpreter->Calc(codeSlice.c_str(), &errorCode);
335 if (errorCode != TInterpreter::EErrorCode::kNoError) {
336 std::string msg = "\nAn error occurred during just-in-time compilation";
337 if (!context.empty())
338 msg += " in " + context;
339 msg +=
340 ". The lines above might indicate the cause of the crash\nAll RDF objects that have not run their event "
341 "loop yet should be considered in an invalid state.\n";
342 throw std::runtime_error(msg);
343 }
344 };
345
346 // Call Calc every 1000 newlines in order to avoid jitting a very large function body, which is slow:
347 // see https://github.com/root-project/root/issues/9312 and https://github.com/root-project/root/issues/7604
348 std::size_t substr_start = 0;
349 std::size_t substr_end = 0;
350 while (substr_end != std::string::npos && substr_start != code.size() - 1) {
351 for (std::size_t i = 0u; i < 1000u && substr_end != std::string::npos; ++i) {
352 substr_end = code.find('\n', substr_end + 1);
353 }
354 const std::string subs = code.substr(substr_start, substr_end - substr_start);
355 substr_start = substr_end;
356
357 callCalc(subs);
358 }
359
360 return 0; // we used to forward the return value of Calc, but that's not possible anymore.
361}
362
363bool IsInternalColumn(std::string_view colName)
364{
365 const auto str = colName.data();
366 const auto goodPrefix = colName.size() > 3 && // has at least more characters than {r,t}df
367 ('r' == str[0] || 't' == str[0]) && // starts with r or t
368 0 == strncmp("df", str + 1, 2); // 2nd and 3rd letters are df
369 return goodPrefix && '_' == colName.back(); // also ends with '_'
370}
371
372unsigned int GetColumnWidth(const std::vector<std::string>& names, const unsigned int minColumnSpace)
373{
374 auto columnWidth = 0u;
375 for (const auto& name : names) {
376 const auto length = name.length();
377 if (length > columnWidth)
378 columnWidth = length;
379 }
380 columnWidth = (columnWidth / minColumnSpace + 1) * minColumnSpace;
381 return columnWidth;
382}
383
384void CheckReaderTypeMatches(const std::type_info &colType, const std::type_info &requestedType,
385 const std::string &colName)
386{
387 // Here we compare names and not typeinfos since they may come from two different contexts: a compiled
388 // and a jitted one.
389 const auto diffTypes = (0 != std::strcmp(colType.name(), requestedType.name()));
390 auto inheritedType = [&]() {
391 auto colTClass = TClass::GetClass(colType);
392 return colTClass && colTClass->InheritsFrom(TClass::GetClass(requestedType));
393 };
394
395 if (diffTypes && !inheritedType()) {
396 const auto tName = TypeID2TypeName(requestedType);
397 const auto colTypeName = TypeID2TypeName(colType);
398 std::string errMsg = "RDataFrame: type mismatch: column \"" + colName + "\" is being used as ";
399 if (tName.empty()) {
400 errMsg += requestedType.name();
401 errMsg += " (extracted from type info)";
402 } else {
403 errMsg += tName;
404 }
405 errMsg += " but the Define or Vary node advertises it as ";
406 if (colTypeName.empty()) {
407 auto &id = colType;
408 errMsg += id.name();
409 errMsg += " (extracted from type info)";
410 } else {
411 errMsg += colTypeName;
412 }
413 throw std::runtime_error(errMsg);
414 }
415}
416
417bool IsStrInVec(const std::string &str, const std::vector<std::string> &vec)
418{
419 return std::find(vec.cbegin(), vec.cend(), str) != vec.cend();
420}
421
422} // end NS RDF
423} // end NS Internal
424} // end NS ROOT
#define R__LOG_DEBUG(DEBUGLEVEL,...)
Definition RLogger.hxx:365
#define b(i)
Definition RSha256.hxx:100
#define c(i)
Definition RSha256.hxx:101
long long Long64_t
Definition RtypesCore.h:80
unsigned long long ULong64_t
Definition RtypesCore.h:81
void Info(const char *location, const char *msgfmt,...)
Use this function for informational messages.
Definition TError.cxx:230
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h length
char name[80]
Definition TGX11.cxx:110
#define gInterpreter
std::string GetTypeName() const
A log configuration for a channel, e.g.
Definition RLogger.hxx:101
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
virtual bool HasColumn(std::string_view colName) const =0
Checks if the dataset has a certain column.
virtual std::string GetTypeName(std::string_view colName) const =0
Type of a column as a string, e.g.
A Branch for the case of an object.
virtual TClass * GetClass() const
static TClass * Class()
TClassRef is used to implement a permanent reference to a TClass object.
Definition TClassRef.h:28
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition TClass.cxx:2968
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition TLeaf.h:57
virtual const char * GetTypeName() const
Definition TLeaf.h:139
virtual TLeaf * GetLeafCount() const
If this leaf stores a variable-sized array or a multi-dimensional array whose last dimension has vari...
Definition TLeaf.h:121
virtual Int_t GetLenStatic() const
Return the fixed length of this leaf.
Definition TLeaf.h:132
const char * GetName() const override
Returns name of object.
Definition TNamed.h:47
A TTree represents a columnar dataset.
Definition TTree.h:79
virtual TBranch * FindBranch(const char *name)
Return the branch that correspond to the path 'branchname', which can include the name of the tree or...
Definition TTree.cxx:4832
virtual TBranch * GetBranch(const char *name)
Return pointer to the branch with the given name in this tree or its friends.
Definition TTree.cxx:5285
virtual TLeaf * GetLeaf(const char *branchname, const char *leafname)
Return pointer to the 1st Leaf named name in any Branch of this Tree or any branch in the list of fri...
Definition TTree.cxx:6186
virtual TLeaf * FindLeaf(const char *name)
Find leaf..
Definition TTree.cxx:4907
ROOT::Experimental::RLogChannel & RDFLogChannel()
Definition RDFUtils.cxx:37
std::vector< std::string > ReplaceDotWithUnderscore(const std::vector< std::string > &columnNames)
Replace occurrences of '.
Definition RDFUtils.cxx:296
const std::type_info & TypeName2TypeID(const std::string &name)
Return the type_info associated to a name.
Definition RDFUtils.cxx:51
unsigned int GetNSlots()
Definition RDFUtils.cxx:283
std::string ComposeRVecTypeName(const std::string &valueType)
Definition RDFUtils.cxx:133
std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *, RDataSource *, RDefineBase *, bool vector2rvec=true)
Return a string containing the type of the given branch.
Definition RDFUtils.cxx:222
std::string GetLeafTypeName(TLeaf *leaf, const std::string &colName)
Definition RDFUtils.cxx:138
char TypeName2ROOTTypeName(const std::string &b)
Convert type name (e.g.
Definition RDFUtils.cxx:252
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition RDFUtils.cxx:99
bool IsStrInVec(const std::string &str, const std::vector< std::string > &vec)
Definition RDFUtils.cxx:417
unsigned int GetColumnWidth(const std::vector< std::string > &names, const unsigned int minColumnSpace=8u)
Get optimal column width for printing a table given the names and the desired minimal space between c...
Definition RDFUtils.cxx:372
std::string GetBranchOrLeafTypeName(TTree &t, const std::string &colName)
Return the typename of object colName stored in t, if any.
Definition RDFUtils.cxx:165
Long64_t InterpreterCalc(const std::string &code, const std::string &context="")
Jit code in the interpreter with TInterpreter::Calc, throw in case of errors.
Definition RDFUtils.cxx:327
void CheckReaderTypeMatches(const std::type_info &colType, const std::type_info &requestedType, const std::string &colName)
Definition RDFUtils.cxx:384
bool IsInternalColumn(std::string_view colName)
Whether custom column with name colName is an "internal" column such as rdfentry_ or rdfslot_.
Definition RDFUtils.cxx:363
void InterpreterDeclare(const std::string &code)
Declare code in the interpreter via the TInterpreter::Declare method, throw in case of errors.
Definition RDFUtils.cxx:315
This file contains a specialised ROOT message handler to test for diagnostic in unit tests.
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
Definition TROOT.cxx:558
UInt_t GetThreadPoolSize()
Returns the size of ROOT's thread pool.
Definition TROOT.cxx:565
@ kSTLvector
Definition ESTLType.h:30
ROOT::ESTLType IsSTLCont(std::string_view type)
type : type name: vector<list<classA,allocator>,allocator> result: 0 : not stl container code of cont...
int GetSplit(const char *type, std::vector< std::string > &output, int &nestedLoc, EModType mode=TClassEdit::kNone)
Stores in output (after emptying it) the split type.
Definition tree.py:1