Logo ROOT   6.14/05
Reference Guide
RDFInterfaceUtils.cxx
Go to the documentation of this file.
1 // Author: Enrico Guiraud, Danilo Piparo CERN 02/2018
2 
3 /*************************************************************************
4  * Copyright (C) 1995-2016, Rene Brun and Fons Rademakers. *
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
12 #include <ROOT/RStringView.hxx>
13 #include <RtypesCore.h>
14 #include <TClass.h>
15 #include <TFriendElement.h>
16 #include <TInterpreter.h>
17 #include <TObject.h>
18 #include <TRegexp.h>
19 #include <TString.h>
20 #include <TTree.h>
21 #include <TBranchElement.h>
22 
23 #include <iosfwd>
24 #include <stdexcept>
25 #include <string>
26 #include <typeinfo>
27 
28 namespace ROOT {
29 namespace Detail {
30 namespace RDF {
31 class RCustomColumnBase;
32 class RFilterBase;
33 class RLoopManager;
34 class RRangeBase;
35 } // namespace RDF
36 } // namespace Detail
37 
38 namespace RDF {
39 class RDataSource;
40 } // namespace RDF
41 
42 } // namespace ROOT
43 
44 namespace ROOT {
45 namespace Internal {
46 namespace RDF {
47 
48 // The set here is used as a registry, the real list, which keeps the order, is
49 // the one in the vector
50 class RActionBase;
51 
52 void UpdateList(std::set<std::string> &bNamesReg, ColumnNames_t &bNames, std::string &branchName,
53  std::string &friendName)
54 {
55  if (!friendName.empty()) {
56  // In case of a friend tree, users might prepend its name/alias to the branch names
57  auto friendBName = friendName + "." + branchName;
58  if (bNamesReg.insert(friendBName).second)
59  bNames.push_back(friendBName);
60  }
61 
62  if (bNamesReg.insert(branchName).second)
63  bNames.push_back(branchName);
64 }
65 
66 void ExploreBranch(TTree &t, std::set<std::string> &bNamesReg, ColumnNames_t &bNames, TBranch *b, std::string prefix,
67  std::string &friendName)
68 {
69  for (auto sb : *b->GetListOfBranches()) {
70  TBranch *subBranch = static_cast<TBranch *>(sb);
71  auto subBranchName = std::string(subBranch->GetName());
72  auto fullName = prefix + subBranchName;
73 
74  std::string newPrefix;
75  if (!prefix.empty())
76  newPrefix = fullName + ".";
77 
78  ExploreBranch(t, bNamesReg, bNames, subBranch, newPrefix, friendName);
79 
80  if (t.GetBranch(fullName.c_str()))
81  UpdateList(bNamesReg, bNames, fullName, friendName);
82  else if (t.GetBranch(subBranchName.c_str()))
83  UpdateList(bNamesReg, bNames, subBranchName, friendName);
84  }
85 }
86 
87 void GetBranchNamesImpl(TTree &t, std::set<std::string> &bNamesReg, ColumnNames_t &bNames,
88  std::set<TTree *> &analysedTrees, std::string &friendName)
89 {
90 
91  if (!analysedTrees.insert(&t).second) {
92  return;
93  }
94 
95  const auto branches = t.GetListOfBranches();
96  if (branches) {
97  std::string prefix = "";
98  for (auto b : *branches) {
99  TBranch *branch = static_cast<TBranch *>(b);
100  auto branchName = std::string(branch->GetName());
101  if (branch->IsA() == TBranch::Class()) {
102  // Leaf list
103 
104  auto listOfLeaves = branch->GetListOfLeaves();
105  if (listOfLeaves->GetEntries() == 1) {
106  auto leafName = std::string(listOfLeaves->At(0)->GetName());
107  if (leafName == branchName)
108  UpdateList(bNamesReg, bNames, branchName, friendName);
109  }
110 
111  for (auto leaf : *listOfLeaves) {
112  auto leafName = std::string(leaf->GetName());
113  auto fullName = branchName + "." + leafName;
114  UpdateList(bNamesReg, bNames, fullName, friendName);
115  }
116  } else {
117  // TBranchElement
118  // Check if there is explicit or implicit dot in the name
119 
120  bool dotIsImplied = false;
121  auto be = dynamic_cast<TBranchElement *>(b);
122  if (!be)
123  throw std::runtime_error("GetBranchNames: unsupported branch type");
124  // TClonesArray (3) and STL collection (4)
125  if (be->GetType() == 3 || be->GetType() == 4)
126  dotIsImplied = true;
127 
128  if (dotIsImplied || branchName.back() == '.')
129  ExploreBranch(t, bNamesReg, bNames, branch, "", friendName);
130  else
131  ExploreBranch(t, bNamesReg, bNames, branch, branchName + ".", friendName);
132 
133  UpdateList(bNamesReg, bNames, branchName, friendName);
134  }
135  }
136  }
137 
138  auto friendTrees = t.GetListOfFriends();
139 
140  if (!friendTrees)
141  return;
142 
143  for (auto friendTreeObj : *friendTrees) {
144  auto friendTree = ((TFriendElement *)friendTreeObj)->GetTree();
145 
146  std::string frName;
147  auto alias = t.GetFriendAlias(friendTree);
148  if (alias != nullptr)
149  frName = std::string(alias);
150  else
151  frName = std::string(friendTree->GetName());
152 
153  GetBranchNamesImpl(*friendTree, bNamesReg, bNames, analysedTrees, frName);
154  }
155 }
156 
157 ///////////////////////////////////////////////////////////////////////////////
158 /// Get all the branches names, including the ones of the friend trees
159 ColumnNames_t GetBranchNames(TTree &t)
160 {
161  std::set<std::string> bNamesSet;
162  ColumnNames_t bNames;
163  std::set<TTree *> analysedTrees;
164  std::string emptyFrName = "";
165  GetBranchNamesImpl(t, bNamesSet, bNames, analysedTrees, emptyFrName);
166  return bNames;
167 }
168 
169 void GetTopLevelBranchNamesImpl(TTree &t, std::set<std::string> &bNamesReg, ColumnNames_t &bNames,
170  std::set<TTree *> &analysedTrees)
171 {
172 
173  if (!analysedTrees.insert(&t).second) {
174  return;
175  }
176 
177  auto branches = t.GetListOfBranches();
178  if (branches) {
179  for (auto branchObj : *branches) {
180  auto name = branchObj->GetName();
181  if (bNamesReg.insert(name).second) {
182  bNames.emplace_back(name);
183  }
184  }
185  }
186 
187  auto friendTrees = t.GetListOfFriends();
188 
189  if (!friendTrees)
190  return;
191 
192  for (auto friendTreeObj : *friendTrees) {
193  auto friendTree = ((TFriendElement *)friendTreeObj)->GetTree();
194  GetTopLevelBranchNamesImpl(*friendTree, bNamesReg, bNames, analysedTrees);
195  }
196 }
197 
198 ///////////////////////////////////////////////////////////////////////////////
199 /// Get all the top-level branches names, including the ones of the friend trees
200 ColumnNames_t GetTopLevelBranchNames(TTree &t)
201 {
202  std::set<std::string> bNamesSet;
203  ColumnNames_t bNames;
204  std::set<TTree *> analysedTrees;
205  GetTopLevelBranchNamesImpl(t, bNamesSet, bNames, analysedTrees);
206  return bNames;
207 }
208 
209 bool IsValidCppVarName(const std::string &var)
210 {
211  if (var.empty())
212  return false;
213  const char firstChar = var[0];
214 
215  // first character must be either a letter or an underscore
216  auto isALetter = [](char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); };
217  const bool isValidFirstChar = firstChar == '_' || isALetter(firstChar);
218  if (!isValidFirstChar)
219  return false;
220 
221  // all characters must be either a letter, an underscore or a number
222  auto isANumber = [](char c) { return c >= '0' && c <= '9'; };
223  auto isValidTok = [&isALetter, &isANumber](char c) { return c == '_' || isALetter(c) || isANumber(c); };
224  for (const char c : var)
225  if (!isValidTok(c))
226  return false;
227 
228  return true;
229 }
230 
231 void CheckCustomColumn(std::string_view definedCol, TTree *treePtr, const ColumnNames_t &customCols,
232  const ColumnNames_t &dataSourceColumns)
233 {
234  const std::string definedColStr(definedCol);
235 
236  if (!IsValidCppVarName(definedColStr)) {
237  const auto msg = "Cannot define column \"" + definedColStr + "\": not a valid C++ variable name.";
238  throw std::runtime_error(msg);
239  }
240 
241  if (treePtr != nullptr) {
242  // check if definedCol is already present in TTree
243  const auto branch = treePtr->GetBranch(definedColStr.c_str());
244  if (branch != nullptr) {
245  const auto msg = "branch \"" + definedColStr + "\" already present in TTree";
246  throw std::runtime_error(msg);
247  }
248  }
249  // check if definedCol has already been `Define`d in the functional graph
250  if (std::find(customCols.begin(), customCols.end(), definedCol) != customCols.end()) {
251  const auto msg = "Redefinition of column \"" + definedColStr + "\"";
252  throw std::runtime_error(msg);
253  }
254  // check if definedCol is already present in the DataSource (but has not yet been `Define`d)
255  if (!dataSourceColumns.empty()) {
256  if (std::find(dataSourceColumns.begin(), dataSourceColumns.end(), definedCol) != dataSourceColumns.end()) {
257  const auto msg = "Redefinition of column \"" + definedColStr + "\" already present in the data-source";
258  throw std::runtime_error(msg);
259  }
260  }
261 }
262 
263 void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
264 {
265  if (nTemplateParams != nColumnNames) {
266  std::string err_msg = "The number of template parameters specified is ";
267  err_msg += std::to_string(nTemplateParams);
268  err_msg += " while ";
269  err_msg += std::to_string(nColumnNames);
270  err_msg += " columns have been specified.";
271  throw std::runtime_error(err_msg);
272  }
273 }
274 
275 /// Choose between local column names or default column names, throw in case of errors.
276 const ColumnNames_t
277 SelectColumns(unsigned int nRequiredNames, const ColumnNames_t &names, const ColumnNames_t &defaultNames)
278 {
279  if (names.empty()) {
280  // use default column names
281  if (defaultNames.size() < nRequiredNames)
282  throw std::runtime_error(
283  std::to_string(nRequiredNames) + " column name" + (nRequiredNames == 1 ? " is" : "s are") +
284  " required but none were provided and the default list has size " + std::to_string(defaultNames.size()));
285  // return first nRequiredNames default column names
286  return ColumnNames_t(defaultNames.begin(), defaultNames.begin() + nRequiredNames);
287  } else {
288  // use column names provided by the user to this particular transformation/action
289  if (names.size() != nRequiredNames) {
290  auto msg = std::to_string(nRequiredNames) + " column name" + (nRequiredNames == 1 ? " is" : "s are") +
291  " required but " + std::to_string(names.size()) + (names.size() == 1 ? " was" : " were") +
292  " provided:";
293  for (const auto &name : names)
294  msg += " \"" + name + "\",";
295  msg.back() = '.';
296  throw std::runtime_error(msg);
297  }
298  return names;
299  }
300 }
301 
302 ColumnNames_t FindUnknownColumns(const ColumnNames_t &requiredCols, const ColumnNames_t &datasetColumns,
303  const ColumnNames_t &definedCols, const ColumnNames_t &dataSourceColumns)
304 {
305  ColumnNames_t unknownColumns;
306  for (auto &column : requiredCols) {
307  const auto isBranch = std::find(datasetColumns.begin(), datasetColumns.end(), column) != datasetColumns.end();
308  if (isBranch)
309  continue;
310  const auto isCustomColumn = std::find(definedCols.begin(), definedCols.end(), column) != definedCols.end();
311  if (isCustomColumn)
312  continue;
313  const auto isDataSourceColumn =
314  std::find(dataSourceColumns.begin(), dataSourceColumns.end(), column) != dataSourceColumns.end();
315  if (isDataSourceColumn)
316  continue;
317  unknownColumns.emplace_back(column);
318  }
319  return unknownColumns;
320 }
321 
323 {
324  return 0 == colName.find("tdf") && '_' == colName.back();
325 }
326 
327 // Replace all the occurrences of a string by another string
328 unsigned int Replace(std::string &s, const std::string what, const std::string withWhat)
329 {
330  size_t idx = 0;
331  auto numReplacements = 0U;
332  while ((idx = s.find(what, idx)) != std::string::npos) {
333  s.replace(idx, what.size(), withWhat);
334  idx += withWhat.size();
335  numReplacements++;
336  }
337  return numReplacements;
338 }
339 
340 // Match expression against names of branches passed as parameter
341 // Return vector of names of the branches used in the expression
342 std::vector<std::string> FindUsedColumnNames(std::string_view expression, const ColumnNames_t &branches,
343  const ColumnNames_t &customColumns, const ColumnNames_t &dsColumns,
344  const std::map<std::string, std::string> &aliasMap)
345 {
346  // To help matching the regex
347  const std::string paddedExpr = " " + std::string(expression) + " ";
348  static const std::string regexBit("[^a-zA-Z0-9_]");
349  Ssiz_t matchedLen;
350 
351  std::vector<std::string> usedBranches;
352 
353  // Check which custom columns match
354  for (auto &brName : customColumns) {
355  std::string bNameRegexContent = regexBit + brName + regexBit;
356  TRegexp bNameRegex(bNameRegexContent.c_str());
357  if (-1 != bNameRegex.Index(paddedExpr.c_str(), &matchedLen)) {
358  usedBranches.emplace_back(brName);
359  }
360  }
361 
362  // Check which tree branches match
363  for (auto &brName : branches) {
364  // Replace "." with "\." for a correct match of sub-branches/leaves
365  auto escapedBrName = brName;
366  Replace(escapedBrName, std::string("."), std::string("\\."));
367  std::string bNameRegexContent = regexBit + escapedBrName + regexBit;
368  TRegexp bNameRegex(bNameRegexContent.c_str());
369  if (-1 != bNameRegex.Index(paddedExpr.c_str(), &matchedLen)) {
370  usedBranches.emplace_back(brName);
371  }
372  }
373 
374  // Check which data-source columns match
375  for (auto &col : dsColumns) {
376  std::string bNameRegexContent = regexBit + col + regexBit;
377  TRegexp bNameRegex(bNameRegexContent.c_str());
378  if (-1 != bNameRegex.Index(paddedExpr.c_str(), &matchedLen)) {
379  // if not already found among the other columns
380  if (std::find(usedBranches.begin(), usedBranches.end(), col) == usedBranches.end())
381  usedBranches.emplace_back(col);
382  }
383  }
384 
385  // Check which aliases match
386  for (auto &alias_colName : aliasMap) {
387  auto &alias = alias_colName.first;
388  std::string bNameRegexContent = regexBit + alias + regexBit;
389  TRegexp bNameRegex(bNameRegexContent.c_str());
390  if (-1 != bNameRegex.Index(paddedExpr.c_str(), &matchedLen)) {
391  // if not already found among the other columns
392  if (std::find(usedBranches.begin(), usedBranches.end(), alias) == usedBranches.end())
393  usedBranches.emplace_back(alias);
394  }
395  }
396 
397  return usedBranches;
398 }
399 
400 // TODO we should also replace other invalid chars, like '[],' and spaces
401 std::vector<std::string> ReplaceDots(const ColumnNames_t &colNames)
402 {
403  std::vector<std::string> dotlessNames = colNames;
404  for (auto &c : dotlessNames) {
405  const bool hasDot = c.find_first_of('.') != std::string::npos;
406  if (hasDot) {
407  std::replace(c.begin(), c.end(), '.', '_');
408  c.insert(0u, "__tdf_arg_");
409  }
410  }
411  return dotlessNames;
412 }
413 
414 // TODO comment well -- there is a lot going on in this function in terms of side-effects
415 std::vector<std::string> ColumnTypesAsString(ColumnNames_t &colNames, ColumnNames_t &varNames,
416  const std::map<std::string, std::string> &aliasMap,
417  const ColumnNames_t &customColNames, TTree *tree, RDataSource *ds,
418  std::string &expr, unsigned int namespaceID)
419 {
420  std::vector<std::string> colTypes;
421  colTypes.reserve(colNames.size());
422  const auto aliasMapEnd = aliasMap.end();
423 
424  for (auto c = colNames.begin(), v = varNames.begin(); c != colNames.end();) {
425  const auto &colName = *c;
426 
427  if (colName.find('.') != std::string::npos) {
428  // If the column name contains dots, replace its name in the expression with the corresponding varName
429  auto numRepl = Replace(expr, colName, *v);
430  if (numRepl == 0) {
431  // Discard this column: we could not replace it, although we matched it previously
432  // This is because it is a substring of a column we already replaced in the expression
433  // e.g. "a.b" is a substring column of "a.b.c"
434  c = colNames.erase(c);
435  v = varNames.erase(v);
436  continue;
437  }
438  } else {
439  // Column name with no dots: check the name is still there
440  // it might have only been there as part of a column name with dots, e.g. "a" inside "a.b.c"
441  const auto paddedExpr = " " + expr + " ";
442  static const std::string noWordChars("[^a-zA-Z0-9_]");
443  const auto colNameRxBody = noWordChars + colName + noWordChars;
444  TRegexp colNameRegex(colNameRxBody.c_str());
445  Ssiz_t matchedLen;
446  const auto colStillThere = colNameRegex.Index(paddedExpr.c_str(), &matchedLen) != -1;
447  if (!colStillThere) {
448  c = colNames.erase(c);
449  v = varNames.erase(v);
450  continue;
451  }
452  }
453 
454  // Replace the colName with the real one in case colName it's an alias
455  // The real name is used to get the type, but the variable name will still be colName
456  const auto aliasMapIt = aliasMap.find(colName);
457  const auto &realColName = aliasMapEnd == aliasMapIt ? colName : aliasMapIt->second;
458  // The map is a const reference, so no operator[]
459  const auto isCustomCol =
460  std::find(customColNames.begin(), customColNames.end(), realColName) != customColNames.end();
461  const auto colTypeName = ColumnName2ColumnTypeName(realColName, namespaceID, tree, ds, isCustomCol);
462  colTypes.emplace_back(colTypeName);
463  ++c, ++v;
464  }
465 
466  return colTypes;
467 }
468 
469 // Jit expression "in the vacuum", throw if cling exits with an error
470 // This is to make sure that column names, types and expression string are proper C++
471 void TryToJitExpression(const std::string &expression, const ColumnNames_t &colNames,
472  const std::vector<std::string> &colTypes, bool hasReturnStmt)
473 {
474  R__ASSERT(colNames.size() == colTypes.size());
475 
476  static unsigned int iNs = 0U;
477  std::stringstream dummyDecl;
478  dummyDecl << "namespace __tdf_" << std::to_string(iNs++) << "{ auto tdf_f = []() {";
479 
480  for (auto col = colNames.begin(), type = colTypes.begin(); col != colNames.end(); ++col, ++type) {
481  dummyDecl << *type << " " << *col << ";\n";
482  }
483 
484  // Now that branches are declared as variables, put the body of the
485  // lambda in dummyDecl and close scopes of f and namespace __tdf_N
486  if (hasReturnStmt)
487  dummyDecl << expression << "\n;};}";
488  else
489  dummyDecl << "return " << expression << "\n;};}";
490 
491  // Try to declare the dummy lambda, error out if it does not compile
492  if (!gInterpreter->Declare(dummyDecl.str().c_str())) {
493  auto msg =
494  "Cannot interpret the following expression:\n" + std::string(expression) + "\n\nMake sure it is valid C++.";
495  throw std::runtime_error(msg);
496  }
497 }
498 
499 std::string
500 BuildLambdaString(const std::string &expr, const ColumnNames_t &vars, const ColumnNames_t &varTypes, bool hasReturnStmt)
501 {
502  R__ASSERT(vars.size() == varTypes.size());
503 
504  std::stringstream ss;
505  ss << "[](";
506  for (auto i = 0u; i < vars.size(); ++i) {
507  // We pass by reference to avoid expensive copies
508  // It can't be const reference in general, as users might want/need to call non-const methods on the values
509  ss << varTypes[i] << "& " << vars[i] << ", ";
510  }
511  if (!vars.empty())
512  ss.seekp(-2, ss.cur);
513 
514  if (hasReturnStmt)
515  ss << "){\n" << expr << "\n}";
516  else
517  ss << "){return " << expr << "\n;}";
518 
519  return ss.str();
520 }
521 
522 std::string PrettyPrintAddr(void *addr)
523 {
524  std::stringstream s;
525  // Windows-friendly
526  s << std::hex << std::showbase << reinterpret_cast<size_t>(addr);
527  return s.str();
528 }
529 
530 // Jit a string filter expression and jit-and-call this->Filter with the appropriate arguments
531 // Return pointer to the new functional chain node returned by the call, cast to Long_t
532 void BookFilterJit(RJittedFilter *jittedFilter, void *prevNode, std::string_view prevNodeTypeName,
534  const std::map<std::string, std::string> &aliasMap, const ColumnNames_t &branches,
535  const ColumnNames_t &customCols, TTree *tree, RDataSource *ds, unsigned int namespaceID)
536 {
537  const auto &dsColumns = ds ? ds->GetColumnNames() : ColumnNames_t{};
538 
539  // not const because `ColumnTypesAsStrings` might delete redundant matches and replace variable names
540  auto usedBranches = FindUsedColumnNames(expression, branches, customCols, dsColumns, aliasMap);
541  auto varNames = ReplaceDots(usedBranches);
542  auto dotlessExpr = std::string(expression);
543  const auto usedColTypes =
544  ColumnTypesAsString(usedBranches, varNames, aliasMap, customCols, tree, ds, dotlessExpr, namespaceID);
545 
546  TRegexp re("[^a-zA-Z0-9_]return[^a-zA-Z0-9_]");
547  Ssiz_t matchedLen;
548  const bool hasReturnStmt = re.Index(dotlessExpr, &matchedLen) != -1;
549 
550  TryToJitExpression(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
551 
552  const auto filterLambda = BuildLambdaString(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
553 
554  const auto jittedFilterAddr = PrettyPrintAddr(jittedFilter);
555  const auto prevNodeAddr = PrettyPrintAddr(prevNode);
556 
557  // Produce code snippet that creates the filter and registers it with the corresponding RJittedFilter
558  // Windows requires std::hex << std::showbase << (size_t)pointer to produce notation "0x1234"
559  std::stringstream filterInvocation;
560  filterInvocation << "ROOT::Internal::RDF::JitFilterHelper(" << filterLambda << ", {";
561  for (const auto &brName : usedBranches) {
562  // Here we selectively replace the brName with the real column name if it's necessary.
563  const auto aliasMapIt = aliasMap.find(brName);
564  auto &realBrName = aliasMapIt == aliasMap.end() ? brName : aliasMapIt->second;
565  filterInvocation << "\"" << realBrName << "\", ";
566  }
567  if (!usedBranches.empty())
568  filterInvocation.seekp(-2, filterInvocation.cur); // remove the last ",
569  filterInvocation << "}, \"" << name << "\", "
570  << "reinterpret_cast<ROOT::Detail::RDF::RJittedFilter*>(" << jittedFilterAddr << "), "
571  << "reinterpret_cast<" << prevNodeTypeName << "*>(" << prevNodeAddr << "));";
572 
573  jittedFilter->GetLoopManagerUnchecked()->ToJit(filterInvocation.str());
574 }
575 
576 // Jit a Define call
578 {
579  const auto &aliasMap = lm.GetAliasMap();
580  auto *const tree = lm.GetTree();
581  const auto branches = tree ? RDFInternal::GetBranchNames(*tree) : ColumnNames_t();
582  const auto &customColumns = lm.GetCustomColumnNames();
583  const auto namespaceID = lm.GetID();
584  const auto &dsColumns = ds ? ds->GetColumnNames() : ColumnNames_t{};
585 
586  // not const because `ColumnTypesAsStrings` might delete redundant matches and replace variable names
587  auto usedBranches = FindUsedColumnNames(expression, branches, customColumns, dsColumns, aliasMap);
588  auto varNames = ReplaceDots(usedBranches);
589  auto dotlessExpr = std::string(expression);
590  const auto usedColTypes =
591  ColumnTypesAsString(usedBranches, varNames, aliasMap, customColumns, tree, ds, dotlessExpr, namespaceID);
592 
593  TRegexp re("[^a-zA-Z0-9_]return[^a-zA-Z0-9_]");
594  Ssiz_t matchedLen;
595  const bool hasReturnStmt = re.Index(dotlessExpr, &matchedLen) != -1;
596 
597  TryToJitExpression(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
598 
599  const auto definelambda = BuildLambdaString(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
600  const auto lambdaName = "eval_" + std::string(name);
601  const auto ns = "__tdf" + std::to_string(namespaceID);
602 
603  // Declare the lambda variable and an alias for the type of the defined column in namespace __tdf
604  // This assumes that a given variable is Define'd once per RDataFrame -- we might want to relax this requirement
605  // to let python users execute a Define cell multiple times
606  const auto defineDeclaration =
607  "namespace " + ns + " { auto " + lambdaName + " = " + definelambda + ";\n" + "using " + std::string(name) +
608  "_type = typename ROOT::TypeTraits::CallableTraits<decltype(" + lambdaName + " )>::ret_type; }\n";
609  gInterpreter->Declare(defineDeclaration.c_str());
610 
611  std::stringstream defineInvocation;
612  defineInvocation << "ROOT::Internal::RDF::JitDefineHelper(" << definelambda << ", {";
613  for (auto brName : usedBranches) {
614  // Here we selectively replace the brName with the real column name if it's necessary.
615  auto aliasMapIt = aliasMap.find(brName);
616  auto &realBrName = aliasMapIt == aliasMap.end() ? brName : aliasMapIt->second;
617  defineInvocation << "\"" << realBrName << "\", ";
618  }
619  if (!usedBranches.empty())
620  defineInvocation.seekp(-2, defineInvocation.cur); // remove the last ",
621  defineInvocation << "}, \"" << name << "\", reinterpret_cast<ROOT::Detail::RDF::RLoopManager*>("
622  << PrettyPrintAddr(&lm) << "));";
623 
624  lm.AddCustomColumnName(name);
625  lm.ToJit(defineInvocation.str());
626 }
627 
628 // Jit and call something equivalent to "this->BuildAndBook<BranchTypes...>(params...)"
629 // (see comments in the body for actual jitted code)
630 std::string JitBuildAndBook(const ColumnNames_t &bl, const std::string &prevNodeTypename, void *prevNode,
631  const std::type_info &art, const std::type_info &at, const void *rOnHeap, TTree *tree,
632  const unsigned int nSlots, const ColumnNames_t &customColumns, RDataSource *ds,
633  const std::shared_ptr<RActionBase *> *const actionPtrPtr, unsigned int namespaceID)
634 {
635  auto nBranches = bl.size();
636 
637  // retrieve branch type names as strings
638  std::vector<std::string> columnTypeNames(nBranches);
639  for (auto i = 0u; i < nBranches; ++i) {
640  const auto isCustomCol = std::find(customColumns.begin(), customColumns.end(), bl[i]) != customColumns.end();
641  const auto columnTypeName = ColumnName2ColumnTypeName(bl[i], namespaceID, tree, ds, isCustomCol);
642  if (columnTypeName.empty()) {
643  std::string exceptionText = "The type of column ";
644  exceptionText += bl[i];
645  exceptionText += " could not be guessed. Please specify one.";
646  throw std::runtime_error(exceptionText.c_str());
647  }
648  columnTypeNames[i] = columnTypeName;
649  }
650 
651  // retrieve type of result of the action as a string
652  auto actionResultTypeClass = TClass::GetClass(art);
653  if (!actionResultTypeClass) {
654  std::string exceptionText = "An error occurred while inferring the result type of an operation.";
655  throw std::runtime_error(exceptionText.c_str());
656  }
657  const auto actionResultTypeName = actionResultTypeClass->GetName();
658 
659  // retrieve type of action as a string
660  auto actionTypeClass = TClass::GetClass(at);
661  if (!actionTypeClass) {
662  std::string exceptionText = "An error occurred while inferring the action type of the operation.";
663  throw std::runtime_error(exceptionText.c_str());
664  }
665  const auto actionTypeName = actionTypeClass->GetName();
666 
667  // createAction_str will contain the following:
668  // ROOT::Internal::RDF::CallBuildAndBook<actionType, branchType1, branchType2...>(
669  // *reinterpret_cast<PrevNodeType*>(prevNode), { bl[0], bl[1], ... }, reinterpret_cast<actionResultType*>(rOnHeap),
670  // reinterpret_cast<shared_ptr<RActionBase*>*>(actionPtrPtr))
671  std::stringstream createAction_str;
672  createAction_str << "ROOT::Internal::RDF::CallBuildAndBook"
673  << "<" << actionTypeName;
674  for (auto &colType : columnTypeNames)
675  createAction_str << ", " << colType;
676  // on Windows, to prefix the hexadecimal value of a pointer with '0x',
677  // one need to write: std::hex << std::showbase << (size_t)pointer
678  createAction_str << ">(*reinterpret_cast<" << prevNodeTypename << "*>(" << std::hex << std::showbase
679  << (size_t)prevNode << "), {";
680  for (auto i = 0u; i < bl.size(); ++i) {
681  if (i != 0u)
682  createAction_str << ", ";
683  createAction_str << '"' << bl[i] << '"';
684  }
685  createAction_str << "}, " << std::dec << std::noshowbase << nSlots << ", reinterpret_cast<" << actionResultTypeName
686  << "*>(" << std::hex << std::showbase << (size_t)rOnHeap << ")"
687  << ", reinterpret_cast<const std::shared_ptr<ROOT::Internal::RDF::RActionBase*>*>(" << std::hex
688  << std::showbase << (size_t)actionPtrPtr << "));";
689  return createAction_str.str();
690 }
691 
692 bool AtLeastOneEmptyString(const std::vector<std::string_view> strings)
693 {
694  for (const auto &s : strings) {
695  if (s.empty())
696  return true;
697  }
698  return false;
699 }
700 
701 /*** Take a shared_ptr<Node<T1,T2,...>> and return a shared_ptr<NodeBase> ***/
702 std::shared_ptr<RFilterBase> UpcastNode(const std::shared_ptr<RFilterBase> ptr)
703 {
704  return ptr;
705 }
706 
707 std::shared_ptr<RCustomColumnBase> UpcastNode(const std::shared_ptr<RCustomColumnBase> ptr)
708 {
709  return ptr;
710 }
711 
712 std::shared_ptr<RRangeBase> UpcastNode(const std::shared_ptr<RRangeBase> ptr)
713 {
714  return ptr;
715 }
716 
717 std::shared_ptr<RLoopManager> UpcastNode(const std::shared_ptr<RLoopManager> ptr)
718 {
719  return ptr;
720 }
721 
722 std::shared_ptr<RJittedFilter> UpcastNode(const std::shared_ptr<RJittedFilter> ptr)
723 {
724  return ptr;
725 }
726 /****************************************************************************/
727 
728 /// Given the desired number of columns and the user-provided list of columns:
729 /// * fallback to using the first nColumns default columns if needed (or throw if nColumns > nDefaultColumns)
730 /// * check that selected column names refer to valid branches, custom columns or datasource columns (throw if not)
731 /// Return the list of selected column names.
732 ColumnNames_t GetValidatedColumnNames(RLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns,
733  const ColumnNames_t &datasetColumns, const ColumnNames_t &validCustomColumns,
734  RDataSource *ds)
735 {
736  const auto &defaultColumns = lm.GetDefaultColumnNames();
737  auto selectedColumns = SelectColumns(nColumns, columns, defaultColumns);
738  const auto unknownColumns = FindUnknownColumns(selectedColumns, datasetColumns, validCustomColumns,
739  ds ? ds->GetColumnNames() : ColumnNames_t{});
740 
741  if (!unknownColumns.empty()) {
742  // throw
743  std::stringstream unknowns;
744  std::string delim = unknownColumns.size() > 1 ? "s: " : ": "; // singular/plural
745  for (auto &unknownColumn : unknownColumns) {
746  unknowns << delim << unknownColumn;
747  delim = ',';
748  }
749  throw std::runtime_error("Unknown column" + unknowns.str());
750  }
751 
752  // Now we need to check within the aliases if some of the yet unknown names can be recovered
753  auto &aliasMap = lm.GetAliasMap();
754  auto aliasMapEnd = aliasMap.end();
755 
756  for (auto idx : ROOT::TSeqU(selectedColumns.size())) {
757  const auto &colName = selectedColumns[idx];
758  const auto aliasColumnNameIt = aliasMap.find(colName);
759  if (aliasMapEnd != aliasColumnNameIt) {
760  selectedColumns[idx] = aliasColumnNameIt->second;
761  }
762  }
763 
764  return selectedColumns;
765 }
766 
767 /// Return a bitset each element of which indicates whether the corresponding element in `selectedColumns` is the
768 /// name of a column that must be defined via datasource. All elements of the returned vector are false if no
769 /// data-source is present.
770 std::vector<bool> FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedCols)
771 {
772  const auto nColumns = requestedCols.size();
773  std::vector<bool> mustBeDefined(nColumns, false);
774  for (auto i = 0u; i < nColumns; ++i)
775  mustBeDefined[i] = std::find(definedCols.begin(), definedCols.end(), requestedCols[i]) == definedCols.end();
776  return mustBeDefined;
777 }
778 
779 } // namespace RDF
780 } // namespace Internal
781 } // namespace ROOT
std::string JitBuildAndBook(const ColumnNames_t &bl, const std::string &prevNodeTypename, void *prevNode, const std::type_info &art, const std::type_info &at, const void *rOnHeap, TTree *tree, const unsigned int nSlots, const ColumnNames_t &customColumns, RDataSource *ds, const std::shared_ptr< RActionBase *> *const actionPtrPtr, unsigned int namespaceID)
void GetTopLevelBranchNamesImpl(TTree &t, std::set< std::string > &bNamesReg, ColumnNames_t &bNames, std::set< TTree *> &analysedTrees)
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:47
void BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm, RDataSource *ds)
void AddCustomColumnName(std::string_view name)
Definition: RDFNodes.hxx:209
ColumnNames_t GetTopLevelBranchNames(TTree &t)
Get all the top-level branches names, including the ones of the friend trees.
std::vector< std::string > ReplaceDots(const ColumnNames_t &colNames)
bool AtLeastOneEmptyString(const std::vector< std::string_view > strings)
Namespace for new ROOT classes and functions.
Definition: StringConv.hxx:21
std::string PrettyPrintAddr(void *addr)
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset&#39;s column names.
bool IsInternalColumn(std::string_view colName)
std::string BuildLambdaString(const std::string &expr, const ColumnNames_t &vars, const ColumnNames_t &varTypes, bool hasReturnStmt)
unsigned int Replace(std::string &s, const std::string what, const std::string withWhat)
virtual TList * GetListOfFriends() const
Definition: TTree.h:411
Regular expression class.
Definition: TRegexp.h:31
#define R__ASSERT(e)
Definition: TError.h:96
#define gInterpreter
Definition: TInterpreter.h:527
const ColumnNames_t SelectColumns(unsigned int nRequiredNames, const ColumnNames_t &names, const ColumnNames_t &defaultNames)
Choose between local column names or default column names, throw in case of errors.
virtual TObjArray * GetListOfBranches()
Definition: TTree.h:409
void ExploreBranch(TTree &t, std::set< std::string > &bNamesReg, ColumnNames_t &bNames, TBranch *b, std::string prefix, std::string &friendName)
const ColumnNames_t & GetDefaultColumnNames() const
Return the list of default columns – empty if none was provided when constructing the RDataFrame...
Definition: RDFNodes.cxx:529
void Class()
Definition: Class.C:29
bool IsValidCppVarName(const std::string &var)
std::shared_ptr< RJittedFilter > UpcastNode(const std::shared_ptr< RJittedFilter > ptr)
A wrapper around a concrete RFilter, which forwards all calls to it RJittedFilter is the type of the ...
Definition: RDFNodes.hxx:611
TObjArray * GetListOfBranches()
Definition: TBranch.h:201
std::vector< std::string > FindUsedColumnNames(std::string_view expression, const ColumnNames_t &branches, const ColumnNames_t &customColumns, const ColumnNames_t &dsColumns, const std::map< std::string, std::string > &aliasMap)
ColumnNames_t FindUnknownColumns(const ColumnNames_t &requiredCols, const ColumnNames_t &datasetColumns, const ColumnNames_t &definedCols, const ColumnNames_t &dataSourceColumns)
unsigned int GetID() const
Definition: RDFNodes.hxx:212
virtual TBranch * GetBranch(const char *name)
Return pointer to the branch with the given name in this tree or its friends.
Definition: TTree.cxx:5017
RLoopManager * GetLoopManagerUnchecked() const
Definition: RDFNodes.cxx:88
std::vector< bool > FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedCols)
Return a bitset each element of which indicates whether the corresponding element in selectedColumns ...
const ColumnNames_t & GetCustomColumnNames() const
Definition: RDFNodes.hxx:185
SVector< double, 2 > v
Definition: Dict.h:5
void TryToJitExpression(const std::string &expression, const ColumnNames_t &colNames, const std::vector< std::string > &colTypes, bool hasReturnStmt)
void BookFilterJit(RJittedFilter *jittedFilter, void *prevNode, std::string_view prevNodeTypeName, std::string_view name, std::string_view expression, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &branches, const ColumnNames_t &customCols, TTree *tree, RDataSource *ds, unsigned int namespaceID)
void GetBranchNamesImpl(TTree &t, std::set< std::string > &bNamesReg, ColumnNames_t &bNames, std::set< TTree *> &analysedTrees, std::string &friendName)
ColumnNames_t GetValidatedColumnNames(RLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns, const ColumnNames_t &datasetColumns, const ColumnNames_t &validCustomColumns, RDataSource *ds)
Given the desired number of columns and the user-provided list of columns:
A Branch for the case of an object.
int Ssiz_t
Definition: RtypesCore.h:63
std::string ColumnName2ColumnTypeName(const std::string &colName, unsigned int namespaceID, TTree *tree, RDataSource *ds, bool isCustomColumn, bool vector2tvec)
Return a string containing the type of the given branch.
Definition: RDFUtils.cxx:184
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition: TRegexp.cxx:209
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:66
int type
Definition: TGX11.cxx:120
basic_string_view< char > string_view
Definition: RStringView.hxx:35
static constexpr double s
std::vector< std::string > ColumnTypesAsString(ColumnNames_t &colNames, ColumnNames_t &varNames, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &customColNames, TTree *tree, RDataSource *ds, std::string &expr, unsigned int namespaceID)
TObjArray * GetListOfLeaves()
Definition: TBranch.h:202
void CheckCustomColumn(std::string_view definedCol, TTree *treePtr, const ColumnNames_t &customCols, const ColumnNames_t &dataSourceColumns)
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition: TClass.cxx:2887
void UpdateList(std::set< std::string > &bNamesReg, ColumnNames_t &bNames, std::string &branchName, std::string &friendName)
void ToJit(const std::string &s)
Definition: RDFNodes.hxx:205
void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
A TFriendElement TF describes a TTree object TF in a file.
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
Definition: TRolke.cxx:630
#define c(i)
Definition: RSha256.hxx:101
Definition: tree.py:1
A TTree object has a header with a name and a title.
Definition: TTree.h:70
const std::map< std::string, std::string > & GetAliasMap() const
Definition: RDFNodes.hxx:210
A TTree is a list of TBranches.
Definition: TBranch.h:62
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
Definition: RDataSource.hxx:91
static constexpr double ns
char name[80]
Definition: TGX11.cxx:109
ColumnNames_t GetBranchNames(TTree &t)
Get all the branches names, including the ones of the friend trees.
virtual const char * GetFriendAlias(TTree *) const
If the &#39;tree&#39; is a friend, this method returns its alias name.
Definition: TTree.cxx:5751