Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RDFInterfaceUtils.cxx
Go to the documentation of this file.
1// Author: Enrico Guiraud, Danilo Piparo CERN 02/2018
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
12#include <ROOT/RDataFrame.hxx>
13#include <ROOT/RStringView.hxx>
14#include <ROOT/TSeq.hxx>
15#include <RtypesCore.h>
16#include <TDirectory.h>
17#include <TChain.h>
18#include <TClass.h>
19#include <TClassEdit.h>
20#include <TFriendElement.h>
21#include <TInterpreter.h>
22#include <TObject.h>
23#include <TPRegexp.h>
24#include <TString.h>
25#include <TTree.h>
26
27// pragma to disable warnings on Rcpp which have
28// so many noise compiling
29#if defined(__GNUC__)
30#pragma GCC diagnostic push
31#pragma GCC diagnostic ignored "-Woverloaded-virtual"
32#pragma GCC diagnostic ignored "-Wshadow"
33#endif
34#include "lexertk.hpp"
35#if defined(__GNUC__)
36#pragma GCC diagnostic pop
37#endif
38
39#include <algorithm>
40#include <set>
41#include <stdexcept>
42#include <string>
43#include <sstream>
44#include <typeinfo>
45
46namespace ROOT {
47namespace Detail {
48namespace RDF {
49class RDefineBase;
50class RFilterBase;
51class RLoopManager;
52class RRangeBase;
53} // namespace RDF
54} // namespace Detail
55
56namespace RDF {
57class RDataSource;
58} // namespace RDF
59
60} // namespace ROOT
61
62namespace {
64
65/// A string expression such as those passed to Filter and Define, digested to a standardized form
66struct ParsedExpression {
67 /// The string expression with the dummy variable names in fVarNames in place of the original column names
68 std::string fExpr;
69 /// The list of valid column names that were used in the original string expression.
70 /// Duplicates are removed and column aliases (created with Alias calls) are resolved.
71 ColumnNames_t fUsedCols;
72 /// The list of variable names used in fExpr, with same ordering and size as fUsedCols
73 ColumnNames_t fVarNames;
74};
75
76static bool IsStrInVec(const std::string &str, const std::vector<std::string> &vec)
77{
78 return std::find(vec.cbegin(), vec.cend(), str) != vec.cend();
79}
80
81static const std::string &ResolveAlias(const std::string &col, const std::map<std::string, std::string> &aliasMap)
82{
83 const auto it = aliasMap.find(col);
84 if (it != aliasMap.end())
85 return it->second;
86 return col;
87}
88
89// look at expression `expr` and return a list of column names used, including aliases
90static ColumnNames_t FindUsedColumns(const std::string &expr, const ColumnNames_t &treeBranchNames,
91 const ColumnNames_t &customColNames, const ColumnNames_t &dataSourceColNames,
92 const std::map<std::string, std::string> &aliasMap)
93{
94 ColumnNames_t usedCols;
95
96 lexertk::generator tokens;
97 const auto tokensOk = tokens.process(expr);
98 if (!tokensOk) {
99 const auto msg = "Failed to tokenize expression:\n" + expr + "\n\nMake sure it is valid C++.";
100 throw std::runtime_error(msg);
101 }
102
103 // iterate over tokens in expression and fill usedCols, varNames and exprWithVars
104 const auto nTokens = tokens.size();
105 const auto kSymbol = lexertk::token::e_symbol;
106 for (auto i = 0u; i < nTokens; ++i) {
107 const auto &tok = tokens[i];
108 // lexertk classifies '&' as e_symbol for some reason
109 if (tok.type != kSymbol || tok.value == "&" || tok.value == "|") {
110 // token is not a potential variable name, skip it
111 continue;
112 }
113
114 ColumnNames_t potentialColNames({tok.value});
115
116 // if token is the start of a dot chain (a.b.c...), a.b, a.b.c etc. are also potential column names
117 auto dotChainKeepsGoing = [&](unsigned int _i) {
118 return _i + 2 <= nTokens && tokens[_i + 1].value == "." && tokens[_i + 2].type == kSymbol;
119 };
120 while (dotChainKeepsGoing(i)) {
121 potentialColNames.emplace_back(potentialColNames.back() + "." + tokens[i + 2].value);
122 i += 2; // consume the tokens we looked at
123 }
124
125 // find the longest potential column name that is an actual column name
126 // if it's a new match, also add it to usedCols and update varNames
127 // potential columns are sorted by length, so we search from the end
128 auto isRDFColumn = [&](const std::string &columnOrAlias) {
129 const auto &col = ResolveAlias(columnOrAlias, aliasMap);
130 if (IsStrInVec(col, customColNames) || IsStrInVec(col, treeBranchNames) || IsStrInVec(col, dataSourceColNames))
131 return true;
132 return false;
133 };
134 const auto longestRDFColMatch = std::find_if(potentialColNames.crbegin(), potentialColNames.crend(), isRDFColumn);
135
136 if (longestRDFColMatch != potentialColNames.crend() && !IsStrInVec(*longestRDFColMatch, usedCols)) {
137 // found a new RDF column in the expression (potentially an alias)
138 usedCols.emplace_back(*longestRDFColMatch);
139 }
140 }
141
142 return usedCols;
143}
144
145static ParsedExpression ParseRDFExpression(const std::string &expr, const ColumnNames_t &treeBranchNames,
146 const ColumnNames_t &customColNames, const ColumnNames_t &dataSourceColNames,
147 const std::map<std::string, std::string> &aliasMap)
148{
149 const auto usedColsAndAliases = FindUsedColumns(expr, treeBranchNames, customColNames, dataSourceColNames, aliasMap);
150
151 auto escapeDots = [](const std::string &s) {
152 TString ss(s);
153 TPRegexp dot("\\.");
154 dot.Substitute(ss, "\\.", "g");
155 return std::string(std::move(ss));
156 };
157
158 ColumnNames_t varNames;
159 ColumnNames_t usedCols;
160 TString exprWithVars(expr); // same as expr but column names will be substituted with the variable names in varNames
161 for (const auto &colOrAlias : usedColsAndAliases) {
162 const auto col = ResolveAlias(colOrAlias, aliasMap);
163 unsigned int varIdx; // index of the variable in varName corresponding to col
164 if (!IsStrInVec(col, usedCols)) {
165 usedCols.emplace_back(col);
166 varIdx = varNames.size();
167 varNames.emplace_back("var" + std::to_string(varIdx));
168 } else {
169 // colOrAlias must be an alias that resolves to a column we have already seen.
170 // Find back the corresponding varName
171 varIdx = std::distance(usedCols.begin(), std::find(usedCols.begin(), usedCols.end(), col));
172 }
173 TPRegexp replacer("\\b" + escapeDots(colOrAlias) + "\\b"); // watch out: need to replace colOrAlias, not col
174 replacer.Substitute(exprWithVars, varNames[varIdx], "g");
175 }
176
177 return ParsedExpression{std::string(std::move(exprWithVars)), std::move(usedCols), std::move(varNames)};
178}
179
180/// Return the static global map of Filter/Define lambda expressions that have been jitted.
181/// It's used to check whether a given expression has already been jitted, and
182/// to look up its associated variable name if it is.
183/// Keys in the map are the body of the expression, values are the name of the
184/// jitted variable that corresponds to that expression. For example, for:
185/// auto lambda1 = [] { return 42; };
186/// key would be "[] { return 42; }" and value would be "lambda1".
187static std::unordered_map<std::string, std::string> &GetJittedExprs() {
188 static std::unordered_map<std::string, std::string> jittedExpressions;
189 return jittedExpressions;
190}
191
192static std::string
193BuildLambdaString(const std::string &expr, const ColumnNames_t &vars, const ColumnNames_t &varTypes)
194{
195 R__ASSERT(vars.size() == varTypes.size());
196
197 TPRegexp re(R"(\breturn\b)");
198 const bool hasReturnStmt = re.Match(expr) == 1;
199
200 static const std::vector<std::string> fundamentalTypes = {
201 "int",
202 "signed",
203 "signed int",
204 "Int_t",
205 "unsigned",
206 "unsigned int",
207 "UInt_t",
208 "double",
209 "Double_t",
210 "float",
211 "Float_t",
212 "char",
213 "Char_t",
214 "unsigned char",
215 "UChar_t",
216 "bool",
217 "Bool_t",
218 "short",
219 "short int",
220 "Short_t",
221 "long",
222 "long int",
223 "long long int",
224 "Long64_t",
225 "unsigned long",
226 "unsigned long int",
227 "ULong64_t",
228 "std::size_t",
229 "size_t",
230 "Ssiz_t"
231 };
232
233 std::stringstream ss;
234 ss << "[](";
235 for (auto i = 0u; i < vars.size(); ++i) {
236 std::string fullType;
237 const auto &type = varTypes[i];
238 if (std::find(fundamentalTypes.begin(), fundamentalTypes.end(), type) != fundamentalTypes.end()) {
239 // pass it by const value to help detect common mistakes such as if(x = 3)
240 fullType = "const " + type + " ";
241 } else {
242 // We pass by reference to avoid expensive copies
243 // It can't be const reference in general, as users might want/need to call non-const methods on the values
244 fullType = type + "& ";
245 }
246 ss << fullType << vars[i] << ", ";
247 }
248 if (!vars.empty())
249 ss.seekp(-2, ss.cur);
250
251 if (hasReturnStmt)
252 ss << "){";
253 else
254 ss << "){return ";
255 ss << expr << "\n;}";
256
257 return ss.str();
258}
259
260/// Declare a lambda expression to the interpreter in namespace __rdf, return the name of the jitted lambda.
261/// If the lambda expression is already in GetJittedExprs, return the name for the lambda that has already been jitted.
262static std::string DeclareLambda(const std::string &expr, const ColumnNames_t &vars, const ColumnNames_t &varTypes)
263{
265
266 const auto lambdaExpr = BuildLambdaString(expr, vars, varTypes);
267 auto &exprMap = GetJittedExprs();
268 const auto exprIt = exprMap.find(lambdaExpr);
269 if (exprIt != exprMap.end()) {
270 // expression already there
271 const auto lambdaName = exprIt->second;
272 return lambdaName;
273 }
274
275 // new expression
276 const auto lambdaBaseName = "lambda" + std::to_string(exprMap.size());
277 const auto lambdaFullName = "__rdf::" + lambdaBaseName;
278
279 const auto toDeclare = "namespace __rdf {\nauto " + lambdaBaseName + " = " + lambdaExpr + ";\nusing " +
280 lambdaBaseName + "_ret_t = typename ROOT::TypeTraits::CallableTraits<decltype(" +
281 lambdaBaseName + ")>::ret_type;\n}";
283
284 // InterpreterDeclare could throw. If it doesn't, mark the lambda as already jitted
285 exprMap.insert({lambdaExpr, lambdaFullName});
286
287 return lambdaFullName;
288}
289
290/// Each jitted lambda comes with a lambda_ret_t type alias for its return type.
291/// Resolve that alias and return the true type as string.
292static std::string RetTypeOfLambda(const std::string &lambdaName)
293{
294 const auto dt = gROOT->GetType((lambdaName + "_ret_t").c_str());
295 R__ASSERT(dt != nullptr);
296 const auto type = dt->GetFullTypeName();
297 return type;
298}
299
300static void GetTopLevelBranchNamesImpl(TTree &t, std::set<std::string> &bNamesReg, ColumnNames_t &bNames,
301 std::set<TTree *> &analysedTrees, const std::string friendName = "")
302{
303 if (!analysedTrees.insert(&t).second) {
304 return;
305 }
306
307 auto branches = t.GetListOfBranches();
308 if (branches) {
309 for (auto branchObj : *branches) {
310 const auto name = branchObj->GetName();
311 if (bNamesReg.insert(name).second) {
312 bNames.emplace_back(name);
313 } else if (!friendName.empty()) {
314 // If this is a friend and the branch name has already been inserted, it might be because the friend
315 // has a branch with the same name as a branch in the main tree. Let's add it as <friendname>.<branchname>.
316 // If used for a Snapshot, this name will become <friendname>_<branchname> (with an underscore).
317 const auto longName = friendName + "." + name;
318 if (bNamesReg.insert(longName).second)
319 bNames.emplace_back(longName);
320 }
321 }
322 }
323
324 auto friendTrees = t.GetListOfFriends();
325
326 if (!friendTrees)
327 return;
328
329 for (auto friendTreeObj : *friendTrees) {
330 auto friendElement = static_cast<TFriendElement *>(friendTreeObj);
331 auto friendTree = friendElement->GetTree();
332 const std::string frName(friendElement->GetName()); // this gets us the TTree name or the friend alias if any
333 GetTopLevelBranchNamesImpl(*friendTree, bNamesReg, bNames, analysedTrees, frName);
334 }
335}
336
337static bool IsValidCppVarName(const std::string &var)
338{
339 if (var.empty())
340 return false;
341 const char firstChar = var[0];
342
343 // first character must be either a letter or an underscore
344 auto isALetter = [](char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); };
345 const bool isValidFirstChar = firstChar == '_' || isALetter(firstChar);
346 if (!isValidFirstChar)
347 return false;
348
349 // all characters must be either a letter, an underscore or a number
350 auto isANumber = [](char c) { return c >= '0' && c <= '9'; };
351 auto isValidTok = [&isALetter, &isANumber](char c) { return c == '_' || isALetter(c) || isANumber(c); };
352 for (const char c : var)
353 if (!isValidTok(c))
354 return false;
355
356 return true;
357}
358
359} // anonymous namespace
360
361namespace ROOT {
362namespace Internal {
363namespace RDF {
364
365///////////////////////////////////////////////////////////////////////////////
366/// Get all the top-level branches names, including the ones of the friend trees
368{
369 std::set<std::string> bNamesSet;
370 ColumnNames_t bNames;
371 std::set<TTree *> analysedTrees;
372 GetTopLevelBranchNamesImpl(t, bNamesSet, bNames, analysedTrees);
373 return bNames;
374}
375
376// The set here is used as a registry, the real list, which keeps the order, is
377// the one in the vector
378class RActionBase;
379
380std::string DemangleTypeIdName(const std::type_info &typeInfo)
381{
382 int dummy(0);
383 char *tn = TClassEdit::DemangleTypeIdName(typeInfo, dummy);
384 std::string tname(tn);
385 free(tn);
386 return tname;
387}
388
390ConvertRegexToColumns(const ColumnNames_t &colNames, std::string_view columnNameRegexp, std::string_view callerName)
391{
392 const auto theRegexSize = columnNameRegexp.size();
393 std::string theRegex(columnNameRegexp);
394
395 const auto isEmptyRegex = 0 == theRegexSize;
396 // This is to avoid cases where branches called b1, b2, b3 are all matched by expression "b"
397 if (theRegexSize > 0 && theRegex[0] != '^')
398 theRegex = "^" + theRegex;
399 if (theRegexSize > 0 && theRegex[theRegexSize - 1] != '$')
400 theRegex = theRegex + "$";
401
402 ColumnNames_t selectedColumns;
403
404 // Since we support gcc48 and it does not provide in its stl std::regex,
405 // we need to use TPRegexp
406 TPRegexp regexp(theRegex);
407 for (auto &&colName : colNames) {
408 if ((isEmptyRegex || 0 != regexp.Match(colName.c_str())) && !RDFInternal::IsInternalColumn(colName)) {
409 selectedColumns.emplace_back(colName);
410 }
411 }
412
413 if (selectedColumns.empty()) {
414 std::string text(callerName);
415 if (columnNameRegexp.empty()) {
416 text = ": there is no column available to match.";
417 } else {
418 text = ": regex \"" + std::string(columnNameRegexp) + "\" did not match any column.";
419 }
420 throw std::runtime_error(text);
421 }
422 return selectedColumns;
423}
424
425void CheckDefine(std::string_view definedCol, TTree *treePtr, const ColumnNames_t &customCols,
426 const std::map<std::string, std::string> &aliasMap, const ColumnNames_t &dataSourceColumns)
427{
428 const std::string definedColStr(definedCol);
429
430 if (!IsValidCppVarName(definedColStr)) {
431 const auto msg = "Cannot define column \"" + definedColStr + "\": not a valid C++ variable name.";
432 throw std::runtime_error(msg);
433 }
434
435 if (treePtr != nullptr) {
436 // check if definedCol is already present in TTree
437 const auto branch = treePtr->GetBranch(definedColStr.c_str());
438 if (branch != nullptr) {
439 const auto msg = "branch \"" + definedColStr + "\" already present in TTree";
440 throw std::runtime_error(msg);
441 }
442 }
443 // check if definedCol has already been `Define`d in the functional graph
444 if (std::find(customCols.begin(), customCols.end(), definedCol) != customCols.end()) {
445 const auto msg = "Redefinition of column \"" + definedColStr + "\"";
446 throw std::runtime_error(msg);
447 }
448
449 // Check if the definedCol is an alias
450 const auto aliasColNameIt = aliasMap.find(definedColStr);
451 if (aliasColNameIt != aliasMap.end()) {
452 const auto msg = "An alias with name " + definedColStr + " pointing to column " +
453 aliasColNameIt->second + " is already existing.";
454 throw std::runtime_error(msg);
455 }
456
457 // check if definedCol is already present in the DataSource (but has not yet been `Define`d)
458 if (!dataSourceColumns.empty()) {
459 if (std::find(dataSourceColumns.begin(), dataSourceColumns.end(), definedCol) != dataSourceColumns.end()) {
460 const auto msg = "Redefinition of column \"" + definedColStr + "\" already present in the data-source";
461 throw std::runtime_error(msg);
462 }
463 }
464}
465
466void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
467{
468 if (nTemplateParams != nColumnNames) {
469 std::string err_msg = "The number of template parameters specified is ";
470 err_msg += std::to_string(nTemplateParams);
471 err_msg += " while ";
472 err_msg += std::to_string(nColumnNames);
473 err_msg += " columns have been specified.";
474 throw std::runtime_error(err_msg);
475 }
476}
477
478/// Choose between local column names or default column names, throw in case of errors.
479const ColumnNames_t
480SelectColumns(unsigned int nRequiredNames, const ColumnNames_t &names, const ColumnNames_t &defaultNames)
481{
482 if (names.empty()) {
483 // use default column names
484 if (defaultNames.size() < nRequiredNames)
485 throw std::runtime_error(
486 std::to_string(nRequiredNames) + " column name" + (nRequiredNames == 1 ? " is" : "s are") +
487 " required but none were provided and the default list has size " + std::to_string(defaultNames.size()));
488 // return first nRequiredNames default column names
489 return ColumnNames_t(defaultNames.begin(), defaultNames.begin() + nRequiredNames);
490 } else {
491 // use column names provided by the user to this particular transformation/action
492 if (names.size() != nRequiredNames) {
493 auto msg = std::to_string(nRequiredNames) + " column name" + (nRequiredNames == 1 ? " is" : "s are") +
494 " required but " + std::to_string(names.size()) + (names.size() == 1 ? " was" : " were") +
495 " provided:";
496 for (const auto &name : names)
497 msg += " \"" + name + "\",";
498 msg.back() = '.';
499 throw std::runtime_error(msg);
500 }
501 return names;
502 }
503}
504
505ColumnNames_t FindUnknownColumns(const ColumnNames_t &requiredCols, const ColumnNames_t &datasetColumns,
506 const ColumnNames_t &definedCols, const ColumnNames_t &dataSourceColumns)
507{
508 ColumnNames_t unknownColumns;
509 for (auto &column : requiredCols) {
510 const auto isBranch = std::find(datasetColumns.begin(), datasetColumns.end(), column) != datasetColumns.end();
511 if (isBranch)
512 continue;
513 const auto isDefine = std::find(definedCols.begin(), definedCols.end(), column) != definedCols.end();
514 if (isDefine)
515 continue;
516 const auto isDataSourceColumn =
517 std::find(dataSourceColumns.begin(), dataSourceColumns.end(), column) != dataSourceColumns.end();
518 if (isDataSourceColumn)
519 continue;
520 unknownColumns.emplace_back(column);
521 }
522 return unknownColumns;
523}
524
525std::vector<std::string> GetFilterNames(const std::shared_ptr<RLoopManager> &loopManager)
526{
527 return loopManager->GetFiltersNames();
528}
529
530ParsedTreePath ParseTreePath(std::string_view fullTreeName)
531{
532 // split name into directory and treename if needed
533 std::string_view dirName = "";
534 std::string_view treeName = fullTreeName;
535 const auto lastSlash = fullTreeName.rfind('/');
536 if (std::string_view::npos != lastSlash) {
537 dirName = treeName.substr(0, lastSlash);
538 treeName = treeName.substr(lastSlash + 1, treeName.size());
539 }
540 return {std::string(treeName), std::string(dirName)};
541}
542
543std::string PrettyPrintAddr(const void *const addr)
544{
545 std::stringstream s;
546 // Windows-friendly
547 s << std::hex << std::showbase << reinterpret_cast<size_t>(addr);
548 return s.str();
549}
550
551void BookFilterJit(const std::shared_ptr<RJittedFilter> &jittedFilter,
552 std::shared_ptr<RDFDetail::RNodeBase> *prevNodeOnHeap, std::string_view name,
553 std::string_view expression, const std::map<std::string, std::string> &aliasMap,
554 const ColumnNames_t &branches, const RDFInternal::RBookedDefines &customCols, TTree *tree,
555 RDataSource *ds)
556{
557 const auto &dsColumns = ds ? ds->GetColumnNames() : ColumnNames_t{};
558
559 const auto parsedExpr =
560 ParseRDFExpression(std::string(expression), branches, customCols.GetNames(), dsColumns, aliasMap);
561 const auto exprVarTypes =
562 GetValidatedArgTypes(parsedExpr.fUsedCols, customCols, tree, ds, "Filter", /*vector2rvec=*/true);
563 const auto lambdaName = DeclareLambda(parsedExpr.fExpr, parsedExpr.fVarNames, exprVarTypes);
564 const auto type = RetTypeOfLambda(lambdaName);
565 if (type != "bool")
566 std::runtime_error("Filter: the following expression does not evaluate to bool:\n" + std::string(expression));
567
568 // definesOnHeap is deleted by the jitted call to JitFilterHelper
570 const auto definesOnHeapAddr = PrettyPrintAddr(definesOnHeap);
571 const auto prevNodeAddr = PrettyPrintAddr(prevNodeOnHeap);
572
573 // Produce code snippet that creates the filter and registers it with the corresponding RJittedFilter
574 // Windows requires std::hex << std::showbase << (size_t)pointer to produce notation "0x1234"
575 std::stringstream filterInvocation;
576 filterInvocation << "ROOT::Internal::RDF::JitFilterHelper(" << lambdaName << ", new const char*["
577 << parsedExpr.fUsedCols.size() << "]{";
578 for (const auto &col : parsedExpr.fUsedCols)
579 filterInvocation << "\"" << col << "\", ";
580 if (!parsedExpr.fUsedCols.empty())
581 filterInvocation.seekp(-2, filterInvocation.cur); // remove the last ",
582 // lifetime of pointees:
583 // - jittedFilter: heap-allocated weak_ptr to the actual jittedFilter that will be deleted by JitFilterHelper
584 // - prevNodeOnHeap: heap-allocated shared_ptr to the actual previous node that will be deleted by JitFilterHelper
585 // - definesOnHeap: heap-allocated, will be deleted by JitFilterHelper
586 filterInvocation << "}, " << parsedExpr.fUsedCols.size() << ", \"" << name << "\", "
587 << "reinterpret_cast<std::weak_ptr<ROOT::Detail::RDF::RJittedFilter>*>("
588 << PrettyPrintAddr(MakeWeakOnHeap(jittedFilter)) << "), "
589 << "reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>(" << prevNodeAddr << "),"
590 << "reinterpret_cast<ROOT::Internal::RDF::RBookedDefines*>(" << definesOnHeapAddr << ")"
591 << ");\n";
592
593 auto lm = jittedFilter->GetLoopManagerUnchecked();
594 lm->ToJitExec(filterInvocation.str());
595}
596
597// Jit a Define call
598std::shared_ptr<RJittedDefine> BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm,
599 RDataSource *ds, const RDFInternal::RBookedDefines &customCols,
600 const ColumnNames_t &branches,
601 std::shared_ptr<RNodeBase> *upcastNodeOnHeap)
602{
603 const auto &aliasMap = lm.GetAliasMap();
604 auto *const tree = lm.GetTree();
605 const auto &dsColumns = ds ? ds->GetColumnNames() : ColumnNames_t{};
606
607 const auto parsedExpr =
608 ParseRDFExpression(std::string(expression), branches, customCols.GetNames(), dsColumns, aliasMap);
609 const auto exprVarTypes =
610 GetValidatedArgTypes(parsedExpr.fUsedCols, customCols, tree, ds, "Define", /*vector2rvec=*/true);
611 const auto lambdaName = DeclareLambda(parsedExpr.fExpr, parsedExpr.fVarNames, exprVarTypes);
612 const auto type = RetTypeOfLambda(lambdaName);
613
614 auto definesCopy = new RDFInternal::RBookedDefines(customCols);
615 auto definesAddr = PrettyPrintAddr(definesCopy);
616 auto jittedDefine = std::make_shared<RDFDetail::RJittedDefine>(name, type, lm.GetNSlots(), lm.GetDSValuePtrs());
617
618 std::stringstream defineInvocation;
619 defineInvocation << "ROOT::Internal::RDF::JitDefineHelper(" << lambdaName << ", new const char*["
620 << parsedExpr.fUsedCols.size() << "]{";
621 for (const auto &col : parsedExpr.fUsedCols) {
622 defineInvocation << "\"" << col << "\", ";
623 }
624 if (!parsedExpr.fUsedCols.empty())
625 defineInvocation.seekp(-2, defineInvocation.cur); // remove the last ",
626 // lifetime of pointees:
627 // - lm is the loop manager, and if that goes out of scope jitting does not happen at all (i.e. will always be valid)
628 // - jittedDefine: heap-allocated weak_ptr that will be deleted by JitDefineHelper after usage
629 // - definesAddr: heap-allocated, will be deleted by JitDefineHelper after usage
630 defineInvocation << "}, " << parsedExpr.fUsedCols.size() << ", \"" << name
631 << "\", reinterpret_cast<ROOT::Detail::RDF::RLoopManager*>(" << PrettyPrintAddr(&lm)
632 << "), reinterpret_cast<std::weak_ptr<ROOT::Detail::RDF::RJittedDefine>*>("
633 << PrettyPrintAddr(MakeWeakOnHeap(jittedDefine))
634 << "), reinterpret_cast<ROOT::Internal::RDF::RBookedDefines*>(" << definesAddr
635 << "), reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>("
636 << PrettyPrintAddr(upcastNodeOnHeap) << "));\n";
637
638 lm.ToJitExec(defineInvocation.str());
639 return jittedDefine;
640}
641
642// Jit and call something equivalent to "this->BuildAndBook<ColTypes...>(params...)"
643// (see comments in the body for actual jitted code)
644std::string JitBuildAction(const ColumnNames_t &cols, std::shared_ptr<RDFDetail::RNodeBase> *prevNode,
645 const std::type_info &helperArgType, const std::type_info &at, void *helperArgOnHeap,
646 TTree *tree, const unsigned int nSlots, const RDFInternal::RBookedDefines &customCols,
647 RDataSource *ds, std::weak_ptr<RJittedAction> *jittedActionOnHeap)
648{
649 // retrieve type of result of the action as a string
650 auto helperArgClass = TClass::GetClass(helperArgType);
651 if (!helperArgClass) {
652 std::string exceptionText = "An error occurred while inferring the result type of an operation.";
653 throw std::runtime_error(exceptionText.c_str());
654 }
655 const auto helperArgClassName = helperArgClass->GetName();
656
657 // retrieve type of action as a string
658 auto actionTypeClass = TClass::GetClass(at);
659 if (!actionTypeClass) {
660 std::string exceptionText = "An error occurred while inferring the action type of the operation.";
661 throw std::runtime_error(exceptionText.c_str());
662 }
663 const std::string actionTypeName = actionTypeClass->GetName();
664 const std::string actionTypeNameBase = actionTypeName.substr(actionTypeName.rfind(':') + 1);
665
666 auto definesCopy = new RDFInternal::RBookedDefines(customCols); // deleted in jitted CallBuildAction
667 auto definesAddr = PrettyPrintAddr(definesCopy);
668
669 // Build a call to CallBuildAction with the appropriate argument. When run through the interpreter, this code will
670 // just-in-time create an RAction object and it will assign it to its corresponding RJittedAction.
671 std::stringstream createAction_str;
672 createAction_str << "ROOT::Internal::RDF::CallBuildAction<" << actionTypeName;
673 const auto columnTypeNames =
674 GetValidatedArgTypes(cols, customCols, tree, ds, actionTypeNameBase, /*vector2rvec=*/true);
675 for (auto &colType : columnTypeNames)
676 createAction_str << ", " << colType;
677 // on Windows, to prefix the hexadecimal value of a pointer with '0x',
678 // one need to write: std::hex << std::showbase << (size_t)pointer
679 createAction_str << ">(reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>("
680 << PrettyPrintAddr(prevNode) << "), new const char*[" << cols.size() << "]{";
681 for (auto i = 0u; i < cols.size(); ++i) {
682 if (i != 0u)
683 createAction_str << ", ";
684 createAction_str << '"' << cols[i] << '"';
685 }
686 createAction_str << "}, " << cols.size() << ", " << nSlots << ", reinterpret_cast<" << helperArgClassName << "*>("
687 << PrettyPrintAddr(helperArgOnHeap)
688 << "), reinterpret_cast<std::weak_ptr<ROOT::Internal::RDF::RJittedAction>*>("
689 << PrettyPrintAddr(jittedActionOnHeap)
690 << "), reinterpret_cast<ROOT::Internal::RDF::RBookedDefines*>(" << definesAddr << "));";
691 return createAction_str.str();
692}
693
694bool AtLeastOneEmptyString(const std::vector<std::string_view> strings)
695{
696 for (const auto &s : strings) {
697 if (s.empty())
698 return true;
699 }
700 return false;
701}
702
703std::shared_ptr<RNodeBase> UpcastNode(std::shared_ptr<RNodeBase> ptr)
704{
705 return ptr;
706}
707
708/// Given the desired number of columns and the user-provided list of columns:
709/// * fallback to using the first nColumns default columns if needed (or throw if nColumns > nDefaultColumns)
710/// * check that selected column names refer to valid branches, custom columns or datasource columns (throw if not)
711/// * replace column names from aliases by the actual column name
712/// Return the list of selected column names.
713ColumnNames_t GetValidatedColumnNames(RLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns,
714 const ColumnNames_t &validDefines, RDataSource *ds)
715{
716 const auto &defaultColumns = lm.GetDefaultColumnNames();
717 auto selectedColumns = SelectColumns(nColumns, columns, defaultColumns);
718 const auto &validBranchNames = lm.GetBranchNames();
719 const auto unknownColumns =
720 FindUnknownColumns(selectedColumns, validBranchNames, validDefines, ds ? ds->GetColumnNames() : ColumnNames_t{});
721
722 if (!unknownColumns.empty()) {
723 // throw
724 std::stringstream unknowns;
725 std::string delim = unknownColumns.size() > 1 ? "s: " : ": "; // singular/plural
726 for (auto &unknownColumn : unknownColumns) {
727 unknowns << delim << unknownColumn;
728 delim = ',';
729 }
730 throw std::runtime_error("Unknown column" + unknowns.str());
731 }
732
733 // Now we need to check within the aliases if some of the yet unknown names can be recovered
734 auto &aliasMap = lm.GetAliasMap();
735 auto aliasMapEnd = aliasMap.end();
736
737 for (auto idx : ROOT::TSeqU(selectedColumns.size())) {
738 const auto &colName = selectedColumns[idx];
739 const auto aliasColumnNameIt = aliasMap.find(colName);
740 if (aliasMapEnd != aliasColumnNameIt) {
741 selectedColumns[idx] = aliasColumnNameIt->second;
742 }
743 }
744
745 return selectedColumns;
746}
747
748std::vector<std::string> GetValidatedArgTypes(const ColumnNames_t &colNames, const RBookedDefines &defines, TTree *tree,
749 RDataSource *ds, const std::string &context, bool vector2rvec)
750{
751 auto toCheckedArgType = [&](const std::string &c) {
752 RDFDetail::RDefineBase *define = defines.HasName(c) ? defines.GetColumns().at(c).get() : nullptr;
753 const auto colType = ColumnName2ColumnTypeName(c, tree, ds, define, vector2rvec);
754 if (colType.rfind("CLING_UNKNOWN_TYPE", 0) == 0) { // the interpreter does not know this type
755 const auto msg =
756 "The type of custom column \"" + c + "\" (" + colType.substr(19) +
757 ") is not known to the interpreter, but a just-in-time-compiled " + context +
758 " call requires this column. Make sure to create and load ROOT dictionaries for this column's class.";
759 throw std::runtime_error(msg);
760 }
761 return colType;
762 };
763 std::vector<std::string> colTypes;
764 colTypes.reserve(colNames.size());
765 std::transform(colNames.begin(), colNames.end(), std::back_inserter(colTypes), toCheckedArgType);
766 return colTypes;
767}
768
769/// Return a bitset each element of which indicates whether the corresponding element in `selectedColumns` is the
770/// name of a column that must be defined via datasource. All elements of the returned vector are false if no
771/// data-source is present.
772std::vector<bool> FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedCols)
773{
774 const auto nColumns = requestedCols.size();
775 std::vector<bool> mustBeDefined(nColumns, false);
776 for (auto i = 0u; i < nColumns; ++i)
777 mustBeDefined[i] = std::find(definedCols.begin(), definedCols.end(), requestedCols[i]) == definedCols.end();
778 return mustBeDefined;
779}
780
781} // namespace RDF
782} // namespace Internal
783} // namespace ROOT
#define c(i)
Definition RSha256.hxx:101
#define R__ASSERT(e)
Definition TError.h:120
char name[80]
Definition TGX11.cxx:110
int type
Definition TGX11.cxx:121
R__EXTERN TVirtualMutex * gROOTMutex
Definition TROOT.h:63
#define gROOT
Definition TROOT.h:406
#define R__LOCKGUARD(mutex)
#define free
Definition civetweb.c:1539
The head node of a RDF computation graph.
const std::map< std::string, std::string > & GetAliasMap() const
const ColumnNames_t & GetBranchNames()
Return all valid TTree::Branch names (caching results for subsequent calls).
void ToJitExec(const std::string &) const
const std::map< std::string, std::vector< void * > > & GetDSValuePtrs() const
const ColumnNames_t & GetDefaultColumnNames() const
Return the list of default columns – empty if none was provided when constructing the RDataFrame.
Encapsulates the columns defined by the user.
bool HasName(std::string_view name) const
Check if the provided name is tracked in the names list.
const RDefineBasePtrMap_t & GetColumns() const
Returns the list of the pointers to the defined columns.
ColumnNames_t GetNames() const
Returns the list of the names of the defined columns.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition TClass.cxx:2957
virtual const char * GetName() const
Return name of this collection.
A TFriendElement TF describes a TTree object TF in a file.
virtual TTree * GetTree()
Return pointer to friend TTree.
Int_t Match(const TString &s, const TString &mods="", Int_t start=0, Int_t nMaxMatch=10, TArrayI *pos=0)
The number of matches is returned, this equals the full match + sub-pattern matches.
Definition TPRegexp.cxx:339
Basic string class.
Definition TString.h:136
A TTree represents a columnar dataset.
Definition TTree.h:79
virtual TBranch * GetBranch(const char *name)
Return pointer to the branch with the given name in this tree or its friends.
Definition TTree.cxx:5275
virtual TObjArray * GetListOfBranches()
Definition TTree.h:485
virtual TList * GetListOfFriends() const
Definition TTree.h:487
TText * text
std::vector< std::string > ColumnNames_t
const ColumnNames_t SelectColumns(unsigned int nRequiredNames, const ColumnNames_t &names, const ColumnNames_t &defaultNames)
Choose between local column names or default column names, throw in case of errors.
ParsedTreePath ParseTreePath(std::string_view fullTreeName)
std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *tree, RDataSource *ds, RDefineBase *define, bool vector2rvec)
Return a string containing the type of the given branch.
Definition RDFUtils.cxx:223
std::shared_ptr< RNodeBase > UpcastNode(std::shared_ptr< RNodeBase > ptr)
std::vector< std::string > GetFilterNames(const std::shared_ptr< RLoopManager > &loopManager)
std::string PrettyPrintAddr(const void *const addr)
ColumnNames_t GetTopLevelBranchNames(TTree &t)
Get all the top-level branches names, including the ones of the friend trees.
void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
std::string DemangleTypeIdName(const std::type_info &typeInfo)
bool AtLeastOneEmptyString(const std::vector< std::string_view > strings)
std::string JitBuildAction(const ColumnNames_t &cols, std::shared_ptr< RDFDetail::RNodeBase > *prevNode, const std::type_info &helperArgType, const std::type_info &at, void *helperArgOnHeap, TTree *tree, const unsigned int nSlots, const RDFInternal::RBookedDefines &customCols, RDataSource *ds, std::weak_ptr< RJittedAction > *jittedActionOnHeap)
ColumnNames_t GetValidatedColumnNames(RLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns, const ColumnNames_t &validDefines, RDataSource *ds)
Given the desired number of columns and the user-provided list of columns:
ColumnNames_t FindUnknownColumns(const ColumnNames_t &requiredCols, const ColumnNames_t &datasetColumns, const ColumnNames_t &definedCols, const ColumnNames_t &dataSourceColumns)
bool IsInternalColumn(std::string_view colName)
Definition RDFUtils.cxx:347
void InterpreterDeclare(const std::string &code)
Definition RDFUtils.cxx:318
void CheckDefine(std::string_view definedCol, TTree *treePtr, const ColumnNames_t &customCols, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &dataSourceColumns)
ColumnNames_t ConvertRegexToColumns(const ColumnNames_t &colNames, std::string_view columnNameRegexp, std::string_view callerName)
std::vector< std::string > GetValidatedArgTypes(const ColumnNames_t &colNames, const RBookedDefines &defines, TTree *tree, RDataSource *ds, const std::string &context, bool vector2rvec)
std::vector< bool > FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedCols)
Return a bitset each element of which indicates whether the corresponding element in selectedColumns ...
void BookFilterJit(const std::shared_ptr< RJittedFilter > &jittedFilter, std::shared_ptr< RDFDetail::RNodeBase > *prevNodeOnHeap, std::string_view name, std::string_view expression, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &branches, const RDFInternal::RBookedDefines &customCols, TTree *tree, RDataSource *ds)
std::shared_ptr< RJittedDefine > BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm, RDataSource *ds, const RDFInternal::RBookedDefines &customCols, const ColumnNames_t &branches, std::shared_ptr< RNodeBase > *upcastNodeOnHeap)
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
TSeq< unsigned int > TSeqU
Definition TSeq.hxx:195
char * DemangleTypeIdName(const std::type_info &ti, int &errorCode)
Demangle in a portable way the type id name.
Definition tree.py:1