Logo ROOT  
Reference Guide
RDFInterfaceUtils.cxx
Go to the documentation of this file.
1// Author: Enrico Guiraud, Danilo Piparo CERN 02/2018
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
12#include <ROOT/RDataFrame.hxx>
14#include <ROOT/RStringView.hxx>
15#include <ROOT/TSeq.hxx>
16#include <RtypesCore.h>
17#include <TDirectory.h>
18#include <TChain.h>
19#include <TClass.h>
20#include <TClassEdit.h>
21#include <TFriendElement.h>
22#include <TInterpreter.h>
23#include <TObject.h>
24#include <TPRegexp.h>
25#include <TString.h>
26#include <TTree.h>
27
28// pragma to disable warnings on Rcpp which have
29// so many noise compiling
30#if defined(__GNUC__)
31#pragma GCC diagnostic push
32#pragma GCC diagnostic ignored "-Woverloaded-virtual"
33#pragma GCC diagnostic ignored "-Wshadow"
34#endif
35#include "lexertk.hpp"
36#if defined(__GNUC__)
37#pragma GCC diagnostic pop
38#endif
39
40#include <algorithm>
41#include <set>
42#include <stdexcept>
43#include <string>
44#include <sstream>
45#include <typeinfo>
46
47namespace ROOT {
48namespace Detail {
49namespace RDF {
50class RCustomColumnBase;
51class RFilterBase;
52class RLoopManager;
53class RRangeBase;
54} // namespace RDF
55} // namespace Detail
56
57namespace RDF {
58class RDataSource;
59} // namespace RDF
60
61} // namespace ROOT
62
63namespace {
64using ROOT::Detail::RDF::ColumnNames_t;
65
66/// A string expression such as those passed to Filter and Define, digested to a standardized form
67struct ParsedExpression {
68 /// The string expression with the dummy variable names in fVarNames in place of the original column names
69 std::string fExpr;
70 /// The list of valid column names that were used in the original string expression.
71 /// Duplicates are removed and column aliases (created with Alias calls) are resolved.
72 ColumnNames_t fUsedCols;
73 /// The list of variable names used in fExpr, with same ordering and size as fUsedCols
74 ColumnNames_t fVarNames;
75};
76
77static bool IsStrInVec(const std::string &str, const std::vector<std::string> &vec)
78{
79 return std::find(vec.cbegin(), vec.cend(), str) != vec.cend();
80}
81
82static const std::string &ResolveAlias(const std::string &col, const std::map<std::string, std::string> &aliasMap)
83{
84 const auto it = aliasMap.find(col);
85 if (it != aliasMap.end())
86 return it->second;
87 return col;
88}
89
90// look at expression `expr` and return a list of column names used, including aliases
91static ColumnNames_t FindUsedColumns(const std::string &expr, const ColumnNames_t &treeBranchNames,
92 const ColumnNames_t &customColNames, const ColumnNames_t &dataSourceColNames,
93 const std::map<std::string, std::string> &aliasMap)
94{
95 ColumnNames_t usedCols;
96
97 lexertk::generator tokens;
98 const auto tokensOk = tokens.process(expr);
99 if (!tokensOk) {
100 const auto msg = "Failed to tokenize expression:\n" + expr + "\n\nMake sure it is valid C++.";
101 throw std::runtime_error(msg);
102 }
103
104 // iterate over tokens in expression and fill usedCols, varNames and exprWithVars
105 const auto nTokens = tokens.size();
106 const auto kSymbol = lexertk::token::e_symbol;
107 for (auto i = 0u; i < nTokens; ++i) {
108 const auto &tok = tokens[i];
109 // lexertk classifies '&' as e_symbol for some reason
110 if (tok.type != kSymbol || tok.value == "&" || tok.value == "|") {
111 // token is not a potential variable name, skip it
112 continue;
113 }
114
115 ColumnNames_t potentialColNames({tok.value});
116
117 // if token is the start of a dot chain (a.b.c...), a.b, a.b.c etc. are also potential column names
118 auto dotChainKeepsGoing = [&](unsigned int _i) {
119 return _i + 2 <= nTokens && tokens[_i + 1].value == "." && tokens[_i + 2].type == kSymbol;
120 };
121 while (dotChainKeepsGoing(i)) {
122 potentialColNames.emplace_back(potentialColNames.back() + "." + tokens[i + 2].value);
123 i += 2; // consume the tokens we looked at
124 }
125
126 // find the longest potential column name that is an actual column name
127 // if it's a new match, also add it to usedCols and update varNames
128 // potential columns are sorted by length, so we search from the end
129 auto isRDFColumn = [&](const std::string &columnOrAlias) {
130 const auto &col = ResolveAlias(columnOrAlias, aliasMap);
131 if (IsStrInVec(col, customColNames) || IsStrInVec(col, treeBranchNames) || IsStrInVec(col, dataSourceColNames))
132 return true;
133 return false;
134 };
135 const auto longestRDFColMatch = std::find_if(potentialColNames.crbegin(), potentialColNames.crend(), isRDFColumn);
136
137 if (longestRDFColMatch != potentialColNames.crend() && !IsStrInVec(*longestRDFColMatch, usedCols)) {
138 // found a new RDF column in the expression (potentially an alias)
139 usedCols.emplace_back(*longestRDFColMatch);
140 }
141 }
142
143 return usedCols;
144}
145
146static ParsedExpression ParseRDFExpression(const std::string &expr, const ColumnNames_t &treeBranchNames,
147 const ColumnNames_t &customColNames, const ColumnNames_t &dataSourceColNames,
148 const std::map<std::string, std::string> &aliasMap)
149{
150 const auto usedColsAndAliases = FindUsedColumns(expr, treeBranchNames, customColNames, dataSourceColNames, aliasMap);
151
152 auto escapeDots = [](const std::string &s) {
153 TString ss(s);
154 TPRegexp dot("\\.");
155 dot.Substitute(ss, "\\.", "g");
156 return std::string(std::move(ss));
157 };
158
159 ColumnNames_t varNames;
160 ColumnNames_t usedCols;
161 TString exprWithVars(expr); // same as expr but column names will be substituted with the variable names in varNames
162 for (const auto &colOrAlias : usedColsAndAliases) {
163 const auto col = ResolveAlias(colOrAlias, aliasMap);
164 unsigned int varIdx; // index of the variable in varName corresponding to col
165 if (!IsStrInVec(col, usedCols)) {
166 usedCols.emplace_back(col);
167 varIdx = varNames.size();
168 varNames.emplace_back("var" + std::to_string(varIdx));
169 } else {
170 // colOrAlias must be an alias that resolves to a column we have already seen.
171 // Find back the corresponding varName
172 varIdx = std::distance(usedCols.begin(), std::find(usedCols.begin(), usedCols.end(), col));
173 }
174 TPRegexp replacer("\\b" + escapeDots(colOrAlias) + "\\b"); // watch out: need to replace colOrAlias, not col
175 replacer.Substitute(exprWithVars, varNames[varIdx], "g");
176 }
177
178 return ParsedExpression{std::string(std::move(exprWithVars)), std::move(usedCols), std::move(varNames)};
179}
180
181/// Return the static global map of Filter/Define lambda expressions that have been jitted.
182/// It's used to check whether a given expression has already been jitted, and
183/// to look up its associated variable name if it is.
184/// Keys in the map are the body of the expression, values are the name of the
185/// jitted variable that corresponds to that expression. For example, for:
186/// auto lambda1 = [] { return 42; };
187/// key would be "[] { return 42; }" and value would be "lambda1".
188static std::unordered_map<std::string, std::string> &GetJittedExprs() {
189 static std::unordered_map<std::string, std::string> jittedExpressions;
190 return jittedExpressions;
191}
192
193static std::string
194BuildLambdaString(const std::string &expr, const ColumnNames_t &vars, const ColumnNames_t &varTypes)
195{
196 R__ASSERT(vars.size() == varTypes.size());
197
198 TPRegexp re(R"(\breturn\b)");
199 const bool hasReturnStmt = re.Match(expr) == 1;
200
201 std::stringstream ss;
202 ss << "[](";
203 for (auto i = 0u; i < vars.size(); ++i) {
204 // We pass by reference to avoid expensive copies
205 // It can't be const reference in general, as users might want/need to call non-const methods on the values
206 ss << varTypes[i] << "& " << vars[i] << ", ";
207 }
208 if (!vars.empty())
209 ss.seekp(-2, ss.cur);
210
211 if (hasReturnStmt)
212 ss << "){";
213 else
214 ss << "){return ";
215 ss << expr << "\n;}";
216
217 return ss.str();
218}
219
220/// Declare a lambda expression to the interpreter in namespace __rdf, return the name of the jitted lambda.
221/// If the lambda expression is already in GetJittedExprs, return the name for the lambda that has already been jitted.
222static std::string DeclareLambda(const std::string &expr, const ColumnNames_t &vars, const ColumnNames_t &varTypes)
223{
224 const auto lambdaExpr = BuildLambdaString(expr, vars, varTypes);
225 auto &exprMap = GetJittedExprs();
226 const auto exprIt = exprMap.find(lambdaExpr);
227 if (exprIt != exprMap.end()) {
228 // expression already there
229 const auto lambdaName = exprIt->second;
230 return lambdaName;
231 }
232
233 // new expression
234 const auto lambdaBaseName = "lambda" + std::to_string(exprMap.size());
235 const auto lambdaFullName = "__rdf::" + lambdaBaseName;
236
237 const auto toDeclare = "namespace __rdf {\nauto " + lambdaBaseName + " = " + lambdaExpr + ";\nusing " +
238 lambdaBaseName + "_ret_t = typename ROOT::TypeTraits::CallableTraits<decltype(" +
239 lambdaBaseName + ")>::ret_type;\n}";
241
242 // InterpreterDeclare could throw. If it doesn't, mark the lambda as already jitted
243 exprMap.insert({lambdaExpr, lambdaFullName});
244
245 return lambdaFullName;
246}
247
248/// Each jitted lambda comes with a lambda_ret_t type alias for its return type.
249/// Resolve that alias and return the true type as string.
250static std::string RetTypeOfLambda(const std::string &lambdaName)
251{
252 auto *ti = gInterpreter->TypedefInfo_Factory((lambdaName + "_ret_t").c_str());
253 const char *type = gInterpreter->TypedefInfo_TrueName(ti);
254 return type;
255}
256
257
258static void GetTopLevelBranchNamesImpl(TTree &t, std::set<std::string> &bNamesReg, ColumnNames_t &bNames,
259 std::set<TTree *> &analysedTrees)
260{
261
262 if (!analysedTrees.insert(&t).second) {
263 return;
264 }
265
266 auto branches = t.GetListOfBranches();
267 if (branches) {
268 for (auto branchObj : *branches) {
269 auto name = branchObj->GetName();
270 if (bNamesReg.insert(name).second) {
271 bNames.emplace_back(name);
272 }
273 }
274 }
275
276 auto friendTrees = t.GetListOfFriends();
277
278 if (!friendTrees)
279 return;
280
281 for (auto friendTreeObj : *friendTrees) {
282 auto friendTree = ((TFriendElement *)friendTreeObj)->GetTree();
283 GetTopLevelBranchNamesImpl(*friendTree, bNamesReg, bNames, analysedTrees);
284 }
285}
286
287///////////////////////////////////////////////////////////////////////////////
288/// Get all the top-level branches names, including the ones of the friend trees
289static ColumnNames_t GetTopLevelBranchNames(TTree &t)
290{
291 std::set<std::string> bNamesSet;
292 ColumnNames_t bNames;
293 std::set<TTree *> analysedTrees;
294 GetTopLevelBranchNamesImpl(t, bNamesSet, bNames, analysedTrees);
295 return bNames;
296}
297
298static bool IsValidCppVarName(const std::string &var)
299{
300 if (var.empty())
301 return false;
302 const char firstChar = var[0];
303
304 // first character must be either a letter or an underscore
305 auto isALetter = [](char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); };
306 const bool isValidFirstChar = firstChar == '_' || isALetter(firstChar);
307 if (!isValidFirstChar)
308 return false;
309
310 // all characters must be either a letter, an underscore or a number
311 auto isANumber = [](char c) { return c >= '0' && c <= '9'; };
312 auto isValidTok = [&isALetter, &isANumber](char c) { return c == '_' || isALetter(c) || isANumber(c); };
313 for (const char c : var)
314 if (!isValidTok(c))
315 return false;
316
317 return true;
318}
319
320} // anonymous namespace
321
322namespace ROOT {
323namespace Internal {
324namespace RDF {
325
326// The set here is used as a registry, the real list, which keeps the order, is
327// the one in the vector
328class RActionBase;
329
330HeadNode_t CreateSnapshotRDF(const ColumnNames_t &validCols,
331 std::string_view treeName,
332 std::string_view fileName,
333 bool isLazy,
334 RLoopManager &loopManager,
335 std::unique_ptr<RDFInternal::RActionBase> actionPtr)
336{
337 // create new RDF
339 auto snapshotRDF = std::make_shared<ROOT::RDataFrame>(treeName, fileName, validCols);
340 auto snapshotRDFResPtr = MakeResultPtr(snapshotRDF, loopManager, std::move(actionPtr));
341
342 if (!isLazy) {
343 *snapshotRDFResPtr;
344 }
345 return snapshotRDFResPtr;
346}
347
348std::string DemangleTypeIdName(const std::type_info &typeInfo)
349{
350 int dummy(0);
351 return TClassEdit::DemangleTypeIdName(typeInfo, dummy);
352}
353
355 TTree *tree,
356 ROOT::RDF::RDataSource *dataSource,
357 std::string_view columnNameRegexp,
358 std::string_view callerName)
359{
360 const auto theRegexSize = columnNameRegexp.size();
361 std::string theRegex(columnNameRegexp);
362
363 const auto isEmptyRegex = 0 == theRegexSize;
364 // This is to avoid cases where branches called b1, b2, b3 are all matched by expression "b"
365 if (theRegexSize > 0 && theRegex[0] != '^')
366 theRegex = "^" + theRegex;
367 if (theRegexSize > 0 && theRegex[theRegexSize - 1] != '$')
368 theRegex = theRegex + "$";
369
370 ColumnNames_t selectedColumns;
371 selectedColumns.reserve(32);
372
373 // Since we support gcc48 and it does not provide in its stl std::regex,
374 // we need to use TPRegexp
375 TPRegexp regexp(theRegex);
376 for (auto &&branchName : customColumns.GetNames()) {
377 if ((isEmptyRegex || 0 != regexp.Match(branchName.c_str())) &&
378 !RDFInternal::IsInternalColumn(branchName)) {
379 selectedColumns.emplace_back(branchName);
380 }
381 }
382
383 if (tree) {
384 auto branchNames = GetTopLevelBranchNames(*tree);
385 for (auto &branchName : branchNames) {
386 if (isEmptyRegex || 0 != regexp.Match(branchName.c_str())) {
387 selectedColumns.emplace_back(branchName);
388 }
389 }
390 }
391
392 if (dataSource) {
393 auto &dsColNames = dataSource->GetColumnNames();
394 for (auto &dsColName : dsColNames) {
395 if ((isEmptyRegex || 0 != regexp.Match(dsColName.c_str())) &&
396 !RDFInternal::IsInternalColumn(dsColName)) {
397 selectedColumns.emplace_back(dsColName);
398 }
399 }
400 }
401
402 if (selectedColumns.empty()) {
403 std::string text(callerName);
404 if (columnNameRegexp.empty()) {
405 text = ": there is no column available to match.";
406 } else {
407 text = ": regex \"" + std::string(columnNameRegexp) + "\" did not match any column.";
408 }
409 throw std::runtime_error(text);
410 }
411 return selectedColumns;
412}
413
414void CheckCustomColumn(std::string_view definedCol, TTree *treePtr, const ColumnNames_t &customCols,
415 const std::map<std::string, std::string> &aliasMap, const ColumnNames_t &dataSourceColumns)
416{
417 const std::string definedColStr(definedCol);
418
419 if (!IsValidCppVarName(definedColStr)) {
420 const auto msg = "Cannot define column \"" + definedColStr + "\": not a valid C++ variable name.";
421 throw std::runtime_error(msg);
422 }
423
424 if (treePtr != nullptr) {
425 // check if definedCol is already present in TTree
426 const auto branch = treePtr->GetBranch(definedColStr.c_str());
427 if (branch != nullptr) {
428 const auto msg = "branch \"" + definedColStr + "\" already present in TTree";
429 throw std::runtime_error(msg);
430 }
431 }
432 // check if definedCol has already been `Define`d in the functional graph
433 if (std::find(customCols.begin(), customCols.end(), definedCol) != customCols.end()) {
434 const auto msg = "Redefinition of column \"" + definedColStr + "\"";
435 throw std::runtime_error(msg);
436 }
437
438 // Check if the definedCol is an alias
439 const auto aliasColNameIt = aliasMap.find(definedColStr);
440 if (aliasColNameIt != aliasMap.end()) {
441 const auto msg = "An alias with name " + definedColStr + " pointing to column " +
442 aliasColNameIt->second + " is already existing.";
443 throw std::runtime_error(msg);
444 }
445
446 // check if definedCol is already present in the DataSource (but has not yet been `Define`d)
447 if (!dataSourceColumns.empty()) {
448 if (std::find(dataSourceColumns.begin(), dataSourceColumns.end(), definedCol) != dataSourceColumns.end()) {
449 const auto msg = "Redefinition of column \"" + definedColStr + "\" already present in the data-source";
450 throw std::runtime_error(msg);
451 }
452 }
453}
454
455void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
456{
457 if (nTemplateParams != nColumnNames) {
458 std::string err_msg = "The number of template parameters specified is ";
459 err_msg += std::to_string(nTemplateParams);
460 err_msg += " while ";
461 err_msg += std::to_string(nColumnNames);
462 err_msg += " columns have been specified.";
463 throw std::runtime_error(err_msg);
464 }
465}
466
467/// Choose between local column names or default column names, throw in case of errors.
468const ColumnNames_t
469SelectColumns(unsigned int nRequiredNames, const ColumnNames_t &names, const ColumnNames_t &defaultNames)
470{
471 if (names.empty()) {
472 // use default column names
473 if (defaultNames.size() < nRequiredNames)
474 throw std::runtime_error(
475 std::to_string(nRequiredNames) + " column name" + (nRequiredNames == 1 ? " is" : "s are") +
476 " required but none were provided and the default list has size " + std::to_string(defaultNames.size()));
477 // return first nRequiredNames default column names
478 return ColumnNames_t(defaultNames.begin(), defaultNames.begin() + nRequiredNames);
479 } else {
480 // use column names provided by the user to this particular transformation/action
481 if (names.size() != nRequiredNames) {
482 auto msg = std::to_string(nRequiredNames) + " column name" + (nRequiredNames == 1 ? " is" : "s are") +
483 " required but " + std::to_string(names.size()) + (names.size() == 1 ? " was" : " were") +
484 " provided:";
485 for (const auto &name : names)
486 msg += " \"" + name + "\",";
487 msg.back() = '.';
488 throw std::runtime_error(msg);
489 }
490 return names;
491 }
492}
493
494ColumnNames_t FindUnknownColumns(const ColumnNames_t &requiredCols, const ColumnNames_t &datasetColumns,
495 const ColumnNames_t &definedCols, const ColumnNames_t &dataSourceColumns)
496{
497 ColumnNames_t unknownColumns;
498 for (auto &column : requiredCols) {
499 const auto isBranch = std::find(datasetColumns.begin(), datasetColumns.end(), column) != datasetColumns.end();
500 if (isBranch)
501 continue;
502 const auto isCustomColumn = std::find(definedCols.begin(), definedCols.end(), column) != definedCols.end();
503 if (isCustomColumn)
504 continue;
505 const auto isDataSourceColumn =
506 std::find(dataSourceColumns.begin(), dataSourceColumns.end(), column) != dataSourceColumns.end();
507 if (isDataSourceColumn)
508 continue;
509 unknownColumns.emplace_back(column);
510 }
511 return unknownColumns;
512}
513
515{
516 const auto str = colName.data();
517 const auto goodPrefix = colName.size() > 3 && // has at least more characters than {r,t}df
518 ('r' == str[0] || 't' == str[0]) && // starts with r or t
519 0 == strncmp("df", str + 1, 2); // 2nd and 3rd letters are df
520 return goodPrefix && '_' == colName.back(); // also ends with '_'
521}
522
523std::vector<std::string> GetFilterNames(const std::shared_ptr<RLoopManager> &loopManager)
524{
525 return loopManager->GetFiltersNames();
526}
527
528std::string PrettyPrintAddr(const void *const addr)
529{
530 std::stringstream s;
531 // Windows-friendly
532 s << std::hex << std::showbase << reinterpret_cast<size_t>(addr);
533 return s.str();
534}
535
536void BookFilterJit(const std::shared_ptr<RJittedFilter> &jittedFilter,
537 std::shared_ptr<RDFDetail::RNodeBase> *prevNodeOnHeap, std::string_view name,
538 std::string_view expression, const std::map<std::string, std::string> &aliasMap,
539 const ColumnNames_t &branches, const RDFInternal::RBookedCustomColumns &customCols, TTree *tree,
540 RDataSource *ds)
541{
542 const auto &dsColumns = ds ? ds->GetColumnNames() : ColumnNames_t{};
543
544 const auto parsedExpr =
545 ParseRDFExpression(std::string(expression), branches, customCols.GetNames(), dsColumns, aliasMap);
546 const auto exprVarTypes =
547 GetValidatedArgTypes(parsedExpr.fUsedCols, customCols, tree, ds, "Filter", /*vector2rvec=*/true);
548 const auto lambdaName = DeclareLambda(parsedExpr.fExpr, parsedExpr.fVarNames, exprVarTypes);
549 const auto type = RetTypeOfLambda(lambdaName);
550 if (type != "bool")
551 std::runtime_error("Filter: the following expression does not evaluate to bool:\n" + std::string(expression));
552
553 // columnsOnHeap is deleted by the jitted call to JitFilterHelper
555 const auto columnsOnHeapAddr = PrettyPrintAddr(columnsOnHeap);
556 const auto prevNodeAddr = PrettyPrintAddr(prevNodeOnHeap);
557
558 // Produce code snippet that creates the filter and registers it with the corresponding RJittedFilter
559 // Windows requires std::hex << std::showbase << (size_t)pointer to produce notation "0x1234"
560 std::stringstream filterInvocation;
561 filterInvocation << "ROOT::Internal::RDF::JitFilterHelper(" << lambdaName << ", {";
562 for (const auto &col : parsedExpr.fUsedCols)
563 filterInvocation << "\"" << col << "\", ";
564 if (!parsedExpr.fUsedCols.empty())
565 filterInvocation.seekp(-2, filterInvocation.cur); // remove the last ",
566 // lifetime of pointees:
567 // - jittedFilter: heap-allocated weak_ptr to the actual jittedFilter that will be deleted by JitFilterHelper
568 // - prevNodeOnHeap: heap-allocated shared_ptr to the actual previous node that will be deleted by JitFilterHelper
569 // - columnsOnHeap: heap-allocated, will be deleted by JitFilterHelper
570 filterInvocation << "}, \"" << name << "\", "
571 << "reinterpret_cast<std::weak_ptr<ROOT::Detail::RDF::RJittedFilter>*>("
572 << PrettyPrintAddr(MakeWeakOnHeap(jittedFilter)) << "), "
573 << "reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>(" << prevNodeAddr << "),"
574 << "reinterpret_cast<ROOT::Internal::RDF::RBookedCustomColumns*>(" << columnsOnHeapAddr << ")"
575 << ");\n";
576
577 auto lm = jittedFilter->GetLoopManagerUnchecked();
578 lm->ToJitExec(filterInvocation.str());
579}
580
581// Jit a Define call
582std::shared_ptr<RJittedCustomColumn> BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm,
583 RDataSource *ds, const RDFInternal::RBookedCustomColumns &customCols,
584 const ColumnNames_t &branches,
585 std::shared_ptr<RNodeBase> *upcastNodeOnHeap)
586{
587 const auto &aliasMap = lm.GetAliasMap();
588 auto *const tree = lm.GetTree();
589 const auto &dsColumns = ds ? ds->GetColumnNames() : ColumnNames_t{};
590
591 const auto parsedExpr =
592 ParseRDFExpression(std::string(expression), branches, customCols.GetNames(), dsColumns, aliasMap);
593 const auto exprVarTypes =
594 GetValidatedArgTypes(parsedExpr.fUsedCols, customCols, tree, ds, "Define", /*vector2rvec=*/true);
595 const auto lambdaName = DeclareLambda(parsedExpr.fExpr, parsedExpr.fVarNames, exprVarTypes);
596 const auto type = RetTypeOfLambda(lambdaName);
597
598 auto customColumnsCopy = new RDFInternal::RBookedCustomColumns(customCols);
599 auto customColumnsAddr = PrettyPrintAddr(customColumnsCopy);
600 auto jittedCustomColumn = std::make_shared<RDFDetail::RJittedCustomColumn>(name, type, lm.GetNSlots());
601
602 std::stringstream defineInvocation;
603 defineInvocation << "ROOT::Internal::RDF::JitDefineHelper(" << lambdaName << ", {";
604 for (const auto &col : parsedExpr.fUsedCols) {
605 defineInvocation << "\"" << col << "\", ";
606 }
607 if (!parsedExpr.fUsedCols.empty())
608 defineInvocation.seekp(-2, defineInvocation.cur); // remove the last ",
609 // lifetime of pointees:
610 // - lm is the loop manager, and if that goes out of scope jitting does not happen at all (i.e. will always be valid)
611 // - jittedCustomColumn: heap-allocated weak_ptr that will be deleted by JitDefineHelper after usage
612 // - customColumnsAddr: heap-allocated, will be deleted by JitDefineHelper after usage
613 defineInvocation << "}, \"" << name << "\", reinterpret_cast<ROOT::Detail::RDF::RLoopManager*>("
614 << PrettyPrintAddr(&lm)
615 << "), reinterpret_cast<std::weak_ptr<ROOT::Detail::RDF::RJittedCustomColumn>*>("
616 << PrettyPrintAddr(MakeWeakOnHeap(jittedCustomColumn))
617 << "), reinterpret_cast<ROOT::Internal::RDF::RBookedCustomColumns*>(" << customColumnsAddr
618 << "), reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>("
619 << PrettyPrintAddr(upcastNodeOnHeap) << "));\n";
620
621 lm.ToJitExec(defineInvocation.str());
622 return jittedCustomColumn;
623}
624
625// Jit and call something equivalent to "this->BuildAndBook<BranchTypes...>(params...)"
626// (see comments in the body for actual jitted code)
627std::string JitBuildAction(const ColumnNames_t &bl, std::shared_ptr<RDFDetail::RNodeBase> *prevNode,
628 const std::type_info &art, const std::type_info &at, void *rOnHeap, TTree *tree,
629 const unsigned int nSlots, const RDFInternal::RBookedCustomColumns &customCols,
630 RDataSource *ds, std::weak_ptr<RJittedAction> *jittedActionOnHeap)
631{
632 // retrieve type of result of the action as a string
633 auto actionResultTypeClass = TClass::GetClass(art);
634 if (!actionResultTypeClass) {
635 std::string exceptionText = "An error occurred while inferring the result type of an operation.";
636 throw std::runtime_error(exceptionText.c_str());
637 }
638 const auto actionResultTypeName = actionResultTypeClass->GetName();
639
640 // retrieve type of action as a string
641 auto actionTypeClass = TClass::GetClass(at);
642 if (!actionTypeClass) {
643 std::string exceptionText = "An error occurred while inferring the action type of the operation.";
644 throw std::runtime_error(exceptionText.c_str());
645 }
646 const auto actionTypeName = actionTypeClass->GetName();
647
648 auto customColumnsCopy = new RDFInternal::RBookedCustomColumns(customCols); // deleted in jitted CallBuildAction
649 auto customColumnsAddr = PrettyPrintAddr(customColumnsCopy);
650
651 // Build a call to CallBuildAction with the appropriate argument. When run through the interpreter, this code will
652 // just-in-time create an RAction object and it will assign it to its corresponding RJittedAction.
653 std::stringstream createAction_str;
654 createAction_str << "ROOT::Internal::RDF::CallBuildAction<" << actionTypeName;
655 const auto columnTypeNames = GetValidatedArgTypes(bl, customCols, tree, ds, actionTypeName, /*vector2rvec=*/true);
656 for (auto &colType : columnTypeNames)
657 createAction_str << ", " << colType;
658 // on Windows, to prefix the hexadecimal value of a pointer with '0x',
659 // one need to write: std::hex << std::showbase << (size_t)pointer
660 createAction_str << ">(reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>("
661 << PrettyPrintAddr(prevNode) << "), {";
662 for (auto i = 0u; i < bl.size(); ++i) {
663 if (i != 0u)
664 createAction_str << ", ";
665 createAction_str << '"' << bl[i] << '"';
666 }
667 createAction_str << "}, " << nSlots << ", reinterpret_cast<" << actionResultTypeName << "*>("
668 << PrettyPrintAddr(rOnHeap)
669 << "), reinterpret_cast<std::weak_ptr<ROOT::Internal::RDF::RJittedAction>*>("
670 << PrettyPrintAddr(jittedActionOnHeap)
671 << "), reinterpret_cast<ROOT::Internal::RDF::RBookedCustomColumns*>(" << customColumnsAddr << "));";
672 return createAction_str.str();
673}
674
675bool AtLeastOneEmptyString(const std::vector<std::string_view> strings)
676{
677 for (const auto &s : strings) {
678 if (s.empty())
679 return true;
680 }
681 return false;
682}
683
684std::shared_ptr<RNodeBase> UpcastNode(std::shared_ptr<RNodeBase> ptr)
685{
686 return ptr;
687}
688
689/// Given the desired number of columns and the user-provided list of columns:
690/// * fallback to using the first nColumns default columns if needed (or throw if nColumns > nDefaultColumns)
691/// * check that selected column names refer to valid branches, custom columns or datasource columns (throw if not)
692/// * replace column names from aliases by the actual column name
693/// Return the list of selected column names.
694ColumnNames_t GetValidatedColumnNames(RLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns,
695 const ColumnNames_t &validCustomColumns, RDataSource *ds)
696{
697 const auto &defaultColumns = lm.GetDefaultColumnNames();
698 auto selectedColumns = SelectColumns(nColumns, columns, defaultColumns);
699 const auto &validBranchNames = lm.GetBranchNames();
700 const auto unknownColumns = FindUnknownColumns(selectedColumns, validBranchNames, validCustomColumns,
701 ds ? ds->GetColumnNames() : ColumnNames_t{});
702
703 if (!unknownColumns.empty()) {
704 // throw
705 std::stringstream unknowns;
706 std::string delim = unknownColumns.size() > 1 ? "s: " : ": "; // singular/plural
707 for (auto &unknownColumn : unknownColumns) {
708 unknowns << delim << unknownColumn;
709 delim = ',';
710 }
711 throw std::runtime_error("Unknown column" + unknowns.str());
712 }
713
714 // Now we need to check within the aliases if some of the yet unknown names can be recovered
715 auto &aliasMap = lm.GetAliasMap();
716 auto aliasMapEnd = aliasMap.end();
717
718 for (auto idx : ROOT::TSeqU(selectedColumns.size())) {
719 const auto &colName = selectedColumns[idx];
720 const auto aliasColumnNameIt = aliasMap.find(colName);
721 if (aliasMapEnd != aliasColumnNameIt) {
722 selectedColumns[idx] = aliasColumnNameIt->second;
723 }
724 }
725
726 return selectedColumns;
727}
728
729std::vector<std::string> GetValidatedArgTypes(const ColumnNames_t &colNames, const RBookedCustomColumns &customColumns,
730 TTree *tree, RDataSource *ds, const std::string &context,
731 bool vector2rvec)
732{
733 auto toCheckedArgType = [&](const std::string &c) {
735 customColumns.HasName(c) ? customColumns.GetColumns().at(c).get() : nullptr;
736 const auto colType = ColumnName2ColumnTypeName(c, tree, ds, customCol, vector2rvec);
737 if (colType.rfind("CLING_UNKNOWN_TYPE", 0) == 0) { // the interpreter does not know this type
738 const auto msg =
739 "The type of custom column \"" + c + "\" (" + colType.substr(19) +
740 ") is not known to the interpreter, but a just-in-time-compiled " + context +
741 " call requires this column. Make sure to create and load ROOT dictionaries for this column's class.";
742 throw std::runtime_error(msg);
743 }
744 return colType;
745 };
746 std::vector<std::string> colTypes;
747 colTypes.reserve(colNames.size());
748 std::transform(colNames.begin(), colNames.end(), std::back_inserter(colTypes), toCheckedArgType);
749 return colTypes;
750}
751
752/// Return a bitset each element of which indicates whether the corresponding element in `selectedColumns` is the
753/// name of a column that must be defined via datasource. All elements of the returned vector are false if no
754/// data-source is present.
755std::vector<bool> FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedCols)
756{
757 const auto nColumns = requestedCols.size();
758 std::vector<bool> mustBeDefined(nColumns, false);
759 for (auto i = 0u; i < nColumns; ++i)
760 mustBeDefined[i] = std::find(definedCols.begin(), definedCols.end(), requestedCols[i]) == definedCols.end();
761 return mustBeDefined;
762}
763
764} // namespace RDF
765} // namespace Internal
766} // namespace ROOT
#define c(i)
Definition: RSha256.hxx:101
static RooMathCoreReg dummy
#define R__ASSERT(e)
Definition: TError.h:96
char name[80]
Definition: TGX11.cxx:109
int type
Definition: TGX11.cxx:120
#define gInterpreter
Definition: TInterpreter.h:556
The head node of a RDF computation graph.
const std::map< std::string, std::string > & GetAliasMap() const
const ColumnNames_t & GetBranchNames()
Return all valid TTree::Branch names (caching results for subsequent calls).
void ToJitExec(const std::string &) const
const ColumnNames_t & GetDefaultColumnNames() const
Return the list of default columns – empty if none was provided when constructing the RDataFrame.
unsigned int GetNSlots() const
Encapsulates the columns defined by the user.
ColumnNames_t GetNames() const
Returns the list of the names of the defined columns.
bool HasName(std::string_view name) const
Check if the provided name is tracked in the names list.
const RCustomColumnBasePtrMap_t & GetColumns() const
Returns the list of the pointers to the defined columns.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset's column names.
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:66
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition: TClass.cxx:2948
Small helper to keep current directory context.
Definition: TDirectory.h:47
A TFriendElement TF describes a TTree object TF in a file.
Int_t Match(const TString &s, const TString &mods="", Int_t start=0, Int_t nMaxMatch=10, TArrayI *pos=0)
The number of matches is returned, this equals the full match + sub-pattern matches.
Definition: TPRegexp.cxx:339
Basic string class.
Definition: TString.h:131
A TTree represents a columnar dataset.
Definition: TTree.h:78
virtual TBranch * GetBranch(const char *name)
Return pointer to the branch with the given name in this tree or its friends.
Definition: TTree.cxx:5209
virtual TObjArray * GetListOfBranches()
Definition: TTree.h:482
virtual TList * GetListOfFriends() const
Definition: TTree.h:484
TText * text
basic_string_view< char > string_view
RResultPtr< T > MakeResultPtr(const std::shared_ptr< T > &r, RLoopManager &df, std::shared_ptr< ROOT::Internal::RDF::RActionBase > actionPtr)
Create a RResultPtr and set its pointer to the corresponding RAction This overload is invoked by non-...
Definition: RResultPtr.hxx:346
const ColumnNames_t SelectColumns(unsigned int nRequiredNames, const ColumnNames_t &names, const ColumnNames_t &defaultNames)
Choose between local column names or default column names, throw in case of errors.
void BookFilterJit(const std::shared_ptr< RJittedFilter > &jittedFilter, std::shared_ptr< RDFDetail::RNodeBase > *prevNodeOnHeap, std::string_view name, std::string_view expression, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &branches, const RDFInternal::RBookedCustomColumns &customCols, TTree *tree, RDataSource *ds)
std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *tree, RDataSource *ds, RCustomColumnBase *customColumn, bool vector2rvec)
Return a string containing the type of the given branch.
Definition: RDFUtils.cxx:211
std::vector< std::string > GetValidatedArgTypes(const ColumnNames_t &colNames, const RBookedCustomColumns &customColumns, TTree *tree, RDataSource *ds, const std::string &context, bool vector2rvec)
std::shared_ptr< RNodeBase > UpcastNode(std::shared_ptr< RNodeBase > ptr)
std::vector< std::string > GetFilterNames(const std::shared_ptr< RLoopManager > &loopManager)
ColumnNames_t ConvertRegexToColumns(const RDFInternal::RBookedCustomColumns &customColumns, TTree *tree, ROOT::RDF::RDataSource *dataSource, std::string_view columnNameRegexp, std::string_view callerName)
std::string PrettyPrintAddr(const void *const addr)
std::shared_ptr< RJittedCustomColumn > BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm, RDataSource *ds, const RDFInternal::RBookedCustomColumns &customCols, const ColumnNames_t &branches, std::shared_ptr< RNodeBase > *upcastNodeOnHeap)
void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
std::string DemangleTypeIdName(const std::type_info &typeInfo)
bool AtLeastOneEmptyString(const std::vector< std::string_view > strings)
ColumnNames_t FindUnknownColumns(const ColumnNames_t &requiredCols, const ColumnNames_t &datasetColumns, const ColumnNames_t &definedCols, const ColumnNames_t &dataSourceColumns)
HeadNode_t CreateSnapshotRDF(const ColumnNames_t &validCols, std::string_view treeName, std::string_view fileName, bool isLazy, RLoopManager &loopManager, std::unique_ptr< RDFInternal::RActionBase > actionPtr)
std::string JitBuildAction(const ColumnNames_t &bl, std::shared_ptr< RDFDetail::RNodeBase > *prevNode, const std::type_info &art, const std::type_info &at, void *rOnHeap, TTree *tree, const unsigned int nSlots, const RDFInternal::RBookedCustomColumns &customCols, RDataSource *ds, std::weak_ptr< RJittedAction > *jittedActionOnHeap)
bool IsInternalColumn(std::string_view colName)
void InterpreterDeclare(const std::string &code)
Definition: RDFUtils.cxx:302
ColumnNames_t GetValidatedColumnNames(RLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns, const ColumnNames_t &validCustomColumns, RDataSource *ds)
Given the desired number of columns and the user-provided list of columns:
std::vector< bool > FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedCols)
Return a bitset each element of which indicates whether the corresponding element in selectedColumns ...
void CheckCustomColumn(std::string_view definedCol, TTree *treePtr, const ColumnNames_t &customCols, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &dataSourceColumns)
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Definition: StringConv.hxx:21
char * DemangleTypeIdName(const std::type_info &ti, int &errorCode)
Demangle in a portable way the type id name.
static constexpr double s
Definition: tree.py:1