Logo ROOT   6.16/01
Reference Guide
RDFInterfaceUtils.cxx
Go to the documentation of this file.
1// Author: Enrico Guiraud, Danilo Piparo CERN 02/2018
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
13#include <ROOT/RStringView.hxx>
14#include <ROOT/TSeq.hxx>
15#include <RtypesCore.h>
16#include <TDirectory.h>
17#include <TChain.h>
18#include <TClass.h>
19#include <TClassEdit.h>
20#include <TFriendElement.h>
21#include <TInterpreter.h>
22#include <TObject.h>
23#include <TRegexp.h>
24#include <TString.h>
25#include <TTree.h>
26
27#include <iosfwd>
28#include <stdexcept>
29#include <string>
30#include <typeinfo>
31
32namespace ROOT {
33namespace Detail {
34namespace RDF {
35class RCustomColumnBase;
36class RFilterBase;
37class RLoopManager;
38class RRangeBase;
39} // namespace RDF
40} // namespace Detail
41
42namespace RDF {
43class RDataSource;
44} // namespace RDF
45
46} // namespace ROOT
47
48namespace ROOT {
49namespace Internal {
50namespace RDF {
51
52// The set here is used as a registry, the real list, which keeps the order, is
53// the one in the vector
54class RActionBase;
55
56
57HeadNode_t CreateSnaphotRDF(const ColumnNames_t &validCols,
58 const std::string &fullTreeName,
59 const std::string &fileName,
60 bool isLazy,
61 RLoopManager &loopManager,
62 std::unique_ptr<RDFInternal::RActionBase> actionPtr)
63{
64 // create new RDF
66 // Now we mimic a constructor for the RDataFrame. We cannot invoke it here
67 // since this would introduce a cyclic headers dependency.
68
69 // Keep these two statements separated to work-around an ABI incompatibility
70 // between clang (and thus cling) and gcc in the way std::forward is handled.
71 // See https://sft.its.cern.ch/jira/browse/ROOT-9236 for more detail.
72 auto rlm_ptr = std::make_shared<RLoopManager>(nullptr, validCols);
73 auto snapshotRDF = std::make_shared<RInterface<RLoopManager>>(rlm_ptr);
74 auto chain = std::make_shared<TChain>(fullTreeName.c_str());
75 chain->Add(std::string(fileName).c_str());
76 snapshotRDF->fProxiedPtr->SetTree(chain);
77 auto snapshotRDFResPtr = MakeResultPtr(snapshotRDF, loopManager, std::move(actionPtr));
78
79 if (!isLazy) {
80 *snapshotRDFResPtr;
81 }
82 return snapshotRDFResPtr;
83}
84
85std::string DemangleTypeIdName(const std::type_info &typeInfo)
86{
87 int dummy(0);
88 return TClassEdit::DemangleTypeIdName(typeInfo, dummy);
89}
90
92 std::string_view callerName)
93{
94 const auto theRegexSize = columnNameRegexp.size();
95 std::string theRegex(columnNameRegexp);
96
97 const auto isEmptyRegex = 0 == theRegexSize;
98 // This is to avoid cases where branches called b1, b2, b3 are all matched by expression "b"
99 if (theRegexSize > 0 && theRegex[0] != '^')
100 theRegex = "^" + theRegex;
101 if (theRegexSize > 0 && theRegex[theRegexSize - 1] != '$')
102 theRegex = theRegex + "$";
103
104 ColumnNames_t selectedColumns;
105 selectedColumns.reserve(32);
106
107 // Since we support gcc48 and it does not provide in its stl std::regex,
108 // we need to use TRegexp
109 TRegexp regexp(theRegex);
110 int dummy;
111 for (auto &&branchName : node.fCustomColumns.GetNames()) {
112 if ((isEmptyRegex || -1 != regexp.Index(branchName.c_str(), &dummy)) &&
114 selectedColumns.emplace_back(branchName);
115 }
116 }
117
118 auto tree = node.fLoopManager->GetTree();
119 if (tree) {
120 auto branchNames = RDFInternal::GetTopLevelBranchNames(*tree);
121 for (auto &branchName : branchNames) {
122 if (isEmptyRegex || -1 != regexp.Index(branchName, &dummy)) {
123 selectedColumns.emplace_back(branchName);
124 }
125 }
126 }
127
128 if (node.fDataSource) {
129 auto &dsColNames = node.fDataSource->GetColumnNames();
130 for (auto &dsColName : dsColNames) {
131 if ((isEmptyRegex || -1 != regexp.Index(dsColName.c_str(), &dummy)) &&
132 !RDFInternal::IsInternalColumn(dsColName)) {
133 selectedColumns.emplace_back(dsColName);
134 }
135 }
136 }
137
138 if (selectedColumns.empty()) {
139 std::string text(callerName);
140 if (columnNameRegexp.empty()) {
141 text = ": there is no column available to match.";
142 } else {
143 text = ": regex \"" + std::string(columnNameRegexp) + "\" did not match any column.";
144 }
145 throw std::runtime_error(text);
146 }
147 return selectedColumns;
148}
149
150void GetTopLevelBranchNamesImpl(TTree &t, std::set<std::string> &bNamesReg, ColumnNames_t &bNames,
151 std::set<TTree *> &analysedTrees)
152{
153
154 if (!analysedTrees.insert(&t).second) {
155 return;
156 }
157
158 auto branches = t.GetListOfBranches();
159 if (branches) {
160 for (auto branchObj : *branches) {
161 auto name = branchObj->GetName();
162 if (bNamesReg.insert(name).second) {
163 bNames.emplace_back(name);
164 }
165 }
166 }
167
168 auto friendTrees = t.GetListOfFriends();
169
170 if (!friendTrees)
171 return;
172
173 for (auto friendTreeObj : *friendTrees) {
174 auto friendTree = ((TFriendElement *)friendTreeObj)->GetTree();
175 GetTopLevelBranchNamesImpl(*friendTree, bNamesReg, bNames, analysedTrees);
176 }
177}
178
179///////////////////////////////////////////////////////////////////////////////
180/// Get all the top-level branches names, including the ones of the friend trees
182{
183 std::set<std::string> bNamesSet;
184 ColumnNames_t bNames;
185 std::set<TTree *> analysedTrees;
186 GetTopLevelBranchNamesImpl(t, bNamesSet, bNames, analysedTrees);
187 return bNames;
188}
189
190bool IsValidCppVarName(const std::string &var)
191{
192 if (var.empty())
193 return false;
194 const char firstChar = var[0];
195
196 // first character must be either a letter or an underscore
197 auto isALetter = [](char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); };
198 const bool isValidFirstChar = firstChar == '_' || isALetter(firstChar);
199 if (!isValidFirstChar)
200 return false;
201
202 // all characters must be either a letter, an underscore or a number
203 auto isANumber = [](char c) { return c >= '0' && c <= '9'; };
204 auto isValidTok = [&isALetter, &isANumber](char c) { return c == '_' || isALetter(c) || isANumber(c); };
205 for (const char c : var)
206 if (!isValidTok(c))
207 return false;
208
209 return true;
210}
211
212void CheckCustomColumn(std::string_view definedCol, TTree *treePtr, const ColumnNames_t &customCols,
213 const std::map<std::string, std::string> &aliasMap, const ColumnNames_t &dataSourceColumns)
214{
215 const std::string definedColStr(definedCol);
216
217 if (!IsValidCppVarName(definedColStr)) {
218 const auto msg = "Cannot define column \"" + definedColStr + "\": not a valid C++ variable name.";
219 throw std::runtime_error(msg);
220 }
221
222 if (treePtr != nullptr) {
223 // check if definedCol is already present in TTree
224 const auto branch = treePtr->GetBranch(definedColStr.c_str());
225 if (branch != nullptr) {
226 const auto msg = "branch \"" + definedColStr + "\" already present in TTree";
227 throw std::runtime_error(msg);
228 }
229 }
230 // check if definedCol has already been `Define`d in the functional graph
231 if (std::find(customCols.begin(), customCols.end(), definedCol) != customCols.end()) {
232 const auto msg = "Redefinition of column \"" + definedColStr + "\"";
233 throw std::runtime_error(msg);
234 }
235
236 // Check if the definedCol is an alias
237 const auto aliasColNameIt = aliasMap.find(definedColStr);
238 if (aliasColNameIt != aliasMap.end()) {
239 const auto msg = "An alias with name " + definedColStr + " pointing to column " +
240 aliasColNameIt->second + " is already existing.";
241 throw std::runtime_error(msg);
242 }
243
244 // check if definedCol is already present in the DataSource (but has not yet been `Define`d)
245 if (!dataSourceColumns.empty()) {
246 if (std::find(dataSourceColumns.begin(), dataSourceColumns.end(), definedCol) != dataSourceColumns.end()) {
247 const auto msg = "Redefinition of column \"" + definedColStr + "\" already present in the data-source";
248 throw std::runtime_error(msg);
249 }
250 }
251}
252
253void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
254{
255 if (nTemplateParams != nColumnNames) {
256 std::string err_msg = "The number of template parameters specified is ";
257 err_msg += std::to_string(nTemplateParams);
258 err_msg += " while ";
259 err_msg += std::to_string(nColumnNames);
260 err_msg += " columns have been specified.";
261 throw std::runtime_error(err_msg);
262 }
263}
264
265/// Choose between local column names or default column names, throw in case of errors.
266const ColumnNames_t
267SelectColumns(unsigned int nRequiredNames, const ColumnNames_t &names, const ColumnNames_t &defaultNames)
268{
269 if (names.empty()) {
270 // use default column names
271 if (defaultNames.size() < nRequiredNames)
272 throw std::runtime_error(
273 std::to_string(nRequiredNames) + " column name" + (nRequiredNames == 1 ? " is" : "s are") +
274 " required but none were provided and the default list has size " + std::to_string(defaultNames.size()));
275 // return first nRequiredNames default column names
276 return ColumnNames_t(defaultNames.begin(), defaultNames.begin() + nRequiredNames);
277 } else {
278 // use column names provided by the user to this particular transformation/action
279 if (names.size() != nRequiredNames) {
280 auto msg = std::to_string(nRequiredNames) + " column name" + (nRequiredNames == 1 ? " is" : "s are") +
281 " required but " + std::to_string(names.size()) + (names.size() == 1 ? " was" : " were") +
282 " provided:";
283 for (const auto &name : names)
284 msg += " \"" + name + "\",";
285 msg.back() = '.';
286 throw std::runtime_error(msg);
287 }
288 return names;
289 }
290}
291
292ColumnNames_t FindUnknownColumns(const ColumnNames_t &requiredCols, const ColumnNames_t &datasetColumns,
293 const ColumnNames_t &definedCols, const ColumnNames_t &dataSourceColumns)
294{
295 ColumnNames_t unknownColumns;
296 for (auto &column : requiredCols) {
297 const auto isBranch = std::find(datasetColumns.begin(), datasetColumns.end(), column) != datasetColumns.end();
298 if (isBranch)
299 continue;
300 const auto isCustomColumn = std::find(definedCols.begin(), definedCols.end(), column) != definedCols.end();
301 if (isCustomColumn)
302 continue;
303 const auto isDataSourceColumn =
304 std::find(dataSourceColumns.begin(), dataSourceColumns.end(), column) != dataSourceColumns.end();
305 if (isDataSourceColumn)
306 continue;
307 unknownColumns.emplace_back(column);
308 }
309 return unknownColumns;
310}
311
313{
314 const auto str = colName.data();
315 const auto goodPrefix = colName.size() > 3 && // has at least more characters than {r,t}df
316 ('r' == str[0] || 't' == str[0]) && // starts with r or t
317 0 == strncmp("df", str + 1, 2); // 2nd and 3rd letters are df
318 return goodPrefix && '_' == colName.back(); // also ends with '_'
319}
320
321std::vector<std::string> GetFilterNames(const std::shared_ptr<RLoopManager> &loopManager)
322{
323 return loopManager->GetFiltersNames();
324}
325
326// Replace all the occurrences of a string by another string
327unsigned int Replace(std::string &s, const std::string what, const std::string withWhat)
328{
329 size_t idx = 0;
330 auto numReplacements = 0U;
331 while ((idx = s.find(what, idx)) != std::string::npos) {
332 s.replace(idx, what.size(), withWhat);
333 idx += withWhat.size();
334 numReplacements++;
335 }
336 return numReplacements;
337}
338
339// Match expression against names of branches passed as parameter
340// Return vector of names of the branches used in the expression
341std::vector<std::string> FindUsedColumnNames(std::string_view expression, const ColumnNames_t &branches,
342 const ColumnNames_t &customColumns, const ColumnNames_t &dsColumns,
343 const std::map<std::string, std::string> &aliasMap)
344{
345 // To help matching the regex
346 const std::string paddedExpr = " " + std::string(expression) + " ";
347 static const std::string regexBit("[^a-zA-Z0-9_]");
348 Ssiz_t matchedLen;
349
350 std::vector<std::string> usedBranches;
351
352 // Check which custom columns match
353 for (auto &brName : customColumns) {
354 std::string bNameRegexContent = regexBit + brName + regexBit;
355 TRegexp bNameRegex(bNameRegexContent.c_str());
356 if (-1 != bNameRegex.Index(paddedExpr.c_str(), &matchedLen)) {
357 usedBranches.emplace_back(brName);
358 }
359 }
360
361 // Check which tree branches match
362 for (auto &brName : branches) {
363 // Replace "." with "\." for a correct match of sub-branches/leaves
364 auto escapedBrName = brName;
365 Replace(escapedBrName, std::string("."), std::string("\\."));
366 std::string bNameRegexContent = regexBit + escapedBrName + regexBit;
367 TRegexp bNameRegex(bNameRegexContent.c_str());
368 if (-1 != bNameRegex.Index(paddedExpr.c_str(), &matchedLen)) {
369 usedBranches.emplace_back(brName);
370 }
371 }
372
373 // Check which data-source columns match
374 for (auto &col : dsColumns) {
375 std::string bNameRegexContent = regexBit + col + regexBit;
376 TRegexp bNameRegex(bNameRegexContent.c_str());
377 if (-1 != bNameRegex.Index(paddedExpr.c_str(), &matchedLen)) {
378 // if not already found among the other columns
379 if (std::find(usedBranches.begin(), usedBranches.end(), col) == usedBranches.end())
380 usedBranches.emplace_back(col);
381 }
382 }
383
384 // Check which aliases match
385 for (auto &alias_colName : aliasMap) {
386 auto &alias = alias_colName.first;
387 std::string bNameRegexContent = regexBit + alias + regexBit;
388 TRegexp bNameRegex(bNameRegexContent.c_str());
389 if (-1 != bNameRegex.Index(paddedExpr.c_str(), &matchedLen)) {
390 // if not already found among the other columns
391 if (std::find(usedBranches.begin(), usedBranches.end(), alias) == usedBranches.end())
392 usedBranches.emplace_back(alias);
393 }
394 }
395
396 return usedBranches;
397}
398
399// TODO we should also replace other invalid chars, like '[],' and spaces
400std::vector<std::string> ReplaceDots(const ColumnNames_t &colNames)
401{
402 std::vector<std::string> dotlessNames = colNames;
403 for (auto &c : dotlessNames) {
404 const bool hasDot = c.find_first_of('.') != std::string::npos;
405 if (hasDot) {
406 std::replace(c.begin(), c.end(), '.', '_');
407 c.insert(0u, "__tdf_arg_");
408 }
409 }
410 return dotlessNames;
411}
412
413// TODO comment well -- there is a lot going on in this function in terms of side-effects
414std::vector<std::string> ColumnTypesAsString(ColumnNames_t &colNames, ColumnNames_t &varNames,
415 const std::map<std::string, std::string> &aliasMap, TTree *tree,
416 RDataSource *ds, std::string &expr, unsigned int namespaceID,
417 const RDFInternal::RBookedCustomColumns &customCols)
418{
419 std::vector<std::string> colTypes;
420 colTypes.reserve(colNames.size());
421 const auto aliasMapEnd = aliasMap.end();
422
423 for (auto c = colNames.begin(), v = varNames.begin(); c != colNames.end();) {
424 const auto &colName = *c;
425
426 if (colName.find('.') != std::string::npos) {
427 // If the column name contains dots, replace its name in the expression with the corresponding varName
428 auto numRepl = Replace(expr, colName, *v);
429 if (numRepl == 0) {
430 // Discard this column: we could not replace it, although we matched it previously
431 // This is because it is a substring of a column we already replaced in the expression
432 // e.g. "a.b" is a substring column of "a.b.c"
433 c = colNames.erase(c);
434 v = varNames.erase(v);
435 continue;
436 }
437 } else {
438 // Column name with no dots: check the name is still there
439 // it might have only been there as part of a column name with dots, e.g. "a" inside "a.b.c"
440 const auto paddedExpr = " " + expr + " ";
441 static const std::string noWordChars("[^a-zA-Z0-9_]");
442 const auto colNameRxBody = noWordChars + colName + noWordChars;
443 TRegexp colNameRegex(colNameRxBody.c_str());
444 Ssiz_t matchedLen;
445 const auto colStillThere = colNameRegex.Index(paddedExpr.c_str(), &matchedLen) != -1;
446 if (!colStillThere) {
447 c = colNames.erase(c);
448 v = varNames.erase(v);
449 continue;
450 }
451 }
452
453 // Replace the colName with the real one in case colName it's an alias
454 // The real name is used to get the type, but the variable name will still be colName
455 const auto aliasMapIt = aliasMap.find(colName);
456 const auto &realColName = aliasMapEnd == aliasMapIt ? colName : aliasMapIt->second;
457 // The map is a const reference, so no operator[]
458 const auto isCustomCol = customCols.HasName(realColName);
459 const auto customColID = isCustomCol ? customCols.GetColumns()[realColName]->GetID() : 0;
460 const auto colTypeName =
461 ColumnName2ColumnTypeName(realColName, namespaceID, tree, ds, isCustomCol, /*vector2rvec=*/true, customColID);
462 colTypes.emplace_back(colTypeName);
463 ++c, ++v;
464 }
465
466 return colTypes;
467}
468
469// Jit expression "in the vacuum", throw if cling exits with an error
470// This is to make sure that column names, types and expression string are proper C++
471void TryToJitExpression(const std::string &expression, const ColumnNames_t &colNames,
472 const std::vector<std::string> &colTypes, bool hasReturnStmt)
473{
474 R__ASSERT(colNames.size() == colTypes.size());
475
476 static unsigned int iNs = 0U;
477 std::stringstream dummyDecl;
478 dummyDecl << "namespace __tdf_" << std::to_string(iNs++) << "{ auto tdf_f = []() {";
479
480 for (auto col = colNames.begin(), type = colTypes.begin(); col != colNames.end(); ++col, ++type) {
481 dummyDecl << *type << " " << *col << ";\n";
482 }
483
484 // Now that branches are declared as variables, put the body of the
485 // lambda in dummyDecl and close scopes of f and namespace __tdf_N
486 if (hasReturnStmt)
487 dummyDecl << expression << "\n;};}";
488 else
489 dummyDecl << "return " << expression << "\n;};}";
490
491 // Try to declare the dummy lambda, error out if it does not compile
492 if (!gInterpreter->Declare(dummyDecl.str().c_str())) {
493 auto msg =
494 "Cannot interpret the following expression:\n" + std::string(expression) + "\n\nMake sure it is valid C++.";
495 throw std::runtime_error(msg);
496 }
497}
498
499std::string
500BuildLambdaString(const std::string &expr, const ColumnNames_t &vars, const ColumnNames_t &varTypes, bool hasReturnStmt)
501{
502 R__ASSERT(vars.size() == varTypes.size());
503
504 std::stringstream ss;
505 ss << "[](";
506 for (auto i = 0u; i < vars.size(); ++i) {
507 // We pass by reference to avoid expensive copies
508 // It can't be const reference in general, as users might want/need to call non-const methods on the values
509 ss << varTypes[i] << "& " << vars[i] << ", ";
510 }
511 if (!vars.empty())
512 ss.seekp(-2, ss.cur);
513
514 if (hasReturnStmt)
515 ss << "){\n" << expr << "\n}";
516 else
517 ss << "){return " << expr << "\n;}";
518
519 return ss.str();
520}
521
522std::string PrettyPrintAddr(const void *const addr)
523{
524 std::stringstream s;
525 // Windows-friendly
526 s << std::hex << std::showbase << reinterpret_cast<size_t>(addr);
527 return s.str();
528}
529
530// Jit a string filter expression and jit-and-call this->Filter with the appropriate arguments
531// Return pointer to the new functional chain node returned by the call, cast to Long_t
532
533void BookFilterJit(RJittedFilter *jittedFilter, void *prevNodeOnHeap, std::string_view name,
534 std::string_view expression, const std::map<std::string, std::string> &aliasMap,
536 RDataSource *ds, unsigned int namespaceID)
537{
538 const auto &dsColumns = ds ? ds->GetColumnNames() : ColumnNames_t{};
539
540 // not const because `ColumnTypesAsStrings` might delete redundant matches and replace variable names
541 auto usedBranches = FindUsedColumnNames(expression, branches, customCols.GetNames(), dsColumns, aliasMap);
542 auto varNames = ReplaceDots(usedBranches);
543 auto dotlessExpr = std::string(expression);
544 const auto usedColTypes =
545 ColumnTypesAsString(usedBranches, varNames, aliasMap, tree, ds, dotlessExpr, namespaceID, customCols);
546
547 TRegexp re("[^a-zA-Z0-9_]return[^a-zA-Z0-9_]");
548 Ssiz_t matchedLen;
549 const bool hasReturnStmt = re.Index(dotlessExpr, &matchedLen) != -1;
550
551 TryToJitExpression(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
552
553 const auto filterLambda = BuildLambdaString(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
554
555 const auto jittedFilterAddr = PrettyPrintAddr(jittedFilter);
556 const auto prevNodeAddr = PrettyPrintAddr(prevNodeOnHeap);
557
558 // columnsOnHeap is deleted by the jitted call to JitFilterHelper
560 const auto columnsOnHeapAddr = PrettyPrintAddr(columnsOnHeap);
561
562 // Produce code snippet that creates the filter and registers it with the corresponding RJittedFilter
563 // Windows requires std::hex << std::showbase << (size_t)pointer to produce notation "0x1234"
564 std::stringstream filterInvocation;
565 filterInvocation << "ROOT::Internal::RDF::JitFilterHelper(" << filterLambda << ", {";
566 for (const auto &brName : usedBranches) {
567 // Here we selectively replace the brName with the real column name if it's necessary.
568 const auto aliasMapIt = aliasMap.find(brName);
569 auto &realBrName = aliasMapIt == aliasMap.end() ? brName : aliasMapIt->second;
570 filterInvocation << "\"" << realBrName << "\", ";
571 }
572 if (!usedBranches.empty())
573 filterInvocation.seekp(-2, filterInvocation.cur); // remove the last ",
574 filterInvocation << "}, \"" << name << "\", "
575 << "reinterpret_cast<ROOT::Detail::RDF::RJittedFilter*>(" << jittedFilterAddr << "), "
576 << "reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>(" << prevNodeAddr << "),"
577 << "reinterpret_cast<ROOT::Internal::RDF::RBookedCustomColumns*>(" << columnsOnHeapAddr << ")"
578 << ");";
579
580 jittedFilter->GetLoopManagerUnchecked()->ToJit(filterInvocation.str());
581}
582
583// Jit a Define call
585 const std::shared_ptr<RJittedCustomColumn> &jittedCustomColumn,
587{
588 const auto &aliasMap = lm.GetAliasMap();
589 auto *const tree = lm.GetTree();
590 const auto namespaceID = lm.GetID();
591 const auto &dsColumns = ds ? ds->GetColumnNames() : ColumnNames_t{};
592
593 // not const because `ColumnTypesAsStrings` might delete redundant matches and replace variable names
594 auto usedBranches = FindUsedColumnNames(expression, branches, customCols.GetNames(), dsColumns, aliasMap);
595 auto varNames = ReplaceDots(usedBranches);
596 auto dotlessExpr = std::string(expression);
597 const auto usedColTypes =
598 ColumnTypesAsString(usedBranches, varNames, aliasMap, tree, ds, dotlessExpr, namespaceID, customCols);
599
600 TRegexp re("[^a-zA-Z0-9_]return[^a-zA-Z0-9_]");
601 Ssiz_t matchedLen;
602 const bool hasReturnStmt = re.Index(dotlessExpr, &matchedLen) != -1;
603
604 TryToJitExpression(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
605
606 const auto definelambda = BuildLambdaString(dotlessExpr, varNames, usedColTypes, hasReturnStmt);
607 const auto customColID = std::to_string(jittedCustomColumn->GetID());
608 const auto lambdaName = "eval_" + std::string(name) + customColID;
609 const auto ns = "__tdf" + std::to_string(namespaceID);
610
611 auto customColumnsCopy = new RDFInternal::RBookedCustomColumns(customCols);
612 auto customColumnsAddr = PrettyPrintAddr(customColumnsCopy);
613
614 // Declare the lambda variable and an alias for the type of the defined column in namespace __tdf
615 // This assumes that a given variable is Define'd once per RDataFrame -- we might want to relax this requirement
616 // to let python users execute a Define cell multiple times
617 const auto defineDeclaration =
618 "namespace " + ns + " { auto " + lambdaName + " = " + definelambda + ";\n" + "using " + std::string(name) +
619 customColID + "_type = typename ROOT::TypeTraits::CallableTraits<decltype(" + lambdaName + " )>::ret_type; }\n";
620 gInterpreter->Declare(defineDeclaration.c_str());
621
622 std::stringstream defineInvocation;
623 defineInvocation << "ROOT::Internal::RDF::JitDefineHelper(" << definelambda << ", {";
624 for (auto brName : usedBranches) {
625 // Here we selectively replace the brName with the real column name if it's necessary.
626 auto aliasMapIt = aliasMap.find(brName);
627 auto &realBrName = aliasMapIt == aliasMap.end() ? brName : aliasMapIt->second;
628 defineInvocation << "\"" << realBrName << "\", ";
629 }
630 if (!usedBranches.empty())
631 defineInvocation.seekp(-2, defineInvocation.cur); // remove the last ",
632 defineInvocation << "}, \"" << name << "\", reinterpret_cast<ROOT::Detail::RDF::RLoopManager*>("
633 << PrettyPrintAddr(&lm) << "), *reinterpret_cast<ROOT::Detail::RDF::RJittedCustomColumn*>("
634 << PrettyPrintAddr(jittedCustomColumn.get()) << "),"
635 << "reinterpret_cast<ROOT::Internal::RDF::RBookedCustomColumns*>(" << customColumnsAddr << ")"
636 << ");";
637
638 lm.ToJit(defineInvocation.str());
639}
640
641// Jit and call something equivalent to "this->BuildAndBook<BranchTypes...>(params...)"
642// (see comments in the body for actual jitted code)
643std::string JitBuildAction(const ColumnNames_t &bl, void *prevNode, const std::type_info &art, const std::type_info &at,
644 void *rOnHeap, TTree *tree, const unsigned int nSlots,
645 const RDFInternal::RBookedCustomColumns &customCols, RDataSource *ds,
646 std::shared_ptr<RJittedAction> *jittedActionOnHeap, unsigned int namespaceID)
647{
648 auto nBranches = bl.size();
649
650 // retrieve branch type names as strings
651 std::vector<std::string> columnTypeNames(nBranches);
652 for (auto i = 0u; i < nBranches; ++i) {
653 const auto isCustomCol = customCols.HasName(bl[i]);
654 const auto customColID = isCustomCol ? customCols.GetColumns()[bl[i]]->GetID() : 0;
655 const auto columnTypeName =
656 ColumnName2ColumnTypeName(bl[i], namespaceID, tree, ds, isCustomCol, /*vector2rvec=*/true, customColID);
657 if (columnTypeName.empty()) {
658 std::string exceptionText = "The type of column ";
659 exceptionText += bl[i];
660 exceptionText += " could not be guessed. Please specify one.";
661 throw std::runtime_error(exceptionText.c_str());
662 }
663 columnTypeNames[i] = columnTypeName;
664 }
665
666 // retrieve type of result of the action as a string
667 auto actionResultTypeClass = TClass::GetClass(art);
668 if (!actionResultTypeClass) {
669 std::string exceptionText = "An error occurred while inferring the result type of an operation.";
670 throw std::runtime_error(exceptionText.c_str());
671 }
672 const auto actionResultTypeName = actionResultTypeClass->GetName();
673
674 // retrieve type of action as a string
675 auto actionTypeClass = TClass::GetClass(at);
676 if (!actionTypeClass) {
677 std::string exceptionText = "An error occurred while inferring the action type of the operation.";
678 throw std::runtime_error(exceptionText.c_str());
679 }
680 const auto actionTypeName = actionTypeClass->GetName();
681
682 auto customColumnsCopy = new RDFInternal::RBookedCustomColumns(customCols); // deleted in jitted CallBuildAction
683 auto customColumnsAddr = PrettyPrintAddr(customColumnsCopy);
684
685 // Build a call to CallBuildAction with the appropriate argument. When run through the interpreter, this code will
686 // just-in-time create an RAction object and it will assign it to its corresponding RJittedAction.
687 std::stringstream createAction_str;
688 createAction_str << "ROOT::Internal::RDF::CallBuildAction"
689 << "<" << actionTypeName;
690 for (auto &colType : columnTypeNames)
691 createAction_str << ", " << colType;
692 // on Windows, to prefix the hexadecimal value of a pointer with '0x',
693 // one need to write: std::hex << std::showbase << (size_t)pointer
694 createAction_str << ">(reinterpret_cast<std::shared_ptr<ROOT::Detail::RDF::RNodeBase>*>("
695 << PrettyPrintAddr(prevNode) << "), {";
696 for (auto i = 0u; i < bl.size(); ++i) {
697 if (i != 0u)
698 createAction_str << ", ";
699 createAction_str << '"' << bl[i] << '"';
700 }
701 createAction_str << "}, " << std::dec << std::noshowbase << nSlots << ", reinterpret_cast<" << actionResultTypeName
702 << "*>(" << PrettyPrintAddr(rOnHeap) << ")"
703 << ", reinterpret_cast<std::shared_ptr<ROOT::Internal::RDF::RJittedAction>*>("
704 << PrettyPrintAddr(jittedActionOnHeap) << "),"
705 << "reinterpret_cast<ROOT::Internal::RDF::RBookedCustomColumns*>(" << customColumnsAddr << ")"
706 << ");";
707 return createAction_str.str();
708}
709
710bool AtLeastOneEmptyString(const std::vector<std::string_view> strings)
711{
712 for (const auto &s : strings) {
713 if (s.empty())
714 return true;
715 }
716 return false;
717}
718
719std::shared_ptr<RNodeBase> UpcastNode(std::shared_ptr<RNodeBase> ptr)
720{
721 return ptr;
722}
723
724/// Given the desired number of columns and the user-provided list of columns:
725/// * fallback to using the first nColumns default columns if needed (or throw if nColumns > nDefaultColumns)
726/// * check that selected column names refer to valid branches, custom columns or datasource columns (throw if not)
727/// Return the list of selected column names.
728ColumnNames_t GetValidatedColumnNames(RLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns,
729 const ColumnNames_t &validCustomColumns, RDataSource *ds)
730{
731 const auto &defaultColumns = lm.GetDefaultColumnNames();
732 auto selectedColumns = SelectColumns(nColumns, columns, defaultColumns);
733 const auto &validBranchNames = lm.GetBranchNames();
734 const auto unknownColumns = FindUnknownColumns(selectedColumns, validBranchNames, validCustomColumns,
735 ds ? ds->GetColumnNames() : ColumnNames_t{});
736
737 if (!unknownColumns.empty()) {
738 // throw
739 std::stringstream unknowns;
740 std::string delim = unknownColumns.size() > 1 ? "s: " : ": "; // singular/plural
741 for (auto &unknownColumn : unknownColumns) {
742 unknowns << delim << unknownColumn;
743 delim = ',';
744 }
745 throw std::runtime_error("Unknown column" + unknowns.str());
746 }
747
748 // Now we need to check within the aliases if some of the yet unknown names can be recovered
749 auto &aliasMap = lm.GetAliasMap();
750 auto aliasMapEnd = aliasMap.end();
751
752 for (auto idx : ROOT::TSeqU(selectedColumns.size())) {
753 const auto &colName = selectedColumns[idx];
754 const auto aliasColumnNameIt = aliasMap.find(colName);
755 if (aliasMapEnd != aliasColumnNameIt) {
756 selectedColumns[idx] = aliasColumnNameIt->second;
757 }
758 }
759
760 return selectedColumns;
761}
762
763/// Return a bitset each element of which indicates whether the corresponding element in `selectedColumns` is the
764/// name of a column that must be defined via datasource. All elements of the returned vector are false if no
765/// data-source is present.
766std::vector<bool> FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedCols)
767{
768 const auto nColumns = requestedCols.size();
769 std::vector<bool> mustBeDefined(nColumns, false);
770 for (auto i = 0u; i < nColumns; ++i)
771 mustBeDefined[i] = std::find(definedCols.begin(), definedCols.end(), requestedCols[i]) == definedCols.end();
772 return mustBeDefined;
773}
774
775} // namespace RDF
776} // namespace Internal
777} // namespace ROOT
SVector< double, 2 > v
Definition: Dict.h:5
#define c(i)
Definition: RSha256.hxx:101
static RooMathCoreReg dummy
int Ssiz_t
Definition: RtypesCore.h:63
#define R__ASSERT(e)
Definition: TError.h:96
int type
Definition: TGX11.cxx:120
#define gInterpreter
Definition: TInterpreter.h:538
A wrapper around a concrete RFilter, which forwards all calls to it RJittedFilter is the type of the ...
The head node of a RDF computation graph.
const std::map< std::string, std::string > & GetAliasMap() const
void ToJit(const std::string &s)
const ColumnNames_t & GetBranchNames()
Return all valid TTree::Branch names (caching results for subsequent calls).
const ColumnNames_t & GetDefaultColumnNames() const
Return the list of default columns – empty if none was provided when constructing the RDataFrame.
virtual RLoopManager * GetLoopManagerUnchecked()
Definition: RNodeBase.hxx:64
Encapsulates the columns defined by the user.
bool HasName(std::string name) const
Check if the provided name is tracked in the names list.
ColumnNames_t GetNames() const
Returns the list of the names of the defined columns.
RCustomColumnBasePtrMap_t GetColumns() const
Returns the list of the pointers to the defined columns.
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset's column names.
The public interface to the RDataFrame federation of classes.
Definition: RInterface.hxx:87
RDataSource * fDataSource
Non-owning pointer to a data-source object. Null if no data-source. RLoopManager has ownership of the...
Definition: RInterface.hxx:113
RDFInternal::RBookedCustomColumns fCustomColumns
Contains the custom columns defined up to this node.
Definition: RInterface.hxx:116
RLoopManager * fLoopManager
Definition: RInterface.hxx:111
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:66
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition: TClass.cxx:2885
A TFriendElement TF describes a TTree object TF in a file.
Regular expression class.
Definition: TRegexp.h:31
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition: TRegexp.cxx:209
A TTree object has a header with a name and a title.
Definition: TTree.h:71
virtual TBranch * GetBranch(const char *name)
Return pointer to the branch with the given name in this tree or its friends.
Definition: TTree.cxx:5051
virtual TObjArray * GetListOfBranches()
Definition: TTree.h:427
virtual TList * GetListOfFriends() const
Definition: TTree.h:429
TText * text
RResultPtr< T > MakeResultPtr(const std::shared_ptr< T > &r, RLoopManager &df, std::shared_ptr< ROOT::Internal::RDF::RActionBase > actionPtr)
Create a RResultPtr and set its pointer to the corresponding RAction This overload is invoked by non-...
Definition: RResultPtr.hxx:333
const ColumnNames_t SelectColumns(unsigned int nRequiredNames, const ColumnNames_t &names, const ColumnNames_t &defaultNames)
Choose between local column names or default column names, throw in case of errors.
std::string BuildLambdaString(const std::string &expr, const ColumnNames_t &vars, const ColumnNames_t &varTypes, bool hasReturnStmt)
std::vector< std::string > ReplaceDots(const ColumnNames_t &colNames)
bool IsValidCppVarName(const std::string &var)
ColumnNames_t ConvertRegexToColumns(const ROOT::RDF::RNode &node, std::string_view columnNameRegexp, std::string_view callerName)
unsigned int Replace(std::string &s, const std::string what, const std::string withWhat)
std::shared_ptr< RNodeBase > UpcastNode(std::shared_ptr< RNodeBase > ptr)
std::vector< std::string > GetFilterNames(const std::shared_ptr< RLoopManager > &loopManager)
std::string PrettyPrintAddr(const void *const addr)
ColumnNames_t GetTopLevelBranchNames(TTree &t)
Get all the top-level branches names, including the ones of the friend trees.
void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
std::string DemangleTypeIdName(const std::type_info &typeInfo)
bool AtLeastOneEmptyString(const std::vector< std::string_view > strings)
ColumnNames_t FindUnknownColumns(const ColumnNames_t &requiredCols, const ColumnNames_t &datasetColumns, const ColumnNames_t &definedCols, const ColumnNames_t &dataSourceColumns)
std::vector< std::string > FindUsedColumnNames(std::string_view expression, const ColumnNames_t &branches, const ColumnNames_t &customColumns, const ColumnNames_t &dsColumns, const std::map< std::string, std::string > &aliasMap)
std::string JitBuildAction(const ColumnNames_t &bl, void *prevNode, const std::type_info &art, const std::type_info &at, void *rOnHeap, TTree *tree, const unsigned int nSlots, const RDFInternal::RBookedCustomColumns &customCols, RDataSource *ds, std::shared_ptr< RJittedAction > *jittedActionOnHeap, unsigned int namespaceID)
HeadNode_t CreateSnaphotRDF(const ColumnNames_t &validCols, const std::string &fullTreeName, const std::string &fileName, bool isLazy, RLoopManager &loopManager, std::unique_ptr< RDFInternal::RActionBase > actionPtr)
bool IsInternalColumn(std::string_view colName)
ColumnNames_t GetValidatedColumnNames(RLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns, const ColumnNames_t &validCustomColumns, RDataSource *ds)
Given the desired number of columns and the user-provided list of columns:
void GetTopLevelBranchNamesImpl(TTree &t, std::set< std::string > &bNamesReg, ColumnNames_t &bNames, std::set< TTree * > &analysedTrees)
std::vector< bool > FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedCols)
Return a bitset each element of which indicates whether the corresponding element in selectedColumns ...
void TryToJitExpression(const std::string &expression, const ColumnNames_t &colNames, const std::vector< std::string > &colTypes, bool hasReturnStmt)
std::string ColumnName2ColumnTypeName(const std::string &colName, unsigned int namespaceID, TTree *tree, RDataSource *ds, bool isCustomColumn, bool vector2tvec, unsigned int customColID)
Return a string containing the type of the given branch.
Definition: RDFUtils.cxx:186
void BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm, RDataSource *ds, const std::shared_ptr< RJittedCustomColumn > &jittedCustomColumn, const RDFInternal::RBookedCustomColumns &customCols, const ColumnNames_t &branches)
void BookFilterJit(RJittedFilter *jittedFilter, void *prevNodeOnHeap, std::string_view name, std::string_view expression, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &branches, const RDFInternal::RBookedCustomColumns &customCols, TTree *tree, RDataSource *ds, unsigned int namespaceID)
std::vector< std::string > ColumnTypesAsString(ColumnNames_t &colNames, ColumnNames_t &varNames, const std::map< std::string, std::string > &aliasMap, TTree *tree, RDataSource *ds, std::string &expr, unsigned int namespaceID, const RDFInternal::RBookedCustomColumns &customCols)
void CheckCustomColumn(std::string_view definedCol, TTree *treePtr, const ColumnNames_t &customCols, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &dataSourceColumns)
Namespace for new ROOT classes and functions.
Definition: StringConv.hxx:21
ROOT::Detail::RDF::ColumnNames_t ColumnNames_t
Definition: RDataFrame.cxx:790
char * DemangleTypeIdName(const std::type_info &ti, int &errorCode)
Demangle in a portable way the type id name.
static constexpr double s
static constexpr double ns
basic_string_view< char > string_view
Definition: RStringView.hxx:35
Definition: tree.py:1