Logo ROOT   6.12/07
Reference Guide
TDFInterface.cxx
Go to the documentation of this file.
1 // Author: Enrico Guiraud, Danilo Piparo CERN 03/2017
2 
3 /*************************************************************************
4  * Copyright (C) 1995-2016, Rene Brun and Fons Rademakers. *
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 #include "TClass.h"
12 #include "TRegexp.h"
13 
14 #include "ROOT/TDFInterface.hxx"
15 #include "ROOT/TSeq.hxx"
16 
17 #include <vector>
18 #include <string>
19 using namespace ROOT::Experimental::TDF;
20 using namespace ROOT::Internal::TDF;
21 using namespace ROOT::Detail::TDF;
22 
23 namespace ROOT {
24 namespace Experimental {
25 namespace TDF {
26 // extern templates
27 template class TInterface<TLoopManager>;
28 template class TInterface<TFilterBase>;
29 }
30 }
31 
32 namespace Internal {
33 namespace TDF {
34 // Match expression against names of branches passed as parameter
35 // Return vector of names of the branches used in the expression
36 std::vector<std::string> FindUsedColumnNames(std::string_view expression, const ColumnNames_t &branches,
37  const ColumnNames_t &customColumns, const ColumnNames_t &dsColumns,
38  const std::map<std::string, std::string> &aliasMap)
39 {
40  // To help matching the regex
41  const std::string paddedExpr = " " + std::string(expression) + " ";
42  int paddedExprLen = paddedExpr.size();
43  static const std::string regexBit("[^a-zA-Z0-9_]");
44 
45  std::vector<std::string> usedBranches;
46 
47  // Check which custom columns match
48  for (auto &brName : customColumns) {
49  std::string bNameRegexContent = regexBit + brName + regexBit;
50  TRegexp bNameRegex(bNameRegexContent.c_str());
51  if (-1 != bNameRegex.Index(paddedExpr.c_str(), &paddedExprLen)) {
52  usedBranches.emplace_back(brName);
53  }
54  }
55 
56  // Check which tree branches match
57  for (auto &brName : branches) {
58  std::string bNameRegexContent = regexBit + brName + regexBit;
59  TRegexp bNameRegex(bNameRegexContent.c_str());
60  if (-1 != bNameRegex.Index(paddedExpr.c_str(), &paddedExprLen)) {
61  usedBranches.emplace_back(brName);
62  }
63  }
64 
65  // Check which data-source columns match
66  for (auto &col : dsColumns) {
67  std::string bNameRegexContent = regexBit + col + regexBit;
68  TRegexp bNameRegex(bNameRegexContent.c_str());
69  if (-1 != bNameRegex.Index(paddedExpr.c_str(), &paddedExprLen)) {
70  // if not already found among the other columns
71  if (std::find(usedBranches.begin(), usedBranches.end(), col) == usedBranches.end())
72  usedBranches.emplace_back(col);
73  }
74  }
75 
76  // Check which aliases match
77  for (auto &alias_colName : aliasMap) {
78  auto &alias = alias_colName.first;
79  std::string bNameRegexContent = regexBit + alias + regexBit;
80  TRegexp bNameRegex(bNameRegexContent.c_str());
81  if (-1 != bNameRegex.Index(paddedExpr.c_str(), &paddedExprLen)) {
82  // if not already found among the other columns
83  if (std::find(usedBranches.begin(), usedBranches.end(), alias) == usedBranches.end())
84  usedBranches.emplace_back(alias);
85  }
86  }
87 
88  return usedBranches;
89 }
90 
91 // Jit a string filter or a string temporary column, call this->Define or this->Filter as needed
92 // Return pointer to the new functional chain node returned by the call, cast to Long_t
93 Long_t JitTransformation(void *thisPtr, std::string_view methodName, std::string_view interfaceTypeName,
95  const std::map<std::string, std::string> &aliasMap, const ColumnNames_t &branches,
96  const std::vector<std::string> &customColumns,
97  const std::map<std::string, TmpBranchBasePtr_t> &tmpBookedBranches, TTree *tree,
98  std::string_view returnTypeName, TDataSource *ds)
99 {
100  const auto &dsColumns = ds ? ds->GetColumnNames() : ColumnNames_t{};
101  auto usedBranches = FindUsedColumnNames(expression, branches, customColumns, dsColumns, aliasMap);
102  auto exprNeedsVariables = !usedBranches.empty();
103 
104  // Move to the preparation of the jitting
105  // We put all of the jitted entities in function f in namespace __tdf_N, where N is a monotonically increasing index
106  // and then try to declare that function to make sure column names, types and expression are proper C++
107  std::vector<std::string> usedBranchesTypes;
108  static unsigned int iNs = 0U;
109  std::stringstream dummyDecl;
110  dummyDecl << "namespace __tdf_" << std::to_string(iNs++) << "{ auto __tdf_lambda = []() {";
111 
112  // Declare variables with the same name as the column used by this transformation
113  auto aliasMapEnd = aliasMap.end();
114  if (exprNeedsVariables) {
115  for (auto &brName : usedBranches) {
116  // Here we replace on the fly the brName with the real one in case brName it's an alias
117  // This is then used to get the type. The variable name will be brName;
118  auto aliasMapIt = aliasMap.find(brName);
119  auto &realBrName = aliasMapEnd == aliasMapIt ? brName : aliasMapIt->second;
120  // The map is a const reference, so no operator[]
121  auto tmpBrIt = tmpBookedBranches.find(realBrName);
122  auto tmpBr = tmpBrIt == tmpBookedBranches.end() ? nullptr : tmpBrIt->second.get();
123  auto brTypeName = ColumnName2ColumnTypeName(realBrName, tree, tmpBr, ds);
124  dummyDecl << brTypeName << " " << brName << ";\n";
125  usedBranchesTypes.emplace_back(brTypeName);
126  }
127  }
128 
129  TRegexp re("[^a-zA-Z0-9_]return[^a-zA-Z0-9_]");
130  int exprSize = expression.size();
131  bool hasReturnStmt = re.Index(std::string(expression), &exprSize) != -1;
132 
133  // Now that branches are declared as variables, put the body of the
134  // lambda in dummyDecl and close scopes of f and namespace __tdf_N
135  if (hasReturnStmt)
136  dummyDecl << expression << "\n;};}";
137  else
138  dummyDecl << "return " << expression << "\n;};}";
139 
140  // Try to declare the dummy lambda, error out if it does not compile
141  if (!gInterpreter->Declare(dummyDecl.str().c_str())) {
142  auto msg =
143  "Cannot interpret the following expression:\n" + std::string(expression) + "\n\nMake sure it is valid C++.";
144  throw std::runtime_error(msg);
145  }
146 
147  // Now we build the lambda and we invoke the method with it in the jitted world
148  std::stringstream ss;
149  ss << "[](";
150  for (unsigned int i = 0; i < usedBranchesTypes.size(); ++i) {
151  // We pass by reference to avoid expensive copies
152  // It can't be const reference in general, as users might want/need to call non-const methods on the values
153  // In the special case of arguments of type `TArrayBranch`, it *has* to be a const ref as we will pass in
154  // temporaries converted from TTreeReaderArrays.
155  if (usedBranchesTypes[i].find("ROOT::Experimental::TDF::TArrayBranch<") == 0u)
156  ss << "const ";
157  // Here we do not replace anything: the name of the parameters of the lambda does not need to be the real
158  // column name, it must be an alias to compile.
159  ss << usedBranchesTypes[i] << "& " << usedBranches[i] << ", ";
160  }
161  if (!usedBranchesTypes.empty())
162  ss.seekp(-2, ss.cur);
163 
164  if (hasReturnStmt)
165  ss << "){\n" << expression << "\n}";
166  else
167  ss << "){return " << expression << "\n;}";
168 
169  auto filterLambda = ss.str();
170 
171  // The TInterface type to convert the result to. For example, Filter returns a TInterface<TFilter<F,P>> but when
172  // returning it from a jitted call we need to convert it to TInterface<TFilterBase> as we are missing information
173  // on types F and P at compile time.
174  const auto targetTypeName = "ROOT::Experimental::TDF::TInterface<" + std::string(returnTypeName) + ">";
175 
176  // Here we have two cases: filter and column
177  ss.str("");
178  ss << targetTypeName << "(((" << interfaceTypeName << "*)" << thisPtr << ")->" << methodName << "(";
179  if (methodName == "Define") {
180  ss << "\"" << name << "\", ";
181  }
182  ss << filterLambda << ", {";
183  for (auto brName : usedBranches) {
184  // Here we selectively replace the brName with the real column name if it's necessary.
185  auto aliasMapIt = aliasMap.find(brName);
186  auto &realBrName = aliasMapEnd == aliasMapIt ? brName : aliasMapIt->second;
187  ss << "\"" << realBrName << "\", ";
188  }
189  if (exprNeedsVariables)
190  ss.seekp(-2, ss.cur); // remove the last ",
191  ss << "}";
192 
193  if (methodName == "Filter") {
194  ss << ", \"" << name << "\"";
195  }
196 
197  ss << "));";
198 
199  TInterpreter::EErrorCode interpErrCode;
200  auto retVal = gInterpreter->Calc(ss.str().c_str(), &interpErrCode);
201  if (TInterpreter::EErrorCode::kNoError != interpErrCode || !retVal) {
202  std::string msg = "Cannot interpret the invocation to " + std::string(methodName) + ": ";
203  msg += ss.str();
204  if (TInterpreter::EErrorCode::kNoError != interpErrCode) {
205  msg += "\nInterpreter error code is " + std::to_string(interpErrCode) + ".";
206  }
207  throw std::runtime_error(msg);
208  }
209  return retVal;
210 }
211 
212 // Jit and call something equivalent to "this->BuildAndBook<BranchTypes...>(params...)"
213 // (see comments in the body for actual jitted code)
214 std::string JitBuildAndBook(const ColumnNames_t &bl, const std::string &prevNodeTypename, void *prevNode,
215  const std::type_info &art, const std::type_info &at, const void *rOnHeap, TTree *tree,
216  const unsigned int nSlots, const std::map<std::string, TmpBranchBasePtr_t> &customColumns,
217  TDataSource *ds, const std::shared_ptr<TActionBase *> *const actionPtrPtr)
218 {
219  auto nBranches = bl.size();
220 
221  // retrieve pointers to temporary columns (null if the column is not temporary)
222  std::vector<TCustomColumnBase *> tmpBranchPtrs(nBranches, nullptr);
223  for (auto i = 0u; i < nBranches; ++i) {
224  auto tmpBranchIt = customColumns.find(bl[i]);
225  if (tmpBranchIt != customColumns.end())
226  tmpBranchPtrs[i] = tmpBranchIt->second.get();
227  }
228 
229  // retrieve branch type names as strings
230  std::vector<std::string> columnTypeNames(nBranches);
231  for (auto i = 0u; i < nBranches; ++i) {
232  const auto columnTypeName = ColumnName2ColumnTypeName(bl[i], tree, tmpBranchPtrs[i], ds);
233  if (columnTypeName.empty()) {
234  std::string exceptionText = "The type of column ";
235  exceptionText += bl[i];
236  exceptionText += " could not be guessed. Please specify one.";
237  throw std::runtime_error(exceptionText.c_str());
238  }
239  columnTypeNames[i] = columnTypeName;
240  }
241 
242  // retrieve type of result of the action as a string
243  auto actionResultTypeClass = TClass::GetClass(art);
244  if (!actionResultTypeClass) {
245  std::string exceptionText = "An error occurred while inferring the result type of an operation.";
246  throw std::runtime_error(exceptionText.c_str());
247  }
248  const auto actionResultTypeName = actionResultTypeClass->GetName();
249 
250  // retrieve type of action as a string
251  auto actionTypeClass = TClass::GetClass(at);
252  if (!actionTypeClass) {
253  std::string exceptionText = "An error occurred while inferring the action type of the operation.";
254  throw std::runtime_error(exceptionText.c_str());
255  }
256  const auto actionTypeName = actionTypeClass->GetName();
257 
258  // createAction_str will contain the following:
259  // ROOT::Internal::TDF::CallBuildAndBook<actionType, branchType1, branchType2...>(
260  // *reinterpret_cast<PrevNodeType*>(prevNode), { bl[0], bl[1], ... }, reinterpret_cast<actionResultType*>(rOnHeap),
261  // reinterpret_cast<shared_ptr<TActionBase*>*>(actionPtrPtr))
262  std::stringstream createAction_str;
263  createAction_str << "ROOT::Internal::TDF::CallBuildAndBook"
264  << "<" << actionTypeName;
265  for (auto &colType : columnTypeNames)
266  createAction_str << ", " << colType;
267  createAction_str << ">(*reinterpret_cast<" << prevNodeTypename << "*>(" << prevNode << "), {";
268  for (auto i = 0u; i < bl.size(); ++i) {
269  if (i != 0u)
270  createAction_str << ", ";
271  createAction_str << '"' << bl[i] << '"';
272  }
273  createAction_str << "}, " << nSlots << ", reinterpret_cast<" << actionResultTypeName << "*>(" << rOnHeap << ")"
274  << ", reinterpret_cast<const std::shared_ptr<ROOT::Internal::TDF::TActionBase*>*>(" << actionPtrPtr
275  << "));";
276  return createAction_str.str();
277 }
278 
279 bool AtLeastOneEmptyString(const std::vector<std::string_view> strings)
280 {
281  for (const auto &s : strings) {
282  if (s.empty())
283  return true;
284  }
285  return false;
286 }
287 
288 std::shared_ptr<TFilterBase> UpcastNode(const std::shared_ptr<TFilterBase> ptr)
289 {
290  return ptr;
291 }
292 
293 std::shared_ptr<TCustomColumnBase> UpcastNode(const std::shared_ptr<TCustomColumnBase> ptr)
294 {
295  return ptr;
296 }
297 
298 std::shared_ptr<TRangeBase> UpcastNode(const std::shared_ptr<TRangeBase> ptr)
299 {
300  return ptr;
301 }
302 
303 std::shared_ptr<TLoopManager> UpcastNode(const std::shared_ptr<TLoopManager> ptr)
304 {
305  return ptr;
306 }
307 
308 /// Given the desired number of columns and the user-provided list of columns:
309 /// * fallback to using the first nColumns default columns if needed (or throw if nColumns > nDefaultColumns)
310 /// * check that selected column names refer to valid branches, custom columns or datasource columns (throw if not)
311 /// Return the list of selected column names.
312 ColumnNames_t GetValidatedColumnNames(TLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns,
313  const ColumnNames_t &validCustomColumns, TDataSource *ds)
314 {
315  const auto &defaultColumns = lm.GetDefaultColumnNames();
316  auto selectedColumns = SelectColumns(nColumns, columns, defaultColumns);
317  const auto unknownColumns = FindUnknownColumns(selectedColumns, lm.GetTree(), validCustomColumns,
318  ds ? ds->GetColumnNames() : ColumnNames_t{});
319 
320  if (!unknownColumns.empty()) {
321  // throw
322  std::stringstream unknowns;
323  std::string delim = unknownColumns.size() > 1 ? "s: " : ": "; // singular/plural
324  for (auto &unknownColumn : unknownColumns) {
325  unknowns << delim << unknownColumn;
326  delim = ',';
327  }
328  throw std::runtime_error("Unknown column" + unknowns.str());
329  }
330 
331  // Now we need to check within the aliases if some of the yet unknown names can be recovered
332  auto &aliasMap = lm.GetAliasMap();
333  auto aliasMapEnd = aliasMap.end();
334 
335  for (auto idx : ROOT::TSeqU(selectedColumns.size())) {
336  const auto &colName = selectedColumns[idx];
337  const auto aliasColumnNameIt = aliasMap.find(colName);
338  if (aliasMapEnd != aliasColumnNameIt) {
339  selectedColumns[idx] = aliasColumnNameIt->second;
340  }
341  }
342 
343  return selectedColumns;
344 }
345 
346 /// Return a bitset each element of which indicates whether the corresponding element in `selectedColumns` is the
347 /// name of a column that must be defined via datasource. All elements of the returned vector are false if no
348 /// data-source is present.
349 std::vector<bool> FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedCols)
350 {
351  const auto nColumns = requestedCols.size();
352  std::vector<bool> mustBeDefined(nColumns, false);
353  for (auto i = 0u; i < nColumns; ++i)
354  mustBeDefined[i] = std::find(definedCols.begin(), definedCols.end(), requestedCols[i]) == definedCols.end();
355  return mustBeDefined;
356 }
357 
358 } // end ns TDF
359 } // end ns Internal
360 } // end ns ROOT
std::vector< bool > FindUndefinedDSColumns(const ColumnNames_t &requestedCols, const ColumnNames_t &definedDSCols)
Return a bitset each element of which indicates whether the corresponding element in selectedColumns ...
std::shared_ptr< TFilterBase > UpcastNode(const std::shared_ptr< TFilterBase > ptr)
basic_string_view< char > string_view
Definition: RStringView.h:35
Namespace for new ROOT classes and functions.
Definition: StringConv.hxx:21
ColumnNames_t FindUnknownColumns(const ColumnNames_t &requiredCols, TTree *tree, const ColumnNames_t &definedCols, const ColumnNames_t &dataSourceColumns)
Definition: TDFUtils.cxx:339
bool AtLeastOneEmptyString(const std::vector< std::string_view > strings)
Regular expression class.
Definition: TRegexp.h:31
#define gInterpreter
Definition: TInterpreter.h:526
const std::map< std::string, std::string > & GetAliasMap() const
Definition: TDFNodes.hxx:200
std::string JitBuildAndBook(const ColumnNames_t &bl, const std::string &prevNodeTypename, void *prevNode, const std::type_info &art, const std::type_info &at, const void *r, TTree *tree, const unsigned int nSlots, const std::map< std::string, TmpBranchBasePtr_t > &customColumns, TDataSource *ds, const std::shared_ptr< TActionBase *> *const actionPtrPtr)
ColumnNames_t GetValidatedColumnNames(TLoopManager &lm, const unsigned int nColumns, const ColumnNames_t &columns, const ColumnNames_t &validCustomColumns, TDataSource *ds)
Given the desired number of columns and the user-provided list of columns:
const ColumnNames_t SelectColumns(unsigned int nRequiredNames, const ColumnNames_t &names, const ColumnNames_t &defaultNames)
Choose between local column names or default column names, throw in case of errors.
Definition: TDFUtils.cxx:314
TDataSource defines an API that TDataFrame can use to read arbitrary data formats.
Definition: TDataSource.hxx:51
const ColumnNames_t & GetDefaultColumnNames() const
Return the list of default columns – empty if none was provided when constructing the TDataFrame...
Definition: TDFNodes.cxx:413
long Long_t
Definition: RtypesCore.h:50
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition: TRegexp.cxx:209
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:66
static constexpr double s
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition: TClass.cxx:2887
std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *tree, TCustomColumnBase *tmpBranch, TDataSource *ds)
Return a string containing the type of the given branch.
Definition: TDFUtils.cxx:123
std::vector< std::string > FindUsedColumnNames(std::string_view expression, const ColumnNames_t &branches, const ColumnNames_t &customColumns, const ColumnNames_t &dsColumns, const std::map< std::string, std::string > &aliasMap)
Definition: tree.py:1
Long_t JitTransformation(void *thisPtr, std::string_view methodName, std::string_view interfaceTypeName, std::string_view name, std::string_view expression, const std::map< std::string, std::string > &aliasMap, const ColumnNames_t &branches, const std::vector< std::string > &customColumns, const std::map< std::string, TmpBranchBasePtr_t > &tmpBookedBranches, TTree *tree, std::string_view returnTypeName, TDataSource *ds)
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset&#39;s column names.
char name[80]
Definition: TGX11.cxx:109
The public interface to the TDataFrame federation of classes.