Logo ROOT  
Reference Guide
Loading...
Searching...
No Matches
RInterface.hxx
Go to the documentation of this file.
1// Author: Enrico Guiraud, Danilo Piparo CERN 03/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2021, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RDF_TINTERFACE
12#define ROOT_RDF_TINTERFACE
13
14#include "ROOT/RDataSource.hxx"
20#include "ROOT/RDF/RDefine.hxx"
22#include "ROOT/RDF/RFilter.hxx"
27#include "ROOT/RDF/RRange.hxx"
29#include "ROOT/RDF/Utils.hxx"
32#include "ROOT/RResultPtr.hxx"
34#include <string_view>
35#include "ROOT/RVec.hxx"
36#include "ROOT/TypeTraits.hxx"
37#include "RtypesCore.h" // for ULong64_t
38#include "TDirectory.h"
39#include "TH1.h" // For Histo actions
40#include "TH2.h" // For Histo actions
41#include "TH3.h" // For Histo actions
42#include "THn.h"
43#include "THnSparse.h"
44#include "TProfile.h"
45#include "TProfile2D.h"
46#include "TStatistic.h"
47
48#include "RConfigure.h" // for R__HAS_ROOT7
49#ifdef R__HAS_ROOT7
51#include <ROOT/RHist.hxx>
52#include <ROOT/RHistEngine.hxx>
53#endif
54
55#include <algorithm>
56#include <cstddef>
57#include <initializer_list>
58#include <iterator> // std::back_insterter
59#include <limits>
60#include <memory>
61#include <set>
62#include <sstream>
63#include <stdexcept>
64#include <string>
65#include <type_traits> // is_same, enable_if
66#include <typeinfo>
67#include <unordered_set>
68#include <utility> // std::index_sequence
69#include <vector>
70#include <any>
71
72class TGraph;
73
74// Windows requires a forward decl of printValue to accept it as a valid friend function in RInterface
75namespace ROOT {
78void EnableImplicitMT(UInt_t numthreads);
79class RDataFrame;
80} // namespace ROOT
81namespace cling {
82std::string printValue(ROOT::RDataFrame *tdf);
83}
84
85namespace ROOT {
86namespace RDF {
87namespace RDFDetail = ROOT::Detail::RDF;
88namespace RDFInternal = ROOT::Internal::RDF;
89namespace TTraits = ROOT::TypeTraits;
90
91template <typename Proxied>
92class RInterface;
93
95} // namespace RDF
96
97namespace Internal {
98namespace RDF {
99class GraphCreatorHelper;
100void ChangeEmptyEntryRange(const ROOT::RDF::RNode &node, std::pair<ULong64_t, ULong64_t> &&newRange);
101void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end);
102void ChangeSpec(const ROOT::RDF::RNode &node, ROOT::RDF::Experimental::RDatasetSpec &&spec);
103std::vector<std::pair<std::uint64_t, std::uint64_t>> GetDatasetGlobalClusterBoundaries(const RNode &node);
105std::string GetDataSourceLabel(const ROOT::RDF::RNode &node);
106void SetTTreeLifeline(ROOT::RDF::RNode &node, std::any lifeline);
107} // namespace RDF
108} // namespace Internal
109
110namespace RDF {
111
112// clang-format off
113/**
114 * \class ROOT::RDF::RInterface
115 * \ingroup dataframe
116 * \brief The public interface to the RDataFrame federation of classes.
117 * \tparam Proxied One of the "node" base types (e.g. RLoopManager, RFilterBase). The user never specifies this type manually.
118 *
119 * The documentation of each method features a one liner illustrating how to use the method, for example showing how
120 * the majority of the template parameters are automatically deduced requiring no or very little effort by the user.
121 */
122// clang-format on
123template <typename Proxied>
125 using RFilterBase = RDFDetail::RFilterBase;
126 using RRangeBase = RDFDetail::RRangeBase;
127 using RLoopManager = RDFDetail::RLoopManager;
128 friend std::string cling::printValue(::ROOT::RDataFrame *tdf); // For a nice printing at the prompt
130
131 template <typename T>
132 friend class RInterface;
133
135 friend void RDFInternal::ChangeEmptyEntryRange(const RNode &node, std::pair<ULong64_t, ULong64_t> &&newRange);
138 friend std::vector<std::pair<std::uint64_t, std::uint64_t>>
140 friend std::string ROOT::Internal::RDF::GetDataSourceLabel(const RNode &node);
141 friend void ROOT::Internal::RDF::SetTTreeLifeline(ROOT::RDF::RNode &node, std::any lifeline);
142 std::shared_ptr<Proxied> fProxiedPtr; ///< Smart pointer to the graph node encapsulated by this RInterface.
143
144public:
145 ////////////////////////////////////////////////////////////////////////////
146 /// \brief Copy-assignment operator for RInterface.
147 RInterface &operator=(const RInterface &) = default;
148
149 ////////////////////////////////////////////////////////////////////////////
150 /// \brief Copy-ctor for RInterface.
151 RInterface(const RInterface &) = default;
152
153 ////////////////////////////////////////////////////////////////////////////
154 /// \brief Move-ctor for RInterface.
155 RInterface(RInterface &&) = default;
156
157 ////////////////////////////////////////////////////////////////////////////
158 /// \brief Move-assignment operator for RInterface.
160
161 ////////////////////////////////////////////////////////////////////////////
162 /// \brief Build a RInterface from a RLoopManager.
163 /// This constructor is only available for RInterface<RLoopManager>.
164 template <typename T = Proxied, typename = std::enable_if_t<std::is_same<T, RLoopManager>::value, int>>
165 RInterface(const std::shared_ptr<RLoopManager> &proxied) : RInterfaceBase(proxied), fProxiedPtr(proxied)
166 {
167 }
168
169 /// \name Transformation
170 /// \{
171
172 ////////////////////////////////////////////////////////////////////////////
173 /// \brief Cast any RDataFrame node to a common type ROOT::RDF::RNode.
174 /// Different RDataFrame methods return different C++ types. All nodes, however,
175 /// can be cast to this common type at the cost of a small performance penalty.
176 /// This allows, for example, storing RDataFrame nodes in a vector, or passing them
177 /// around via (non-template, C++11) helper functions.
178 /// Example usage:
179 /// ~~~{.cpp}
180 /// // a function that conditionally adds a Range to a RDataFrame node.
181 /// RNode MaybeAddRange(RNode df, bool mustAddRange)
182 /// {
183 /// return mustAddRange ? df.Range(1) : df;
184 /// }
185 /// // use as :
186 /// ROOT::RDataFrame df(10);
187 /// auto maybeRanged = MaybeAddRange(df, true);
188 /// ~~~
189 /// Note that it is not a problem to pass RNode's by value.
190 operator RNode() const
191 {
192 return RNode(std::static_pointer_cast<::ROOT::Detail::RDF::RNodeBase>(fProxiedPtr), *fLoopManager, fColRegister);
193 }
194
195 ////////////////////////////////////////////////////////////////////////////
196 /// \brief Append a filter to the call graph.
197 /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool`
198 /// signalling whether the event has passed the selection (true) or not (false).
199 /// \param[in] columns Names of the columns/branches in input to the filter function.
200 /// \param[in] name Optional name of this filter. See `Report`.
201 /// \return the filter node of the computation graph.
202 ///
203 /// Append a filter node at the point of the call graph corresponding to the
204 /// object this method is called on.
205 /// The callable `f` should not have side-effects (e.g. modification of an
206 /// external or static variable) to ensure correct results when implicit
207 /// multi-threading is active.
208 ///
209 /// RDataFrame only evaluates filters when necessary: if multiple filters
210 /// are chained one after another, they are executed in order and the first
211 /// one returning false causes the event to be discarded.
212 /// Even if multiple actions or transformations depend on the same filter,
213 /// it is executed once per entry. If its result is requested more than
214 /// once, the cached result is served.
215 ///
216 /// ### Example usage:
217 /// ~~~{.cpp}
218 /// // C++ callable (function, functor class, lambda...) that takes two parameters of the types of "x" and "y"
219 /// auto filtered = df.Filter(myCut, {"x", "y"});
220 ///
221 /// // String: it must contain valid C++ except that column names can be used instead of variable names
222 /// auto filtered = df.Filter("x*y > 0");
223 /// ~~~
224 ///
225 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
226 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
227 /// ~~~{.cpp}
228 /// df.Filter("Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
229 /// ~~~
230 /// but instead this will:
231 /// ~~~{.cpp}
232 /// df.Filter("return Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
233 /// ~~~
234 template <typename F, std::enable_if_t<!std::is_convertible<F, std::string>::value, int> = 0>
235 RInterface<RDFDetail::RFilter<F, Proxied>> Filter(F f, const ColumnNames_t &columns = {}, std::string_view name = "")
236 {
237 RDFInternal::CheckFilter(f);
238 using ColTypes_t = typename TTraits::CallableTraits<F>::arg_types;
239 constexpr auto nColumns = ColTypes_t::list_size;
240 const auto validColumnNames = GetValidatedColumnNames(nColumns, columns);
241 CheckAndFillDSColumns(validColumnNames, ColTypes_t());
242
243 using F_t = RDFDetail::RFilter<F, Proxied>;
244
245 auto filterPtr = std::make_shared<F_t>(std::move(f), validColumnNames, fProxiedPtr, fColRegister, name);
246 return RInterface<F_t>(std::move(filterPtr), *fLoopManager, fColRegister);
247 }
248
249 ////////////////////////////////////////////////////////////////////////////
250 /// \brief Append a filter to the call graph.
251 /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool`
252 /// signalling whether the event has passed the selection (true) or not (false).
253 /// \param[in] name Optional name of this filter. See `Report`.
254 /// \return the filter node of the computation graph.
255 ///
256 /// Refer to the first overload of this method for the full documentation.
257 template <typename F, std::enable_if_t<!std::is_convertible<F, std::string>::value, int> = 0>
259 {
260 // The sfinae is there in order to pick up the overloaded method which accepts two strings
261 // rather than this template method.
262 return Filter(f, {}, name);
263 }
264
265 ////////////////////////////////////////////////////////////////////////////
266 /// \brief Append a filter to the call graph.
267 /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool`
268 /// signalling whether the event has passed the selection (true) or not (false).
269 /// \param[in] columns Names of the columns/branches in input to the filter function.
270 /// \return the filter node of the computation graph.
271 ///
272 /// Refer to the first overload of this method for the full documentation.
273 template <typename F>
274 RInterface<RDFDetail::RFilter<F, Proxied>> Filter(F f, const std::initializer_list<std::string> &columns)
275 {
276 return Filter(f, ColumnNames_t{columns});
277 }
278
279 ////////////////////////////////////////////////////////////////////////////
280 /// \brief Append a filter to the call graph.
281 /// \param[in] expression The filter expression in C++
282 /// \param[in] name Optional name of this filter. See `Report`.
283 /// \return the filter node of the computation graph.
284 ///
285 /// The expression is just-in-time compiled and used to filter entries. It must
286 /// be valid C++ syntax in which variable names are substituted with the names
287 /// of branches/columns.
288 ///
289 /// ### Example usage:
290 /// ~~~{.cpp}
291 /// auto filtered_df = df.Filter("myCollection.size() > 3");
292 /// auto filtered_name_df = df.Filter("myCollection.size() > 3", "Minumum collection size");
293 /// ~~~
294 ///
295 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
296 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
297 /// ~~~{.cpp}
298 /// df.Filter("Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
299 /// ~~~
300 /// but instead this will:
301 /// ~~~{.cpp}
302 /// df.Filter("return Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
303 /// ~~~
304 RInterface<RDFDetail::RJittedFilter> Filter(std::string_view expression, std::string_view name = "")
305 {
306 const auto jittedFilter = RDFInternal::BookFilterJit(RDFInternal::UpcastNode(fProxiedPtr), name, expression,
307 fColRegister, nullptr, GetDataSource());
308
309 return RInterface<RDFDetail::RJittedFilter>(std::move(jittedFilter), *fLoopManager, fColRegister);
310 }
311
312 ////////////////////////////////////////////////////////////////////////////
313 /// \brief Discard entries with missing values
314 /// \param[in] column Column name whose entries with missing values should be discarded
315 /// \return The filter node of the computation graph
316 ///
317 /// This operation is useful in case an entry of the dataset is incomplete,
318 /// i.e. if one or more of the columns do not have valid values. If the value
319 /// of the input column is missing for an entry, the entire entry will be
320 /// discarded from the rest of this branch of the computation graph.
321 ///
322 /// Use cases include:
323 /// * When processing multiple files, one or more of them is missing a column
324 /// * In horizontal joining with entry matching, a certain dataset has no
325 /// match for the current entry.
326 ///
327 /// ### Example usage:
328 ///
329 /// \code{.py}
330 /// # Assume a dataset with columns [idx, x] matching another dataset with
331 /// # columns [idx, y]. For idx == 42, the right-hand dataset has no match
332 /// df = ROOT.RDataFrame(dataset)
333 /// df_nomissing = df.FilterAvailable("idx").Define("z", "x + y")
334 /// colz = df_nomissing.Take[int]("z")
335 /// \endcode
336 ///
337 /// \code{.cpp}
338 /// // Assume a dataset with columns [idx, x] matching another dataset with
339 /// // columns [idx, y]. For idx == 42, the right-hand dataset has no match
340 /// ROOT::RDataFrame df{dataset};
341 /// auto df_nomissing = df.FilterAvailable("idx")
342 /// .Define("z", [](int x, int y) { return x + y; }, {"x", "y"});
343 /// auto colz = df_nomissing.Take<int>("z");
344 /// \endcode
345 ///
346 /// \note See FilterMissing() if you want to keep only the entries with
347 /// missing values instead.
349 {
350 const auto columns = ColumnNames_t{column.data()};
351 // For now disable this functionality in case of an empty data source and
352 // the column name was not defined previously.
353 if (ROOT::Internal::RDF::GetDataSourceLabel(*this) == "EmptyDS")
354 throw std::runtime_error("Unknown column: \"" + std::string(column) + "\"");
355 using F_t = RDFDetail::RFilterWithMissingValues<Proxied>;
356 auto filterPtr = std::make_shared<F_t>(/*discardEntry*/ true, fProxiedPtr, fColRegister, columns);
357 CheckAndFillDSColumns(columns, TTraits::TypeList<void>{});
358 return RInterface<F_t>(std::move(filterPtr), *fLoopManager, fColRegister);
359 }
360
361 ////////////////////////////////////////////////////////////////////////////
362 /// \brief Keep only the entries that have missing values.
363 /// \param[in] column Column name whose entries with missing values should be kept
364 /// \return The filter node of the computation graph
365 ///
366 /// This operation is useful in case an entry of the dataset is incomplete,
367 /// i.e. if one or more of the columns do not have valid values. It only
368 /// keeps the entries for which the value of the input column is missing.
369 ///
370 /// Use cases include:
371 /// * When processing multiple files, one or more of them is missing a column
372 /// * In horizontal joining with entry matching, a certain dataset has no
373 /// match for the current entry.
374 ///
375 /// ### Example usage:
376 ///
377 /// \code{.py}
378 /// # Assume a dataset made of two files vertically chained together, one has
379 /// # column "x" and the other has column "y"
380 /// df = ROOT.RDataFrame(dataset)
381 /// df_valid_col_x = df.FilterMissing("y")
382 /// df_valid_col_y = df.FilterMissing("x")
383 /// display_x = df_valid_col_x.Display(("x",))
384 /// display_y = df_valid_col_y.Display(("y",))
385 /// \endcode
386 ///
387 /// \code{.cpp}
388 /// // Assume a dataset made of two files vertically chained together, one has
389 /// // column "x" and the other has column "y"
390 /// ROOT.RDataFrame df{dataset};
391 /// auto df_valid_col_x = df.FilterMissing("y");
392 /// auto df_valid_col_y = df.FilterMissing("x");
393 /// auto display_x = df_valid_col_x.Display<int>({"x"});
394 /// auto display_y = df_valid_col_y.Display<int>({"y"});
395 /// \endcode
396 ///
397 /// \note See FilterAvailable() if you want to discard the entries in case
398 /// there is a missing value instead.
400 {
401 const auto columns = ColumnNames_t{column.data()};
402 // For now disable this functionality in case of an empty data source and
403 // the column name was not defined previously.
404 if (ROOT::Internal::RDF::GetDataSourceLabel(*this) == "EmptyDS")
405 throw std::runtime_error("Unknown column: \"" + std::string(column) + "\"");
406 using F_t = RDFDetail::RFilterWithMissingValues<Proxied>;
407 auto filterPtr = std::make_shared<F_t>(/*discardEntry*/ false, fProxiedPtr, fColRegister, columns);
408 CheckAndFillDSColumns(columns, TTraits::TypeList<void>{});
409 return RInterface<F_t>(std::move(filterPtr), *fLoopManager, fColRegister);
410 }
411
412 // clang-format off
413 ////////////////////////////////////////////////////////////////////////////
414 /// \brief Define a new column.
415 /// \param[in] name The name of the defined column.
416 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column. This callable must be thread safe when used with multiple threads.
417 /// \param[in] columns Names of the columns/branches in input to the producer function.
418 /// \return the first node of the computation graph for which the new quantity is defined.
419 ///
420 /// Define a column that will be visible from all subsequent nodes
421 /// of the functional chain. The `expression` is only evaluated for entries that pass
422 /// all the preceding filters.
423 /// A new variable is created called `name`, accessible as if it was contained
424 /// in the dataset from subsequent transformations/actions.
425 ///
426 /// Use cases include:
427 /// * caching the results of complex calculations for easy and efficient multiple access
428 /// * extraction of quantities of interest from complex objects
429 ///
430 /// An exception is thrown if the name of the new column is already in use in this branch of the computation graph.
431 /// Note that the callable must be thread safe when called from multiple threads. Use DefineSlot() if needed.
432 ///
433 /// ### Example usage:
434 /// ~~~{.cpp}
435 /// // assuming a function with signature:
436 /// double myComplexCalculation(const RVec<float> &muon_pts);
437 /// // we can pass it directly to Define
438 /// auto df_with_define = df.Define("newColumn", myComplexCalculation, {"muon_pts"});
439 /// // alternatively, we can pass the body of the function as a string, as in Filter:
440 /// auto df_with_define = df.Define("newColumn", "x*x + y*y");
441 /// ~~~
442 ///
443 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
444 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
445 /// ~~~{.cpp}
446 /// df.Define("x2", "Map(v, [](float e) { return e*e; })")
447 /// ~~~
448 /// but instead this will:
449 /// ~~~{.cpp}
450 /// df.Define("x2", "return Map(v, [](float e) { return e*e; })")
451 /// ~~~
452 template <typename F, typename std::enable_if_t<!std::is_convertible<F, std::string>::value, int> = 0>
453 RInterface<Proxied> Define(std::string_view name, F expression, const ColumnNames_t &columns = {})
454 {
455 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::None>(name, std::move(expression), columns, "Define");
456 }
457 // clang-format on
458
459 // clang-format off
460 ////////////////////////////////////////////////////////////////////////////
461 /// \brief Define a new column with a value dependent on the processing slot.
462 /// \param[in] name The name of the defined column.
463 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
464 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding the slot number).
465 /// \return the first node of the computation graph for which the new quantity is defined.
466 ///
467 /// This alternative implementation of `Define` is meant as a helper to evaluate new column values in a thread-safe manner.
468 /// The expression must be a callable of signature R(unsigned int, T1, T2, ...) where `T1, T2...` are the types
469 /// of the columns that the expression takes as input. The first parameter is reserved for an unsigned integer
470 /// representing a "slot number". RDataFrame guarantees that different threads will invoke the expression with
471 /// different slot numbers - slot numbers will range from zero to ROOT::GetThreadPoolSize()-1.
472 /// Note that there is no guarantee as to how often each slot will be reached during the event loop.
473 ///
474 /// The following two calls are equivalent, although `DefineSlot` is slightly more performant:
475 /// ~~~{.cpp}
476 /// int function(unsigned int, double, double);
477 /// df.Define("x", function, {"rdfslot_", "column1", "column2"})
478 /// df.DefineSlot("x", function, {"column1", "column2"})
479 /// ~~~
480 ///
481 /// See Define() for more information.
482 template <typename F>
483 RInterface<Proxied> DefineSlot(std::string_view name, F expression, const ColumnNames_t &columns = {})
484 {
485 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::Slot>(name, std::move(expression), columns, "DefineSlot");
486 }
487 // clang-format on
488
489 // clang-format off
490 ////////////////////////////////////////////////////////////////////////////
491 /// \brief Define a new column with a value dependent on the processing slot and the current entry.
492 /// \param[in] name The name of the defined column.
493 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
494 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding slot and entry).
495 /// \return the first node of the computation graph for which the new quantity is defined.
496 ///
497 /// This alternative implementation of `Define` is meant as a helper in writing entry-specific, thread-safe custom
498 /// columns. The expression must be a callable of signature R(unsigned int, ULong64_t, T1, T2, ...) where `T1, T2...`
499 /// are the types of the columns that the expression takes as input. The first parameter is reserved for an unsigned
500 /// integer representing a "slot number". RDataFrame guarantees that different threads will invoke the expression with
501 /// different slot numbers - slot numbers will range from zero to ROOT::GetThreadPoolSize()-1.
502 /// Note that there is no guarantee as to how often each slot will be reached during the event loop.
503 /// The second parameter is reserved for a `ULong64_t` representing the current entry being processed by the current thread.
504 ///
505 /// The following two `Define`s are equivalent, although `DefineSlotEntry` is slightly more performant:
506 /// ~~~{.cpp}
507 /// int function(unsigned int, ULong64_t, double, double);
508 /// Define("x", function, {"rdfslot_", "rdfentry_", "column1", "column2"})
509 /// DefineSlotEntry("x", function, {"column1", "column2"})
510 /// ~~~
511 ///
512 /// See Define() for more information.
513 template <typename F>
514 RInterface<Proxied> DefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns = {})
515 {
516 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::SlotAndEntry>(name, std::move(expression), columns,
517 "DefineSlotEntry");
518 }
519 // clang-format on
520
521 ////////////////////////////////////////////////////////////////////////////
522 /// \brief Define a new column.
523 /// \param[in] name The name of the defined column.
524 /// \param[in] expression An expression in C++ which represents the defined value
525 /// \return the first node of the computation graph for which the new quantity is defined.
526 ///
527 /// The expression is just-in-time compiled and used to produce the column entries.
528 /// It must be valid C++ syntax in which variable names are substituted with the names
529 /// of branches/columns.
530 ///
531 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
532 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
533 /// ~~~{.cpp}
534 /// df.Define("x2", "Map(v, [](float e) { return e*e; })")
535 /// ~~~
536 /// but instead this will:
537 /// ~~~{.cpp}
538 /// df.Define("x2", "return Map(v, [](float e) { return e*e; })")
539 /// ~~~
540 ///
541 /// Refer to the first overload of this method for the full documentation.
542 RInterface<Proxied> Define(std::string_view name, std::string_view expression)
543 {
544 constexpr auto where = "Define";
546 // these checks must be done before jitting lest we throw exceptions in jitted code
549
550 auto jittedDefine = RDFInternal::BookDefineJit(name, expression, *fLoopManager, GetDataSource(), fColRegister);
551
553 newCols.AddDefine(std::move(jittedDefine));
554
555 RInterface<Proxied> newInterface(fProxiedPtr, *fLoopManager, std::move(newCols));
556
557 return newInterface;
558 }
559
560 ////////////////////////////////////////////////////////////////////////////
561 /// \brief Overwrite the value and/or type of an existing column.
562 /// \param[in] name The name of the column to redefine.
563 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
564 /// \param[in] columns Names of the columns/branches in input to the expression.
565 /// \return the first node of the computation graph for which the quantity is redefined.
566 ///
567 /// The old value of the column can be used as an input for the expression.
568 ///
569 /// An exception is thrown in case the column to redefine does not already exist.
570 /// See Define() for more information.
571 template <typename F, std::enable_if_t<!std::is_convertible<F, std::string>::value, int> = 0>
572 RInterface<Proxied> Redefine(std::string_view name, F expression, const ColumnNames_t &columns = {})
573 {
574 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::None>(name, std::move(expression), columns, "Redefine");
575 }
576
577 // clang-format off
578 ////////////////////////////////////////////////////////////////////////////
579 /// \brief Overwrite the value and/or type of an existing column.
580 /// \param[in] name The name of the column to redefine.
581 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
582 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding slot).
583 /// \return the first node of the computation graph for which the new quantity is defined.
584 ///
585 /// The old value of the column can be used as an input for the expression.
586 /// An exception is thrown in case the column to redefine does not already exist.
587 ///
588 /// See DefineSlot() for more information.
589 // clang-format on
590 template <typename F>
591 RInterface<Proxied> RedefineSlot(std::string_view name, F expression, const ColumnNames_t &columns = {})
592 {
593 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::Slot>(name, std::move(expression), columns, "RedefineSlot");
594 }
595
596 // clang-format off
597 ////////////////////////////////////////////////////////////////////////////
598 /// \brief Overwrite the value and/or type of an existing column.
599 /// \param[in] name The name of the column to redefine.
600 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
601 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding slot and entry).
602 /// \return the first node of the computation graph for which the new quantity is defined.
603 ///
604 /// The old value of the column can be used as an input for the expression.
605 /// An exception is thrown in case the column to re-define does not already exist.
606 ///
607 /// See DefineSlotEntry() for more information.
608 // clang-format on
609 template <typename F>
610 RInterface<Proxied> RedefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns = {})
611 {
612 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::SlotAndEntry>(name, std::move(expression), columns,
613 "RedefineSlotEntry");
614 }
615
616 ////////////////////////////////////////////////////////////////////////////
617 /// \brief Overwrite the value and/or type of an existing column.
618 /// \param[in] name The name of the column to redefine.
619 /// \param[in] expression An expression in C++ which represents the defined value
620 /// \return the first node of the computation graph for which the new quantity is defined.
621 ///
622 /// The expression is just-in-time compiled and used to produce the column entries.
623 /// It must be valid C++ syntax in which variable names are substituted with the names
624 /// of branches/columns.
625 ///
626 /// The old value of the column can be used as an input for the expression.
627 /// An exception is thrown in case the column to re-define does not already exist.
628 ///
629 /// Aliases cannot be overridden. See the corresponding Define() overload for more information.
630 RInterface<Proxied> Redefine(std::string_view name, std::string_view expression)
631 {
632 constexpr auto where = "Redefine";
637
638 auto jittedDefine = RDFInternal::BookDefineJit(name, expression, *fLoopManager, GetDataSource(), fColRegister);
639
641 newCols.AddDefine(std::move(jittedDefine));
642
643 RInterface<Proxied> newInterface(fProxiedPtr, *fLoopManager, std::move(newCols));
644
645 return newInterface;
646 }
647
648 ////////////////////////////////////////////////////////////////////////////
649 /// \brief In case the value in the given column is missing, provide a default value
650 /// \tparam T The type of the column
651 /// \param[in] column Column name where missing values should be replaced by the given default value
652 /// \param[in] defaultValue Value to provide instead of a missing value
653 /// \return The node of the graph that will provide a default value
654 ///
655 /// This operation is useful in case an entry of the dataset is incomplete,
656 /// i.e. if one or more of the columns do not have valid values. It does not
657 /// modify the values of the column, but in case any entry is missing, it
658 /// will provide the default value to downstream nodes instead.
659 ///
660 /// Use cases include:
661 /// * When processing multiple files, one or more of them is missing a column
662 /// * In horizontal joining with entry matching, a certain dataset has no
663 /// match for the current entry.
664 ///
665 /// ### Example usage:
666 ///
667 /// \code{.cpp}
668 /// // Assume a dataset with columns [idx, x] matching another dataset with
669 /// // columns [idx, y]. For idx == 42, the right-hand dataset has no match
670 /// ROOT::RDataFrame df{dataset};
671 /// auto df_default = df.DefaultValueFor("y", 33)
672 /// .Define("z", [](int x, int y) { return x + y; }, {"x", "y"});
673 /// auto colz = df_default.Take<int>("z");
674 /// \endcode
675 ///
676 /// \code{.py}
677 /// df = ROOT.RDataFrame(dataset)
678 /// df_default = df.DefaultValueFor("y", 33).Define("z", "x + y")
679 /// colz = df_default.Take[int]("z")
680 /// \endcode
681 template <typename T>
682 RInterface<Proxied> DefaultValueFor(std::string_view column, const T &defaultValue)
683 {
684 constexpr auto where{"DefaultValueFor"};
686 // For now disable this functionality in case of an empty data source and
687 // the column name was not defined previously.
688 if (ROOT::Internal::RDF::GetDataSourceLabel(*this) == "EmptyDS")
691
692 // Declare return type to the interpreter, for future use by jitted actions
693 auto retTypeName = RDFInternal::TypeID2TypeName(typeid(T));
694 if (retTypeName.empty()) {
695 // The type is not known to the interpreter.
696 // We must not error out here, but if/when this column is used in jitted code
697 const auto demangledType = RDFInternal::DemangleTypeIdName(typeid(T));
698 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
699 }
700
701 const auto validColumnNames = ColumnNames_t{column.data()};
702 auto newColumn = std::make_shared<ROOT::Internal::RDF::RDefaultValueFor<T>>(
703 column, retTypeName, defaultValue, validColumnNames, fColRegister, *fLoopManager);
704 CheckAndFillDSColumns(validColumnNames, TTraits::TypeList<T>{});
705
707 newCols.AddDefine(std::move(newColumn));
708
709 RInterface<Proxied> newInterface(fProxiedPtr, *fLoopManager, std::move(newCols));
710
711 return newInterface;
712 }
713
714 // clang-format off
715 ////////////////////////////////////////////////////////////////////////////
716 /// \brief Define a new column that is updated when the input sample changes.
717 /// \param[in] name The name of the defined column.
718 /// \param[in] expression A C++ callable that computes the new value of the defined column.
719 /// \return the first node of the computation graph for which the new quantity is defined.
720 ///
721 /// The signature of the callable passed as second argument should be `T(unsigned int slot, const ROOT::RDF::RSampleInfo &id)`
722 /// where:
723 /// - `T` is the type of the defined column
724 /// - `slot` is a number in the range [0, nThreads) that is different for each processing thread. This can simplify
725 /// the definition of thread-safe callables if you are interested in using parallel capabilities of RDataFrame.
726 /// - `id` is an instance of a ROOT::RDF::RSampleInfo object which contains information about the sample which is
727 /// being processed (see the class docs for more information).
728 ///
729 /// DefinePerSample() is useful to e.g. define a quantity that depends on which TTree in which TFile is being
730 /// processed or to inject a callback into the event loop that is only called when the processing of a new sample
731 /// starts rather than at every entry.
732 ///
733 /// The callable will be invoked once per input TTree or once per multi-thread task, whichever is more often.
734 ///
735 /// ### Example usage:
736 /// ~~~{.cpp}
737 /// ROOT::RDataFrame df{"mytree", {"sample1.root","sample2.root"}};
738 /// df.DefinePerSample("weightbysample",
739 /// [](unsigned int slot, const ROOT::RDF::RSampleInfo &id)
740 /// { return id.Contains("sample1") ? 1.0f : 2.0f; });
741 /// ~~~
742 // clang-format on
743 // TODO we could SFINAE on F's signature to provide friendlier compilation errors in case of signature mismatch
744 template <typename F, typename RetType_t = typename TTraits::CallableTraits<F>::ret_type>
745 RInterface<Proxied> DefinePerSample(std::string_view name, F expression)
746 {
747 RDFInternal::CheckValidCppVarName(name, "DefinePerSample");
750
751 auto retTypeName = RDFInternal::TypeID2TypeName(typeid(RetType_t));
752 if (retTypeName.empty()) {
753 // The type is not known to the interpreter.
754 // We must not error out here, but if/when this column is used in jitted code
755 const auto demangledType = RDFInternal::DemangleTypeIdName(typeid(RetType_t));
756 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
757 }
758
759 auto newColumn =
760 std::make_shared<RDFDetail::RDefinePerSample<F>>(name, retTypeName, std::move(expression), *fLoopManager);
761
763 newCols.AddDefine(std::move(newColumn));
764 RInterface<Proxied> newInterface(fProxiedPtr, *fLoopManager, std::move(newCols));
765 return newInterface;
766 }
767
768 // clang-format off
769 ////////////////////////////////////////////////////////////////////////////
770 /// \brief Define a new column that is updated when the input sample changes.
771 /// \param[in] name The name of the defined column.
772 /// \param[in] expression A valid C++ expression as a string, which will be used to compute the defined value.
773 /// \return the first node of the computation graph for which the new quantity is defined.
774 ///
775 /// The expression is just-in-time compiled and used to produce the column entries.
776 /// It must be valid C++ syntax and the usage of the special variable names `rdfslot_` and `rdfsampleinfo_` is
777 /// permitted, where these variables will take the same values as the `slot` and `id` parameters described at the
778 /// DefinePerSample(std::string_view name, F expression) overload. See the documentation of that overload for more information.
779 ///
780 /// ### Example usage:
781 /// ~~~{.py}
782 /// df = ROOT.RDataFrame('mytree', ['sample1.root','sample2.root'])
783 /// df.DefinePerSample('weightbysample', 'rdfsampleinfo_.Contains("sample1") ? 1.0f : 2.0f')
784 /// ~~~
785 ///
786 /// \note
787 /// If you have declared some C++ function to the interpreter, the correct syntax to call that function with this
788 /// overload of DefinePerSample is by calling it explicitly with the special names `rdfslot_` and `rdfsampleinfo_` as
789 /// input parameters. This is for example the correct way to call this overload when working in PyROOT:
790 /// ~~~{.py}
791 /// ROOT.gInterpreter.Declare(
792 /// """
793 /// float weights(unsigned int slot, const ROOT::RDF::RSampleInfo &id){
794 /// return id.Contains("sample1") ? 1.0f : 2.0f;
795 /// }
796 /// """)
797 /// df = ROOT.RDataFrame("mytree", ["sample1.root","sample2.root"])
798 /// df.DefinePerSample("weightsbysample", "weights(rdfslot_, rdfsampleinfo_)")
799 /// ~~~
800 ///
801 /// \note
802 /// Differently from what happens in Define(), the string expression passed to DefinePerSample cannot contain
803 /// column names other than those mentioned above: the expression is evaluated once before the processing of the
804 /// sample even starts, so column values are not accessible.
805 // clang-format on
806 RInterface<Proxied> DefinePerSample(std::string_view name, std::string_view expression)
807 {
808 RDFInternal::CheckValidCppVarName(name, "DefinePerSample");
809 // these checks must be done before jitting lest we throw exceptions in jitted code
812
813 auto jittedDefine = RDFInternal::BookDefinePerSampleJit(name, expression, *fLoopManager, fColRegister);
814
816 newCols.AddDefine(std::move(jittedDefine));
817
818 RInterface<Proxied> newInterface(fProxiedPtr, *fLoopManager, std::move(newCols));
819
820 return newInterface;
821 }
822
823 /// \brief Register systematic variations for a single existing column using custom variation tags.
824 /// \param[in] colName name of the column for which varied values are provided.
825 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
826 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
827 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
828 /// \param[in] inputColumns the names of the columns to be passed to the callable.
829 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
830 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
831 ///
832 /// Vary provides a natural and flexible syntax to define systematic variations that automatically propagate to
833 /// Filters, Defines and results. RDataFrame usage of columns with attached variations does not change, but for
834 /// results that depend on any varied quantity, a map/dictionary of varied results can be produced with
835 /// ROOT::RDF::Experimental::VariationsFor (see the example below).
836 ///
837 /// The dictionary will contain a "nominal" value (accessed with the "nominal" key) for the unchanged result, and
838 /// values for each of the systematic variations that affected the result (via upstream Filters or via direct or
839 /// indirect dependencies of the column values on some registered variations). The keys will be a composition of
840 /// variation names and tags, e.g. "pt:up" and "pt:down" for the example below.
841 ///
842 /// In the following example we add up/down variations of pt and fill a histogram with a quantity that depends on pt.
843 /// We automatically obtain three histograms in output ("nominal", "pt:up" and "pt:down"):
844 /// ~~~{.cpp}
845 /// auto nominal_hx =
846 /// df.Vary("pt", [] (double pt) { return RVecD{pt*0.9, pt*1.1}; }, {"down", "up"})
847 /// .Filter("pt > k")
848 /// .Define("x", someFunc, {"pt"})
849 /// .Histo1D("x");
850 ///
851 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
852 /// hx["nominal"].Draw();
853 /// hx["pt:down"].Draw("SAME");
854 /// hx["pt:up"].Draw("SAME");
855 /// ~~~
856 /// RDataFrame computes all variations as part of a single loop over the data.
857 /// In particular, this means that I/O and computation of values shared
858 /// among variations only happen once for all variations. Thus, the event loop
859 /// run-time typically scales much better than linearly with the number of
860 /// variations.
861 ///
862 /// RDataFrame lazily computes the varied values required to produce the
863 /// outputs of \ref ROOT::RDF::Experimental::VariationsFor "VariationsFor()". If \ref
864 /// ROOT::RDF::Experimental::VariationsFor "VariationsFor()" was not called for a result, the computations are only
865 /// run for the nominal case.
866 ///
867 /// See other overloads for examples when variations are added for multiple existing columns,
868 /// or when the tags are auto-generated instead of being directly defined.
869 template <typename F>
870 RInterface<Proxied> Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns,
871 const std::vector<std::string> &variationTags, std::string_view variationName = "")
872 {
873 std::vector<std::string> colNames{{std::string(colName)}};
874 const std::string theVariationName{variationName.empty() ? colName : variationName};
875
876 return VaryImpl<true>(std::move(colNames), std::forward<F>(expression), inputColumns, variationTags,
877 theVariationName);
878 }
879
880 /// \brief Register systematic variations for a single existing column using auto-generated variation tags.
881 /// \param[in] colName name of the column for which varied values are provided.
882 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
883 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
884 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
885 /// \param[in] inputColumns the names of the columns to be passed to the callable.
886 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
887 /// `"1"`, etc.
888 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
889 /// colName is used if none is provided.
890 ///
891 /// This overload of Vary takes an nVariations parameter instead of a list of tag names.
892 /// The varied results will be accessible via the keys of the dictionary with the form `variationName:N` where `N`
893 /// is the corresponding sequential tag starting at 0 and going up to `nVariations - 1`.
894 ///
895 /// Example usage:
896 /// ~~~{.cpp}
897 /// auto nominal_hx =
898 /// df.Vary("pt", [] (double pt) { return RVecD{pt*0.9, pt*1.1}; }, 2)
899 /// .Histo1D("x");
900 ///
901 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
902 /// hx["nominal"].Draw();
903 /// hx["x:0"].Draw("SAME");
904 /// hx["x:1"].Draw("SAME");
905 /// ~~~
906 ///
907 /// \note See also This Vary() overload for more information.
908 template <typename F>
909 RInterface<Proxied> Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns,
910 std::size_t nVariations, std::string_view variationName = "")
911 {
912 R__ASSERT(nVariations > 0 && "Must have at least one variation.");
913
914 std::vector<std::string> variationTags;
915 variationTags.reserve(nVariations);
916 for (std::size_t i = 0u; i < nVariations; ++i)
917 variationTags.emplace_back(std::to_string(i));
918
919 const std::string theVariationName{variationName.empty() ? colName : variationName};
920
921 return Vary(colName, std::forward<F>(expression), inputColumns, std::move(variationTags), theVariationName);
922 }
923
924 /// \brief Register systematic variations for multiple existing columns using custom variation tags.
925 /// \param[in] colNames set of names of the columns for which varied values are provided.
926 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
927 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
928 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
929 /// \param[in] inputColumns the names of the columns to be passed to the callable.
930 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
931 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`
932 ///
933 /// This overload of Vary takes a list of column names as first argument and
934 /// requires that the expression returns an RVec of RVecs of values: one inner RVec for the variations of each
935 /// affected column. The `variationTags` are defined as `{"down", "up"}`.
936 ///
937 /// Example usage:
938 /// ~~~{.cpp}
939 /// // produce variations "ptAndEta:down" and "ptAndEta:up"
940 /// auto nominal_hx =
941 /// df.Vary({"pt", "eta"}, // the columns that will vary simultaneously
942 /// [](double pt, double eta) { return RVec<RVecF>{{pt*0.9, pt*1.1}, {eta*0.9, eta*1.1}}; },
943 /// {"pt", "eta"}, // inputs to the Vary expression, independent of what columns are varied
944 /// {"down", "up"}, // variation tags
945 /// "ptAndEta") // variation name
946 /// .Histo1D("pt", "eta");
947 ///
948 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
949 /// hx["nominal"].Draw();
950 /// hx["ptAndEta:down"].Draw("SAME");
951 /// hx["ptAndEta:up"].Draw("SAME");
952 /// ~~~
953 ///
954 /// \note See also This Vary() overload for more information.
955
956 template <typename F>
957 RInterface<Proxied> Vary(const std::vector<std::string> &colNames, F &&expression, const ColumnNames_t &inputColumns,
958 const std::vector<std::string> &variationTags, std::string_view variationName)
959 {
960 return VaryImpl<false>(colNames, std::forward<F>(expression), inputColumns, variationTags, variationName);
961 }
962
963 /// \brief Register systematic variations for multiple existing columns using custom variation tags.
964 /// \param[in] colNames set of names of the columns for which varied values are provided.
965 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
966 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
967 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
968 /// \param[in] inputColumns the names of the columns to be passed to the callable.
969 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
970 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
971 /// colName is used if none is provided.
972 ///
973 /// \note This overload ensures that the ambiguity between C++20 string, vector<string> construction from init list
974 /// is avoided.
975 ///
976 /// \note See also This Vary() overload for more information.
977 template <typename F>
979 Vary(std::initializer_list<std::string> colNames, F &&expression, const ColumnNames_t &inputColumns,
980 const std::vector<std::string> &variationTags, std::string_view variationName)
981 {
982 return Vary(std::vector<std::string>(colNames), std::forward<F>(expression), inputColumns, variationTags, variationName);
983 }
984
985 /// \brief Register systematic variations for multiple existing columns using auto-generated tags.
986 /// \param[in] colNames set of names of the columns for which varied values are provided.
987 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
988 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
989 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
990 /// \param[in] inputColumns the names of the columns to be passed to the callable.
991 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
992 /// `"1"`, etc.
993 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
994 /// colName is used if none is provided.
995 ///
996 /// This overload of Vary takes a list of column names as first argument.
997 /// It takes an `nVariations` parameter instead of a list of tag names (`variationTags`). Tag names
998 /// will be auto-generated as the sequence 0...``nVariations-1``.
999 ///
1000 /// Example usage:
1001 /// ~~~{.cpp}
1002 /// auto nominal_hx =
1003 /// df.Vary({"pt", "eta"}, // the columns that will vary simultaneously
1004 /// [](double pt, double eta) { return RVec<RVecF>{{pt*0.9, pt*1.1}, {eta*0.9, eta*1.1}}; },
1005 /// {"pt", "eta"}, // inputs to the Vary expression, independent of what columns are varied
1006 /// 2, // auto-generated variation tags
1007 /// "ptAndEta") // variation name
1008 /// .Histo1D("pt", "eta");
1009 ///
1010 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1011 /// hx["nominal"].Draw();
1012 /// hx["ptAndEta:0"].Draw("SAME");
1013 /// hx["ptAndEta:1"].Draw("SAME");
1014 /// ~~~
1015 ///
1016 /// \note See also This Vary() overload for more information.
1017 template <typename F>
1018 RInterface<Proxied> Vary(const std::vector<std::string> &colNames, F &&expression, const ColumnNames_t &inputColumns,
1019 std::size_t nVariations, std::string_view variationName)
1020 {
1021 R__ASSERT(nVariations > 0 && "Must have at least one variation.");
1022
1023 std::vector<std::string> variationTags;
1024 variationTags.reserve(nVariations);
1025 for (std::size_t i = 0u; i < nVariations; ++i)
1026 variationTags.emplace_back(std::to_string(i));
1027
1028 return Vary(colNames, std::forward<F>(expression), inputColumns, std::move(variationTags), variationName);
1029 }
1030
1031 /// \brief Register systematic variations for for multiple existing columns using custom variation tags.
1032 /// \param[in] colNames set of names of the columns for which varied values are provided.
1033 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
1034 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
1035 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
1036 /// \param[in] inputColumns the names of the columns to be passed to the callable.
1037 /// \param[in] inputColumns the names of the columns to be passed to the callable.
1038 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1039 /// `"1"`, etc.
1040 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1041 /// colName is used if none is provided.
1042 ///
1043 /// \note This overload ensures that the ambiguity between C++20 string, vector<string> construction from init list
1044 /// is avoided.
1045 ///
1046 /// \note See also This Vary() overload for more information.
1047 template <typename F>
1048 RInterface<Proxied> Vary(std::initializer_list<std::string> colNames, F &&expression,
1049 const ColumnNames_t &inputColumns, std::size_t nVariations, std::string_view variationName)
1050 {
1051 return Vary(std::vector<std::string>(colNames), std::forward<F>(expression), inputColumns, nVariations, variationName);
1052 }
1053
1054 /// \brief Register systematic variations for a single existing column using custom variation tags.
1055 /// \param[in] colName name of the column for which varied values are provided.
1056 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied
1057 /// values for the specified column.
1058 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
1059 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1060 /// colName is used if none is provided.
1061 ///
1062 /// This overload adds the possibility for the expression used to evaluate the varied values to be just-in-time
1063 /// compiled. The example below shows how Vary() is used while dealing with a single column. The variation tags are
1064 /// defined as `{"down", "up"}`.
1065 /// ~~~{.cpp}
1066 /// auto nominal_hx =
1067 /// df.Vary("pt", "ROOT::RVecD{pt*0.9, pt*1.1}", {"down", "up"})
1068 /// .Filter("pt > k")
1069 /// .Define("x", someFunc, {"pt"})
1070 /// .Histo1D("x");
1071 ///
1072 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1073 /// hx["nominal"].Draw();
1074 /// hx["pt:down"].Draw("SAME");
1075 /// hx["pt:up"].Draw("SAME");
1076 /// ~~~
1077 ///
1078 /// ## Short-hand expression syntax
1079 ///
1080 /// For convenience, when a C++ expression is passed to Vary, the return type can be omitted if the string begins
1081 /// with '{' and ends with '}' (whitespace, tab and newline characters are excluded from the search). This means that
1082 /// the following is equivalent to the example above:
1083 ///
1084 /// ~~~{.cpp}
1085 /// auto nominal_hx =
1086 /// df.Vary("pt", "{pt*0.9, pt*1.1}", {"down", "up"})
1087 /// // Same as above
1088 /// ~~~
1089 ///
1090 /// \note See also This Vary() overload for more information.
1091 RInterface<Proxied> Vary(std::string_view colName, std::string_view expression,
1092 const std::vector<std::string> &variationTags, std::string_view variationName = "")
1093 {
1094 std::vector<std::string> colNames{{std::string(colName)}};
1095 const std::string theVariationName{variationName.empty() ? colName : variationName};
1096
1097 return JittedVaryImpl(colNames, expression, variationTags, theVariationName, /*isSingleColumn=*/true);
1098 }
1099
1100 /// \brief Register systematic variations for a single existing column using auto-generated variation tags.
1101 /// \param[in] colName name of the column for which varied values are provided.
1102 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied
1103 /// values for the specified column.
1104 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1105 /// `"1"`, etc.
1106 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1107 /// colName is used if none is provided.
1108 ///
1109 /// This overload adds the possibility for the expression used to evaluate the varied values to be a just-in-time
1110 /// compiled. The example below shows how Vary() is used while dealing with a single column. The variation tags are
1111 /// auto-generated.
1112 /// ~~~{.cpp}
1113 /// auto nominal_hx =
1114 /// df.Vary("pt", "ROOT::RVecD{pt*0.9, pt*1.1}", 2)
1115 /// .Histo1D("pt");
1116 ///
1117 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1118 /// hx["nominal"].Draw();
1119 /// hx["pt:0"].Draw("SAME");
1120 /// hx["pt:1"].Draw("SAME");
1121 /// ~~~
1122 ///
1123 /// ## Short-hand expression syntax
1124 ///
1125 /// For convenience, when a C++ expression is passed to Vary, the return type can be omitted if the string begins
1126 /// with '{' and ends with '}' (whitespace, tab and newline characters are excluded from the search). This means that
1127 /// the following is equivalent to the example above:
1128 ///
1129 /// ~~~{.cpp}
1130 /// auto nominal_hx =
1131 /// df.Vary("pt", "{pt*0.9, pt*1.1}", 2)
1132 /// // Same as above
1133 /// ~~~
1134 ///
1135 /// \note See also This Vary() overload for more information.
1136 RInterface<Proxied> Vary(std::string_view colName, std::string_view expression, std::size_t nVariations,
1137 std::string_view variationName = "")
1138 {
1139 std::vector<std::string> variationTags;
1140 variationTags.reserve(nVariations);
1141 for (std::size_t i = 0u; i < nVariations; ++i)
1142 variationTags.emplace_back(std::to_string(i));
1143
1144 return Vary(colName, expression, std::move(variationTags), variationName);
1145 }
1146
1147 /// \brief Register systematic variations for multiple existing columns using auto-generated variation tags.
1148 /// \param[in] colNames set of names of the columns for which varied values are provided.
1149 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec or RVecs containing the varied
1150 /// values for the specified columns.
1151 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1152 /// `"1"`, etc.
1153 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1154 ///
1155 /// This overload adds the possibility for the expression used to evaluate the varied values to be just-in-time
1156 /// compiled. It takes an nVariations parameter instead of a list of tag names.
1157 /// The varied results will be accessible via the keys of the dictionary with the form `variationName:N` where `N`
1158 /// is the corresponding sequential tag starting at 0 and going up to `nVariations - 1`.
1159 /// The example below shows how Vary() is used while dealing with multiple columns.
1160 ///
1161 /// ~~~{.cpp}
1162 /// auto nominal_hx =
1163 /// df.Vary({"x", "y"}, "ROOT::RVec<ROOT::RVecD>{{x*0.9, x*1.1}, {y*0.9, y*1.1}}", 2, "xy")
1164 /// .Histo1D("x", "y");
1165 ///
1166 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1167 /// hx["nominal"].Draw();
1168 /// hx["xy:0"].Draw("SAME");
1169 /// hx["xy:1"].Draw("SAME");
1170 /// ~~~
1171 ///
1172 /// ## Short-hand expression syntax
1173 ///
1174 /// For convenience, when a C++ expression is passed to Vary, the return type can be omitted if the string begins
1175 /// with '{' and ends with '}' (whitespace, tab and newline characters are excluded from the search). This means that
1176 /// the following is equivalent to the example above:
1177 ///
1178 /// ~~~{.cpp}
1179 /// auto nominal_hx =
1180 /// df.Vary("pt", "{{x*0.9, x*1.1}, {y*0.9, y*1.1}}", 2, "xy")
1181 /// // Same as above
1182 /// ~~~
1183 ///
1184 /// or also:
1185 ///
1186 /// ~~~{.cpp}
1187 /// auto nominal_hx =
1188 /// df.Vary("pt", R"(
1189 /// {
1190 /// {x*0.9, x*1.1}, // x variations
1191 /// {y*0.9, y*1.1} // y variations
1192 /// }
1193 /// )", 2, "xy")
1194 /// // Same as above
1195 /// ~~~
1196 ///
1197 /// \note See also This Vary() overload for more information.
1198 RInterface<Proxied> Vary(const std::vector<std::string> &colNames, std::string_view expression,
1199 std::size_t nVariations, std::string_view variationName)
1200 {
1201 std::vector<std::string> variationTags;
1202 variationTags.reserve(nVariations);
1203 for (std::size_t i = 0u; i < nVariations; ++i)
1204 variationTags.emplace_back(std::to_string(i));
1205
1206 return Vary(colNames, expression, std::move(variationTags), variationName);
1207 }
1208
1209 /// \brief Register systematic variations for multiple existing columns using auto-generated variation tags.
1210 /// \param[in] colNames set of names of the columns for which varied values are provided.
1211 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied
1212 /// values for the specified column.
1213 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1214 /// `"1"`, etc.
1215 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1216 /// colName is used if none is provided.
1217 ///
1218 /// \note This overload ensures that the ambiguity between C++20 string, vector<string> construction from init list
1219 /// is avoided.
1220 ///
1221 /// \note See also This Vary() overload for more information.
1222 RInterface<Proxied> Vary(std::initializer_list<std::string> colNames, std::string_view expression,
1223 std::size_t nVariations, std::string_view variationName)
1224 {
1225 return Vary(std::vector<std::string>(colNames), expression, nVariations, variationName);
1226 }
1227
1228 /// \brief Register systematic variations for multiple existing columns using custom variation tags.
1229 /// \param[in] colNames set of names of the columns for which varied values are provided.
1230 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec or RVecs containing the varied
1231 /// values for the specified columns.
1232 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
1233 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1234 ///
1235 /// This overload adds the possibility for the expression used to evaluate the varied values to be just-in-time
1236 /// compiled. The example below shows how Vary() is used while dealing with multiple columns. The tags are defined as
1237 /// `{"down", "up"}`.
1238 /// ~~~{.cpp}
1239 /// auto nominal_hx =
1240 /// df.Vary({"x", "y"}, "ROOT::RVec<ROOT::RVecD>{{x*0.9, x*1.1}, {y*0.9, y*1.1}}", {"down", "up"}, "xy")
1241 /// .Histo1D("x", "y");
1242 ///
1243 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1244 /// hx["nominal"].Draw();
1245 /// hx["xy:down"].Draw("SAME");
1246 /// hx["xy:up"].Draw("SAME");
1247 /// ~~~
1248 ///
1249 /// ## Short-hand expression syntax
1250 ///
1251 /// For convenience, when a C++ expression is passed to Vary, the return type can be omitted if the string begins
1252 /// with '{' and ends with '}' (whitespace, tab and newline characters are excluded from the search). This means that
1253 /// the following is equivalent to the example above:
1254 ///
1255 /// ~~~{.cpp}
1256 /// auto nominal_hx =
1257 /// df.Vary("pt", "{{x*0.9, x*1.1}, {y*0.9, y*1.1}}", {"down", "up"}, "xy")
1258 /// // Same as above
1259 /// ~~~
1260 ///
1261 /// or also:
1262 ///
1263 /// ~~~{.cpp}
1264 /// auto nominal_hx =
1265 /// df.Vary("pt", R"(
1266 /// {
1267 /// {x*0.9, x*1.1}, // x variations
1268 /// {y*0.9, y*1.1} // y variations
1269 /// }
1270 /// )", {"down", "up"}, "xy")
1271 /// // Same as above
1272 /// ~~~
1273 ///
1274 /// \note See also This Vary() overload for more information.
1275 RInterface<Proxied> Vary(const std::vector<std::string> &colNames, std::string_view expression,
1276 const std::vector<std::string> &variationTags, std::string_view variationName)
1277 {
1278 return JittedVaryImpl(colNames, expression, variationTags, variationName, /*isSingleColumn=*/false);
1279 }
1280
1281 ////////////////////////////////////////////////////////////////////////////
1282 /// \brief Allow to refer to a column with a different name.
1283 /// \param[in] alias name of the column alias
1284 /// \param[in] columnName of the column to be aliased
1285 /// \return the first node of the computation graph for which the alias is available.
1286 ///
1287 /// Aliasing an alias is supported.
1288 ///
1289 /// ### Example usage:
1290 /// ~~~{.cpp}
1291 /// auto df_with_alias = df.Alias("simple_name", "very_long&complex_name!!!");
1292 /// ~~~
1293 RInterface<Proxied> Alias(std::string_view alias, std::string_view columnName)
1294 {
1295 // The symmetry with Define is clear. We want to:
1296 // - Create globally the alias and return this very node, unchanged
1297 // - Make aliases accessible based on chains and not globally
1298
1299 // Helper to find out if a name is a column
1300 auto &dsColumnNames = GetDataSource() ? GetDataSource()->GetColumnNames() : ColumnNames_t{};
1301
1302 constexpr auto where = "Alias";
1304 // If the alias name is a column name, there is a problem
1305 RDFInternal::CheckForRedefinition(where, alias, fColRegister, dsColumnNames);
1306
1307 const auto validColumnName = GetValidatedColumnNames(1, {std::string(columnName)})[0];
1308
1310 newCols.AddAlias(alias, validColumnName);
1311
1312 RInterface<Proxied> newInterface(fProxiedPtr, *fLoopManager, std::move(newCols));
1313
1314 return newInterface;
1315 }
1316
1317 template <typename... ColumnTypes>
1318 [[deprecated("Snapshot is not any more a template. You can safely remove the template parameters.")]]
1320 Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList,
1321 const RSnapshotOptions &options = RSnapshotOptions())
1322 {
1323 return Snapshot(treename, filename, columnList, options);
1324 }
1325
1326 ////////////////////////////////////////////////////////////////////////////
1327 /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`.
1328 /// \param[in] treename The name of the output TTree or RNTuple.
1329 /// \param[in] filename The name of the output TFile.
1330 /// \param[in] columnList The list of names of the columns/branches/fields to be written.
1331 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple.
1332 /// \return a `RDataFrame` that wraps the snapshotted dataset.
1333 ///
1334 /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source.
1335 /// The types of the columns are automatically inferred and do not need to be specified.
1336 ///
1337 /// Support for writing of nested branches/fields is limited (although RDataFrame is able to read them) and dot ('.')
1338 /// characters in input column names will be replaced by underscores ('_') in the branches produced by Snapshot.
1339 /// When writing a variable size array through Snapshot, it is required that the column indicating its size is also
1340 /// written out and it appears before the array in the columnList.
1341 ///
1342 /// By default, in case of TTree, TChain or RNTuple inputs, Snapshot will try to write out all top-level branches.
1343 /// For other types of inputs, all columns returned by GetColumnNames() will be written out. Systematic variations of
1344 /// columns will be included if the corresponding flag is set in RSnapshotOptions. See \ref snapshot-with-variations
1345 /// "Snapshot with Variations" for more details. If friend trees or chains are present, by default all friend
1346 /// top-level branches that have names that do not collide with names of branches in the main TTree/TChain will be
1347 /// written out. Since v6.24, Snapshot will also write out friend branches with the same names of branches in the
1348 /// main TTree/TChain with names of the form
1349 /// `<friendname>_<branchname>` in order to differentiate them from the branches in the main tree/chain.
1350 ///
1351 /// ### Writing to a sub-directory
1352 ///
1353 /// Snapshot supports writing the TTree or RNTuple in a sub-directory inside the TFile. It is sufficient to specify
1354 /// the directory path as part of the TTree or RNTuple name, e.g. `df.Snapshot("subdir/t", "f.root")` writes TTree
1355 /// `t` in the sub-directory `subdir` of file `f.root` (creating file and sub-directory as needed).
1356 ///
1357 /// \attention In multi-thread runs (i.e. when EnableImplicitMT() has been called) threads will loop over clusters of
1358 /// entries in an undefined order, so Snapshot will produce outputs in which (clusters of) entries will be shuffled
1359 /// with respect to the input TTree. Using such "shuffled" TTrees as friends of the original trees would result in
1360 /// wrong associations between entries in the main TTree and entries in the "shuffled" friend. Since v6.22, ROOT will
1361 /// error out if such a "shuffled" TTree is used in a friendship.
1362 ///
1363 /// \note In case no events are written out (e.g. because no event passes all filters), Snapshot will still write the
1364 /// requested output TTree or RNTuple to the file, with all the branches requested to preserve the dataset schema.
1365 ///
1366 /// \note Snapshot will refuse to process columns with names of the form `#columnname`. These are special columns
1367 /// made available by some data sources (e.g. RNTupleDS) that represent the size of column `columnname`, and are
1368 /// not meant to be written out with that name (which is not a valid C++ variable name). Instead, go through an
1369 /// Alias(): `df.Alias("nbar", "#bar").Snapshot(..., {"nbar"})`.
1370 ///
1371 /// ### Example invocations:
1372 ///
1373 /// ~~~{.cpp}
1374 /// // No need to specify column types, they are automatically deduced thanks
1375 /// // to information coming from the data source
1376 /// df.Snapshot("outputTree", "outputFile.root", {"x", "y"});
1377 /// ~~~
1378 ///
1379 /// To book a Snapshot without triggering the event loop, one needs to set the appropriate flag in
1380 /// `RSnapshotOptions`:
1381 /// ~~~{.cpp}
1382 /// RSnapshotOptions opts;
1383 /// opts.fLazy = true;
1384 /// df.Snapshot("outputTree", "outputFile.root", {"x"}, opts);
1385 /// ~~~
1386 ///
1387 /// To snapshot to the RNTuple data format, the `fOutputFormat` option in `RSnapshotOptions` needs to be set
1388 /// accordingly:
1389 /// ~~~{.cpp}
1390 /// RSnapshotOptions opts;
1391 /// opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple;
1392 /// df.Snapshot("outputNTuple", "outputFile.root", {"x"}, opts);
1393 /// ~~~
1394 ///
1395 /// Snapshot systematic variations resulting from a Vary() call (see details \ref snapshot-with-variations "here"):
1396 /// ~~~{.cpp}
1397 /// RSnapshotOptions opts;
1398 /// opts.fIncludeVariations = true;
1399 /// df.Snapshot("outputTree", "outputFile.root", {"x"}, opts);
1400 /// ~~~
1401 RResultPtr<RInterface<RLoopManager>> Snapshot(std::string_view treename, std::string_view filename,
1402 const ColumnNames_t &columnList,
1403 const RSnapshotOptions &options = RSnapshotOptions())
1404 {
1405 // like columnList but with `#var` columns removed
1406 auto colListNoPoundSizes = RDFInternal::FilterArraySizeColNames(columnList, "Snapshot");
1407 // like columnListWithoutSizeColumns but with aliases resolved
1408 auto colListNoAliases = GetValidatedColumnNames(colListNoPoundSizes.size(), colListNoPoundSizes);
1410 // like validCols but with missing size branches required by array branches added in the right positions
1411 const auto pairOfColumnLists =
1412 RDFInternal::AddSizeBranches(GetDataSource(), std::move(colListNoAliases), std::move(colListNoPoundSizes));
1413 const auto &colListNoAliasesWithSizeBranches = pairOfColumnLists.first;
1414 const auto &colListWithAliasesAndSizeBranches = pairOfColumnLists.second;
1415
1416 const auto fullTreeName = treename;
1417 const auto parsedTreePath = RDFInternal::ParseTreePath(fullTreeName);
1418 treename = parsedTreePath.fTreeName;
1419 const auto &dirname = parsedTreePath.fDirName;
1420
1422
1424
1425 auto retrieveTypeID = [](const std::string &colName, const std::string &colTypeName,
1426 bool isRNTuple = false) -> const std::type_info * {
1427 try {
1428 return &ROOT::Internal::RDF::TypeName2TypeID(colTypeName);
1429 } catch (const std::runtime_error &err) {
1430 if (isRNTuple)
1432
1433 if (std::string(err.what()).find("Cannot extract type_info of type") != std::string::npos) {
1434 // We could not find RTTI for this column, thus we cannot write it out at the moment.
1435 std::string trueTypeName{colTypeName};
1436 if (colTypeName.rfind("CLING_UNKNOWN_TYPE", 0) == 0)
1437 trueTypeName = colTypeName.substr(19);
1438 std::string msg{"No runtime type information is available for column \"" + colName +
1439 "\" with type name \"" + trueTypeName +
1440 "\". Thus, it cannot be written to disk with Snapshot. Make sure to generate and load "
1441 "ROOT dictionaries for the type of this column."};
1442
1443 throw std::runtime_error(msg);
1444 } else {
1445 throw;
1446 }
1447 }
1448 };
1449
1451
1452 if (options.fOutputFormat == ESnapshotOutputFormat::kRNTuple) {
1453 // The data source of the RNTuple resulting from the Snapshot action does not exist yet here, so we create one
1454 // without a data source for now, and set it once the actual data source can be created (i.e., after
1455 // writing the RNTuple).
1456 auto newRDF = std::make_shared<RInterface<RLoopManager>>(std::make_shared<RLoopManager>(colListNoPoundSizes));
1457
1458 auto snapHelperArgs = std::make_shared<RDFInternal::SnapshotHelperArgs>(RDFInternal::SnapshotHelperArgs{
1459 std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches,
1460 options, newRDF->GetLoopManager(), GetLoopManager(), true /* fToNTuple */, /*fIncludeVariations=*/false});
1461
1462 auto &&nColumns = colListNoAliasesWithSizeBranches.size();
1463 const auto validColumnNames = GetValidatedColumnNames(nColumns, colListNoAliasesWithSizeBranches);
1464
1465 const auto nSlots = fLoopManager->GetNSlots();
1466 std::vector<const std::type_info *> colTypeIDs;
1467 colTypeIDs.reserve(nColumns);
1468 for (decltype(nColumns) i{}; i < nColumns; i++) {
1469 const auto &colName = validColumnNames[i];
1470 const auto colTypeName = ROOT::Internal::RDF::ColumnName2ColumnTypeName(
1471 colName, /*tree*/ nullptr, GetDataSource(), fColRegister.GetDefine(colName), options.fVector2RVec);
1472 const std::type_info *colTypeID = retrieveTypeID(colName, colTypeName, /*isRNTuple*/ true);
1473 colTypeIDs.push_back(colTypeID);
1474 }
1475 // Crucial e.g. if the column names do not correspond to already-available column readers created by the data
1476 // source
1477 CheckAndFillDSColumns(validColumnNames, colTypeIDs);
1478
1479 auto action =
1480 RDFInternal::BuildAction(validColumnNames, snapHelperArgs, nSlots, fProxiedPtr, fColRegister, colTypeIDs);
1481 resPtr = MakeResultPtr(newRDF, *GetLoopManager(), std::move(action));
1482 } else {
1483 if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS" &&
1484 options.fOutputFormat == ESnapshotOutputFormat::kDefault) {
1485 Warning("Snapshot",
1486 "The default Snapshot output data format is TTree, but the input data format is RNTuple. If you "
1487 "want to Snapshot to RNTuple or suppress this warning, set the appropriate fOutputFormat option in "
1488 "RSnapshotOptions. Note that this current default behaviour might change in the future.");
1489 }
1490
1491 // We create an RLoopManager without a data source. This needs to be initialised when the output TTree dataset
1492 // has actually been created and written to TFile, i.e. at the end of the Snapshot execution.
1493 auto newRDF = std::make_shared<RInterface<RLoopManager>>(
1494 std::make_shared<RLoopManager>(colListNoAliasesWithSizeBranches));
1495
1496 auto snapHelperArgs = std::make_shared<RDFInternal::SnapshotHelperArgs>(RDFInternal::SnapshotHelperArgs{
1497 std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches,
1498 options, newRDF->GetLoopManager(), GetLoopManager(), false /* fToRNTuple */, options.fIncludeVariations});
1499
1500 auto &&nColumns = colListNoAliasesWithSizeBranches.size();
1501 const auto validColumnNames = GetValidatedColumnNames(nColumns, colListNoAliasesWithSizeBranches);
1502
1503 const auto nSlots = fLoopManager->GetNSlots();
1504 std::vector<const std::type_info *> colTypeIDs;
1505 colTypeIDs.reserve(nColumns);
1506 for (decltype(nColumns) i{}; i < nColumns; i++) {
1507 const auto &colName = validColumnNames[i];
1508 const auto colTypeName = ROOT::Internal::RDF::ColumnName2ColumnTypeName(
1509 colName, /*tree*/ nullptr, GetDataSource(), fColRegister.GetDefine(colName), options.fVector2RVec);
1510 const std::type_info *colTypeID = retrieveTypeID(colName, colTypeName);
1511 colTypeIDs.push_back(colTypeID);
1512 }
1513 // Crucial e.g. if the column names do not correspond to already-available column readers created by the data
1514 // source
1515 CheckAndFillDSColumns(validColumnNames, colTypeIDs);
1516
1517 auto action =
1518 RDFInternal::BuildAction(validColumnNames, snapHelperArgs, nSlots, fProxiedPtr, fColRegister, colTypeIDs);
1519 resPtr = MakeResultPtr(newRDF, *GetLoopManager(), std::move(action));
1520 }
1521
1522 if (!options.fLazy)
1523 *resPtr;
1524 return resPtr;
1525 }
1526
1527 // clang-format off
1528 ////////////////////////////////////////////////////////////////////////////
1529 /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`.
1530 /// \param[in] treename The name of the output TTree or RNTuple.
1531 /// \param[in] filename The name of the output TFile.
1532 /// \param[in] columnNameRegexp The regular expression to match the column names to be selected. The presence of a '^' and a '$' at the end of the string is implicitly assumed if they are not specified. The dialect supported is PCRE via the TPRegexp class. An empty string signals the selection of all columns.
1533 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple
1534 /// \return a `RDataFrame` that wraps the snapshotted dataset.
1535 ///
1536 /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source.
1537 /// The types of the columns are automatically inferred and do not need to be specified.
1538 ///
1539 /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages.
1540 RResultPtr<RInterface<RLoopManager>> Snapshot(std::string_view treename, std::string_view filename,
1541 std::string_view columnNameRegexp = "",
1542 const RSnapshotOptions &options = RSnapshotOptions())
1543 {
1544 const auto definedColumns = fColRegister.GenerateColumnNames();
1545
1547 // Ignore R_rdf_sizeof_* columns coming from datasources: we don't want to Snapshot those
1548 ColumnNames_t dsColumnsWithoutSizeColumns;
1549 std::copy_if(dsColumns.begin(), dsColumns.end(), std::back_inserter(dsColumnsWithoutSizeColumns),
1550 [](const std::string &name) { return name.size() < 13 || name.substr(0, 13) != "R_rdf_sizeof_"; });
1551 ColumnNames_t columnNames;
1552 columnNames.reserve(definedColumns.size() + dsColumnsWithoutSizeColumns.size());
1553 columnNames.insert(columnNames.end(), definedColumns.begin(), definedColumns.end());
1554 columnNames.insert(columnNames.end(), dsColumnsWithoutSizeColumns.begin(), dsColumnsWithoutSizeColumns.end());
1555
1556 // The only way we can get duplicate entries is if a column coming from a tree or data-source is Redefine'd.
1557 // RemoveDuplicates should preserve ordering of the columns: it might be meaningful.
1558 RDFInternal::RemoveDuplicates(columnNames);
1559
1560 std::vector<std::string> selectedColumns;
1561 try {
1562 selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Snapshot");
1563 }
1564 catch (const std::runtime_error &e){
1565 // No columns were found, try again but consider all input data source columns
1566 if (auto ds = GetDataSource())
1567 selectedColumns = RDFInternal::ConvertRegexToColumns(ds->GetColumnNames(), columnNameRegexp, "Snapshot");
1568 else
1569 throw e;
1570 }
1571
1572 if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS") {
1573 RDFInternal::RemoveRNTupleSubfields(selectedColumns);
1574 }
1575
1576 return Snapshot(treename, filename, selectedColumns, options);
1577 }
1578 // clang-format on
1579
1580 // clang-format off
1581 ////////////////////////////////////////////////////////////////////////////
1582 /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`.
1583 /// \param[in] treename The name of the output TTree or RNTuple.
1584 /// \param[in] filename The name of the output TFile.
1585 /// \param[in] columnList The list of names of the columns/branches to be written.
1586 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple.
1587 /// \return a `RDataFrame` that wraps the snapshotted dataset.
1588 ///
1589 /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source.
1590 /// The types of the columns are automatically inferred and do not need to be specified.
1591 ///
1592 /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages.
1593 RResultPtr<RInterface<RLoopManager>> Snapshot(std::string_view treename, std::string_view filename,
1594 std::initializer_list<std::string> columnList,
1595 const RSnapshotOptions &options = RSnapshotOptions())
1596 {
1597 ColumnNames_t selectedColumns(columnList);
1598 return Snapshot(treename, filename, selectedColumns, options);
1599 }
1600 // clang-format on
1601
1602 ////////////////////////////////////////////////////////////////////////////
1603 /// \brief Save selected columns in memory.
1604 /// \tparam ColumnTypes variadic list of branch/column types.
1605 /// \param[in] columnList columns to be cached in memory.
1606 /// \return a `RDataFrame` that wraps the cached dataset.
1607 ///
1608 /// This action returns a new `RDataFrame` object, completely detached from
1609 /// the originating `RDataFrame`. The new dataframe only contains the cached
1610 /// columns and stores their content in memory for fast, zero-copy subsequent access.
1611 ///
1612 /// Use `Cache` if you know you will only need a subset of the (`Filter`ed) data that
1613 /// fits in memory and that will be accessed many times.
1614 ///
1615 /// \note Cache will refuse to process columns with names of the form `#columnname`. These are special columns
1616 /// made available by some data sources (e.g. RNTupleDS) that represent the size of column `columnname`, and are
1617 /// not meant to be written out with that name (which is not a valid C++ variable name). Instead, go through an
1618 /// Alias(): `df.Alias("nbar", "#bar").Cache<std::size_t>(..., {"nbar"})`.
1619 ///
1620 /// ### Example usage:
1621 ///
1622 /// **Types and columns specified:**
1623 /// ~~~{.cpp}
1624 /// auto cache_some_cols_df = df.Cache<double, MyClass, int>({"col0", "col1", "col2"});
1625 /// ~~~
1626 ///
1627 /// **Types inferred and columns specified (this invocation relies on jitting):**
1628 /// ~~~{.cpp}
1629 /// auto cache_some_cols_df = df.Cache({"col0", "col1", "col2"});
1630 /// ~~~
1631 ///
1632 /// **Types inferred and columns selected with a regexp (this invocation relies on jitting):**
1633 /// ~~~{.cpp}
1634 /// auto cache_all_cols_df = df.Cache(myRegexp);
1635 /// ~~~
1636 template <typename... ColumnTypes>
1638 {
1639 auto staticSeq = std::make_index_sequence<sizeof...(ColumnTypes)>();
1640 return CacheImpl<ColumnTypes...>(columnList, staticSeq);
1641 }
1642
1643 ////////////////////////////////////////////////////////////////////////////
1644 /// \brief Save selected columns in memory.
1645 /// \param[in] columnList columns to be cached in memory
1646 /// \return a `RDataFrame` that wraps the cached dataset.
1647 ///
1648 /// See the previous overloads for more information.
1650 {
1651 // Early return: if the list of columns is empty, just return an empty RDF
1652 // If we proceed, the jitted call will not compile!
1653 if (columnList.empty()) {
1654 auto nEntries = *this->Count();
1655 RInterface<RLoopManager> emptyRDF(std::make_shared<RLoopManager>(nEntries));
1656 return emptyRDF;
1657 }
1658
1659 std::stringstream cacheCall;
1660 auto upcastNode = RDFInternal::UpcastNode(fProxiedPtr);
1661 RInterface<TTraits::TakeFirstParameter_t<decltype(upcastNode)>> upcastInterface(fProxiedPtr, *fLoopManager,
1662 fColRegister);
1663 // build a string equivalent to
1664 // "(RInterface<nodetype*>*)(this)->Cache<Ts...>(*(ColumnNames_t*)(&columnList))"
1665 RInterface<RLoopManager> resRDF(std::make_shared<ROOT::Detail::RDF::RLoopManager>(0));
1666 cacheCall << "*reinterpret_cast<ROOT::RDF::RInterface<ROOT::Detail::RDF::RLoopManager>*>("
1668 << ") = reinterpret_cast<ROOT::RDF::RInterface<ROOT::Detail::RDF::RNodeBase>*>("
1669 << RDFInternal::PrettyPrintAddr(&upcastInterface) << ")->Cache<";
1670
1671 const auto columnListWithoutSizeColumns = RDFInternal::FilterArraySizeColNames(columnList, "Cache");
1672
1673 const auto validColumnNames =
1674 GetValidatedColumnNames(columnListWithoutSizeColumns.size(), columnListWithoutSizeColumns);
1675 const auto colTypes =
1676 GetValidatedArgTypes(validColumnNames, fColRegister, nullptr, GetDataSource(), "Cache", /*vector2RVec=*/false);
1677 for (const auto &colType : colTypes)
1678 cacheCall << colType << ", ";
1679 if (!columnListWithoutSizeColumns.empty())
1680 cacheCall.seekp(-2, cacheCall.cur); // remove the last ",
1681 cacheCall << ">(*reinterpret_cast<std::vector<std::string>*>(" // vector<string> should be ColumnNames_t
1682 << RDFInternal::PrettyPrintAddr(&columnListWithoutSizeColumns) << "));";
1683
1684 // book the code to jit with the RLoopManager and trigger the event loop
1685 fLoopManager->ToJitExec(cacheCall.str());
1686 fLoopManager->Jit();
1687
1688 return resRDF;
1689 }
1690
1691 ////////////////////////////////////////////////////////////////////////////
1692 /// \brief Save selected columns in memory.
1693 /// \param[in] columnNameRegexp The regular expression to match the column names to be selected. The presence of a '^' and a '$' at the end of the string is implicitly assumed if they are not specified. The dialect supported is PCRE via the TPRegexp class. An empty string signals the selection of all columns.
1694 /// \return a `RDataFrame` that wraps the cached dataset.
1695 ///
1696 /// The existing columns are matched against the regular expression. If the string provided
1697 /// is empty, all columns are selected. See the previous overloads for more information.
1698 RInterface<RLoopManager> Cache(std::string_view columnNameRegexp = "")
1699 {
1700 const auto definedColumns = fColRegister.GenerateColumnNames();
1701 const auto dsColumns = GetDataSource() ? GetDataSource()->GetColumnNames() : ColumnNames_t{};
1702 // Ignore R_rdf_sizeof_* columns coming from datasources: we don't want to Snapshot those
1703 ColumnNames_t dsColumnsWithoutSizeColumns;
1704 std::copy_if(dsColumns.begin(), dsColumns.end(), std::back_inserter(dsColumnsWithoutSizeColumns),
1705 [](const std::string &name) { return name.size() < 13 || name.substr(0, 13) != "R_rdf_sizeof_"; });
1706 ColumnNames_t columnNames;
1707 columnNames.reserve(definedColumns.size() + dsColumns.size());
1708 columnNames.insert(columnNames.end(), definedColumns.begin(), definedColumns.end());
1709 columnNames.insert(columnNames.end(), dsColumns.begin(), dsColumns.end());
1710 const auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Cache");
1711 return Cache(selectedColumns);
1712 }
1713
1714 ////////////////////////////////////////////////////////////////////////////
1715 /// \brief Save selected columns in memory.
1716 /// \param[in] columnList columns to be cached in memory.
1717 /// \return a `RDataFrame` that wraps the cached dataset.
1718 ///
1719 /// See the previous overloads for more information.
1720 RInterface<RLoopManager> Cache(std::initializer_list<std::string> columnList)
1721 {
1722 ColumnNames_t selectedColumns(columnList);
1723 return Cache(selectedColumns);
1724 }
1725
1726 // clang-format off
1727 ////////////////////////////////////////////////////////////////////////////
1728 /// \brief Creates a node that filters entries based on range: [begin, end).
1729 /// \param[in] begin Initial entry number considered for this range.
1730 /// \param[in] end Final entry number (excluded) considered for this range. 0 means that the range goes until the end of the dataset.
1731 /// \param[in] stride Process one entry of the [begin, end) range every `stride` entries. Must be strictly greater than 0.
1732 /// \return the first node of the computation graph for which the event loop is limited to a certain range of entries.
1733 ///
1734 /// Note that in case of previous Ranges and Filters the selected range refers to the transformed dataset.
1735 /// Ranges are only available if EnableImplicitMT has _not_ been called. Multi-thread ranges are not supported.
1736 ///
1737 /// ### Example usage:
1738 /// ~~~{.cpp}
1739 /// auto d_0_30 = d.Range(0, 30); // Pick the first 30 entries
1740 /// auto d_15_end = d.Range(15, 0); // Pick all entries from 15 onwards
1741 /// auto d_15_end_3 = d.Range(15, 0, 3); // Stride: from event 15, pick an event every 3
1742 /// ~~~
1743 // clang-format on
1744 RInterface<RDFDetail::RRange<Proxied>> Range(unsigned int begin, unsigned int end, unsigned int stride = 1)
1745 {
1746 // check invariants
1747 if (stride == 0 || (end != 0 && end < begin))
1748 throw std::runtime_error("Range: stride must be strictly greater than 0 and end must be greater than begin.");
1749 CheckIMTDisabled("Range");
1750
1751 using Range_t = RDFDetail::RRange<Proxied>;
1752 auto rangePtr = std::make_shared<Range_t>(begin, end, stride, fProxiedPtr);
1753 RInterface<RDFDetail::RRange<Proxied>> newInterface(std::move(rangePtr), *fLoopManager, fColRegister);
1754 return newInterface;
1755 }
1756
1757 // clang-format off
1758 ////////////////////////////////////////////////////////////////////////////
1759 /// \brief Creates a node that filters entries based on range.
1760 /// \param[in] end Final entry number (excluded) considered for this range. 0 means that the range goes until the end of the dataset.
1761 /// \return a node of the computation graph for which the range is defined.
1762 ///
1763 /// See the other Range overload for a detailed description.
1764 // clang-format on
1765 RInterface<RDFDetail::RRange<Proxied>> Range(unsigned int end) { return Range(0, end, 1); }
1766
1767 // clang-format off
1768 ////////////////////////////////////////////////////////////////////////////
1769 /// \brief Execute a user-defined function on each entry (*instant action*).
1770 /// \param[in] f Function, lambda expression, functor class or any other callable object performing user defined calculations.
1771 /// \param[in] columns Names of the columns/branches in input to the user function.
1772 ///
1773 /// The callable `f` is invoked once per entry. This is an *instant action*:
1774 /// upon invocation, an event loop as well as execution of all scheduled actions
1775 /// is triggered.
1776 /// Users are responsible for the thread-safety of this callable when executing
1777 /// with implicit multi-threading enabled (i.e. ROOT::EnableImplicitMT).
1778 ///
1779 /// ### Example usage:
1780 /// ~~~{.cpp}
1781 /// myDf.Foreach([](int i){ std::cout << i << std::endl;}, {"myIntColumn"});
1782 /// ~~~
1783 // clang-format on
1784 template <typename F>
1785 void Foreach(F f, const ColumnNames_t &columns = {})
1786 {
1787 using arg_types = typename TTraits::CallableTraits<decltype(f)>::arg_types_nodecay;
1788 using ret_type = typename TTraits::CallableTraits<decltype(f)>::ret_type;
1789 ForeachSlot(RDFInternal::AddSlotParameter<ret_type>(f, arg_types()), columns);
1790 }
1791
1792 // clang-format off
1793 ////////////////////////////////////////////////////////////////////////////
1794 /// \brief Execute a user-defined function requiring a processing slot index on each entry (*instant action*).
1795 /// \param[in] f Function, lambda expression, functor class or any other callable object performing user defined calculations.
1796 /// \param[in] columns Names of the columns/branches in input to the user function.
1797 ///
1798 /// Same as `Foreach`, but the user-defined function takes an extra
1799 /// `unsigned int` as its first parameter, the *processing slot index*.
1800 /// This *slot index* will be assigned a different value, `0` to `poolSize - 1`,
1801 /// for each thread of execution.
1802 /// This is meant as a helper in writing thread-safe `Foreach`
1803 /// actions when using `RDataFrame` after `ROOT::EnableImplicitMT()`.
1804 /// The user-defined processing callable is able to follow different
1805 /// *streams of processing* indexed by the first parameter.
1806 /// `ForeachSlot` works just as well with single-thread execution: in that
1807 /// case `slot` will always be `0`.
1808 ///
1809 /// ### Example usage:
1810 /// ~~~{.cpp}
1811 /// myDf.ForeachSlot([](unsigned int s, int i){ std::cout << "Slot " << s << ": "<< i << std::endl;}, {"myIntColumn"});
1812 /// ~~~
1813 // clang-format on
1814 template <typename F>
1815 void ForeachSlot(F f, const ColumnNames_t &columns = {})
1816 {
1818 constexpr auto nColumns = ColTypes_t::list_size;
1819
1820 const auto validColumnNames = GetValidatedColumnNames(nColumns, columns);
1821 CheckAndFillDSColumns(validColumnNames, ColTypes_t());
1822
1823 using Helper_t = RDFInternal::ForeachSlotHelper<F>;
1825
1826 auto action = std::make_unique<Action_t>(Helper_t(std::move(f)), validColumnNames, fProxiedPtr, fColRegister);
1827
1828 fLoopManager->Run();
1829 }
1830
1831 // clang-format off
1832 ////////////////////////////////////////////////////////////////////////////
1833 /// \brief Execute a user-defined reduce operation on the values of a column.
1834 /// \tparam F The type of the reduce callable. Automatically deduced.
1835 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
1836 /// \param[in] f A callable with signature `T(T,T)`
1837 /// \param[in] columnName The column to be reduced. If omitted, the first default column is used instead.
1838 /// \return the reduced quantity wrapped in a ROOT::RDF:RResultPtr.
1839 ///
1840 /// A reduction takes two values of a column and merges them into one (e.g.
1841 /// by summing them, taking the maximum, etc). This action performs the
1842 /// specified reduction operation on all processed column values, returning
1843 /// a single value of the same type. The callable f must satisfy the general
1844 /// requirements of a *processing function* besides having signature `T(T,T)`
1845 /// where `T` is the type of column columnName.
1846 ///
1847 /// The returned reduced value of each thread (e.g. the initial value of a sum) is initialized to a
1848 /// default-constructed T object. This is commonly expected to be the neutral/identity element for the specific
1849 /// reduction operation `f` (e.g. 0 for a sum, 1 for a product). If a default-constructed T does not satisfy this
1850 /// requirement, users should explicitly specify an initialization value for T by calling the appropriate `Reduce`
1851 /// overload.
1852 ///
1853 /// ### Example usage:
1854 /// ~~~{.cpp}
1855 /// auto sumOfIntCol = d.Reduce([](int x, int y) { return x + y; }, "intCol");
1856 /// ~~~
1857 ///
1858 /// This action is *lazy*: upon invocation of this method the calculation is
1859 /// booked but not executed. Also see RResultPtr.
1860 // clang-format on
1861 template <typename F, typename T = typename TTraits::CallableTraits<F>::ret_type>
1862 RResultPtr<T> Reduce(F f, std::string_view columnName = "")
1863 {
1864 static_assert(
1865 std::is_default_constructible<T>::value,
1866 "reduce object cannot be default-constructed. Please provide an initialisation value (redIdentity)");
1867 return Reduce(std::move(f), columnName, T());
1868 }
1869
1870 ////////////////////////////////////////////////////////////////////////////
1871 /// \brief Execute a user-defined reduce operation on the values of a column.
1872 /// \tparam F The type of the reduce callable. Automatically deduced.
1873 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
1874 /// \param[in] f A callable with signature `T(T,T)`
1875 /// \param[in] columnName The column to be reduced. If omitted, the first default column is used instead.
1876 /// \param[in] redIdentity The reduced object of each thread is initialized to this value.
1877 /// \return the reduced quantity wrapped in a RResultPtr.
1878 ///
1879 /// ### Example usage:
1880 /// ~~~{.cpp}
1881 /// auto sumOfIntColWithOffset = d.Reduce([](int x, int y) { return x + y; }, "intCol", 42);
1882 /// ~~~
1883 /// See the description of the first Reduce overload for more information.
1884 template <typename F, typename T = typename TTraits::CallableTraits<F>::ret_type>
1885 RResultPtr<T> Reduce(F f, std::string_view columnName, const T &redIdentity)
1886 {
1887 return Aggregate(f, f, columnName, redIdentity);
1888 }
1889
1890 ////////////////////////////////////////////////////////////////////////////
1891 /// \brief Return the number of entries processed (*lazy action*).
1892 /// \return the number of entries wrapped in a RResultPtr.
1893 ///
1894 /// Useful e.g. for counting the number of entries passing a certain filter (see also `Report`).
1895 /// This action is *lazy*: upon invocation of this method the calculation is
1896 /// booked but not executed. Also see RResultPtr.
1897 ///
1898 /// ### Example usage:
1899 /// ~~~{.cpp}
1900 /// auto nEntriesAfterCuts = myFilteredDf.Count();
1901 /// ~~~
1902 ///
1904 {
1905 const auto nSlots = fLoopManager->GetNSlots();
1906 auto cSPtr = std::make_shared<ULong64_t>(0);
1907 using Helper_t = RDFInternal::CountHelper;
1909 auto action = std::make_unique<Action_t>(Helper_t(cSPtr, nSlots), ColumnNames_t({}), fProxiedPtr,
1911 return MakeResultPtr(cSPtr, *fLoopManager, std::move(action));
1912 }
1913
1914 ////////////////////////////////////////////////////////////////////////////
1915 /// \brief Return a collection of values of a column (*lazy action*, returns a std::vector by default).
1916 /// \tparam T The type of the column.
1917 /// \tparam COLL The type of collection used to store the values.
1918 /// \param[in] column The name of the column to collect the values of.
1919 /// \return the content of the selected column wrapped in a RResultPtr.
1920 ///
1921 /// The collection type to be specified for C-style array columns is `RVec<T>`:
1922 /// in this case the returned collection is a `std::vector<RVec<T>>`.
1923 /// ### Example usage:
1924 /// ~~~{.cpp}
1925 /// // In this case intCol is a std::vector<int>
1926 /// auto intCol = rdf.Take<int>("integerColumn");
1927 /// // Same content as above but in this case taken as a RVec<int>
1928 /// auto intColAsRVec = rdf.Take<int, RVec<int>>("integerColumn");
1929 /// // In this case intCol is a std::vector<RVec<int>>, a collection of collections
1930 /// auto cArrayIntCol = rdf.Take<RVec<int>>("cArrayInt");
1931 /// ~~~
1932 /// This action is *lazy*: upon invocation of this method the calculation is
1933 /// booked but not executed. Also see RResultPtr.
1934 template <typename T, typename COLL = std::vector<T>>
1935 RResultPtr<COLL> Take(std::string_view column = "")
1936 {
1937 const auto columns = column.empty() ? ColumnNames_t() : ColumnNames_t({std::string(column)});
1938
1939 const auto validColumnNames = GetValidatedColumnNames(1, columns);
1940 CheckAndFillDSColumns(validColumnNames, TTraits::TypeList<T>());
1941
1942 using Helper_t = RDFInternal::TakeHelper<T, T, COLL>;
1944 auto valuesPtr = std::make_shared<COLL>();
1945 const auto nSlots = fLoopManager->GetNSlots();
1946
1947 auto action =
1948 std::make_unique<Action_t>(Helper_t(valuesPtr, nSlots), validColumnNames, fProxiedPtr, fColRegister);
1949 return MakeResultPtr(valuesPtr, *fLoopManager, std::move(action));
1950 }
1951
1952 ////////////////////////////////////////////////////////////////////////////
1953 /// \brief Fill and return a one-dimensional histogram with the values of a column (*lazy action*).
1954 /// \tparam V The type of the column used to fill the histogram.
1955 /// \param[in] model The returned histogram will be constructed using this as a model.
1956 /// \param[in] vName The name of the column that will fill the histogram.
1957 /// \return the monodimensional histogram wrapped in a RResultPtr.
1958 ///
1959 /// Columns can be of a container type (e.g. `std::vector<double>`), in which case the histogram
1960 /// is filled with each one of the elements of the container. In case multiple columns of container type
1961 /// are provided (e.g. values and weights) they must have the same length for each one of the events (but
1962 /// possibly different lengths between events).
1963 /// This action is *lazy*: upon invocation of this method the calculation is
1964 /// booked but not executed. Also see RResultPtr.
1965 ///
1966 /// ### Example usage:
1967 /// ~~~{.cpp}
1968 /// // Deduce column type (this invocation needs jitting internally)
1969 /// auto myHist1 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myColumn");
1970 /// // Explicit column type
1971 /// auto myHist2 = myDf.Histo1D<float>({"histName", "histTitle", 64u, 0., 128.}, "myColumn");
1972 /// ~~~
1973 ///
1974 /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory
1975 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
1976 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
1977 template <typename V = RDFDetail::RInferredType>
1978 RResultPtr<::TH1D> Histo1D(const TH1DModel &model = {"", "", 128u, 0., 0.}, std::string_view vName = "")
1979 {
1980 const auto userColumns = vName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(vName)});
1981
1982 const auto validatedColumns = GetValidatedColumnNames(1, userColumns);
1983
1984 std::shared_ptr<::TH1D> h(nullptr);
1985 {
1986 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1987 h = model.GetHistogram();
1988 }
1989
1990 if (h->GetXaxis()->GetXmax() == h->GetXaxis()->GetXmin())
1991 h->SetCanExtend(::TH1::kAllAxes);
1993 }
1994
1995 ////////////////////////////////////////////////////////////////////////////
1996 /// \brief Fill and return a one-dimensional histogram with the values of a column (*lazy action*).
1997 /// \tparam V The type of the column used to fill the histogram.
1998 /// \param[in] vName The name of the column that will fill the histogram.
1999 /// \return the monodimensional histogram wrapped in a RResultPtr.
2000 ///
2001 /// This overload uses a default model histogram TH1D(name, title, 128u, 0., 0.).
2002 /// The "name" and "title" strings are built starting from the input column name.
2003 /// See the description of the first Histo1D() overload for more details.
2004 ///
2005 /// ### Example usage:
2006 /// ~~~{.cpp}
2007 /// // Deduce column type (this invocation needs jitting internally)
2008 /// auto myHist1 = myDf.Histo1D("myColumn");
2009 /// // Explicit column type
2010 /// auto myHist2 = myDf.Histo1D<float>("myColumn");
2011 /// ~~~
2012 template <typename V = RDFDetail::RInferredType>
2013 RResultPtr<::TH1D> Histo1D(std::string_view vName)
2014 {
2015 const auto h_name = std::string(vName);
2016 const auto h_title = h_name + ";" + h_name + ";count";
2017 return Histo1D<V>({h_name.c_str(), h_title.c_str(), 128u, 0., 0.}, vName);
2018 }
2019
2020 ////////////////////////////////////////////////////////////////////////////
2021 /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*).
2022 /// \tparam V The type of the column used to fill the histogram.
2023 /// \tparam W The type of the column used as weights.
2024 /// \param[in] model The returned histogram will be constructed using this as a model.
2025 /// \param[in] vName The name of the column that will fill the histogram.
2026 /// \param[in] wName The name of the column that will provide the weights.
2027 /// \return the monodimensional histogram wrapped in a RResultPtr.
2028 ///
2029 /// See the description of the first Histo1D() overload for more details.
2030 ///
2031 /// ### Example usage:
2032 /// ~~~{.cpp}
2033 /// // Deduce column type (this invocation needs jitting internally)
2034 /// auto myHist1 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myValue", "myweight");
2035 /// // Explicit column type
2036 /// auto myHist2 = myDf.Histo1D<float, int>({"histName", "histTitle", 64u, 0., 128.}, "myValue", "myweight");
2037 /// ~~~
2038 template <typename V = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
2039 RResultPtr<::TH1D> Histo1D(const TH1DModel &model, std::string_view vName, std::string_view wName)
2040 {
2041 const std::vector<std::string_view> columnViews = {vName, wName};
2042 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
2043 ? ColumnNames_t()
2044 : ColumnNames_t(columnViews.begin(), columnViews.end());
2045 std::shared_ptr<::TH1D> h(nullptr);
2046 {
2047 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2048 h = model.GetHistogram();
2049 }
2050
2051 if (h->GetXaxis()->GetXmax() == h->GetXaxis()->GetXmin())
2052 h->SetCanExtend(::TH1::kAllAxes);
2054 }
2055
2056 ////////////////////////////////////////////////////////////////////////////
2057 /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*).
2058 /// \tparam V The type of the column used to fill the histogram.
2059 /// \tparam W The type of the column used as weights.
2060 /// \param[in] vName The name of the column that will fill the histogram.
2061 /// \param[in] wName The name of the column that will provide the weights.
2062 /// \return the monodimensional histogram wrapped in a RResultPtr.
2063 ///
2064 /// This overload uses a default model histogram TH1D(name, title, 128u, 0., 0.).
2065 /// The "name" and "title" strings are built starting from the input column names.
2066 /// See the description of the first Histo1D() overload for more details.
2067 ///
2068 /// ### Example usage:
2069 /// ~~~{.cpp}
2070 /// // Deduce column types (this invocation needs jitting internally)
2071 /// auto myHist1 = myDf.Histo1D("myValue", "myweight");
2072 /// // Explicit column types
2073 /// auto myHist2 = myDf.Histo1D<float, int>("myValue", "myweight");
2074 /// ~~~
2075 template <typename V = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
2076 RResultPtr<::TH1D> Histo1D(std::string_view vName, std::string_view wName)
2077 {
2078 // We build name and title based on the value and weight column names
2079 std::string str_vName{vName};
2080 std::string str_wName{wName};
2081 const auto h_name = str_vName + "_weighted_" + str_wName;
2082 const auto h_title = str_vName + ", weights: " + str_wName + ";" + str_vName + ";count * " + str_wName;
2083 return Histo1D<V, W>({h_name.c_str(), h_title.c_str(), 128u, 0., 0.}, vName, wName);
2084 }
2085
2086 ////////////////////////////////////////////////////////////////////////////
2087 /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*).
2088 /// \tparam V The type of the column used to fill the histogram.
2089 /// \tparam W The type of the column used as weights.
2090 /// \param[in] model The returned histogram will be constructed using this as a model.
2091 /// \return the monodimensional histogram wrapped in a RResultPtr.
2092 ///
2093 /// This overload will use the first two default columns as column names.
2094 /// See the description of the first Histo1D() overload for more details.
2095 template <typename V, typename W>
2096 RResultPtr<::TH1D> Histo1D(const TH1DModel &model = {"", "", 128u, 0., 0.})
2097 {
2098 return Histo1D<V, W>(model, "", "");
2099 }
2100
2101 ////////////////////////////////////////////////////////////////////////////
2102 /// \brief Fill and return a two-dimensional histogram (*lazy action*).
2103 /// \tparam V1 The type of the column used to fill the x axis of the histogram.
2104 /// \tparam V2 The type of the column used to fill the y axis of the histogram.
2105 /// \param[in] model The returned histogram will be constructed using this as a model.
2106 /// \param[in] v1Name The name of the column that will fill the x axis.
2107 /// \param[in] v2Name The name of the column that will fill the y axis.
2108 /// \return the bidimensional histogram wrapped in a RResultPtr.
2109 ///
2110 /// Columns can be of a container type (e.g. std::vector<double>), in which case the histogram
2111 /// is filled with each one of the elements of the container. In case multiple columns of container type
2112 /// are provided (e.g. values and weights) they must have the same length for each one of the events (but
2113 /// possibly different lengths between events).
2114 /// This action is *lazy*: upon invocation of this method the calculation is
2115 /// booked but not executed. Also see RResultPtr.
2116 ///
2117 /// ### Example usage:
2118 /// ~~~{.cpp}
2119 /// // Deduce column types (this invocation needs jitting internally)
2120 /// auto myHist1 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY");
2121 /// // Explicit column types
2122 /// auto myHist2 = myDf.Histo2D<float, float>({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY");
2123 /// ~~~
2124 ///
2125 ///
2126 /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory
2127 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2128 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2129 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType>
2130 RResultPtr<::TH2D> Histo2D(const TH2DModel &model, std::string_view v1Name = "", std::string_view v2Name = "")
2131 {
2132 std::shared_ptr<::TH2D> h(nullptr);
2133 {
2134 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2135 h = model.GetHistogram();
2136 }
2137 if (!RDFInternal::HistoUtils<::TH2D>::HasAxisLimits(*h)) {
2138 throw std::runtime_error("2D histograms with no axes limits are not supported yet.");
2139 }
2140 const std::vector<std::string_view> columnViews = {v1Name, v2Name};
2141 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
2142 ? ColumnNames_t()
2143 : ColumnNames_t(columnViews.begin(), columnViews.end());
2145 }
2146
2147 ////////////////////////////////////////////////////////////////////////////
2148 /// \brief Fill and return a weighted two-dimensional histogram (*lazy action*).
2149 /// \tparam V1 The type of the column used to fill the x axis of the histogram.
2150 /// \tparam V2 The type of the column used to fill the y axis of the histogram.
2151 /// \tparam W The type of the column used for the weights of the histogram.
2152 /// \param[in] model The returned histogram will be constructed using this as a model.
2153 /// \param[in] v1Name The name of the column that will fill the x axis.
2154 /// \param[in] v2Name The name of the column that will fill the y axis.
2155 /// \param[in] wName The name of the column that will provide the weights.
2156 /// \return the bidimensional histogram wrapped in a RResultPtr.
2157 ///
2158 /// This action is *lazy*: upon invocation of this method the calculation is
2159 /// booked but not executed. Also see RResultPtr.
2160 ///
2161 /// ### Example usage:
2162 /// ~~~{.cpp}
2163 /// // Deduce column types (this invocation needs jitting internally)
2164 /// auto myHist1 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY", "myWeight");
2165 /// // Explicit column types
2166 /// auto myHist2 = myDf.Histo2D<float, float, double>({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY", "myWeight");
2167 /// ~~~
2168 ///
2169 /// See the documentation of the first Histo2D() overload for more details.
2170 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2171 typename W = RDFDetail::RInferredType>
2173 Histo2D(const TH2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
2174 {
2175 std::shared_ptr<::TH2D> h(nullptr);
2176 {
2177 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2178 h = model.GetHistogram();
2179 }
2180 if (!RDFInternal::HistoUtils<::TH2D>::HasAxisLimits(*h)) {
2181 throw std::runtime_error("2D histograms with no axes limits are not supported yet.");
2182 }
2183 const std::vector<std::string_view> columnViews = {v1Name, v2Name, wName};
2184 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
2185 ? ColumnNames_t()
2186 : ColumnNames_t(columnViews.begin(), columnViews.end());
2188 }
2189
2190 template <typename V1, typename V2, typename W>
2192 {
2193 return Histo2D<V1, V2, W>(model, "", "", "");
2194 }
2195
2196 ////////////////////////////////////////////////////////////////////////////
2197 /// \brief Fill and return a three-dimensional histogram (*lazy action*).
2198 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
2199 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
2200 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
2201 /// \param[in] model The returned histogram will be constructed using this as a model.
2202 /// \param[in] v1Name The name of the column that will fill the x axis.
2203 /// \param[in] v2Name The name of the column that will fill the y axis.
2204 /// \param[in] v3Name The name of the column that will fill the z axis.
2205 /// \return the tridimensional histogram wrapped in a RResultPtr.
2206 ///
2207 /// This action is *lazy*: upon invocation of this method the calculation is
2208 /// booked but not executed. Also see RResultPtr.
2209 ///
2210 /// ### Example usage:
2211 /// ~~~{.cpp}
2212 /// // Deduce column types (this invocation needs jitting internally)
2213 /// auto myHist1 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
2214 /// "myValueX", "myValueY", "myValueZ");
2215 /// // Explicit column types
2216 /// auto myHist2 = myDf.Histo3D<double, double, float>({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
2217 /// "myValueX", "myValueY", "myValueZ");
2218 /// ~~~
2219 /// \note If three-dimensional histograms consume too much memory in multithreaded runs, the cloning of TH3D
2220 /// per thread can be reduced using ROOT::RDF::Experimental::ThreadsPerTH3(). See the section "Memory Usage" in
2221 /// the RDataFrame description.
2222 /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory
2223 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2224 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2225 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2226 typename V3 = RDFDetail::RInferredType>
2227 RResultPtr<::TH3D> Histo3D(const TH3DModel &model, std::string_view v1Name = "", std::string_view v2Name = "",
2228 std::string_view v3Name = "")
2229 {
2230 std::shared_ptr<::TH3D> h(nullptr);
2231 {
2232 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2233 h = model.GetHistogram();
2234 }
2235 if (!RDFInternal::HistoUtils<::TH3D>::HasAxisLimits(*h)) {
2236 throw std::runtime_error("3D histograms with no axes limits are not supported yet.");
2237 }
2238 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name};
2239 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
2240 ? ColumnNames_t()
2241 : ColumnNames_t(columnViews.begin(), columnViews.end());
2243 }
2244
2245 ////////////////////////////////////////////////////////////////////////////
2246 /// \brief Fill and return a three-dimensional histogram (*lazy action*).
2247 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
2248 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
2249 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
2250 /// \tparam W The type of the column used for the weights of the histogram. Inferred if not present.
2251 /// \param[in] model The returned histogram will be constructed using this as a model.
2252 /// \param[in] v1Name The name of the column that will fill the x axis.
2253 /// \param[in] v2Name The name of the column that will fill the y axis.
2254 /// \param[in] v3Name The name of the column that will fill the z axis.
2255 /// \param[in] wName The name of the column that will provide the weights.
2256 /// \return the tridimensional histogram wrapped in a RResultPtr.
2257 ///
2258 /// This action is *lazy*: upon invocation of this method the calculation is
2259 /// booked but not executed. Also see RResultPtr.
2260 ///
2261 /// ### Example usage:
2262 /// ~~~{.cpp}
2263 /// // Deduce column types (this invocation needs jitting internally)
2264 /// auto myHist1 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
2265 /// "myValueX", "myValueY", "myValueZ", "myWeight");
2266 /// // Explicit column types
2267 /// using d_t = double;
2268 /// auto myHist2 = myDf.Histo3D<d_t, d_t, float, d_t>({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
2269 /// "myValueX", "myValueY", "myValueZ", "myWeight");
2270 /// ~~~
2271 ///
2272 ///
2273 /// See the documentation of the first Histo2D() overload for more details.
2274 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2275 typename V3 = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
2276 RResultPtr<::TH3D> Histo3D(const TH3DModel &model, std::string_view v1Name, std::string_view v2Name,
2277 std::string_view v3Name, std::string_view wName)
2278 {
2279 std::shared_ptr<::TH3D> h(nullptr);
2280 {
2281 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2282 h = model.GetHistogram();
2283 }
2284 if (!RDFInternal::HistoUtils<::TH3D>::HasAxisLimits(*h)) {
2285 throw std::runtime_error("3D histograms with no axes limits are not supported yet.");
2286 }
2287 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name, wName};
2288 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
2289 ? ColumnNames_t()
2290 : ColumnNames_t(columnViews.begin(), columnViews.end());
2292 }
2293
2294 template <typename V1, typename V2, typename V3, typename W>
2296 {
2297 return Histo3D<V1, V2, V3, W>(model, "", "", "", "");
2298 }
2299
2300 ////////////////////////////////////////////////////////////////////////////
2301 /// \brief Fill and return an N-dimensional histogram (*lazy action*).
2302 /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred if not
2303 /// present.
2304 /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the
2305 /// object.
2306 /// \param[in] model The returned histogram will be constructed using this as a model.
2307 /// \param[in] columnList
2308 /// A list containing the names of the columns that will be passed when calling `Fill`.
2309 /// \param[in] wName The name of the column that will provide the weights.
2310 /// \return the N-dimensional histogram wrapped in a RResultPtr.
2311 ///
2312 /// This action is *lazy*: upon invocation of this method the calculation is
2313 /// booked but not executed. See RResultPtr documentation.
2314 ///
2315 /// ### Example usage:
2316 /// ~~~{.cpp}
2317 /// auto myFilledObj = myDf.HistoND<float, float, float, float>({"name","title", 4,
2318 /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}},
2319 /// {"col0", "col1", "col2", "col3"});
2320 /// ~~~
2321 ///
2322 /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new
2323 /// argument `wName`: `HistoND(model, cols, weightCol)`.
2324 ///
2325 template <typename FirstColumn, typename... OtherColumns> // need FirstColumn to disambiguate overloads
2326 RResultPtr<::THnD> HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName = "")
2327 {
2328 std::shared_ptr<::THnD> h(nullptr);
2329 {
2330 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2331 h = model.GetHistogram();
2332 const auto hDims = h->GetNdimensions();
2333 decltype(hDims) nCols = columnList.size();
2334
2335 if (!wName.empty() && nCols == hDims + 1)
2336 throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of "
2337 "input columns contains one column more than the number of dimensions of the "
2338 "histogram. Call as 'HistoND(model, cols, weightCol)'.");
2339
2340 if (nCols == hDims + 1)
2341 Warning("HistoND", "Passing the column with the weights as the last column in the list is deprecated. "
2342 "Instead, pass it as a separate argument, e.g. 'HistoND(model, cols, weightCol)'.");
2343
2344 if (!wName.empty() || nCols == hDims + 1)
2345 h->Sumw2();
2346
2347 if (nCols != hDims + 1 && nCols != hDims)
2348 throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes.");
2349 }
2350
2351 if (!wName.empty()) {
2352 // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of
2353 // passed arguments is one more the number of dimensions of the histogram.
2354 ColumnNames_t userColumns = columnList;
2355 userColumns.push_back(std::string{wName});
2356 return CreateAction<RDFInternal::ActionTags::HistoND, FirstColumn, OtherColumns...>(userColumns, h, h,
2357 fProxiedPtr);
2358 }
2359 return CreateAction<RDFInternal::ActionTags::HistoND, FirstColumn, OtherColumns...>(columnList, h, h,
2360 fProxiedPtr);
2361 }
2362
2363 ////////////////////////////////////////////////////////////////////////////
2364 /// \brief Fill and return an N-dimensional histogram (*lazy action*).
2365 /// \param[in] model The returned histogram will be constructed using this as a model.
2366 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2367 /// \param[in] wName The name of the column that will provide the weights.
2368 /// \return the N-dimensional histogram wrapped in a RResultPtr.
2369 ///
2370 /// This action is *lazy*: upon invocation of this method the calculation is
2371 /// booked but not executed. Also see RResultPtr.
2372 ///
2373 /// ### Example usage:
2374 /// ~~~{.cpp}
2375 /// auto myFilledObj = myDf.HistoND({"name","title", 4,
2376 /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}},
2377 /// {"col0", "col1", "col2", "col3"});
2378 /// ~~~
2379 ///
2380 /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new
2381 /// argument `wName`: `HistoND(model, cols, weightCol)`.
2382 ///
2383 RResultPtr<::THnD> HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName = "")
2384 {
2385 std::shared_ptr<::THnD> h(nullptr);
2386 {
2387 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2388 h = model.GetHistogram();
2389 const auto hDims = h->GetNdimensions();
2390 decltype(hDims) nCols = columnList.size();
2391
2392 if (!wName.empty() && nCols == hDims + 1)
2393 throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of "
2394 "input columns contains one column more than the number of dimensions of the "
2395 "histogram. Call as 'HistoND(model, cols, weightCol)'.");
2396
2397 if (nCols == hDims + 1)
2398 Warning("HistoND", "Passing the column with the weights as the last column in the list is deprecated. "
2399 "Instead, pass it as a separate argument, e.g. 'HistoND(model, cols, weightCol)'.");
2400
2401 if (!wName.empty() || nCols == hDims + 1)
2402 h->Sumw2();
2403
2404 if (nCols != hDims + 1 && nCols != hDims)
2405 throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes.");
2406 }
2407
2408 if (!wName.empty()) {
2409 // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of
2410 // passed arguments is one more the number of dimensions of the histogram.
2411 ColumnNames_t userColumns = columnList;
2412 userColumns.push_back(std::string{wName});
2414 userColumns.size());
2415 }
2417 columnList.size());
2418 }
2419
2420 ////////////////////////////////////////////////////////////////////////////
2421 /// \brief Fill and return a sparse N-dimensional histogram (*lazy action*).
2422 /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred if not
2423 /// present.
2424 /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the
2425 /// object.
2426 /// \param[in] model The returned histogram will be constructed using this as a model.
2427 /// \param[in] columnList
2428 /// A list containing the names of the columns that will be passed when calling `Fill`.
2429 /// \param[in] wName The name of the column that will provide the weights.
2430 /// \return the N-dimensional histogram wrapped in a RResultPtr.
2431 ///
2432 /// This action is *lazy*: upon invocation of this method the calculation is
2433 /// booked but not executed. See RResultPtr documentation.
2434 ///
2435 /// ### Example usage:
2436 /// ~~~{.cpp}
2437 /// auto myFilledObj = myDf.HistoNSparseD<float, float, float, float>({"name","title", 4,
2438 /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}},
2439 /// {"col0", "col1", "col2", "col3"});
2440 /// ~~~
2441 ///
2442 /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new
2443 /// argument `wName`: `HistoND(model, cols, weightCol)`.
2444 ///
2445 template <typename FirstColumn, typename... OtherColumns> // need FirstColumn to disambiguate overloads
2447 HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName = "")
2448 {
2449 std::shared_ptr<::THnSparseD> h(nullptr);
2450 {
2451 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2452 h = model.GetHistogram();
2453 const auto hDims = h->GetNdimensions();
2454 decltype(hDims) nCols = columnList.size();
2455
2456 if (!wName.empty() && nCols == hDims + 1)
2457 throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of "
2458 "input columns contains one column more than the number of dimensions of the "
2459 "histogram. Call as 'HistoNSparseD(model, cols, weightCol)'.");
2460
2461 if (nCols == hDims + 1)
2462 Warning("HistoNSparseD",
2463 "Passing the column with the weights as the last column in the list is deprecated. "
2464 "Instead, pass it as a separate argument, e.g. 'HistoNSparseD(model, cols, weightCol)'.");
2465
2466 if (!wName.empty() || nCols == hDims + 1)
2467 h->Sumw2();
2468
2469 if (nCols != hDims + 1 && nCols != hDims)
2470 throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes.");
2471 }
2472
2473 if (!wName.empty()) {
2474 // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of
2475 // passed arguments is one more the number of dimensions of the histogram.
2476 ColumnNames_t userColumns = columnList;
2477 userColumns.push_back(std::string{wName});
2478 return CreateAction<RDFInternal::ActionTags::HistoNSparseD, FirstColumn, OtherColumns...>(userColumns, h, h,
2479 fProxiedPtr);
2480 }
2481 return CreateAction<RDFInternal::ActionTags::HistoNSparseD, FirstColumn, OtherColumns...>(columnList, h, h,
2482 fProxiedPtr);
2483 }
2484
2485 ////////////////////////////////////////////////////////////////////////////
2486 /// \brief Fill and return a sparse N-dimensional histogram (*lazy action*).
2487 /// \param[in] model The returned histogram will be constructed using this as a model.
2488 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2489 /// \param[in] wName The name of the column that will provide the weights.
2490 /// \return the N-dimensional histogram wrapped in a RResultPtr.
2491 ///
2492 /// This action is *lazy*: upon invocation of this method the calculation is
2493 /// booked but not executed. Also see RResultPtr.
2494 ///
2495 /// ### Example usage:
2496 /// ~~~{.cpp}
2497 /// auto myFilledObj = myDf.HistoNSparseD({"name","title", 4,
2498 /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}},
2499 /// {"col0", "col1", "col2", "col3"});
2500 /// ~~~
2501 ///
2502 /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new
2503 /// argument `wName`: `HistoND(model, cols, weightCol)`.
2504 ///
2506 HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName = "")
2507 {
2508 std::shared_ptr<::THnSparseD> h(nullptr);
2509 {
2510 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2511 h = model.GetHistogram();
2512 const auto hDims = h->GetNdimensions();
2513 decltype(hDims) nCols = columnList.size();
2514
2515 if (!wName.empty() && nCols == hDims + 1)
2516 throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of "
2517 "input columns contains one column more than the number of dimensions of the "
2518 "histogram. Call as 'HistoNSparseD(model, cols, weightCol)'.");
2519
2520 if (nCols == hDims + 1)
2521 Warning("HistoNSparseD",
2522 "Passing the column with the weights as the last column in the list is deprecated. "
2523 "Instead, pass it as a separate argument, e.g. 'HistoNSparseD(model, cols, weightCol)'.");
2524
2525 if (!wName.empty() || nCols == hDims + 1)
2526 h->Sumw2();
2527
2528 if (nCols != hDims + 1 && nCols != hDims)
2529 throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes.");
2530 }
2531
2532 if (!wName.empty()) {
2533 // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of
2534 // passed arguments is one more the number of dimensions of the histogram.
2535 ColumnNames_t userColumns = columnList;
2536 userColumns.push_back(std::string{wName});
2538 userColumns, h, h, fProxiedPtr, userColumns.size());
2539 }
2541 columnList, h, h, fProxiedPtr, columnList.size());
2542 }
2543
2544#ifdef R__HAS_ROOT7
2545 ////////////////////////////////////////////////////////////////////////////
2546 /// \brief Fill and return a one-dimensional RHist (*lazy action*).
2547 /// \tparam BinContentType The bin content type of the returned RHist.
2548 /// \param[in] nNormalBins The returned histogram will be constructed using this number of normal bins.
2549 /// \param[in] interval The axis interval of the constructed histogram (lower end inclusive, upper end exclusive).
2550 /// \param[in] vName The name of the column that will fill the histogram.
2551 /// \return the histogram wrapped in a RResultPtr.
2552 ///
2553 /// This action is *lazy*: upon invocation of this method the calculation is
2554 /// booked but not executed. Also see RResultPtr.
2555 ///
2556 /// ### Example usage:
2557 /// ~~~{.cpp}
2558 /// auto myHist = myDf.Hist(10, {5, 15}, "col0");
2559 /// ~~~
2560 template <typename BinContentType = double, typename V = RDFDetail::RInferredType>
2562 Hist(std::uint64_t nNormalBins, std::pair<double, double> interval, std::string_view vName)
2563 {
2564 std::shared_ptr h = std::make_shared<ROOT::Experimental::RHist<BinContentType>>(nNormalBins, interval);
2565
2566 const ColumnNames_t columnList = {std::string(vName)};
2567
2568 return Hist<V>(h, columnList);
2569 }
2570
2571 ////////////////////////////////////////////////////////////////////////////
2572 /// \brief Fill and return an RHist (*lazy action*).
2573 /// \tparam BinContentType The bin content type of the returned RHist.
2574 /// \param[in] axes The returned histogram will be constructed using these axes.
2575 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2576 /// \return the histogram wrapped in a RResultPtr.
2577 ///
2578 /// This action is *lazy*: upon invocation of this method the calculation is
2579 /// booked but not executed. Also see RResultPtr.
2580 ///
2581 /// ### Example usage:
2582 /// ~~~{.cpp}
2583 /// ROOT::Experimental::RRegularAxis axis(10, {5.0, 15.0});
2584 /// auto myHist = myDf.Hist({axis}, {"col0"});
2585 /// ~~~
2586 template <typename BinContentType = double, typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes>
2588 Hist(std::vector<ROOT::Experimental::RAxisVariant> axes, const ColumnNames_t &columnList)
2589 {
2590 if (axes.size() != columnList.size()) {
2591 std::string msg = "Wrong number of columns for the specified number of histogram axes: ";
2592 msg += "expected " + std::to_string(axes.size()) + ", got " + std::to_string(columnList.size());
2593 throw std::invalid_argument(msg);
2594 }
2595
2596 std::shared_ptr h = std::make_shared<ROOT::Experimental::RHist<BinContentType>>(std::move(axes));
2597
2598 return Hist<ColumnType, ColumnTypes...>(h, columnList);
2599 }
2600
2601 ////////////////////////////////////////////////////////////////////////////
2602 /// \brief Fill the provided RHist (*lazy action*).
2603 /// \param[in] h The histogram that should be filled.
2604 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2605 /// \return the histogram wrapped in a RResultPtr.
2606 ///
2607 /// This action is *lazy*: upon invocation of this method the calculation is
2608 /// booked but not executed. Also see RResultPtr.
2609 ///
2610 /// During execution of the computation graph, the passed histogram must only be accessed with methods that are
2611 /// allowed during concurrent filling.
2612 ///
2613 /// ### Example usage:
2614 /// ~~~{.cpp}
2615 /// auto h = std::make_shared<ROOT::Experimental::RHist<double>>(10, {5.0, 15.0});
2616 /// auto myHist = myDf.Hist(h, {"col0"});
2617 /// ~~~
2618 template <typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes, typename BinContentType>
2621 {
2623
2624 if (h->GetNDimensions() != columnList.size()) {
2625 std::string msg = "Wrong number of columns for the passed histogram: ";
2626 msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size());
2627 throw std::invalid_argument(msg);
2628 }
2629
2630 return CreateAction<RDFInternal::ActionTags::Hist, ColumnType, ColumnTypes...>(columnList, h, h, fProxiedPtr,
2631 columnList.size());
2632 }
2633
2634 ////////////////////////////////////////////////////////////////////////////
2635 /// \brief Fill and return a one-dimensional RHist with weights (*lazy action*).
2636 /// \tparam BinContentType The bin content type of the returned RHist.
2637 /// \param[in] nNormalBins The returned histogram will be constructed using this number of normal bins.
2638 /// \param[in] interval The axis interval of the constructed histogram (lower end inclusive, upper end exclusive).
2639 /// \param[in] vName The name of the column that will fill the histogram.
2640 /// \param[in] wName The name of the column that will provide the weights.
2641 /// \return the histogram wrapped in a RResultPtr.
2642 ///
2643 /// This action is *lazy*: upon invocation of this method the calculation is
2644 /// booked but not executed. Also see RResultPtr.
2645 ///
2646 /// ### Example usage:
2647 /// ~~~{.cpp}
2648 /// auto myHist = myDf.Hist(10, {5, 15}, "col0", "colW");
2649 /// ~~~
2650 template <typename BinContentType = ROOT::Experimental::RBinWithError, typename V = RDFDetail::RInferredType,
2651 typename W = RDFDetail::RInferredType>
2653 Hist(std::uint64_t nNormalBins, std::pair<double, double> interval, std::string_view vName, std::string_view wName)
2654 {
2655 std::shared_ptr h = std::make_shared<ROOT::Experimental::RHist<BinContentType>>(nNormalBins, interval);
2656
2657 const ColumnNames_t columnList = {std::string(vName)};
2658
2659 return Hist<V, W>(h, columnList, wName);
2660 }
2661
2662 ////////////////////////////////////////////////////////////////////////////
2663 /// \brief Fill and return an RHist with weights (*lazy action*).
2664 /// \tparam BinContentType The bin content type of the returned RHist.
2665 /// \param[in] axes The returned histogram will be constructed using these axes.
2666 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2667 /// \param[in] wName The name of the column that will provide the weights.
2668 /// \return the histogram wrapped in a RResultPtr.
2669 ///
2670 /// This action is *lazy*: upon invocation of this method the calculation is
2671 /// booked but not executed. Also see RResultPtr.
2672 ///
2673 /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling).
2674 ///
2675 /// ### Example usage:
2676 /// ~~~{.cpp}
2677 /// ROOT::Experimental::RRegularAxis axis(10, {5.0, 15.0});
2678 /// auto myHist = myDf.Hist({axis}, {"col0"}, "colW");
2679 /// ~~~
2680 template <typename BinContentType = ROOT::Experimental::RBinWithError,
2681 typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes>
2683 Hist(std::vector<ROOT::Experimental::RAxisVariant> axes, const ColumnNames_t &columnList, std::string_view wName)
2684 {
2686 "weighted filling is not supported for integral bin content types");
2687
2688 if (axes.size() != columnList.size()) {
2689 std::string msg = "Wrong number of columns for the specified number of histogram axes: ";
2690 msg += "expected " + std::to_string(axes.size()) + ", got " + std::to_string(columnList.size());
2691 throw std::invalid_argument(msg);
2692 }
2693
2694 std::shared_ptr h = std::make_shared<ROOT::Experimental::RHist<BinContentType>>(std::move(axes));
2695
2696 return Hist<ColumnType, ColumnTypes...>(h, columnList, wName);
2697 }
2698
2699 ////////////////////////////////////////////////////////////////////////////
2700 /// \brief Fill the provided RHist with weights (*lazy action*).
2701 /// \param[in] h The histogram that should be filled.
2702 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2703 /// \param[in] wName The name of the column that will provide the weights.
2704 /// \return the histogram wrapped in a RResultPtr.
2705 ///
2706 /// This action is *lazy*: upon invocation of this method the calculation is
2707 /// booked but not executed. Also see RResultPtr.
2708 ///
2709 /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling).
2710 ///
2711 /// During execution of the computation graph, the passed histogram must only be accessed with methods that are
2712 /// allowed during concurrent filling.
2713 ///
2714 /// ### Example usage:
2715 /// ~~~{.cpp}
2716 /// auto h = std::make_shared<ROOT::Experimental::RHist<double>>(10, {5.0, 15.0});
2717 /// auto myHist = myDf.Hist(h, {"col0"}, "colW");
2718 /// ~~~
2719 template <typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes, typename BinContentType>
2722 std::string_view wName)
2723 {
2725 "weighted filling is not supported for integral bin content types");
2726
2728
2729 if (h->GetNDimensions() != columnList.size()) {
2730 std::string msg = "Wrong number of columns for the passed histogram: ";
2731 msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size());
2732 throw std::invalid_argument(msg);
2733 }
2734
2735 // Add the weight column to the list of argument columns to pass it through the infrastructure.
2736 ColumnNames_t columnListWithWeights(columnList);
2737 columnListWithWeights.push_back(std::string(wName));
2738
2739 return CreateAction<RDFInternal::ActionTags::HistWithWeight, ColumnType, ColumnTypes...>(
2740 columnListWithWeights, h, h, fProxiedPtr, columnListWithWeights.size());
2741 }
2742
2743 ////////////////////////////////////////////////////////////////////////////
2744 /// \brief Fill the provided RHistEngine (*lazy action*).
2745 /// \param[in] h The histogram that should be filled.
2746 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2747 /// \return the histogram wrapped in a RResultPtr.
2748 ///
2749 /// This action is *lazy*: upon invocation of this method the calculation is
2750 /// booked but not executed. Also see RResultPtr.
2751 ///
2752 /// During execution of the computation graph, the passed histogram must only be accessed with methods that are
2753 /// allowed during concurrent filling.
2754 ///
2755 /// ### Example usage:
2756 /// ~~~{.cpp}
2757 /// auto h = std::make_shared<ROOT::Experimental::RHistEngine<double>>(10, {5.0, 15.0});
2758 /// auto myHist = myDf.Hist(h, {"col0"});
2759 /// ~~~
2760 template <typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes, typename BinContentType>
2763 {
2765
2766 if (h->GetNDimensions() != columnList.size()) {
2767 std::string msg = "Wrong number of columns for the passed histogram: ";
2768 msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size());
2769 throw std::invalid_argument(msg);
2770 }
2771
2772 return CreateAction<RDFInternal::ActionTags::Hist, ColumnType, ColumnTypes...>(columnList, h, h, fProxiedPtr,
2773 columnList.size());
2774 }
2775
2776 ////////////////////////////////////////////////////////////////////////////
2777 /// \brief Fill the provided RHistEngine with weights (*lazy action*).
2778 /// \param[in] h The histogram that should be filled.
2779 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2780 /// \param[in] wName The name of the column that will provide the weights.
2781 /// \return the histogram wrapped in a RResultPtr.
2782 ///
2783 /// This action is *lazy*: upon invocation of this method the calculation is
2784 /// booked but not executed. Also see RResultPtr.
2785 ///
2786 /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling).
2787 ///
2788 /// During execution of the computation graph, the passed histogram must only be accessed with methods that are
2789 /// allowed during concurrent filling.
2790 ///
2791 /// ### Example usage:
2792 /// ~~~{.cpp}
2793 /// auto h = std::make_shared<ROOT::Experimental::RHistEngine<double>>(10, {5.0, 15.0});
2794 /// auto myHist = myDf.Hist(h, {"col0"}, "colW");
2795 /// ~~~
2796 template <typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes, typename BinContentType>
2799 std::string_view wName)
2800 {
2802 "weighted filling is not supported for integral bin content types");
2803
2805
2806 if (h->GetNDimensions() != columnList.size()) {
2807 std::string msg = "Wrong number of columns for the passed histogram: ";
2808 msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size());
2809 throw std::invalid_argument(msg);
2810 }
2811
2812 // Add the weight column to the list of argument columns to pass it through the infrastructure.
2813 ColumnNames_t columnListWithWeights(columnList);
2814 columnListWithWeights.push_back(std::string(wName));
2815
2816 return CreateAction<RDFInternal::ActionTags::HistWithWeight, ColumnType, ColumnTypes...>(
2817 columnListWithWeights, h, h, fProxiedPtr, columnListWithWeights.size());
2818 }
2819#endif
2820
2821 ////////////////////////////////////////////////////////////////////////////
2822 /// \brief Fill and return a TGraph object (*lazy action*).
2823 /// \tparam X The type of the column used to fill the x axis.
2824 /// \tparam Y The type of the column used to fill the y axis.
2825 /// \param[in] x The name of the column that will fill the x axis.
2826 /// \param[in] y The name of the column that will fill the y axis.
2827 /// \return the TGraph wrapped in a RResultPtr.
2828 ///
2829 /// Columns can be of a container type (e.g. std::vector<double>), in which case the TGraph
2830 /// is filled with each one of the elements of the container.
2831 /// If Multithreading is enabled, the order in which points are inserted is undefined.
2832 /// If the Graph has to be drawn, it is suggested to the user to sort it on the x before printing.
2833 /// A name and a title to the TGraph is given based on the input column names.
2834 ///
2835 /// This action is *lazy*: upon invocation of this method the calculation is
2836 /// booked but not executed. Also see RResultPtr.
2837 ///
2838 /// ### Example usage:
2839 /// ~~~{.cpp}
2840 /// // Deduce column types (this invocation needs jitting internally)
2841 /// auto myGraph1 = myDf.Graph("xValues", "yValues");
2842 /// // Explicit column types
2843 /// auto myGraph2 = myDf.Graph<int, float>("xValues", "yValues");
2844 /// ~~~
2845 ///
2846 /// \note Differently from other ROOT interfaces, the returned TGraph is not associated to gDirectory
2847 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2848 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2849 template <typename X = RDFDetail::RInferredType, typename Y = RDFDetail::RInferredType>
2850 RResultPtr<::TGraph> Graph(std::string_view x = "", std::string_view y = "")
2851 {
2852 auto graph = std::make_shared<::TGraph>();
2853 const std::vector<std::string_view> columnViews = {x, y};
2854 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
2855 ? ColumnNames_t()
2856 : ColumnNames_t(columnViews.begin(), columnViews.end());
2857
2858 const auto validatedColumns = GetValidatedColumnNames(2, userColumns);
2859
2860 // We build a default name and title based on the input columns
2861 const auto g_name = validatedColumns[1] + "_vs_" + validatedColumns[0];
2862 const auto g_title = validatedColumns[1] + " vs " + validatedColumns[0];
2863 graph->SetNameTitle(g_name.c_str(), g_title.c_str());
2864 graph->GetXaxis()->SetTitle(validatedColumns[0].c_str());
2865 graph->GetYaxis()->SetTitle(validatedColumns[1].c_str());
2866
2867 return CreateAction<RDFInternal::ActionTags::Graph, X, Y>(validatedColumns, graph, graph, fProxiedPtr);
2868 }
2869
2870 ////////////////////////////////////////////////////////////////////////////
2871 /// \brief Fill and return a TGraphAsymmErrors object (*lazy action*).
2872 /// \param[in] x The name of the column that will fill the x axis.
2873 /// \param[in] y The name of the column that will fill the y axis.
2874 /// \param[in] exl The name of the column of X low errors
2875 /// \param[in] exh The name of the column of X high errors
2876 /// \param[in] eyl The name of the column of Y low errors
2877 /// \param[in] eyh The name of the column of Y high errors
2878 /// \return the TGraphAsymmErrors wrapped in a RResultPtr.
2879 ///
2880 /// Columns can be of a container type (e.g. std::vector<double>), in which case the graph
2881 /// is filled with each one of the elements of the container.
2882 /// If Multithreading is enabled, the order in which points are inserted is undefined.
2883 ///
2884 /// This action is *lazy*: upon invocation of this method the calculation is
2885 /// booked but not executed. Also see RResultPtr.
2886 ///
2887 /// ### Example usage:
2888 /// ~~~{.cpp}
2889 /// // Deduce column types (this invocation needs jitting internally)
2890 /// auto myGAE1 = myDf.GraphAsymmErrors("xValues", "yValues", "exl", "exh", "eyl", "eyh");
2891 /// // Explicit column types
2892 /// using f = float
2893 /// auto myGAE2 = myDf.GraphAsymmErrors<f, f, f, f, f, f>("xValues", "yValues", "exl", "exh", "eyl", "eyh");
2894 /// ~~~
2895 ///
2896 /// `GraphAsymmErrors` should also be used for the cases in which values associated only with
2897 /// one of the axes have associated errors. For example, only `ey` exist and `ex` are equal to zero.
2898 /// In such cases, user should do the following:
2899 /// ~~~{.cpp}
2900 /// // Create a column of zeros in RDataFrame
2901 /// auto rdf_withzeros = rdf.Define("zero", "0");
2902 /// // or alternatively:
2903 /// auto rdf_withzeros = rdf.Define("zero", []() -> double { return 0.;});
2904 /// // Create the graph with y errors only
2905 /// auto rdf_errorsOnYOnly = rdf_withzeros.GraphAsymmErrors("xValues", "yValues", "zero", "zero", "eyl", "eyh");
2906 /// ~~~
2907 ///
2908 /// \note Differently from other ROOT interfaces, the returned TGraphAsymmErrors is not associated to gDirectory
2909 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2910 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2911 template <typename X = RDFDetail::RInferredType, typename Y = RDFDetail::RInferredType,
2912 typename EXL = RDFDetail::RInferredType, typename EXH = RDFDetail::RInferredType,
2913 typename EYL = RDFDetail::RInferredType, typename EYH = RDFDetail::RInferredType>
2915 GraphAsymmErrors(std::string_view x = "", std::string_view y = "", std::string_view exl = "",
2916 std::string_view exh = "", std::string_view eyl = "", std::string_view eyh = "")
2917 {
2918 auto graph = std::make_shared<::TGraphAsymmErrors>();
2919 const std::vector<std::string_view> columnViews = {x, y, exl, exh, eyl, eyh};
2920 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
2921 ? ColumnNames_t()
2922 : ColumnNames_t(columnViews.begin(), columnViews.end());
2923
2924 const auto validatedColumns = GetValidatedColumnNames(6, userColumns);
2925
2926 // We build a default name and title based on the input columns
2927 const auto g_name = validatedColumns[1] + "_vs_" + validatedColumns[0];
2928 const auto g_title = validatedColumns[1] + " vs " + validatedColumns[0];
2929 graph->SetNameTitle(g_name.c_str(), g_title.c_str());
2930 graph->GetXaxis()->SetTitle(validatedColumns[0].c_str());
2931 graph->GetYaxis()->SetTitle(validatedColumns[1].c_str());
2932
2934 graph, fProxiedPtr);
2935 }
2936
2937 ////////////////////////////////////////////////////////////////////////////
2938 /// \brief Fill and return a one-dimensional profile (*lazy action*).
2939 /// \tparam V1 The type of the column the values of which are used to fill the profile. Inferred if not present.
2940 /// \tparam V2 The type of the column the values of which are used to fill the profile. Inferred if not present.
2941 /// \param[in] model The model to be considered to build the new return value.
2942 /// \param[in] v1Name The name of the column that will fill the x axis.
2943 /// \param[in] v2Name The name of the column that will fill the y axis.
2944 /// \return the monodimensional profile wrapped in a RResultPtr.
2945 ///
2946 /// This action is *lazy*: upon invocation of this method the calculation is
2947 /// booked but not executed. Also see RResultPtr.
2948 ///
2949 /// ### Example usage:
2950 /// ~~~{.cpp}
2951 /// // Deduce column types (this invocation needs jitting internally)
2952 /// auto myProf1 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues");
2953 /// // Explicit column types
2954 /// auto myProf2 = myDf.Graph<int, float>({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues");
2955 /// ~~~
2956 ///
2957 /// \note Differently from other ROOT interfaces, the returned profile is not associated to gDirectory
2958 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2959 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2960 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType>
2962 Profile1D(const TProfile1DModel &model, std::string_view v1Name = "", std::string_view v2Name = "")
2963 {
2964 std::shared_ptr<::TProfile> h(nullptr);
2965 {
2966 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2967 h = model.GetProfile();
2968 }
2969
2970 if (!RDFInternal::HistoUtils<::TProfile>::HasAxisLimits(*h)) {
2971 throw std::runtime_error("Profiles with no axes limits are not supported yet.");
2972 }
2973 const std::vector<std::string_view> columnViews = {v1Name, v2Name};
2974 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
2975 ? ColumnNames_t()
2976 : ColumnNames_t(columnViews.begin(), columnViews.end());
2978 }
2979
2980 ////////////////////////////////////////////////////////////////////////////
2981 /// \brief Fill and return a one-dimensional profile (*lazy action*).
2982 /// \tparam V1 The type of the column the values of which are used to fill the profile. Inferred if not present.
2983 /// \tparam V2 The type of the column the values of which are used to fill the profile. Inferred if not present.
2984 /// \tparam W The type of the column the weights of which are used to fill the profile. Inferred if not present.
2985 /// \param[in] model The model to be considered to build the new return value.
2986 /// \param[in] v1Name The name of the column that will fill the x axis.
2987 /// \param[in] v2Name The name of the column that will fill the y axis.
2988 /// \param[in] wName The name of the column that will provide the weights.
2989 /// \return the monodimensional profile wrapped in a RResultPtr.
2990 ///
2991 /// This action is *lazy*: upon invocation of this method the calculation is
2992 /// booked but not executed. Also see RResultPtr.
2993 ///
2994 /// ### Example usage:
2995 /// ~~~{.cpp}
2996 /// // Deduce column types (this invocation needs jitting internally)
2997 /// auto myProf1 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues", "weight");
2998 /// // Explicit column types
2999 /// auto myProf2 = myDf.Profile1D<int, float, double>({"profName", "profTitle", 64u, -4., 4.},
3000 /// "xValues", "yValues", "weight");
3001 /// ~~~
3002 ///
3003 /// See the first Profile1D() overload for more details.
3004 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
3005 typename W = RDFDetail::RInferredType>
3007 Profile1D(const TProfile1DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
3008 {
3009 std::shared_ptr<::TProfile> h(nullptr);
3010 {
3011 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
3012 h = model.GetProfile();
3013 }
3014
3015 if (!RDFInternal::HistoUtils<::TProfile>::HasAxisLimits(*h)) {
3016 throw std::runtime_error("Profile histograms with no axes limits are not supported yet.");
3017 }
3018 const std::vector<std::string_view> columnViews = {v1Name, v2Name, wName};
3019 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
3020 ? ColumnNames_t()
3021 : ColumnNames_t(columnViews.begin(), columnViews.end());
3023 }
3024
3025 ////////////////////////////////////////////////////////////////////////////
3026 /// \brief Fill and return a one-dimensional profile (*lazy action*).
3027 /// See the first Profile1D() overload for more details.
3028 template <typename V1, typename V2, typename W>
3030 {
3031 return Profile1D<V1, V2, W>(model, "", "", "");
3032 }
3033
3034 ////////////////////////////////////////////////////////////////////////////
3035 /// \brief Fill and return a two-dimensional profile (*lazy action*).
3036 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
3037 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
3038 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
3039 /// \param[in] model The returned profile will be constructed using this as a model.
3040 /// \param[in] v1Name The name of the column that will fill the x axis.
3041 /// \param[in] v2Name The name of the column that will fill the y axis.
3042 /// \param[in] v3Name The name of the column that will fill the z axis.
3043 /// \return the bidimensional profile wrapped in a RResultPtr.
3044 ///
3045 /// This action is *lazy*: upon invocation of this method the calculation is
3046 /// booked but not executed. Also see RResultPtr.
3047 ///
3048 /// ### Example usage:
3049 /// ~~~{.cpp}
3050 /// // Deduce column types (this invocation needs jitting internally)
3051 /// auto myProf1 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
3052 /// "xValues", "yValues", "zValues");
3053 /// // Explicit column types
3054 /// auto myProf2 = myDf.Profile2D<int, float, double>({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
3055 /// "xValues", "yValues", "zValues");
3056 /// ~~~
3057 ///
3058 /// \note Differently from other ROOT interfaces, the returned profile is not associated to gDirectory
3059 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
3060 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
3061 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
3062 typename V3 = RDFDetail::RInferredType>
3063 RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model, std::string_view v1Name = "",
3064 std::string_view v2Name = "", std::string_view v3Name = "")
3065 {
3066 std::shared_ptr<::TProfile2D> h(nullptr);
3067 {
3068 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
3069 h = model.GetProfile();
3070 }
3071
3072 if (!RDFInternal::HistoUtils<::TProfile2D>::HasAxisLimits(*h)) {
3073 throw std::runtime_error("2D profiles with no axes limits are not supported yet.");
3074 }
3075 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name};
3076 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
3077 ? ColumnNames_t()
3078 : ColumnNames_t(columnViews.begin(), columnViews.end());
3080 }
3081
3082 ////////////////////////////////////////////////////////////////////////////
3083 /// \brief Fill and return a two-dimensional profile (*lazy action*).
3084 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
3085 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
3086 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
3087 /// \tparam W The type of the column used for the weights of the histogram. Inferred if not present.
3088 /// \param[in] model The returned histogram will be constructed using this as a model.
3089 /// \param[in] v1Name The name of the column that will fill the x axis.
3090 /// \param[in] v2Name The name of the column that will fill the y axis.
3091 /// \param[in] v3Name The name of the column that will fill the z axis.
3092 /// \param[in] wName The name of the column that will provide the weights.
3093 /// \return the bidimensional profile wrapped in a RResultPtr.
3094 ///
3095 /// This action is *lazy*: upon invocation of this method the calculation is
3096 /// booked but not executed. Also see RResultPtr.
3097 ///
3098 /// ### Example usage:
3099 /// ~~~{.cpp}
3100 /// // Deduce column types (this invocation needs jitting internally)
3101 /// auto myProf1 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
3102 /// "xValues", "yValues", "zValues", "weight");
3103 /// // Explicit column types
3104 /// auto myProf2 = myDf.Profile2D<int, float, double, int>({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
3105 /// "xValues", "yValues", "zValues", "weight");
3106 /// ~~~
3107 ///
3108 /// See the first Profile2D() overload for more details.
3109 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
3110 typename V3 = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
3111 RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model, std::string_view v1Name, std::string_view v2Name,
3112 std::string_view v3Name, std::string_view wName)
3113 {
3114 std::shared_ptr<::TProfile2D> h(nullptr);
3115 {
3116 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
3117 h = model.GetProfile();
3118 }
3119
3120 if (!RDFInternal::HistoUtils<::TProfile2D>::HasAxisLimits(*h)) {
3121 throw std::runtime_error("2D profiles with no axes limits are not supported yet.");
3122 }
3123 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name, wName};
3124 const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews)
3125 ? ColumnNames_t()
3126 : ColumnNames_t(columnViews.begin(), columnViews.end());
3128 }
3129
3130 /// \brief Fill and return a two-dimensional profile (*lazy action*).
3131 /// See the first Profile2D() overload for more details.
3132 template <typename V1, typename V2, typename V3, typename W>
3134 {
3135 return Profile2D<V1, V2, V3, W>(model, "", "", "", "");
3136 }
3137
3138 ////////////////////////////////////////////////////////////////////////////
3139 /// \brief Return an object of type T on which `T::Fill` will be called once per event (*lazy action*).
3140 ///
3141 /// Type T must provide at least:
3142 /// - a copy-constructor
3143 /// - a `Fill` method that accepts as many arguments and with same types as the column names passed as columnList
3144 /// (these types can also be passed as template parameters to this method)
3145 /// - a `Merge` method with signature `Merge(TCollection *)` or `Merge(const std::vector<T *>&)` that merges the
3146 /// objects passed as argument into the object on which `Merge` was called (an analogous of TH1::Merge). Note that
3147 /// if the signature that takes a `TCollection*` is used, then T must inherit from TObject (to allow insertion in
3148 /// the TCollection*).
3149 ///
3150 /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred together with OtherColumns if not present.
3151 /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the object.
3152 /// \tparam T The type of the object to fill. Automatically deduced.
3153 /// \param[in] model The model to be considered to build the new return value.
3154 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
3155 /// \return the filled object wrapped in a RResultPtr.
3156 ///
3157 /// The user gives up ownership of the model object.
3158 /// The list of column names to be used for filling must always be specified.
3159 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed.
3160 /// Also see RResultPtr.
3161 ///
3162 /// ### Example usage:
3163 /// ~~~{.cpp}
3164 /// MyClass obj;
3165 /// // Deduce column types (this invocation needs jitting internally, and in this case
3166 /// // MyClass needs to be known to the interpreter)
3167 /// auto myFilledObj = myDf.Fill(obj, {"col0", "col1"});
3168 /// // explicit column types
3169 /// auto myFilledObj = myDf.Fill<float, float>(obj, {"col0", "col1"});
3170 /// ~~~
3171 ///
3172 template <typename FirstColumn = RDFDetail::RInferredType, typename... OtherColumns, typename T>
3173 RResultPtr<std::decay_t<T>> Fill(T &&model, const ColumnNames_t &columnList)
3174 {
3175 auto h = std::make_shared<std::decay_t<T>>(std::forward<T>(model));
3176 if (!RDFInternal::HistoUtils<T>::HasAxisLimits(*h)) {
3177 throw std::runtime_error("The absence of axes limits is not supported yet.");
3178 }
3179 return CreateAction<RDFInternal::ActionTags::Fill, FirstColumn, OtherColumns...>(columnList, h, h, fProxiedPtr,
3180 columnList.size());
3181 }
3182
3183 ////////////////////////////////////////////////////////////////////////////
3184 /// \brief Return a TStatistic object, filled once per event (*lazy action*).
3185 ///
3186 /// \tparam V The type of the value column
3187 /// \param[in] value The name of the column with the values to fill the statistics with.
3188 /// \return the filled TStatistic object wrapped in a RResultPtr.
3189 ///
3190 /// ### Example usage:
3191 /// ~~~{.cpp}
3192 /// // Deduce column type (this invocation needs jitting internally)
3193 /// auto stats0 = myDf.Stats("values");
3194 /// // Explicit column type
3195 /// auto stats1 = myDf.Stats<float>("values");
3196 /// ~~~
3197 ///
3198 template <typename V = RDFDetail::RInferredType>
3199 RResultPtr<TStatistic> Stats(std::string_view value = "")
3200 {
3201 ColumnNames_t columns;
3202 if (!value.empty()) {
3203 columns.emplace_back(std::string(value));
3204 }
3205 const auto validColumnNames = GetValidatedColumnNames(1, columns);
3206 if (std::is_same<V, RDFDetail::RInferredType>::value) {
3207 return Fill(TStatistic(), validColumnNames);
3208 } else {
3209 return Fill<V>(TStatistic(), validColumnNames);
3210 }
3211 }
3212
3213 ////////////////////////////////////////////////////////////////////////////
3214 /// \brief Return a TStatistic object, filled once per event (*lazy action*).
3215 ///
3216 /// \tparam V The type of the value column
3217 /// \tparam W The type of the weight column
3218 /// \param[in] value The name of the column with the values to fill the statistics with.
3219 /// \param[in] weight The name of the column with the weights to fill the statistics with.
3220 /// \return the filled TStatistic object wrapped in a RResultPtr.
3221 ///
3222 /// ### Example usage:
3223 /// ~~~{.cpp}
3224 /// // Deduce column types (this invocation needs jitting internally)
3225 /// auto stats0 = myDf.Stats("values", "weights");
3226 /// // Explicit column types
3227 /// auto stats1 = myDf.Stats<int, float>("values", "weights");
3228 /// ~~~
3229 ///
3230 template <typename V = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
3231 RResultPtr<TStatistic> Stats(std::string_view value, std::string_view weight)
3232 {
3233 ColumnNames_t columns{std::string(value), std::string(weight)};
3234 constexpr auto vIsInferred = std::is_same<V, RDFDetail::RInferredType>::value;
3235 constexpr auto wIsInferred = std::is_same<W, RDFDetail::RInferredType>::value;
3236 const auto validColumnNames = GetValidatedColumnNames(2, columns);
3237 // We have 3 cases:
3238 // 1. Both types are inferred: we use Fill and let the jit kick in.
3239 // 2. One of the two types is explicit and the other one is inferred: the case is not supported.
3240 // 3. Both types are explicit: we invoke the fully compiled Fill method.
3241 if (vIsInferred && wIsInferred) {
3242 return Fill(TStatistic(), validColumnNames);
3243 } else if (vIsInferred != wIsInferred) {
3244 std::string error("The ");
3245 error += vIsInferred ? "value " : "weight ";
3246 error += "column type is explicit, while the ";
3247 error += vIsInferred ? "weight " : "value ";
3248 error += " is specified to be inferred. This case is not supported: please specify both types or none.";
3249 throw std::runtime_error(error);
3250 } else {
3251 return Fill<V, W>(TStatistic(), validColumnNames);
3252 }
3253 }
3254
3255 ////////////////////////////////////////////////////////////////////////////
3256 /// \brief Return the minimum of processed column values (*lazy action*).
3257 /// \tparam T The type of the branch/column.
3258 /// \param[in] columnName The name of the branch/column to be treated.
3259 /// \return the minimum value of the selected column wrapped in a RResultPtr.
3260 ///
3261 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
3262 /// template specialization of this method.
3263 /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise.
3264 ///
3265 /// This action is *lazy*: upon invocation of this method the calculation is
3266 /// booked but not executed. Also see RResultPtr.
3267 ///
3268 /// ### Example usage:
3269 /// ~~~{.cpp}
3270 /// // Deduce column type (this invocation needs jitting internally)
3271 /// auto minVal0 = myDf.Min("values");
3272 /// // Explicit column type
3273 /// auto minVal1 = myDf.Min<double>("values");
3274 /// ~~~
3275 ///
3276 template <typename T = RDFDetail::RInferredType>
3277 RResultPtr<RDFDetail::MinReturnType_t<T>> Min(std::string_view columnName = "")
3278 {
3279 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
3280 using RetType_t = RDFDetail::MinReturnType_t<T>;
3281 auto minV = std::make_shared<RetType_t>(std::numeric_limits<RetType_t>::max());
3282 return CreateAction<RDFInternal::ActionTags::Min, T>(userColumns, minV, minV, fProxiedPtr);
3283 }
3284
3285 ////////////////////////////////////////////////////////////////////////////
3286 /// \brief Return the maximum of processed column values (*lazy action*).
3287 /// \tparam T The type of the branch/column.
3288 /// \param[in] columnName The name of the branch/column to be treated.
3289 /// \return the maximum value of the selected column wrapped in a RResultPtr.
3290 ///
3291 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
3292 /// template specialization of this method.
3293 /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise.
3294 ///
3295 /// This action is *lazy*: upon invocation of this method the calculation is
3296 /// booked but not executed. Also see RResultPtr.
3297 ///
3298 /// ### Example usage:
3299 /// ~~~{.cpp}
3300 /// // Deduce column type (this invocation needs jitting internally)
3301 /// auto maxVal0 = myDf.Max("values");
3302 /// // Explicit column type
3303 /// auto maxVal1 = myDf.Max<double>("values");
3304 /// ~~~
3305 ///
3306 template <typename T = RDFDetail::RInferredType>
3307 RResultPtr<RDFDetail::MaxReturnType_t<T>> Max(std::string_view columnName = "")
3308 {
3309 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
3310 using RetType_t = RDFDetail::MaxReturnType_t<T>;
3311 auto maxV = std::make_shared<RetType_t>(std::numeric_limits<RetType_t>::lowest());
3312 return CreateAction<RDFInternal::ActionTags::Max, T>(userColumns, maxV, maxV, fProxiedPtr);
3313 }
3314
3315 ////////////////////////////////////////////////////////////////////////////
3316 /// \brief Return the mean of processed column values (*lazy action*).
3317 /// \tparam T The type of the branch/column.
3318 /// \param[in] columnName The name of the branch/column to be treated.
3319 /// \return the mean value of the selected column wrapped in a RResultPtr.
3320 ///
3321 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
3322 /// template specialization of this method.
3323 /// Note that internally, the summations are executed with Kahan sums in double precision, irrespective
3324 /// of the type of column that is read.
3325 ///
3326 /// This action is *lazy*: upon invocation of this method the calculation is
3327 /// booked but not executed. Also see RResultPtr.
3328 ///
3329 /// ### Example usage:
3330 /// ~~~{.cpp}
3331 /// // Deduce column type (this invocation needs jitting internally)
3332 /// auto meanVal0 = myDf.Mean("values");
3333 /// // Explicit column type
3334 /// auto meanVal1 = myDf.Mean<double>("values");
3335 /// ~~~
3336 ///
3337 template <typename T = RDFDetail::RInferredType>
3338 RResultPtr<double> Mean(std::string_view columnName = "")
3339 {
3340 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
3341 auto meanV = std::make_shared<double>(0);
3342 return CreateAction<RDFInternal::ActionTags::Mean, T>(userColumns, meanV, meanV, fProxiedPtr);
3343 }
3344
3345 ////////////////////////////////////////////////////////////////////////////
3346 /// \brief Return the unbiased standard deviation of processed column values (*lazy action*).
3347 /// \tparam T The type of the branch/column.
3348 /// \param[in] columnName The name of the branch/column to be treated.
3349 /// \return the standard deviation value of the selected column wrapped in a RResultPtr.
3350 ///
3351 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
3352 /// template specialization of this method.
3353 ///
3354 /// This action is *lazy*: upon invocation of this method the calculation is
3355 /// booked but not executed. Also see RResultPtr.
3356 ///
3357 /// ### Example usage:
3358 /// ~~~{.cpp}
3359 /// // Deduce column type (this invocation needs jitting internally)
3360 /// auto stdDev0 = myDf.StdDev("values");
3361 /// // Explicit column type
3362 /// auto stdDev1 = myDf.StdDev<double>("values");
3363 /// ~~~
3364 ///
3365 template <typename T = RDFDetail::RInferredType>
3366 RResultPtr<double> StdDev(std::string_view columnName = "")
3367 {
3368 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
3369 auto stdDeviationV = std::make_shared<double>(0);
3370 return CreateAction<RDFInternal::ActionTags::StdDev, T>(userColumns, stdDeviationV, stdDeviationV, fProxiedPtr);
3371 }
3372
3373 // clang-format off
3374 ////////////////////////////////////////////////////////////////////////////
3375 /// \brief Return the sum of processed column values (*lazy action*).
3376 /// \tparam T The type of the branch/column.
3377 /// \param[in] columnName The name of the branch/column.
3378 /// \param[in] initValue Optional initial value for the sum. If not present, the column values must be default-constructible.
3379 /// \return the sum of the selected column wrapped in a RResultPtr.
3380 ///
3381 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
3382 /// template specialization of this method.
3383 /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise.
3384 ///
3385 /// This action is *lazy*: upon invocation of this method the calculation is
3386 /// booked but not executed. Also see RResultPtr.
3387 ///
3388 /// ### Example usage:
3389 /// ~~~{.cpp}
3390 /// // Deduce column type (this invocation needs jitting internally)
3391 /// auto sum0 = myDf.Sum("values");
3392 /// // Explicit column type
3393 /// auto sum1 = myDf.Sum<double>("values");
3394 /// ~~~
3395 ///
3396 template <typename T = RDFDetail::RInferredType>
3398 Sum(std::string_view columnName = "",
3399 const RDFDetail::SumReturnType_t<T> &initValue = RDFDetail::SumReturnType_t<T>{})
3400 {
3401 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
3402 auto sumV = std::make_shared<RDFDetail::SumReturnType_t<T>>(initValue);
3403 return CreateAction<RDFInternal::ActionTags::Sum, T>(userColumns, sumV, sumV, fProxiedPtr);
3404 }
3405 // clang-format on
3406
3407 ////////////////////////////////////////////////////////////////////////////
3408 /// \brief Gather filtering statistics.
3409 /// \return the resulting `RCutFlowReport` instance wrapped in a RResultPtr.
3410 ///
3411 /// Calling `Report` on the main `RDataFrame` object gathers stats for
3412 /// all named filters in the call graph. Calling this method on a
3413 /// stored chain state (i.e. a graph node different from the first) gathers
3414 /// the stats for all named filters in the chain section between the original
3415 /// `RDataFrame` and that node (included). Stats are gathered in the same
3416 /// order as the named filters have been added to the graph.
3417 /// A RResultPtr<RCutFlowReport> is returned to allow inspection of the
3418 /// effects cuts had.
3419 ///
3420 /// This action is *lazy*: upon invocation of
3421 /// this method the calculation is booked but not executed. See RResultPtr
3422 /// documentation.
3423 ///
3424 /// ### Example usage:
3425 /// ~~~{.cpp}
3426 /// auto filtered = d.Filter(cut1, {"b1"}, "Cut1").Filter(cut2, {"b2"}, "Cut2");
3427 /// auto cutReport = filtered3.Report();
3428 /// cutReport->Print();
3429 /// ~~~
3430 ///
3432 {
3433 bool returnEmptyReport = false;
3434 // if this is a RInterface<RLoopManager> on which `Define` has been called, users
3435 // are calling `Report` on a chain of the form LoopManager->Define->Define->..., which
3436 // certainly does not contain named filters.
3437 // The number 4 takes into account the implicit columns for entry and slot number
3438 // and their aliases (2 + 2, i.e. {r,t}dfentry_ and {r,t}dfslot_)
3439 if (std::is_same<Proxied, RLoopManager>::value && fColRegister.GenerateColumnNames().size() > 4)
3440 returnEmptyReport = true;
3441
3442 auto rep = std::make_shared<RCutFlowReport>();
3443 using Helper_t = RDFInternal::ReportHelper<Proxied>;
3445
3446 auto action = std::make_unique<Action_t>(Helper_t(rep, fProxiedPtr.get(), returnEmptyReport), ColumnNames_t({}),
3448
3449 return MakeResultPtr(rep, *fLoopManager, std::move(action));
3450 }
3451
3452 /// \brief Returns the names of the filters created.
3453 /// \return the container of filters names.
3454 ///
3455 /// If called on a root node, all the filters in the computation graph will
3456 /// be printed. For any other node, only the filters upstream of that node.
3457 /// Filters without a name are printed as "Unnamed Filter"
3458 /// This is not an action nor a transformation, just a query to the RDataFrame object.
3459 ///
3460 /// ### Example usage:
3461 /// ~~~{.cpp}
3462 /// auto filtNames = d.GetFilterNames();
3463 /// for (auto &&filtName : filtNames) std::cout << filtName << std::endl;
3464 /// ~~~
3465 ///
3466 std::vector<std::string> GetFilterNames() { return RDFInternal::GetFilterNames(fProxiedPtr); }
3467
3468 // clang-format off
3469 ////////////////////////////////////////////////////////////////////////////
3470 /// \brief Execute a user-defined accumulation operation on the processed column values in each processing slot.
3471 /// \tparam F The type of the aggregator callable. Automatically deduced.
3472 /// \tparam U The type of the aggregator variable. Must be default-constructible, copy-constructible and copy-assignable. Automatically deduced.
3473 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
3474 /// \param[in] aggregator A callable with signature `U(U,T)` or `void(U&,T)`, where T is the type of the column, U is the type of the aggregator variable
3475 /// \param[in] merger A callable with signature `U(U,U)` or `void(std::vector<U>&)` used to merge the results of the accumulations of each thread
3476 /// \param[in] columnName The column to be aggregated. If omitted, the first default column is used instead.
3477 /// \param[in] aggIdentity The aggregator variable of each thread is initialized to this value (or is default-constructed if the parameter is omitted)
3478 /// \return the result of the aggregation wrapped in a RResultPtr.
3479 ///
3480 /// An aggregator callable takes two values, an aggregator variable and a column value. The aggregator variable is
3481 /// initialized to aggIdentity or default-constructed if aggIdentity is omitted.
3482 /// This action calls the aggregator callable for each processed entry, passing in the aggregator variable and
3483 /// the value of the column columnName.
3484 /// If the signature is `U(U,T)` the aggregator variable is then copy-assigned the result of the execution of the callable.
3485 /// Otherwise the signature of aggregator must be `void(U&,T)`.
3486 ///
3487 /// The merger callable is used to merge the partial accumulation results of each processing thread. It is only called in multi-thread executions.
3488 /// If its signature is `U(U,U)` the aggregator variables of each thread are merged two by two.
3489 /// If its signature is `void(std::vector<U>& a)` it is assumed that it merges all aggregators in a[0].
3490 ///
3491 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see RResultPtr.
3492 ///
3493 /// Example usage:
3494 /// ~~~{.cpp}
3495 /// auto aggregator = [](double acc, double x) { return acc * x; };
3496 /// ROOT::EnableImplicitMT();
3497 /// // If multithread is enabled, the aggregator function will be called by more threads
3498 /// // and will produce a vector of partial accumulators.
3499 /// // The merger function performs the final aggregation of these partial results.
3500 /// auto merger = [](std::vector<double> &accumulators) {
3501 /// for (auto i : ROOT::TSeqU(1u, accumulators.size())) {
3502 /// accumulators[0] *= accumulators[i];
3503 /// }
3504 /// };
3505 ///
3506 /// // The accumulator is initialized at this value by every thread.
3507 /// double initValue = 1.;
3508 ///
3509 /// // Multiplies all elements of the column "x"
3510 /// auto result = d.Aggregate(aggregator, merger, "x", initValue);
3511 /// ~~~
3512 // clang-format on
3513 template <typename AccFun, typename MergeFun, typename R = typename TTraits::CallableTraits<AccFun>::ret_type,
3514 typename ArgTypes = typename TTraits::CallableTraits<AccFun>::arg_types,
3515 typename ArgTypesNoDecay = typename TTraits::CallableTraits<AccFun>::arg_types_nodecay,
3516 typename U = TTraits::TakeFirstParameter_t<ArgTypes>,
3517 typename T = TTraits::TakeFirstParameter_t<TTraits::RemoveFirstParameter_t<ArgTypes>>>
3518 RResultPtr<U> Aggregate(AccFun aggregator, MergeFun merger, std::string_view columnName, const U &aggIdentity)
3519 {
3520 RDFInternal::CheckAggregate<R, MergeFun>(ArgTypesNoDecay());
3521 const auto columns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
3522
3523 const auto validColumnNames = GetValidatedColumnNames(1, columns);
3524 CheckAndFillDSColumns(validColumnNames, TTraits::TypeList<T>());
3525
3526 auto accObjPtr = std::make_shared<U>(aggIdentity);
3527 using Helper_t = RDFInternal::AggregateHelper<AccFun, MergeFun, R, T, U>;
3529 auto action = std::make_unique<Action_t>(
3530 Helper_t(std::move(aggregator), std::move(merger), accObjPtr, fLoopManager->GetNSlots()), validColumnNames,
3532 return MakeResultPtr(accObjPtr, *fLoopManager, std::move(action));
3533 }
3534
3535 // clang-format off
3536 ////////////////////////////////////////////////////////////////////////////
3537 /// \brief Execute a user-defined accumulation operation on the processed column values in each processing slot.
3538 /// \tparam F The type of the aggregator callable. Automatically deduced.
3539 /// \tparam U The type of the aggregator variable. Must be default-constructible, copy-constructible and copy-assignable. Automatically deduced.
3540 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
3541 /// \param[in] aggregator A callable with signature `U(U,T)` or `void(U,T)`, where T is the type of the column, U is the type of the aggregator variable
3542 /// \param[in] merger A callable with signature `U(U,U)` or `void(std::vector<U>&)` used to merge the results of the accumulations of each thread
3543 /// \param[in] columnName The column to be aggregated. If omitted, the first default column is used instead.
3544 /// \return the result of the aggregation wrapped in a RResultPtr.
3545 ///
3546 /// See previous Aggregate overload for more information.
3547 // clang-format on
3548 template <typename AccFun, typename MergeFun, typename R = typename TTraits::CallableTraits<AccFun>::ret_type,
3549 typename ArgTypes = typename TTraits::CallableTraits<AccFun>::arg_types,
3550 typename U = TTraits::TakeFirstParameter_t<ArgTypes>,
3551 typename T = TTraits::TakeFirstParameter_t<TTraits::RemoveFirstParameter_t<ArgTypes>>>
3552 RResultPtr<U> Aggregate(AccFun aggregator, MergeFun merger, std::string_view columnName = "")
3553 {
3554 static_assert(
3555 std::is_default_constructible<U>::value,
3556 "aggregated object cannot be default-constructed. Please provide an initialisation value (aggIdentity)");
3557 return Aggregate(std::move(aggregator), std::move(merger), columnName, U());
3558 }
3559
3560 // clang-format off
3561 ////////////////////////////////////////////////////////////////////////////
3562 /// \brief Book execution of a custom action using a user-defined helper object.
3563 /// \tparam FirstColumn The type of the first column used by this action. Inferred together with OtherColumns if not present.
3564 /// \tparam OtherColumns A list of the types of the other columns used by this action
3565 /// \tparam Helper The type of the user-defined helper. See below for the required interface it should expose.
3566 /// \param[in] helper The Action Helper to be scheduled.
3567 /// \param[in] columns The names of the columns on which the helper acts.
3568 /// \return the result of the helper wrapped in a RResultPtr.
3569 ///
3570 /// This method books a custom action for execution. The behavior of the action is completely dependent on the
3571 /// Helper object provided by the caller. The required interface for the helper is described below (more
3572 /// methods that the ones required can be present, e.g. a constructor that takes the number of worker threads is usually useful):
3573 ///
3574 /// ### Mandatory interface
3575 ///
3576 /// * `Helper` must publicly inherit from `ROOT::Detail::RDF::RActionImpl<Helper>`
3577 /// * `Helper::Result_t`: public alias for the type of the result of this action helper. `Result_t` must be default-constructible.
3578 /// * `Helper(Helper &&)`: a move-constructor is required. Copy-constructors are discouraged.
3579 /// * `std::shared_ptr<Result_t> GetResultPtr() const`: return a shared_ptr to the result of this action (of type
3580 /// Result_t). The RResultPtr returned by Book will point to this object. Note that this method can be called
3581 /// _before_ Initialize(), because the RResultPtr is constructed before the event loop is started.
3582 /// * `void Initialize()`: this method is called once before starting the event-loop. Useful for setup operations.
3583 /// It must reset the state of the helper to the expected state at the beginning of the event loop: the same helper,
3584 /// or copies of it, might be used for multiple event loops (e.g. in the presence of systematic variations).
3585 /// * `void InitTask(TTreeReader *, unsigned int slot)`: each working thread shall call this method during the event
3586 /// loop, before processing a batch of entries. The pointer passed as argument, if not null, will point to the TTreeReader
3587 /// that RDataFrame has set up to read the task's batch of entries. It is passed to the helper to allow certain advanced optimizations
3588 /// it should not usually serve any purpose for the Helper. This method is often no-op for simple helpers.
3589 /// * `void Exec(unsigned int slot, ColumnTypes...columnValues)`: each working thread shall call this method
3590 /// during the event-loop, possibly concurrently. No two threads will ever call Exec with the same 'slot' value:
3591 /// this parameter is there to facilitate writing thread-safe helpers. The other arguments will be the values of
3592 /// the requested columns for the particular entry being processed.
3593 /// * `void Finalize()`: this method is called at the end of the event loop. Commonly used to finalize the contents of the result.
3594 /// * `std::string GetActionName()`: it returns a string identifier for this type of action that RDataFrame will use in
3595 /// diagnostics, SaveGraph(), etc.
3596 ///
3597 /// ### Optional methods
3598 ///
3599 /// If these methods are implemented they enable extra functionality as per the description below.
3600 ///
3601 /// * `Result_t &PartialUpdate(unsigned int slot)`: if present, it must return the value of the partial result of this action for the given 'slot'.
3602 /// Different threads might call this method concurrently, but will do so with different 'slot' numbers.
3603 /// RDataFrame leverages this method to implement RResultPtr::OnPartialResult().
3604 /// * `ROOT::RDF::SampleCallback_t GetSampleCallback()`: if present, it must return a callable with the
3605 /// appropriate signature (see ROOT::RDF::SampleCallback_t) that will be invoked at the beginning of the processing
3606 /// of every sample, as in DefinePerSample().
3607 /// * `Helper MakeNew(void *newResult, std::string_view variation = "nominal")`: if implemented, it enables varying
3608 /// the action's result with VariationsFor(). It takes a type-erased new result that can be safely cast to a
3609 /// `std::shared_ptr<Result_t> *` (a pointer to shared pointer) and should be used as the action's output result.
3610 /// The function optionally takes the name of the current variation which could be useful in customizing its behaviour.
3611 ///
3612 /// In case Book is called without specifying column types as template arguments, corresponding typed code will be just-in-time compiled
3613 /// by RDataFrame. In that case the Helper class needs to be known to the ROOT interpreter.
3614 ///
3615 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see RResultPtr.
3616 ///
3617 /// ### Examples
3618 /// See [this tutorial](https://root.cern/doc/master/df018__customActions_8C.html) for an example implementation of an action helper.
3619 ///
3620 /// It is also possible to inspect the code used by built-in RDataFrame actions at ActionHelpers.hxx.
3621 ///
3622 // clang-format on
3623 template <typename FirstColumn = RDFDetail::RInferredType, typename... OtherColumns, typename Helper>
3625 {
3626 using HelperT = std::decay_t<Helper>;
3627 // TODO add more static sanity checks on Helper
3628 using AH = RDFDetail::RActionImpl<HelperT>;
3629 static_assert(std::is_base_of<AH, HelperT>::value && std::is_convertible<HelperT *, AH *>::value,
3630 "Action helper of type T must publicly inherit from ROOT::Detail::RDF::RActionImpl<T>");
3631
3632 auto hPtr = std::make_shared<HelperT>(std::forward<Helper>(helper));
3633 auto resPtr = hPtr->GetResultPtr();
3634
3635 if (std::is_same<FirstColumn, RDFDetail::RInferredType>::value && columns.empty()) {
3636 return CallCreateActionWithoutColsIfPossible<HelperT>(resPtr, hPtr, TTraits::TypeList<FirstColumn>{});
3637 } else {
3638 return CreateAction<RDFInternal::ActionTags::Book, FirstColumn, OtherColumns...>(columns, resPtr, hPtr,
3639 fProxiedPtr, columns.size());
3640 }
3641 }
3642
3643 ////////////////////////////////////////////////////////////////////////////
3644 /// \brief Provides a representation of the columns in the dataset.
3645 /// \tparam ColumnTypes variadic list of branch/column types.
3646 /// \param[in] columnList Names of the columns to be displayed.
3647 /// \param[in] nRows Number of events for each column to be displayed.
3648 /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row.
3649 /// \return the `RDisplay` instance wrapped in a RResultPtr.
3650 ///
3651 /// This function returns a `RResultPtr<RDisplay>` containing all the entries to be displayed, organized in a tabular
3652 /// form. RDisplay will either print on the standard output a summarized version through `RDisplay::Print()` or will
3653 /// return a complete version through `RDisplay::AsString()`.
3654 ///
3655 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see
3656 /// RResultPtr.
3657 ///
3658 /// Example usage:
3659 /// ~~~{.cpp}
3660 /// // Preparing the RResultPtr<RDisplay> object with all columns and default number of entries
3661 /// auto d1 = rdf.Display("");
3662 /// // Preparing the RResultPtr<RDisplay> object with two columns and 128 entries
3663 /// auto d2 = d.Display({"x", "y"}, 128);
3664 /// // Printing the short representations, the event loop will run
3665 /// d1->Print();
3666 /// d2->Print();
3667 /// ~~~
3668 template <typename... ColumnTypes>
3669 RResultPtr<RDisplay> Display(const ColumnNames_t &columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10)
3670 {
3671 CheckIMTDisabled("Display");
3672 auto newCols = columnList;
3673 newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column
3674 auto displayer = std::make_shared<RDisplay>(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements);
3675 using displayHelperArgs_t = std::pair<size_t, std::shared_ptr<RDisplay>>;
3676 // Need to add ULong64_t type corresponding to the first column rdfentry_
3677 return CreateAction<RDFInternal::ActionTags::Display, ULong64_t, ColumnTypes...>(
3678 std::move(newCols), displayer, std::make_shared<displayHelperArgs_t>(nRows, displayer), fProxiedPtr);
3679 }
3680
3681 ////////////////////////////////////////////////////////////////////////////
3682 /// \brief Provides a representation of the columns in the dataset.
3683 /// \param[in] columnList Names of the columns to be displayed.
3684 /// \param[in] nRows Number of events for each column to be displayed.
3685 /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row.
3686 /// \return the `RDisplay` instance wrapped in a RResultPtr.
3687 ///
3688 /// This overload automatically infers the column types.
3689 /// See the previous overloads for further details.
3690 ///
3691 /// Invoked when no types are specified to Display
3692 RResultPtr<RDisplay> Display(const ColumnNames_t &columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10)
3693 {
3694 CheckIMTDisabled("Display");
3695 auto newCols = columnList;
3696 newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column
3697 auto displayer = std::make_shared<RDisplay>(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements);
3698 using displayHelperArgs_t = std::pair<size_t, std::shared_ptr<RDisplay>>;
3700 std::move(newCols), displayer, std::make_shared<displayHelperArgs_t>(nRows, displayer), fProxiedPtr,
3701 columnList.size() + 1);
3702 }
3703
3704 ////////////////////////////////////////////////////////////////////////////
3705 /// \brief Provides a representation of the columns in the dataset.
3706 /// \param[in] columnNameRegexp A regular expression to select the columns.
3707 /// \param[in] nRows Number of events for each column to be displayed.
3708 /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row.
3709 /// \return the `RDisplay` instance wrapped in a RResultPtr.
3710 ///
3711 /// The existing columns are matched against the regular expression. If the string provided
3712 /// is empty, all columns are selected.
3713 /// See the previous overloads for further details.
3715 Display(std::string_view columnNameRegexp = "", size_t nRows = 5, size_t nMaxCollectionElements = 10)
3716 {
3717 const auto columnNames = GetColumnNames();
3718 const auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Display");
3719 return Display(selectedColumns, nRows, nMaxCollectionElements);
3720 }
3721
3722 ////////////////////////////////////////////////////////////////////////////
3723 /// \brief Provides a representation of the columns in the dataset.
3724 /// \param[in] columnList Names of the columns to be displayed.
3725 /// \param[in] nRows Number of events for each column to be displayed.
3726 /// \param[in] nMaxCollectionElements Number of maximum elements in collection.
3727 /// \return the `RDisplay` instance wrapped in a RResultPtr.
3728 ///
3729 /// See the previous overloads for further details.
3731 Display(std::initializer_list<std::string> columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10)
3732 {
3733 ColumnNames_t selectedColumns(columnList);
3734 return Display(selectedColumns, nRows, nMaxCollectionElements);
3735 }
3736
3737 /// \}
3738
3739private:
3740 template <typename F, typename DefineType, typename RetType = typename TTraits::CallableTraits<F>::ret_type>
3741 std::enable_if_t<std::is_default_constructible<RetType>::value, RInterface<Proxied>>
3742 DefineImpl(std::string_view name, F &&expression, const ColumnNames_t &columns, const std::string &where)
3743 {
3744 if (where.compare(0, 8, "Redefine") != 0) { // not a Redefine
3748 } else {
3752 }
3753
3754 using ArgTypes_t = typename TTraits::CallableTraits<F>::arg_types;
3755 using ColTypesTmp_t = typename RDFInternal::RemoveFirstParameterIf<
3756 std::is_same<DefineType, RDFDetail::ExtraArgsForDefine::Slot>::value, ArgTypes_t>::type;
3757 using ColTypes_t = typename RDFInternal::RemoveFirstTwoParametersIf<
3758 std::is_same<DefineType, RDFDetail::ExtraArgsForDefine::SlotAndEntry>::value, ColTypesTmp_t>::type;
3759
3760 constexpr auto nColumns = ColTypes_t::list_size;
3761
3762 const auto validColumnNames = GetValidatedColumnNames(nColumns, columns);
3763 CheckAndFillDSColumns(validColumnNames, ColTypes_t());
3764
3765 // Declare return type to the interpreter, for future use by jitted actions
3766 auto retTypeName = RDFInternal::TypeID2TypeName(typeid(RetType));
3767 if (retTypeName.empty()) {
3768 // The type is not known to the interpreter.
3769 // We must not error out here, but if/when this column is used in jitted code
3770 const auto demangledType = RDFInternal::DemangleTypeIdName(typeid(RetType));
3771 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
3772 }
3773
3774 using NewCol_t = RDFDetail::RDefine<F, DefineType>;
3775 auto newColumn = std::make_shared<NewCol_t>(name, retTypeName, std::forward<F>(expression), validColumnNames,
3777
3779 newCols.AddDefine(std::move(newColumn));
3780
3781 RInterface<Proxied> newInterface(fProxiedPtr, *fLoopManager, std::move(newCols));
3782
3783 return newInterface;
3784 }
3785
3786 // This overload is chosen when the callable passed to Define or DefineSlot returns void.
3787 // It simply fires a compile-time error. This is preferable to a static_assert in the main `Define` overload because
3788 // this way compilation of `Define` has no way to continue after throwing the error.
3789 template <typename F, typename DefineType, typename RetType = typename TTraits::CallableTraits<F>::ret_type,
3790 bool IsFStringConv = std::is_convertible<F, std::string>::value,
3791 bool IsRetTypeDefConstr = std::is_default_constructible<RetType>::value>
3792 std::enable_if_t<!IsFStringConv && !IsRetTypeDefConstr, RInterface<Proxied>>
3793 DefineImpl(std::string_view, F, const ColumnNames_t &, const std::string &)
3794 {
3795 static_assert(std::is_default_constructible<typename TTraits::CallableTraits<F>::ret_type>::value,
3796 "Error in `Define`: type returned by expression is not default-constructible");
3797 return *this; // never reached
3798 }
3799
3800 ////////////////////////////////////////////////////////////////////////////
3801 /// \brief Implementation of cache.
3802 template <typename... ColTypes, std::size_t... S>
3803 RInterface<RLoopManager> CacheImpl(const ColumnNames_t &columnList, std::index_sequence<S...>)
3804 {
3805 const auto columnListWithoutSizeColumns = RDFInternal::FilterArraySizeColNames(columnList, "Snapshot");
3806
3807 // Check at compile time that the columns types are copy constructible
3808 constexpr bool areCopyConstructible =
3809 RDFInternal::TEvalAnd<std::is_copy_constructible<ColTypes>::value...>::value;
3810 static_assert(areCopyConstructible, "Columns of a type which is not copy constructible cannot be cached yet.");
3811
3812 RDFInternal::CheckTypesAndPars(sizeof...(ColTypes), columnListWithoutSizeColumns.size());
3813
3814 auto colHolders = std::make_tuple(Take<ColTypes>(columnListWithoutSizeColumns[S])...);
3815 auto ds = std::make_unique<RLazyDS<ColTypes...>>(
3816 std::make_pair(columnListWithoutSizeColumns[S], std::get<S>(colHolders))...);
3817
3818 RInterface<RLoopManager> cachedRDF(std::make_shared<RLoopManager>(std::move(ds), columnListWithoutSizeColumns));
3819
3820 return cachedRDF;
3821 }
3822
3823 template <bool IsSingleColumn, typename F>
3825 VaryImpl(const std::vector<std::string> &colNames, F &&expression, const ColumnNames_t &inputColumns,
3826 const std::vector<std::string> &variationTags, std::string_view variationName)
3827 {
3828 using F_t = std::decay_t<F>;
3829 using ColTypes_t = typename TTraits::CallableTraits<F_t>::arg_types;
3830 using RetType = typename TTraits::CallableTraits<F_t>::ret_type;
3831 constexpr auto nColumns = ColTypes_t::list_size;
3832
3833 SanityChecksForVary<RetType>(colNames, variationTags, variationName);
3834
3835 const auto validColumnNames = GetValidatedColumnNames(nColumns, inputColumns);
3836 CheckAndFillDSColumns(validColumnNames, ColTypes_t{});
3837
3838 auto retTypeName = RDFInternal::TypeID2TypeName(typeid(RetType));
3839 if (retTypeName.empty()) {
3840 // The type is not known to the interpreter, but we don't want to error out
3841 // here, rather if/when this column is used in jitted code, so we inject a broken but telling type name.
3842 const auto demangledType = RDFInternal::DemangleTypeIdName(typeid(RetType));
3843 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
3844 }
3845
3846 auto variation = std::make_shared<RDFInternal::RVariation<F_t, IsSingleColumn>>(
3847 colNames, variationName, std::forward<F>(expression), variationTags, retTypeName, fColRegister, *fLoopManager,
3848 validColumnNames);
3849
3851 newCols.AddVariation(std::move(variation));
3852
3853 RInterface<Proxied> newInterface(fProxiedPtr, *fLoopManager, std::move(newCols));
3854
3855 return newInterface;
3856 }
3857
3858 RInterface<Proxied> JittedVaryImpl(const std::vector<std::string> &colNames, std::string_view expression,
3859 const std::vector<std::string> &variationTags, std::string_view variationName,
3860 bool isSingleColumn)
3861 {
3862 R__ASSERT(!variationTags.empty() && "Must have at least one variation.");
3863 R__ASSERT(!colNames.empty() && "Must have at least one varied column.");
3864 R__ASSERT(!variationName.empty() && "Must provide a variation name.");
3865
3866 for (auto &colName : colNames) {
3867 RDFInternal::CheckValidCppVarName(colName, "Vary");
3870 }
3871 RDFInternal::CheckValidCppVarName(variationName, "Vary");
3872
3873 // when varying multiple columns, they must be different columns
3874 if (colNames.size() > 1) {
3875 std::set<std::string> uniqueCols(colNames.begin(), colNames.end());
3876 if (uniqueCols.size() != colNames.size())
3877 throw std::logic_error("A column name was passed to the same Vary invocation multiple times.");
3878 }
3879
3880 // Cannot vary different input column types, assume the first
3881 auto varyColType = GetColumnType(colNames[0]);
3882 auto jittedVariation =
3883 RDFInternal::BookVariationJit(colNames, variationName, variationTags, expression, *fLoopManager,
3884 GetDataSource(), fColRegister, isSingleColumn, varyColType);
3885
3887 newColRegister.AddVariation(std::move(jittedVariation));
3888
3889 RInterface<Proxied> newInterface(fProxiedPtr, *fLoopManager, std::move(newColRegister));
3890
3891 return newInterface;
3892 }
3893
3894 template <typename Helper, typename ActionResultType>
3895 auto CallCreateActionWithoutColsIfPossible(const std::shared_ptr<ActionResultType> &resPtr,
3896 const std::shared_ptr<Helper> &hPtr,
3897 TTraits::TypeList<RDFDetail::RInferredType>)
3898 -> decltype(hPtr->Exec(0u), RResultPtr<ActionResultType>{})
3899 {
3900 return CreateAction<RDFInternal::ActionTags::Book>(/*columns=*/{}, resPtr, hPtr, fProxiedPtr, 0u);
3901 }
3902
3903 template <typename Helper, typename ActionResultType, typename... Others>
3904 RResultPtr<ActionResultType>
3905 CallCreateActionWithoutColsIfPossible(const std::shared_ptr<ActionResultType> &,
3906 const std::shared_ptr<Helper>& /*hPtr*/,
3907 Others...)
3908 {
3909 throw std::logic_error(std::string("An action was booked with no input columns, but the action requires "
3910 "columns! The action helper type was ") +
3911 typeid(Helper).name());
3912 return {};
3913 }
3914
3915protected:
3916 RInterface(const std::shared_ptr<Proxied> &proxied, RLoopManager &lm,
3917 const RDFInternal::RColumnRegister &colRegister)
3918 : RInterfaceBase(lm, colRegister), fProxiedPtr(proxied)
3919 {
3920 }
3921
3922 const std::shared_ptr<Proxied> &GetProxiedPtr() const { return fProxiedPtr; }
3923};
3924
3925} // namespace RDF
3926
3927} // namespace ROOT
3928
3929#endif // ROOT_RDF_INTERFACE
#define f(i)
Definition RSha256.hxx:104
#define h(i)
Definition RSha256.hxx:106
#define e(i)
Definition RSha256.hxx:103
start
Definition Rotated.cxx:223
Basic types used by ROOT and required by TInterpreter.
unsigned int UInt_t
Unsigned integer 4 bytes (unsigned int).
Definition RtypesCore.h:60
long long Long64_t
Portable signed long integer 8 bytes.
Definition RtypesCore.h:83
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Definition RtypesCore.h:84
#define X(type, name)
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125
constexpr Int_t kError
Definition TError.h:47
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition TError.cxx:252
char name[80]
Definition TGX11.cxx:148
Double_t err
A histogram data structure to bin data along multiple dimensions.
static constexpr bool SupportsWeightedFilling
Whether this histogram engine type supports weighted filling.
A histogram for aggregation of data along multiple dimensions.
Definition RHist.hxx:65
A RDataFrame node that produces a result.
Definition RAction.hxx:53
A binder for user-defined columns, variations and aliases.
void AddVariation(std::shared_ptr< RVariationBase > variation)
Register a new systematic variation.
void AddDefine(std::shared_ptr< RDFDetail::RDefineBase > column)
Add a new defined column.
The dataset specification for RDataFrame.
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset's column names.
std::string GetColumnType(std::string_view column)
Return the type of a given column as a string.
ColumnNames_t GetValidatedColumnNames(const unsigned int nColumns, const ColumnNames_t &columns)
ColumnNames_t GetColumnTypeNamesList(const ColumnNames_t &columnList)
std::shared_ptr< ROOT::Detail::RDF::RLoopManager > fLoopManager
RResultPtr< ActionResultType > CreateAction(const ColumnNames_t &columns, const std::shared_ptr< ActionResultType > &r, const std::shared_ptr< HelperArgType > &helperArg, const std::shared_ptr< RDFNode > &proxiedPtr, const int=-1)
Create RAction object, return RResultPtr for the action Overload for the case in which all column typ...
void SanityChecksForVary(const std::vector< std::string > &colNames, const std::vector< std::string > &variationTags, std::string_view variationName)
RDataSource * GetDataSource() const
void CheckAndFillDSColumns(ColumnNames_t validCols, TTraits::TypeList< ColumnTypes... > typeList)
void CheckIMTDisabled(std::string_view callerName)
RInterfaceBase(std::shared_ptr< RDFDetail::RLoopManager > lm)
ColumnNames_t GetColumnNames()
Returns the names of the available columns.
RDFDetail::RLoopManager * GetLoopManager() const
The public interface to the RDataFrame federation of classes.
RResultPtr< RDisplay > Display(const ColumnNames_t &columnList, size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RResultPtr<::TProfile > Profile1D(const TProfile1DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
Fill and return a one-dimensional profile (lazy action).
RResultPtr<::THnD > HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName="")
Fill and return an N-dimensional histogram (lazy action).
RResultPtr<::TGraph > Graph(std::string_view x="", std::string_view y="")
Fill and return a TGraph object (lazy action).
RInterface< Proxied > Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName="")
Register systematic variations for a single existing column using custom variation tags.
RInterface< Proxied > Vary(const std::vector< std::string > &colNames, std::string_view expression, std::size_t nVariations, std::string_view variationName)
Register systematic variations for multiple existing columns using auto-generated variation tags.
RResultPtr< ROOT::Experimental::RHist< BinContentType > > Hist(std::uint64_t nNormalBins, std::pair< double, double > interval, std::string_view vName, std::string_view wName)
Fill and return a one-dimensional RHist with weights (lazy action).
RInterface(const RInterface &)=default
Copy-ctor for RInterface.
RResultPtr< RDFDetail::MaxReturnType_t< T > > Max(std::string_view columnName="")
Return the maximum of processed column values (lazy action).
auto CallCreateActionWithoutColsIfPossible(const std::shared_ptr< ActionResultType > &resPtr, const std::shared_ptr< Helper > &hPtr, TTraits::TypeList< RDFDetail::RInferredType >) -> decltype(hPtr->Exec(0u), RResultPtr< ActionResultType >{})
RInterface(RInterface &&)=default
Move-ctor for RInterface.
RInterface< Proxied > Vary(std::string_view colName, std::string_view expression, const std::vector< std::string > &variationTags, std::string_view variationName="")
Register systematic variations for a single existing column using custom variation tags.
RInterface< RDFDetail::RFilter< F, Proxied > > Filter(F f, const std::initializer_list< std::string > &columns)
Append a filter to the call graph.
RInterface< RLoopManager > Cache(std::initializer_list< std::string > columnList)
Save selected columns in memory.
RInterface< Proxied > Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns, std::size_t nVariations, std::string_view variationName="")
Register systematic variations for a single existing column using auto-generated variation tags.
RInterface< Proxied > Vary(std::initializer_list< std::string > colNames, std::string_view expression, std::size_t nVariations, std::string_view variationName)
Register systematic variations for multiple existing columns using auto-generated variation tags.
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList, const RSnapshotOptions &options=RSnapshotOptions())
RResultPtr<::TProfile2D > Profile2D(const TProfile2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view v3Name, std::string_view wName)
Fill and return a two-dimensional profile (lazy action).
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, std::string_view columnNameRegexp="", const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree or RNTuple treename in file filename.
RResultPtr< RDisplay > Display(const ColumnNames_t &columnList, size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RResultPtr< RDisplay > Display(std::initializer_list< std::string > columnList, size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RInterface(const std::shared_ptr< RLoopManager > &proxied)
Build a RInterface from a RLoopManager.
RResultPtr<::THnSparseD > HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName="")
Fill and return a sparse N-dimensional histogram (lazy action).
RInterface< Proxied > Redefine(std::string_view name, F expression, const ColumnNames_t &columns={})
Overwrite the value and/or type of an existing column.
std::shared_ptr< ::ROOT::Detail::RDF::RNodeBase > fProxiedPtr
RInterface< Proxied > Vary(const std::vector< std::string > &colNames, std::string_view expression, const std::vector< std::string > &variationTags, std::string_view variationName)
Register systematic variations for multiple existing columns using custom variation tags.
RInterface< Proxied > Vary(std::string_view colName, std::string_view expression, std::size_t nVariations, std::string_view variationName="")
Register systematic variations for a single existing column using auto-generated variation tags.
RResultPtr<::TH1D > Histo1D(std::string_view vName)
Fill and return a one-dimensional histogram with the values of a column (lazy action).
RInterface< RDFDetail::RRange< Proxied > > Range(unsigned int begin, unsigned int end, unsigned int stride=1)
Creates a node that filters entries based on range: [begin, end).
RResultPtr< typename std::decay_t< Helper >::Result_t > Book(Helper &&helper, const ColumnNames_t &columns={})
Book execution of a custom action using a user-defined helper object.
RResultPtr< ROOT::Experimental::RHist< BinContentType > > Hist(std::vector< ROOT::Experimental::RAxisVariant > axes, const ColumnNames_t &columnList)
Fill and return an RHist (lazy action).
RResultPtr<::TProfile > Profile1D(const TProfile1DModel &model, std::string_view v1Name="", std::string_view v2Name="")
Fill and return a one-dimensional profile (lazy action).
const std::shared_ptr< Proxied > & GetProxiedPtr() const
RResultPtr<::TH1D > Histo1D(const TH1DModel &model={"", "", 128u, 0., 0.})
Fill and return a one-dimensional histogram with the weighted values of a column (lazy action).
RResultPtr< T > Reduce(F f, std::string_view columnName="")
Execute a user-defined reduce operation on the values of a column.
RResultPtr< T > Reduce(F f, std::string_view columnName, const T &redIdentity)
Execute a user-defined reduce operation on the values of a column.
RInterface< Proxied > Vary(const std::vector< std::string > &colNames, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName)
Register systematic variations for multiple existing columns using custom variation tags.
RInterface< RLoopManager > Cache(const ColumnNames_t &columnList)
Save selected columns in memory.
RResultPtr<::TH1D > Histo1D(const TH1DModel &model, std::string_view vName, std::string_view wName)
Fill and return a one-dimensional histogram with the weighted values of a column (lazy action).
RResultPtr< RDisplay > Display(std::string_view columnNameRegexp="", size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RInterface & operator=(const RInterface &)=default
Copy-assignment operator for RInterface.
RInterface< Proxied > VaryImpl(const std::vector< std::string > &colNames, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName)
RResultPtr<::THnSparseD > HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName="")
Fill and return a sparse N-dimensional histogram (lazy action).
RInterface< Proxied > Define(std::string_view name, std::string_view expression)
Define a new column.
RInterface< RDFDetail::RFilterWithMissingValues< Proxied > > FilterAvailable(std::string_view column)
Discard entries with missing values.
std::enable_if_t<!IsFStringConv &&!IsRetTypeDefConstr, RInterface< Proxied > > DefineImpl(std::string_view, F, const ColumnNames_t &, const std::string &)
RInterface< Proxied > Redefine(std::string_view name, std::string_view expression)
Overwrite the value and/or type of an existing column.
std::vector< std::string > GetFilterNames()
Returns the names of the filters created.
RInterface< RLoopManager > Cache(std::string_view columnNameRegexp="")
Save selected columns in memory.
RResultPtr<::TH1D > Histo1D(const TH1DModel &model={"", "", 128u, 0., 0.}, std::string_view vName="")
Fill and return a one-dimensional histogram with the values of a column (lazy action).
RInterface< Proxied > Vary(std::initializer_list< std::string > colNames, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName)
Register systematic variations for multiple existing columns using custom variation tags.
RResultPtr<::TH3D > Histo3D(const TH3DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view v3Name, std::string_view wName)
Fill and return a three-dimensional histogram (lazy action).
RResultPtr< ROOT::Experimental::RHistEngine< BinContentType > > Hist(std::shared_ptr< ROOT::Experimental::RHistEngine< BinContentType > > h, const ColumnNames_t &columnList)
Fill the provided RHistEngine (lazy action).
RInterface< RLoopManager > CacheImpl(const ColumnNames_t &columnList, std::index_sequence< S... >)
RResultPtr<::TProfile2D > Profile2D(const TProfile2DModel &model, std::string_view v1Name="", std::string_view v2Name="", std::string_view v3Name="")
Fill and return a two-dimensional profile (lazy action).
RInterface< RDFDetail::RFilter< F, Proxied > > Filter(F f, std::string_view name)
Append a filter to the call graph.
RResultPtr< U > Aggregate(AccFun aggregator, MergeFun merger, std::string_view columnName="")
Execute a user-defined accumulation operation on the processed column values in each processing slot.
std::enable_if_t< std::is_default_constructible< RetType >::value, RInterface< Proxied > > DefineImpl(std::string_view name, F &&expression, const ColumnNames_t &columns, const std::string &where)
RResultPtr< ROOT::Experimental::RHist< BinContentType > > Hist(std::shared_ptr< ROOT::Experimental::RHist< BinContentType > > h, const ColumnNames_t &columnList)
Fill the provided RHist (lazy action).
RInterface(const std::shared_ptr< Proxied > &proxied, RLoopManager &lm, const RDFInternal::RColumnRegister &colRegister)
RResultPtr< COLL > Take(std::string_view column="")
Return a collection of values of a column (lazy action, returns a std::vector by default).
RInterface< Proxied > Alias(std::string_view alias, std::string_view columnName)
Allow to refer to a column with a different name.
RResultPtr< RDFDetail::MinReturnType_t< T > > Min(std::string_view columnName="")
Return the minimum of processed column values (lazy action).
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList, const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree or RNTuple treename in file filename.
RResultPtr< ROOT::Experimental::RHistEngine< BinContentType > > Hist(std::shared_ptr< ROOT::Experimental::RHistEngine< BinContentType > > h, const ColumnNames_t &columnList, std::string_view wName)
Fill the provided RHistEngine with weights (lazy action).
RResultPtr< RCutFlowReport > Report()
Gather filtering statistics.
RResultPtr<::TH3D > Histo3D(const TH3DModel &model)
RResultPtr<::TH3D > Histo3D(const TH3DModel &model, std::string_view v1Name="", std::string_view v2Name="", std::string_view v3Name="")
Fill and return a three-dimensional histogram (lazy action).
RResultPtr<::TH1D > Histo1D(std::string_view vName, std::string_view wName)
Fill and return a one-dimensional histogram with the weighted values of a column (lazy action).
RInterface< Proxied > DefinePerSample(std::string_view name, std::string_view expression)
Define a new column that is updated when the input sample changes.
RInterface< Proxied > DefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns={})
Define a new column with a value dependent on the processing slot and the current entry.
RResultPtr< std::decay_t< T > > Fill(T &&model, const ColumnNames_t &columnList)
Return an object of type T on which T::Fill will be called once per event (lazy action).
RInterface< Proxied > DefineSlot(std::string_view name, F expression, const ColumnNames_t &columns={})
Define a new column with a value dependent on the processing slot.
RInterface< RDFDetail::RFilterWithMissingValues< Proxied > > FilterMissing(std::string_view column)
Keep only the entries that have missing values.
RResultPtr< TStatistic > Stats(std::string_view value="")
Return a TStatistic object, filled once per event (lazy action).
RInterface< Proxied > JittedVaryImpl(const std::vector< std::string > &colNames, std::string_view expression, const std::vector< std::string > &variationTags, std::string_view variationName, bool isSingleColumn)
friend class RInterface
RInterface< Proxied > DefaultValueFor(std::string_view column, const T &defaultValue)
In case the value in the given column is missing, provide a default value.
RResultPtr< TStatistic > Stats(std::string_view value, std::string_view weight)
Return a TStatistic object, filled once per event (lazy action).
RResultPtr<::TProfile2D > Profile2D(const TProfile2DModel &model)
Fill and return a two-dimensional profile (lazy action).
RInterface< Proxied > RedefineSlot(std::string_view name, F expression, const ColumnNames_t &columns={})
Overwrite the value and/or type of an existing column.
void Foreach(F f, const ColumnNames_t &columns={})
Execute a user-defined function on each entry (instant action).
RResultPtr<::TH2D > Histo2D(const TH2DModel &model, std::string_view v1Name="", std::string_view v2Name="")
Fill and return a two-dimensional histogram (lazy action).
RResultPtr< ActionResultType > CallCreateActionWithoutColsIfPossible(const std::shared_ptr< ActionResultType > &, const std::shared_ptr< Helper > &, Others...)
RInterface< Proxied > Define(std::string_view name, F expression, const ColumnNames_t &columns={})
Define a new column.
void ForeachSlot(F f, const ColumnNames_t &columns={})
Execute a user-defined function requiring a processing slot index on each entry (instant action).
RResultPtr<::TGraphAsymmErrors > GraphAsymmErrors(std::string_view x="", std::string_view y="", std::string_view exl="", std::string_view exh="", std::string_view eyl="", std::string_view eyh="")
Fill and return a TGraphAsymmErrors object (lazy action).
RResultPtr< U > Aggregate(AccFun aggregator, MergeFun merger, std::string_view columnName, const U &aggIdentity)
Execute a user-defined accumulation operation on the processed column values in each processing slot.
RResultPtr< ROOT::Experimental::RHist< BinContentType > > Hist(std::shared_ptr< ROOT::Experimental::RHist< BinContentType > > h, const ColumnNames_t &columnList, std::string_view wName)
Fill the provided RHist with weights (lazy action).
RResultPtr<::TProfile > Profile1D(const TProfile1DModel &model)
Fill and return a one-dimensional profile (lazy action).
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, std::initializer_list< std::string > columnList, const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree or RNTuple treename in file filename.
RInterface & operator=(RInterface &&)=default
Move-assignment operator for RInterface.
RResultPtr<::TH2D > Histo2D(const TH2DModel &model)
RResultPtr< double > Mean(std::string_view columnName="")
Return the mean of processed column values (lazy action).
RInterface< RDFDetail::RFilter< F, Proxied > > Filter(F f, const ColumnNames_t &columns={}, std::string_view name="")
Append a filter to the call graph.
RInterface< RLoopManager > Cache(const ColumnNames_t &columnList)
Save selected columns in memory.
RInterface< Proxied > DefinePerSample(std::string_view name, F expression)
Define a new column that is updated when the input sample changes.
RInterface< Proxied > Vary(std::initializer_list< std::string > colNames, F &&expression, const ColumnNames_t &inputColumns, std::size_t nVariations, std::string_view variationName)
Register systematic variations for for multiple existing columns using custom variation tags.
RInterface< RDFDetail::RRange< Proxied > > Range(unsigned int end)
Creates a node that filters entries based on range.
RInterface< Proxied > RedefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns={})
Overwrite the value and/or type of an existing column.
RInterface< RDFDetail::RJittedFilter > Filter(std::string_view expression, std::string_view name="")
Append a filter to the call graph.
RResultPtr< ROOT::Experimental::RHist< BinContentType > > Hist(std::uint64_t nNormalBins, std::pair< double, double > interval, std::string_view vName)
Fill and return a one-dimensional RHist (lazy action).
RResultPtr< ROOT::Experimental::RHist< BinContentType > > Hist(std::vector< ROOT::Experimental::RAxisVariant > axes, const ColumnNames_t &columnList, std::string_view wName)
Fill and return an RHist with weights (lazy action).
RResultPtr< ULong64_t > Count()
Return the number of entries processed (lazy action).
RResultPtr<::TH2D > Histo2D(const TH2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
Fill and return a weighted two-dimensional histogram (lazy action).
RInterface< Proxied > Vary(const std::vector< std::string > &colNames, F &&expression, const ColumnNames_t &inputColumns, std::size_t nVariations, std::string_view variationName)
Register systematic variations for multiple existing columns using auto-generated tags.
RResultPtr<::THnD > HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName="")
Fill and return an N-dimensional histogram (lazy action).
RResultPtr< double > StdDev(std::string_view columnName="")
Return the unbiased standard deviation of processed column values (lazy action).
RResultPtr< RDFDetail::SumReturnType_t< T > > Sum(std::string_view columnName="", const RDFDetail::SumReturnType_t< T > &initValue=RDFDetail::SumReturnType_t< T >{})
Return the sum of processed column values (lazy action).
A RDataSource implementation which is built on top of result proxies.
Smart pointer for the return type of actions.
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
typename RemoveFirstParameter< T >::type RemoveFirstParameter_t
TDirectory::TContext keeps track and restore the current directory.
Definition TDirectory.h:89
@ kAllAxes
Definition TH1.h:126
Statistical variable, defined by its mean and variance (RMS).
Definition TStatistic.h:33
Double_t y[n]
Definition legend1.C:17
Double_t x[n]
Definition legend1.C:17
#define F(x, y, z)
void CheckForNoVariations(const std::string &where, std::string_view definedColView, const RColumnRegister &colRegister)
Throw if the column has systematic variations attached.
ParsedTreePath ParseTreePath(std::string_view fullTreeName)
const std::type_info & TypeName2TypeID(const std::string &name)
Return the type_info associated to a name.
Definition RDFUtils.cxx:86
void ChangeEmptyEntryRange(const ROOT::RDF::RNode &node, std::pair< ULong64_t, ULong64_t > &&newRange)
std::shared_ptr< RJittedDefine > BookDefinePerSampleJit(std::string_view name, std::string_view expression, RLoopManager &lm, const RColumnRegister &colRegister)
Book the jitting of a DefinePerSample call.
void CheckValidCppVarName(std::string_view var, const std::string &where)
void ChangeSpec(const ROOT::RDF::RNode &node, ROOT::RDF::Experimental::RDatasetSpec &&spec)
Changes the input dataset specification of an RDataFrame.
const std::vector< std::string > & GetTopLevelFieldNames(const ROOT::RDF::RDataSource &ds)
Definition RDFUtils.cxx:650
void RemoveDuplicates(ColumnNames_t &columnNames)
std::shared_ptr< RNodeBase > UpcastNode(std::shared_ptr< RNodeBase > ptr)
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition RDFUtils.cxx:191
void CheckSnapshotOptionsFormatCompatibility(const ROOT::RDF::RSnapshotOptions &opts)
void CheckForDefinition(const std::string &where, std::string_view definedColView, const RColumnRegister &colRegister, const ColumnNames_t &dataSourceColumns)
Throw if column definedColView is not already there.
std::vector< std::string > GetFilterNames(const std::shared_ptr< RLoopManager > &loopManager)
std::string GetDataSourceLabel(const ROOT::RDF::RNode &node)
std::string PrettyPrintAddr(const void *const addr)
std::shared_ptr< RDFDetail::RJittedFilter > BookFilterJit(std::shared_ptr< RDFDetail::RNodeBase > prevNode, std::string_view name, std::string_view expression, const RColumnRegister &colRegister, TTree *tree, RDataSource *ds)
Book the jitting of a Filter call.
void TriggerRun(ROOT::RDF::RNode node)
Trigger the execution of an RDataFrame computation graph.
void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
std::string DemangleTypeIdName(const std::type_info &typeInfo)
bool AtLeastOneEmptyString(const std::vector< std::string_view > strings)
std::pair< std::vector< std::string >, std::vector< std::string > > AddSizeBranches(ROOT::RDF::RDataSource *ds, std::vector< std::string > &&colsWithoutAliases, std::vector< std::string > &&colsWithAliases)
Return copies of colsWithoutAliases and colsWithAliases with size branches for variable-sized array b...
std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *, RDataSource *, RDefineBase *, bool vector2RVec=true)
Return a string containing the type of the given branch.
Definition RDFUtils.cxx:330
void SetTTreeLifeline(ROOT::RDF::RNode &node, std::any lifeline)
void RemoveRNTupleSubfields(ColumnNames_t &columnNames)
std::vector< std::pair< std::uint64_t, std::uint64_t > > GetDatasetGlobalClusterBoundaries(const RNode &node)
Retrieve the cluster boundaries for each cluster in the dataset, across files, with a global offset.
ColumnNames_t FilterArraySizeColNames(const ColumnNames_t &columnNames, const std::string &action)
Take a list of column names, return that list with entries starting by '#' filtered out.
void WarnHist()
Warn once about experimental filling of RHist.
Definition RDFUtils.cxx:55
void CheckForDuplicateSnapshotColumns(const ColumnNames_t &cols)
ColumnNames_t ConvertRegexToColumns(const ColumnNames_t &colNames, std::string_view columnNameRegexp, std::string_view callerName)
void CheckForRedefinition(const std::string &where, std::string_view definedColView, const RColumnRegister &colRegister, const ColumnNames_t &dataSourceColumns)
Throw if column definedColView is already there.
std::shared_ptr< RJittedDefine > BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm, RDataSource *ds, const RColumnRegister &colRegister)
Book the jitting of a Define call.
std::shared_ptr< RJittedVariation > BookVariationJit(const std::vector< std::string > &colNames, std::string_view variationName, const std::vector< std::string > &variationTags, std::string_view expression, RLoopManager &lm, RDataSource *ds, const RColumnRegister &colRegister, bool isSingleColumn, const std::string &varyColType)
Book the jitting of a Vary call.
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
RInterface<::ROOT::Detail::RDF::RNodeBase > RNode
std::vector< std::string > ColumnNames_t
ROOT type_traits extensions.
void EnableImplicitMT(UInt_t numthreads=0)
Enable ROOT's implicit multi-threading for all objects and methods that provide an internal paralleli...
Definition TROOT.cxx:613
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
Definition TROOT.cxx:669
void DisableImplicitMT()
Disables the implicit multi-threading in ROOT (see EnableImplicitMT).
Definition TROOT.cxx:655
A special bin content type to compute the bin error in weighted filling.
type is TypeList if MustRemove is false, otherwise it is a TypeList with the first type removed
Definition Utils.hxx:156
Tag to let data sources use the native data type when creating a column reader.
Definition Utils.hxx:347
A collection of options to steer the creation of the dataset on disk through Snapshot().
A struct which stores some basic parameters of a TH1D.
A struct which stores some basic parameters of a TH2D.
A struct which stores some basic parameters of a TH3D.
A struct which stores some basic parameters of a THnD.
A struct which stores some basic parameters of a THnSparseD.
A struct which stores some basic parameters of a TProfile.
A struct which stores some basic parameters of a TProfile2D.