Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RInterface.hxx
Go to the documentation of this file.
1// Author: Enrico Guiraud, Danilo Piparo CERN 03/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2021, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RDF_TINTERFACE
12#define ROOT_RDF_TINTERFACE
13
14#include "ROOT/RDataSource.hxx"
20#include "ROOT/RDF/RDefine.hxx"
22#include "ROOT/RDF/RFilter.hxx"
27#include "ROOT/RDF/RRange.hxx"
29#include "ROOT/RDF/Utils.hxx"
32#include "ROOT/RResultPtr.hxx"
34#include <string_view>
35#include "ROOT/RVec.hxx"
36#include "ROOT/TypeTraits.hxx"
37#include "RtypesCore.h" // for ULong64_t
38#include "TDirectory.h"
39#include "TH1.h" // For Histo actions
40#include "TH2.h" // For Histo actions
41#include "TH3.h" // For Histo actions
42#include "THn.h"
43#include "THnSparse.h"
44#include "TProfile.h"
45#include "TProfile2D.h"
46#include "TStatistic.h"
47
48#include "RConfigure.h" // for R__HAS_ROOT7
49#ifdef R__HAS_ROOT7
51#include <ROOT/RHist.hxx>
52#include <ROOT/RHistEngine.hxx>
53#endif
54
55#include <algorithm>
56#include <cstddef>
57#include <initializer_list>
58#include <iterator> // std::back_insterter
59#include <limits>
60#include <memory>
61#include <set>
62#include <sstream>
63#include <stdexcept>
64#include <string>
65#include <type_traits> // is_same, enable_if
66#include <typeinfo>
67#include <unordered_set>
68#include <utility> // std::index_sequence
69#include <vector>
70#include <any>
71
72class TGraph;
73
74// Windows requires a forward decl of printValue to accept it as a valid friend function in RInterface
75namespace ROOT {
79class RDataFrame;
80} // namespace ROOT
81namespace cling {
82std::string printValue(ROOT::RDataFrame *tdf);
83}
84
85namespace ROOT {
86namespace RDF {
89namespace TTraits = ROOT::TypeTraits;
90
91template <typename Proxied>
92class RInterface;
93
95} // namespace RDF
96
97namespace Internal {
98namespace RDF {
100void ChangeEmptyEntryRange(const ROOT::RDF::RNode &node, std::pair<ULong64_t, ULong64_t> &&newRange);
101void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end);
103std::vector<std::pair<std::uint64_t, std::uint64_t>> GetDatasetGlobalClusterBoundaries(const RNode &node);
105std::string GetDataSourceLabel(const ROOT::RDF::RNode &node);
106void SetTTreeLifeline(ROOT::RDF::RNode &node, std::any lifeline);
107} // namespace RDF
108} // namespace Internal
109
110namespace RDF {
111
112// clang-format off
113/**
114 * \class ROOT::RDF::RInterface
115 * \ingroup dataframe
116 * \brief The public interface to the RDataFrame federation of classes.
117 * \tparam Proxied One of the "node" base types (e.g. RLoopManager, RFilterBase). The user never specifies this type manually.
118 *
119 * The documentation of each method features a one liner illustrating how to use the method, for example showing how
120 * the majority of the template parameters are automatically deduced requiring no or very little effort by the user.
121 */
122// clang-format on
123template <typename Proxied>
128 friend std::string cling::printValue(::ROOT::RDataFrame *tdf); // For a nice printing at the prompt
130
131 template <typename T>
132 friend class RInterface;
133
135 friend void RDFInternal::ChangeEmptyEntryRange(const RNode &node, std::pair<ULong64_t, ULong64_t> &&newRange);
136 friend void RDFInternal::ChangeBeginAndEndEntries(const RNode &node, Long64_t start, Long64_t end);
138 friend std::vector<std::pair<std::uint64_t, std::uint64_t>>
140 friend std::string ROOT::Internal::RDF::GetDataSourceLabel(const RNode &node);
142 std::shared_ptr<Proxied> fProxiedPtr; ///< Smart pointer to the graph node encapsulated by this RInterface.
143
144public:
145 ////////////////////////////////////////////////////////////////////////////
146 /// \brief Copy-assignment operator for RInterface.
147 RInterface &operator=(const RInterface &) = default;
148
149 ////////////////////////////////////////////////////////////////////////////
150 /// \brief Copy-ctor for RInterface.
151 RInterface(const RInterface &) = default;
152
153 ////////////////////////////////////////////////////////////////////////////
154 /// \brief Move-ctor for RInterface.
155 RInterface(RInterface &&) = default;
156
157 ////////////////////////////////////////////////////////////////////////////
158 /// \brief Move-assignment operator for RInterface.
160
161 ////////////////////////////////////////////////////////////////////////////
162 /// \brief Build a RInterface from a RLoopManager.
163 /// This constructor is only available for RInterface<RLoopManager>.
165 RInterface(const std::shared_ptr<RLoopManager> &proxied) : RInterfaceBase(proxied), fProxiedPtr(proxied)
166 {
167 }
168
169 ////////////////////////////////////////////////////////////////////////////
170 /// \brief Cast any RDataFrame node to a common type ROOT::RDF::RNode.
171 /// Different RDataFrame methods return different C++ types. All nodes, however,
172 /// can be cast to this common type at the cost of a small performance penalty.
173 /// This allows, for example, storing RDataFrame nodes in a vector, or passing them
174 /// around via (non-template, C++11) helper functions.
175 /// Example usage:
176 /// ~~~{.cpp}
177 /// // a function that conditionally adds a Range to a RDataFrame node.
178 /// RNode MaybeAddRange(RNode df, bool mustAddRange)
179 /// {
180 /// return mustAddRange ? df.Range(1) : df;
181 /// }
182 /// // use as :
183 /// ROOT::RDataFrame df(10);
184 /// auto maybeRanged = MaybeAddRange(df, true);
185 /// ~~~
186 /// Note that it is not a problem to pass RNode's by value.
187 operator RNode() const
188 {
189 return RNode(std::static_pointer_cast<::ROOT::Detail::RDF::RNodeBase>(fProxiedPtr), *fLoopManager, fColRegister);
190 }
191
192 /// \name Transformations
193 /// These functions transform the columns of the dataframe, such as filtering events or defining columns.
194 /// Transformations can be chained, for example
195 /// ~~~{.cpp}
196 /// auto filtered = rdf.Filter(...).Define(...).Define(...);
197 /// ~~~
198 /// \{
199
200 ////////////////////////////////////////////////////////////////////////////
201 /// \brief Append a filter to the call graph.
202 /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool`
203 /// signalling whether the event has passed the selection (true) or not (false).
204 /// \param[in] columns Names of the columns/branches in input to the filter function.
205 /// \param[in] name Optional name of this filter. See `Report`.
206 /// \return the filter node of the computation graph.
207 ///
208 /// Append a filter node at the point of the call graph corresponding to the
209 /// object this method is called on.
210 /// The callable `f` should not have side-effects (e.g. modification of an
211 /// external or static variable) to ensure correct results when implicit
212 /// multi-threading is active.
213 ///
214 /// RDataFrame only evaluates filters when necessary: if multiple filters
215 /// are chained one after another, they are executed in order and the first
216 /// one returning false causes the event to be discarded.
217 /// Even if multiple actions or transformations depend on the same filter,
218 /// it is executed once per entry. If its result is requested more than
219 /// once, the cached result is served.
220 ///
221 /// ### Example usage:
222 /// ~~~{.cpp}
223 /// // C++ callable (function, functor class, lambda...) that takes two parameters of the types of "x" and "y"
224 /// auto filtered = df.Filter(myCut, {"x", "y"});
225 ///
226 /// // String: it must contain valid C++ except that column names can be used instead of variable names
227 /// auto filtered = df.Filter("x*y > 0");
228 /// ~~~
229 ///
230 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
231 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
232 /// ~~~{.cpp}
233 /// df.Filter("Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
234 /// ~~~
235 /// but instead this will:
236 /// ~~~{.cpp}
237 /// df.Filter("return Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
238 /// ~~~
241 {
242 RDFInternal::CheckFilter(f);
243 using ColTypes_t = typename TTraits::CallableTraits<F>::arg_types;
244 constexpr auto nColumns = ColTypes_t::list_size;
247
249
250 auto filterPtr = std::make_shared<F_t>(std::move(f), validColumnNames, fProxiedPtr, fColRegister, name);
252 }
253
254 ////////////////////////////////////////////////////////////////////////////
255 /// \brief Append a filter to the call graph.
256 /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool`
257 /// signalling whether the event has passed the selection (true) or not (false).
258 /// \param[in] name Optional name of this filter. See `Report`.
259 /// \return the filter node of the computation graph.
260 ///
261 /// Refer to the first overload of this method for the full documentation.
264 {
265 // The sfinae is there in order to pick up the overloaded method which accepts two strings
266 // rather than this template method.
267 return Filter(f, {}, name);
268 }
269
270 ////////////////////////////////////////////////////////////////////////////
271 /// \brief Append a filter to the call graph.
272 /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool`
273 /// signalling whether the event has passed the selection (true) or not (false).
274 /// \param[in] columns Names of the columns/branches in input to the filter function.
275 /// \return the filter node of the computation graph.
276 ///
277 /// Refer to the first overload of this method for the full documentation.
278 template <typename F>
279 RInterface<RDFDetail::RFilter<F, Proxied>> Filter(F f, const std::initializer_list<std::string> &columns)
280 {
281 return Filter(f, ColumnNames_t{columns});
282 }
283
284 ////////////////////////////////////////////////////////////////////////////
285 /// \brief Append a filter to the call graph.
286 /// \param[in] expression The filter expression in C++
287 /// \param[in] name Optional name of this filter. See `Report`.
288 /// \return the filter node of the computation graph.
289 ///
290 /// The expression is just-in-time compiled and used to filter entries. It must
291 /// be valid C++ syntax in which variable names are substituted with the names
292 /// of branches/columns.
293 ///
294 /// ### Example usage:
295 /// ~~~{.cpp}
296 /// auto filtered_df = df.Filter("myCollection.size() > 3");
297 /// auto filtered_name_df = df.Filter("myCollection.size() > 3", "Minumum collection size");
298 /// ~~~
299 ///
300 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
301 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
302 /// ~~~{.cpp}
303 /// df.Filter("Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
304 /// ~~~
305 /// but instead this will:
306 /// ~~~{.cpp}
307 /// df.Filter("return Sum(Map(vec, [](float e) { return e*e > 0.5; }))")
308 /// ~~~
309 RInterface<RDFDetail::RJittedFilter> Filter(std::string_view expression, std::string_view name = "")
310 {
312 fColRegister, nullptr, GetDataSource());
313
315 }
316
317 ////////////////////////////////////////////////////////////////////////////
318 /// \brief Discard entries with missing values
319 /// \param[in] column Column name whose entries with missing values should be discarded
320 /// \return The filter node of the computation graph
321 ///
322 /// This operation is useful in case an entry of the dataset is incomplete,
323 /// i.e. if one or more of the columns do not have valid values. If the value
324 /// of the input column is missing for an entry, the entire entry will be
325 /// discarded from the rest of this branch of the computation graph.
326 ///
327 /// Use cases include:
328 /// * When processing multiple files, one or more of them is missing a column
329 /// * In horizontal joining with entry matching, a certain dataset has no
330 /// match for the current entry.
331 ///
332 /// ### Example usage:
333 ///
334 /// \code{.py}
335 /// # Assume a dataset with columns [idx, x] matching another dataset with
336 /// # columns [idx, y]. For idx == 42, the right-hand dataset has no match
337 /// df = ROOT.RDataFrame(dataset)
338 /// df_nomissing = df.FilterAvailable("idx").Define("z", "x + y")
339 /// colz = df_nomissing.Take[int]("z")
340 /// \endcode
341 ///
342 /// \code{.cpp}
343 /// // Assume a dataset with columns [idx, x] matching another dataset with
344 /// // columns [idx, y]. For idx == 42, the right-hand dataset has no match
345 /// ROOT::RDataFrame df{dataset};
346 /// auto df_nomissing = df.FilterAvailable("idx")
347 /// .Define("z", [](int x, int y) { return x + y; }, {"x", "y"});
348 /// auto colz = df_nomissing.Take<int>("z");
349 /// \endcode
350 ///
351 /// \note See FilterMissing() if you want to keep only the entries with
352 /// missing values instead.
354 {
355 const auto columns = ColumnNames_t{column.data()};
356 // For now disable this functionality in case of an empty data source and
357 // the column name was not defined previously.
358 if (ROOT::Internal::RDF::GetDataSourceLabel(*this) == "EmptyDS")
359 throw std::runtime_error("Unknown column: \"" + std::string(column) + "\"");
361 auto filterPtr = std::make_shared<F_t>(/*discardEntry*/ true, fProxiedPtr, fColRegister, columns);
364 }
365
366 ////////////////////////////////////////////////////////////////////////////
367 /// \brief Keep only the entries that have missing values.
368 /// \param[in] column Column name whose entries with missing values should be kept
369 /// \return The filter node of the computation graph
370 ///
371 /// This operation is useful in case an entry of the dataset is incomplete,
372 /// i.e. if one or more of the columns do not have valid values. It only
373 /// keeps the entries for which the value of the input column is missing.
374 ///
375 /// Use cases include:
376 /// * When processing multiple files, one or more of them is missing a column
377 /// * In horizontal joining with entry matching, a certain dataset has no
378 /// match for the current entry.
379 ///
380 /// ### Example usage:
381 ///
382 /// \code{.py}
383 /// # Assume a dataset made of two files vertically chained together, one has
384 /// # column "x" and the other has column "y"
385 /// df = ROOT.RDataFrame(dataset)
386 /// df_valid_col_x = df.FilterMissing("y")
387 /// df_valid_col_y = df.FilterMissing("x")
388 /// display_x = df_valid_col_x.Display(("x",))
389 /// display_y = df_valid_col_y.Display(("y",))
390 /// \endcode
391 ///
392 /// \code{.cpp}
393 /// // Assume a dataset made of two files vertically chained together, one has
394 /// // column "x" and the other has column "y"
395 /// ROOT.RDataFrame df{dataset};
396 /// auto df_valid_col_x = df.FilterMissing("y");
397 /// auto df_valid_col_y = df.FilterMissing("x");
398 /// auto display_x = df_valid_col_x.Display<int>({"x"});
399 /// auto display_y = df_valid_col_y.Display<int>({"y"});
400 /// \endcode
401 ///
402 /// \note See FilterAvailable() if you want to discard the entries in case
403 /// there is a missing value instead.
405 {
406 const auto columns = ColumnNames_t{column.data()};
407 // For now disable this functionality in case of an empty data source and
408 // the column name was not defined previously.
409 if (ROOT::Internal::RDF::GetDataSourceLabel(*this) == "EmptyDS")
410 throw std::runtime_error("Unknown column: \"" + std::string(column) + "\"");
412 auto filterPtr = std::make_shared<F_t>(/*discardEntry*/ false, fProxiedPtr, fColRegister, columns);
415 }
416
417 // clang-format off
418 ////////////////////////////////////////////////////////////////////////////
419 /// \brief Define a new column.
420 /// \param[in] name The name of the defined column.
421 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column. This callable must be thread safe when used with multiple threads.
422 /// \param[in] columns Names of the columns/branches in input to the producer function.
423 /// \return the first node of the computation graph for which the new quantity is defined.
424 ///
425 /// Define a column that will be visible from all subsequent nodes
426 /// of the functional chain. The `expression` is only evaluated for entries that pass
427 /// all the preceding filters.
428 /// A new variable is created called `name`, accessible as if it was contained
429 /// in the dataset from subsequent transformations/actions.
430 ///
431 /// Use cases include:
432 /// * caching the results of complex calculations for easy and efficient multiple access
433 /// * extraction of quantities of interest from complex objects
434 ///
435 /// An exception is thrown if the name of the new column is already in use in this branch of the computation graph.
436 /// Note that the callable must be thread safe when called from multiple threads. Use DefineSlot() if needed.
437 ///
438 /// ### Example usage:
439 /// ~~~{.cpp}
440 /// // assuming a function with signature:
441 /// double myComplexCalculation(const RVec<float> &muon_pts);
442 /// // we can pass it directly to Define
443 /// auto df_with_define = df.Define("newColumn", myComplexCalculation, {"muon_pts"});
444 /// // alternatively, we can pass the body of the function as a string, as in Filter:
445 /// auto df_with_define = df.Define("newColumn", "x*x + y*y");
446 /// ~~~
447 ///
448 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
449 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
450 /// ~~~{.cpp}
451 /// df.Define("x2", "Map(v, [](float e) { return e*e; })")
452 /// ~~~
453 /// but instead this will:
454 /// ~~~{.cpp}
455 /// df.Define("x2", "return Map(v, [](float e) { return e*e; })")
456 /// ~~~
458 RInterface<Proxied> Define(std::string_view name, F expression, const ColumnNames_t &columns = {})
459 {
460 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::None>(name, std::move(expression), columns, "Define");
461 }
462 // clang-format on
463
464 // clang-format off
465 ////////////////////////////////////////////////////////////////////////////
466 /// \brief Define a new column with a value dependent on the processing slot.
467 /// \param[in] name The name of the defined column.
468 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
469 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding the slot number).
470 /// \return the first node of the computation graph for which the new quantity is defined.
471 ///
472 /// This alternative implementation of `Define` is meant as a helper to evaluate new column values in a thread-safe manner.
473 /// The expression must be a callable of signature R(unsigned int, T1, T2, ...) where `T1, T2...` are the types
474 /// of the columns that the expression takes as input. The first parameter is reserved for an unsigned integer
475 /// representing a "slot number". RDataFrame guarantees that different threads will invoke the expression with
476 /// different slot numbers - slot numbers will range from zero to ROOT::GetThreadPoolSize()-1.
477 /// Note that there is no guarantee as to how often each slot will be reached during the event loop.
478 ///
479 /// The following two calls are equivalent, although `DefineSlot` is slightly more performant:
480 /// ~~~{.cpp}
481 /// int function(unsigned int, double, double);
482 /// df.Define("x", function, {"rdfslot_", "column1", "column2"})
483 /// df.DefineSlot("x", function, {"column1", "column2"})
484 /// ~~~
485 ///
486 /// See Define() for more information.
487 template <typename F>
488 RInterface<Proxied> DefineSlot(std::string_view name, F expression, const ColumnNames_t &columns = {})
489 {
490 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::Slot>(name, std::move(expression), columns, "DefineSlot");
491 }
492 // clang-format on
493
494 // clang-format off
495 ////////////////////////////////////////////////////////////////////////////
496 /// \brief Define a new column with a value dependent on the processing slot and the current entry.
497 /// \param[in] name The name of the defined column.
498 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
499 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding slot and entry).
500 /// \return the first node of the computation graph for which the new quantity is defined.
501 ///
502 /// This alternative implementation of `Define` is meant as a helper in writing entry-specific, thread-safe custom
503 /// columns. The expression must be a callable of signature R(unsigned int, ULong64_t, T1, T2, ...) where `T1, T2...`
504 /// are the types of the columns that the expression takes as input. The first parameter is reserved for an unsigned
505 /// integer representing a "slot number". RDataFrame guarantees that different threads will invoke the expression with
506 /// different slot numbers - slot numbers will range from zero to ROOT::GetThreadPoolSize()-1.
507 /// Note that there is no guarantee as to how often each slot will be reached during the event loop.
508 /// The second parameter is reserved for a `ULong64_t` representing the current entry being processed by the current thread.
509 ///
510 /// The following two `Define`s are equivalent, although `DefineSlotEntry` is slightly more performant:
511 /// ~~~{.cpp}
512 /// int function(unsigned int, ULong64_t, double, double);
513 /// Define("x", function, {"rdfslot_", "rdfentry_", "column1", "column2"})
514 /// DefineSlotEntry("x", function, {"column1", "column2"})
515 /// ~~~
516 ///
517 /// See Define() for more information.
518 template <typename F>
519 RInterface<Proxied> DefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns = {})
520 {
522 "DefineSlotEntry");
523 }
524 // clang-format on
525
526 ////////////////////////////////////////////////////////////////////////////
527 /// \brief Define a new column.
528 /// \param[in] name The name of the defined column.
529 /// \param[in] expression An expression in C++ which represents the defined value
530 /// \return the first node of the computation graph for which the new quantity is defined.
531 ///
532 /// The expression is just-in-time compiled and used to produce the column entries.
533 /// It must be valid C++ syntax in which variable names are substituted with the names
534 /// of branches/columns.
535 ///
536 /// \note If the body of the string expression contains an explicit `return` statement (even if it is in a nested
537 /// scope), RDataFrame _will not_ add another one in front of the expression. So this will not work:
538 /// ~~~{.cpp}
539 /// df.Define("x2", "Map(v, [](float e) { return e*e; })")
540 /// ~~~
541 /// but instead this will:
542 /// ~~~{.cpp}
543 /// df.Define("x2", "return Map(v, [](float e) { return e*e; })")
544 /// ~~~
545 ///
546 /// Refer to the first overload of this method for the full documentation.
547 RInterface<Proxied> Define(std::string_view name, std::string_view expression)
548 {
549 constexpr auto where = "Define";
551 // these checks must be done before jitting lest we throw exceptions in jitted code
554
556
558 newCols.AddDefine(std::move(jittedDefine));
559
561
562 return newInterface;
563 }
564
565 ////////////////////////////////////////////////////////////////////////////
566 /// \brief Overwrite the value and/or type of an existing column.
567 /// \param[in] name The name of the column to redefine.
568 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
569 /// \param[in] columns Names of the columns/branches in input to the expression.
570 /// \return the first node of the computation graph for which the quantity is redefined.
571 ///
572 /// The old value of the column can be used as an input for the expression.
573 ///
574 /// An exception is thrown in case the column to redefine does not already exist.
575 /// See Define() for more information.
577 RInterface<Proxied> Redefine(std::string_view name, F expression, const ColumnNames_t &columns = {})
578 {
579 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::None>(name, std::move(expression), columns, "Redefine");
580 }
581
582 // clang-format off
583 ////////////////////////////////////////////////////////////////////////////
584 /// \brief Overwrite the value and/or type of an existing column.
585 /// \param[in] name The name of the column to redefine.
586 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
587 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding slot).
588 /// \return the first node of the computation graph for which the new quantity is defined.
589 ///
590 /// The old value of the column can be used as an input for the expression.
591 /// An exception is thrown in case the column to redefine does not already exist.
592 ///
593 /// See DefineSlot() for more information.
594 // clang-format on
595 template <typename F>
596 RInterface<Proxied> RedefineSlot(std::string_view name, F expression, const ColumnNames_t &columns = {})
597 {
598 return DefineImpl<F, RDFDetail::ExtraArgsForDefine::Slot>(name, std::move(expression), columns, "RedefineSlot");
599 }
600
601 // clang-format off
602 ////////////////////////////////////////////////////////////////////////////
603 /// \brief Overwrite the value and/or type of an existing column.
604 /// \param[in] name The name of the column to redefine.
605 /// \param[in] expression Function, lambda expression, functor class or any other callable object producing the defined value. Returns the value that will be assigned to the defined column.
606 /// \param[in] columns Names of the columns/branches in input to the producer function (excluding slot and entry).
607 /// \return the first node of the computation graph for which the new quantity is defined.
608 ///
609 /// The old value of the column can be used as an input for the expression.
610 /// An exception is thrown in case the column to re-define does not already exist.
611 ///
612 /// See DefineSlotEntry() for more information.
613 // clang-format on
614 template <typename F>
615 RInterface<Proxied> RedefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns = {})
616 {
618 "RedefineSlotEntry");
619 }
620
621 ////////////////////////////////////////////////////////////////////////////
622 /// \brief Overwrite the value and/or type of an existing column.
623 /// \param[in] name The name of the column to redefine.
624 /// \param[in] expression An expression in C++ which represents the defined value
625 /// \return the first node of the computation graph for which the new quantity is defined.
626 ///
627 /// The expression is just-in-time compiled and used to produce the column entries.
628 /// It must be valid C++ syntax in which variable names are substituted with the names
629 /// of branches/columns.
630 ///
631 /// The old value of the column can be used as an input for the expression.
632 /// An exception is thrown in case the column to re-define does not already exist.
633 ///
634 /// Aliases cannot be overridden. See the corresponding Define() overload for more information.
652
653 ////////////////////////////////////////////////////////////////////////////
654 /// \brief In case the value in the given column is missing, provide a default value
655 /// \tparam T The type of the column
656 /// \param[in] column Column name where missing values should be replaced by the given default value
657 /// \param[in] defaultValue Value to provide instead of a missing value
658 /// \return The node of the graph that will provide a default value
659 ///
660 /// This operation is useful in case an entry of the dataset is incomplete,
661 /// i.e. if one or more of the columns do not have valid values. It does not
662 /// modify the values of the column, but in case any entry is missing, it
663 /// will provide the default value to downstream nodes instead.
664 ///
665 /// Use cases include:
666 /// * When processing multiple files, one or more of them is missing a column
667 /// * In horizontal joining with entry matching, a certain dataset has no
668 /// match for the current entry.
669 ///
670 /// ### Example usage:
671 ///
672 /// \code{.cpp}
673 /// // Assume a dataset with columns [idx, x] matching another dataset with
674 /// // columns [idx, y]. For idx == 42, the right-hand dataset has no match
675 /// ROOT::RDataFrame df{dataset};
676 /// auto df_default = df.DefaultValueFor("y", 33)
677 /// .Define("z", [](int x, int y) { return x + y; }, {"x", "y"});
678 /// auto colz = df_default.Take<int>("z");
679 /// \endcode
680 ///
681 /// \code{.py}
682 /// df = ROOT.RDataFrame(dataset)
683 /// df_default = df.DefaultValueFor("y", 33).Define("z", "x + y")
684 /// colz = df_default.Take[int]("z")
685 /// \endcode
686 template <typename T>
687 RInterface<Proxied> DefaultValueFor(std::string_view column, const T &defaultValue)
688 {
689 constexpr auto where{"DefaultValueFor"};
691 // For now disable this functionality in case of an empty data source and
692 // the column name was not defined previously.
693 if (ROOT::Internal::RDF::GetDataSourceLabel(*this) == "EmptyDS")
696
697 // Declare return type to the interpreter, for future use by jitted actions
699 if (retTypeName.empty()) {
700 // The type is not known to the interpreter.
701 // We must not error out here, but if/when this column is used in jitted code
702 const auto demangledType = RDFInternal::DemangleTypeIdName(typeid(T));
703 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
704 }
705
706 const auto validColumnNames = ColumnNames_t{column.data()};
707 auto newColumn = std::make_shared<ROOT::Internal::RDF::RDefaultValueFor<T>>(
708 column, retTypeName, defaultValue, validColumnNames, fColRegister, *fLoopManager);
710
712 newCols.AddDefine(std::move(newColumn));
713
715
716 return newInterface;
717 }
718
719 // clang-format off
720 ////////////////////////////////////////////////////////////////////////////
721 /// \brief Define a new column that is updated when the input sample changes.
722 /// \param[in] name The name of the defined column.
723 /// \param[in] expression A C++ callable that computes the new value of the defined column.
724 /// \return the first node of the computation graph for which the new quantity is defined.
725 ///
726 /// The signature of the callable passed as second argument should be `T(unsigned int slot, const ROOT::RDF::RSampleInfo &id)`
727 /// where:
728 /// - `T` is the type of the defined column
729 /// - `slot` is a number in the range [0, nThreads) that is different for each processing thread. This can simplify
730 /// the definition of thread-safe callables if you are interested in using parallel capabilities of RDataFrame.
731 /// - `id` is an instance of a ROOT::RDF::RSampleInfo object which contains information about the sample which is
732 /// being processed (see the class docs for more information).
733 ///
734 /// DefinePerSample() is useful to e.g. define a quantity that depends on which TTree in which TFile is being
735 /// processed or to inject a callback into the event loop that is only called when the processing of a new sample
736 /// starts rather than at every entry.
737 ///
738 /// The callable will be invoked once per input TTree or once per multi-thread task, whichever is more often.
739 ///
740 /// ### Example usage:
741 /// ~~~{.cpp}
742 /// ROOT::RDataFrame df{"mytree", {"sample1.root","sample2.root"}};
743 /// df.DefinePerSample("weightbysample",
744 /// [](unsigned int slot, const ROOT::RDF::RSampleInfo &id)
745 /// { return id.Contains("sample1") ? 1.0f : 2.0f; });
746 /// ~~~
747 // clang-format on
748 // TODO we could SFINAE on F's signature to provide friendlier compilation errors in case of signature mismatch
750 RInterface<Proxied> DefinePerSample(std::string_view name, F expression)
751 {
752 RDFInternal::CheckValidCppVarName(name, "DefinePerSample");
755
756 auto retTypeName = RDFInternal::TypeID2TypeName(typeid(RetType_t));
757 if (retTypeName.empty()) {
758 // The type is not known to the interpreter.
759 // We must not error out here, but if/when this column is used in jitted code
760 const auto demangledType = RDFInternal::DemangleTypeIdName(typeid(RetType_t));
761 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
762 }
763
764 auto newColumn =
765 std::make_shared<RDFDetail::RDefinePerSample<F>>(name, retTypeName, std::move(expression), *fLoopManager);
766
768 newCols.AddDefine(std::move(newColumn));
770 return newInterface;
771 }
772
773 // clang-format off
774 ////////////////////////////////////////////////////////////////////////////
775 /// \brief Define a new column that is updated when the input sample changes.
776 /// \param[in] name The name of the defined column.
777 /// \param[in] expression A valid C++ expression as a string, which will be used to compute the defined value.
778 /// \return the first node of the computation graph for which the new quantity is defined.
779 ///
780 /// The expression is just-in-time compiled and used to produce the column entries.
781 /// It must be valid C++ syntax and the usage of the special variable names `rdfslot_` and `rdfsampleinfo_` is
782 /// permitted, where these variables will take the same values as the `slot` and `id` parameters described at the
783 /// DefinePerSample(std::string_view name, F expression) overload. See the documentation of that overload for more information.
784 ///
785 /// ### Example usage:
786 /// ~~~{.py}
787 /// df = ROOT.RDataFrame('mytree', ['sample1.root','sample2.root'])
788 /// df.DefinePerSample('weightbysample', 'rdfsampleinfo_.Contains("sample1") ? 1.0f : 2.0f')
789 /// ~~~
790 ///
791 /// \note
792 /// If you have declared some C++ function to the interpreter, the correct syntax to call that function with this
793 /// overload of DefinePerSample is by calling it explicitly with the special names `rdfslot_` and `rdfsampleinfo_` as
794 /// input parameters. This is for example the correct way to call this overload when working in PyROOT:
795 /// ~~~{.py}
796 /// ROOT.gInterpreter.Declare(
797 /// """
798 /// float weights(unsigned int slot, const ROOT::RDF::RSampleInfo &id){
799 /// return id.Contains("sample1") ? 1.0f : 2.0f;
800 /// }
801 /// """)
802 /// df = ROOT.RDataFrame("mytree", ["sample1.root","sample2.root"])
803 /// df.DefinePerSample("weightsbysample", "weights(rdfslot_, rdfsampleinfo_)")
804 /// ~~~
805 ///
806 /// \note
807 /// Differently from what happens in Define(), the string expression passed to DefinePerSample cannot contain
808 /// column names other than those mentioned above: the expression is evaluated once before the processing of the
809 /// sample even starts, so column values are not accessible.
810 // clang-format on
811 RInterface<Proxied> DefinePerSample(std::string_view name, std::string_view expression)
812 {
813 RDFInternal::CheckValidCppVarName(name, "DefinePerSample");
814 // these checks must be done before jitting lest we throw exceptions in jitted code
817
819
821 newCols.AddDefine(std::move(jittedDefine));
822
824
825 return newInterface;
826 }
827
828 /// \brief Register systematic variations for a single existing column using custom variation tags.
829 /// \param[in] colName name of the column for which varied values are provided.
830 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
831 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
832 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
833 /// \param[in] inputColumns the names of the columns to be passed to the callable.
834 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
835 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
836 ///
837 /// Vary provides a natural and flexible syntax to define systematic variations that automatically propagate to
838 /// Filters, Defines and results. RDataFrame usage of columns with attached variations does not change, but for
839 /// results that depend on any varied quantity, a map/dictionary of varied results can be produced with
840 /// ROOT::RDF::Experimental::VariationsFor (see the example below).
841 ///
842 /// The dictionary will contain a "nominal" value (accessed with the "nominal" key) for the unchanged result, and
843 /// values for each of the systematic variations that affected the result (via upstream Filters or via direct or
844 /// indirect dependencies of the column values on some registered variations). The keys will be a composition of
845 /// variation names and tags, e.g. "pt:up" and "pt:down" for the example below.
846 ///
847 /// In the following example we add up/down variations of pt and fill a histogram with a quantity that depends on pt.
848 /// We automatically obtain three histograms in output ("nominal", "pt:up" and "pt:down"):
849 /// ~~~{.cpp}
850 /// auto nominal_hx =
851 /// df.Vary("pt", [] (double pt) { return RVecD{pt*0.9, pt*1.1}; }, {"down", "up"})
852 /// .Filter("pt > k")
853 /// .Define("x", someFunc, {"pt"})
854 /// .Histo1D("x");
855 ///
856 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
857 /// hx["nominal"].Draw();
858 /// hx["pt:down"].Draw("SAME");
859 /// hx["pt:up"].Draw("SAME");
860 /// ~~~
861 /// RDataFrame computes all variations as part of a single loop over the data.
862 /// In particular, this means that I/O and computation of values shared
863 /// among variations only happen once for all variations. Thus, the event loop
864 /// run-time typically scales much better than linearly with the number of
865 /// variations.
866 ///
867 /// RDataFrame lazily computes the varied values required to produce the
868 /// outputs of \ref ROOT::RDF::Experimental::VariationsFor "VariationsFor()". If \ref
869 /// ROOT::RDF::Experimental::VariationsFor "VariationsFor()" was not called for a result, the computations are only
870 /// run for the nominal case.
871 ///
872 /// See other overloads for examples when variations are added for multiple existing columns,
873 /// or when the tags are auto-generated instead of being directly defined.
874 template <typename F>
875 RInterface<Proxied> Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns,
876 const std::vector<std::string> &variationTags, std::string_view variationName = "")
877 {
878 std::vector<std::string> colNames{{std::string(colName)}};
879 const std::string theVariationName{variationName.empty() ? colName : variationName};
880
881 return VaryImpl<true>(std::move(colNames), std::forward<F>(expression), inputColumns, variationTags,
883 }
884
885 /// \brief Register systematic variations for a single existing column using auto-generated variation tags.
886 /// \param[in] colName name of the column for which varied values are provided.
887 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
888 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
889 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
890 /// \param[in] inputColumns the names of the columns to be passed to the callable.
891 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
892 /// `"1"`, etc.
893 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
894 /// colName is used if none is provided.
895 ///
896 /// This overload of Vary takes an nVariations parameter instead of a list of tag names.
897 /// The varied results will be accessible via the keys of the dictionary with the form `variationName:N` where `N`
898 /// is the corresponding sequential tag starting at 0 and going up to `nVariations - 1`.
899 ///
900 /// Example usage:
901 /// ~~~{.cpp}
902 /// auto nominal_hx =
903 /// df.Vary("pt", [] (double pt) { return RVecD{pt*0.9, pt*1.1}; }, 2)
904 /// .Histo1D("x");
905 ///
906 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
907 /// hx["nominal"].Draw();
908 /// hx["x:0"].Draw("SAME");
909 /// hx["x:1"].Draw("SAME");
910 /// ~~~
911 ///
912 /// \note See also This Vary() overload for more information.
913 template <typename F>
914 RInterface<Proxied> Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns,
915 std::size_t nVariations, std::string_view variationName = "")
916 {
917 R__ASSERT(nVariations > 0 && "Must have at least one variation.");
918
919 std::vector<std::string> variationTags;
920 variationTags.reserve(nVariations);
921 for (std::size_t i = 0u; i < nVariations; ++i)
922 variationTags.emplace_back(std::to_string(i));
923
924 const std::string theVariationName{variationName.empty() ? colName : variationName};
925
926 return Vary(colName, std::forward<F>(expression), inputColumns, std::move(variationTags), theVariationName);
927 }
928
929 /// \brief Register systematic variations for multiple existing columns using custom variation tags.
930 /// \param[in] colNames set of names of the columns for which varied values are provided.
931 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
932 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
933 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
934 /// \param[in] inputColumns the names of the columns to be passed to the callable.
935 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
936 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`
937 ///
938 /// This overload of Vary takes a list of column names as first argument and
939 /// requires that the expression returns an RVec of RVecs of values: one inner RVec for the variations of each
940 /// affected column. The `variationTags` are defined as `{"down", "up"}`.
941 ///
942 /// Example usage:
943 /// ~~~{.cpp}
944 /// // produce variations "ptAndEta:down" and "ptAndEta:up"
945 /// auto nominal_hx =
946 /// df.Vary({"pt", "eta"}, // the columns that will vary simultaneously
947 /// [](double pt, double eta) { return RVec<RVecF>{{pt*0.9, pt*1.1}, {eta*0.9, eta*1.1}}; },
948 /// {"pt", "eta"}, // inputs to the Vary expression, independent of what columns are varied
949 /// {"down", "up"}, // variation tags
950 /// "ptAndEta") // variation name
951 /// .Histo1D("pt", "eta");
952 ///
953 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
954 /// hx["nominal"].Draw();
955 /// hx["ptAndEta:down"].Draw("SAME");
956 /// hx["ptAndEta:up"].Draw("SAME");
957 /// ~~~
958 ///
959 /// \note See also This Vary() overload for more information.
960
961 template <typename F>
962 RInterface<Proxied> Vary(const std::vector<std::string> &colNames, F &&expression, const ColumnNames_t &inputColumns,
963 const std::vector<std::string> &variationTags, std::string_view variationName)
964 {
965 return VaryImpl<false>(colNames, std::forward<F>(expression), inputColumns, variationTags, variationName);
966 }
967
968 /// \brief Register systematic variations for multiple existing columns using custom variation tags.
969 /// \param[in] colNames set of names of the columns for which varied values are provided.
970 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
971 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
972 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
973 /// \param[in] inputColumns the names of the columns to be passed to the callable.
974 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
975 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
976 /// colName is used if none is provided.
977 ///
978 /// \note This overload ensures that the ambiguity between C++20 string, vector<string> construction from init list
979 /// is avoided.
980 ///
981 /// \note See also This Vary() overload for more information.
982 template <typename F>
984 Vary(std::initializer_list<std::string> colNames, F &&expression, const ColumnNames_t &inputColumns,
985 const std::vector<std::string> &variationTags, std::string_view variationName)
986 {
987 return Vary(std::vector<std::string>(colNames), std::forward<F>(expression), inputColumns, variationTags, variationName);
988 }
989
990 /// \brief Register systematic variations for multiple existing columns using auto-generated tags.
991 /// \param[in] colNames set of names of the columns for which varied values are provided.
992 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
993 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
994 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
995 /// \param[in] inputColumns the names of the columns to be passed to the callable.
996 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
997 /// `"1"`, etc.
998 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
999 /// colName is used if none is provided.
1000 ///
1001 /// This overload of Vary takes a list of column names as first argument.
1002 /// It takes an `nVariations` parameter instead of a list of tag names (`variationTags`). Tag names
1003 /// will be auto-generated as the sequence 0...``nVariations-1``.
1004 ///
1005 /// Example usage:
1006 /// ~~~{.cpp}
1007 /// auto nominal_hx =
1008 /// df.Vary({"pt", "eta"}, // the columns that will vary simultaneously
1009 /// [](double pt, double eta) { return RVec<RVecF>{{pt*0.9, pt*1.1}, {eta*0.9, eta*1.1}}; },
1010 /// {"pt", "eta"}, // inputs to the Vary expression, independent of what columns are varied
1011 /// 2, // auto-generated variation tags
1012 /// "ptAndEta") // variation name
1013 /// .Histo1D("pt", "eta");
1014 ///
1015 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1016 /// hx["nominal"].Draw();
1017 /// hx["ptAndEta:0"].Draw("SAME");
1018 /// hx["ptAndEta:1"].Draw("SAME");
1019 /// ~~~
1020 ///
1021 /// \note See also This Vary() overload for more information.
1022 template <typename F>
1023 RInterface<Proxied> Vary(const std::vector<std::string> &colNames, F &&expression, const ColumnNames_t &inputColumns,
1024 std::size_t nVariations, std::string_view variationName)
1025 {
1026 R__ASSERT(nVariations > 0 && "Must have at least one variation.");
1027
1028 std::vector<std::string> variationTags;
1029 variationTags.reserve(nVariations);
1030 for (std::size_t i = 0u; i < nVariations; ++i)
1031 variationTags.emplace_back(std::to_string(i));
1032
1033 return Vary(colNames, std::forward<F>(expression), inputColumns, std::move(variationTags), variationName);
1034 }
1035
1036 /// \brief Register systematic variations for for multiple existing columns using custom variation tags.
1037 /// \param[in] colNames set of names of the columns for which varied values are provided.
1038 /// \param[in] expression a callable that evaluates the varied values for the specified columns. The callable can
1039 /// take any column values as input, similarly to what happens during Filter and Define calls. It must
1040 /// return an RVec of varied values, one for each variation tag, in the same order as the tags.
1041 /// \param[in] inputColumns the names of the columns to be passed to the callable.
1042 /// \param[in] inputColumns the names of the columns to be passed to the callable.
1043 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1044 /// `"1"`, etc.
1045 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1046 /// colName is used if none is provided.
1047 ///
1048 /// \note This overload ensures that the ambiguity between C++20 string, vector<string> construction from init list
1049 /// is avoided.
1050 ///
1051 /// \note See also This Vary() overload for more information.
1052 template <typename F>
1053 RInterface<Proxied> Vary(std::initializer_list<std::string> colNames, F &&expression,
1054 const ColumnNames_t &inputColumns, std::size_t nVariations, std::string_view variationName)
1055 {
1056 return Vary(std::vector<std::string>(colNames), std::forward<F>(expression), inputColumns, nVariations, variationName);
1057 }
1058
1059 /// \brief Register systematic variations for a single existing column using custom variation tags.
1060 /// \param[in] colName name of the column for which varied values are provided.
1061 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied
1062 /// values for the specified column.
1063 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
1064 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1065 /// colName is used if none is provided.
1066 ///
1067 /// This overload adds the possibility for the expression used to evaluate the varied values to be just-in-time
1068 /// compiled. The example below shows how Vary() is used while dealing with a single column. The variation tags are
1069 /// defined as `{"down", "up"}`.
1070 /// ~~~{.cpp}
1071 /// auto nominal_hx =
1072 /// df.Vary("pt", "ROOT::RVecD{pt*0.9, pt*1.1}", {"down", "up"})
1073 /// .Filter("pt > k")
1074 /// .Define("x", someFunc, {"pt"})
1075 /// .Histo1D("x");
1076 ///
1077 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1078 /// hx["nominal"].Draw();
1079 /// hx["pt:down"].Draw("SAME");
1080 /// hx["pt:up"].Draw("SAME");
1081 /// ~~~
1082 ///
1083 /// ## Short-hand expression syntax
1084 ///
1085 /// For convenience, when a C++ expression is passed to Vary, the return type can be omitted if the string begins
1086 /// with '{' and ends with '}' (whitespace, tab and newline characters are excluded from the search). This means that
1087 /// the following is equivalent to the example above:
1088 ///
1089 /// ~~~{.cpp}
1090 /// auto nominal_hx =
1091 /// df.Vary("pt", "{pt*0.9, pt*1.1}", {"down", "up"})
1092 /// // Same as above
1093 /// ~~~
1094 ///
1095 /// \note See also This Vary() overload for more information.
1096 RInterface<Proxied> Vary(std::string_view colName, std::string_view expression,
1097 const std::vector<std::string> &variationTags, std::string_view variationName = "")
1098 {
1099 std::vector<std::string> colNames{{std::string(colName)}};
1100 const std::string theVariationName{variationName.empty() ? colName : variationName};
1101
1102 return JittedVaryImpl(colNames, expression, variationTags, theVariationName, /*isSingleColumn=*/true);
1103 }
1104
1105 /// \brief Register systematic variations for a single existing column using auto-generated variation tags.
1106 /// \param[in] colName name of the column for which varied values are provided.
1107 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied
1108 /// values for the specified column.
1109 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1110 /// `"1"`, etc.
1111 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1112 /// colName is used if none is provided.
1113 ///
1114 /// This overload adds the possibility for the expression used to evaluate the varied values to be a just-in-time
1115 /// compiled. The example below shows how Vary() is used while dealing with a single column. The variation tags are
1116 /// auto-generated.
1117 /// ~~~{.cpp}
1118 /// auto nominal_hx =
1119 /// df.Vary("pt", "ROOT::RVecD{pt*0.9, pt*1.1}", 2)
1120 /// .Histo1D("pt");
1121 ///
1122 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1123 /// hx["nominal"].Draw();
1124 /// hx["pt:0"].Draw("SAME");
1125 /// hx["pt:1"].Draw("SAME");
1126 /// ~~~
1127 ///
1128 /// ## Short-hand expression syntax
1129 ///
1130 /// For convenience, when a C++ expression is passed to Vary, the return type can be omitted if the string begins
1131 /// with '{' and ends with '}' (whitespace, tab and newline characters are excluded from the search). This means that
1132 /// the following is equivalent to the example above:
1133 ///
1134 /// ~~~{.cpp}
1135 /// auto nominal_hx =
1136 /// df.Vary("pt", "{pt*0.9, pt*1.1}", 2)
1137 /// // Same as above
1138 /// ~~~
1139 ///
1140 /// \note See also This Vary() overload for more information.
1141 RInterface<Proxied> Vary(std::string_view colName, std::string_view expression, std::size_t nVariations,
1142 std::string_view variationName = "")
1143 {
1144 std::vector<std::string> variationTags;
1145 variationTags.reserve(nVariations);
1146 for (std::size_t i = 0u; i < nVariations; ++i)
1147 variationTags.emplace_back(std::to_string(i));
1148
1149 return Vary(colName, expression, std::move(variationTags), variationName);
1150 }
1151
1152 /// \brief Register systematic variations for multiple existing columns using auto-generated variation tags.
1153 /// \param[in] colNames set of names of the columns for which varied values are provided.
1154 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec or RVecs containing the varied
1155 /// values for the specified columns.
1156 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1157 /// `"1"`, etc.
1158 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1159 ///
1160 /// This overload adds the possibility for the expression used to evaluate the varied values to be just-in-time
1161 /// compiled. It takes an nVariations parameter instead of a list of tag names.
1162 /// The varied results will be accessible via the keys of the dictionary with the form `variationName:N` where `N`
1163 /// is the corresponding sequential tag starting at 0 and going up to `nVariations - 1`.
1164 /// The example below shows how Vary() is used while dealing with multiple columns.
1165 ///
1166 /// ~~~{.cpp}
1167 /// auto nominal_hx =
1168 /// df.Vary({"x", "y"}, "ROOT::RVec<ROOT::RVecD>{{x*0.9, x*1.1}, {y*0.9, y*1.1}}", 2, "xy")
1169 /// .Histo1D("x", "y");
1170 ///
1171 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1172 /// hx["nominal"].Draw();
1173 /// hx["xy:0"].Draw("SAME");
1174 /// hx["xy:1"].Draw("SAME");
1175 /// ~~~
1176 ///
1177 /// ## Short-hand expression syntax
1178 ///
1179 /// For convenience, when a C++ expression is passed to Vary, the return type can be omitted if the string begins
1180 /// with '{' and ends with '}' (whitespace, tab and newline characters are excluded from the search). This means that
1181 /// the following is equivalent to the example above:
1182 ///
1183 /// ~~~{.cpp}
1184 /// auto nominal_hx =
1185 /// df.Vary("pt", "{{x*0.9, x*1.1}, {y*0.9, y*1.1}}", 2, "xy")
1186 /// // Same as above
1187 /// ~~~
1188 ///
1189 /// or also:
1190 ///
1191 /// ~~~{.cpp}
1192 /// auto nominal_hx =
1193 /// df.Vary("pt", R"(
1194 /// {
1195 /// {x*0.9, x*1.1}, // x variations
1196 /// {y*0.9, y*1.1} // y variations
1197 /// }
1198 /// )", 2, "xy")
1199 /// // Same as above
1200 /// ~~~
1201 ///
1202 /// \note See also This Vary() overload for more information.
1203 RInterface<Proxied> Vary(const std::vector<std::string> &colNames, std::string_view expression,
1204 std::size_t nVariations, std::string_view variationName)
1205 {
1206 std::vector<std::string> variationTags;
1207 variationTags.reserve(nVariations);
1208 for (std::size_t i = 0u; i < nVariations; ++i)
1209 variationTags.emplace_back(std::to_string(i));
1210
1211 return Vary(colNames, expression, std::move(variationTags), variationName);
1212 }
1213
1214 /// \brief Register systematic variations for multiple existing columns using auto-generated variation tags.
1215 /// \param[in] colNames set of names of the columns for which varied values are provided.
1216 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied
1217 /// values for the specified column.
1218 /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`,
1219 /// `"1"`, etc.
1220 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1221 /// colName is used if none is provided.
1222 ///
1223 /// \note This overload ensures that the ambiguity between C++20 string, vector<string> construction from init list
1224 /// is avoided.
1225 ///
1226 /// \note See also This Vary() overload for more information.
1227 RInterface<Proxied> Vary(std::initializer_list<std::string> colNames, std::string_view expression,
1228 std::size_t nVariations, std::string_view variationName)
1229 {
1230 return Vary(std::vector<std::string>(colNames), expression, nVariations, variationName);
1231 }
1232
1233 /// \brief Register systematic variations for multiple existing columns using custom variation tags.
1234 /// \param[in] colNames set of names of the columns for which varied values are provided.
1235 /// \param[in] expression a string containing valid C++ code that evaluates to an RVec or RVecs containing the varied
1236 /// values for the specified columns.
1237 /// \param[in] variationTags names for each of the varied values, e.g. `"up"` and `"down"`.
1238 /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`.
1239 ///
1240 /// This overload adds the possibility for the expression used to evaluate the varied values to be just-in-time
1241 /// compiled. The example below shows how Vary() is used while dealing with multiple columns. The tags are defined as
1242 /// `{"down", "up"}`.
1243 /// ~~~{.cpp}
1244 /// auto nominal_hx =
1245 /// df.Vary({"x", "y"}, "ROOT::RVec<ROOT::RVecD>{{x*0.9, x*1.1}, {y*0.9, y*1.1}}", {"down", "up"}, "xy")
1246 /// .Histo1D("x", "y");
1247 ///
1248 /// auto hx = ROOT::RDF::Experimental::VariationsFor(nominal_hx);
1249 /// hx["nominal"].Draw();
1250 /// hx["xy:down"].Draw("SAME");
1251 /// hx["xy:up"].Draw("SAME");
1252 /// ~~~
1253 ///
1254 /// ## Short-hand expression syntax
1255 ///
1256 /// For convenience, when a C++ expression is passed to Vary, the return type can be omitted if the string begins
1257 /// with '{' and ends with '}' (whitespace, tab and newline characters are excluded from the search). This means that
1258 /// the following is equivalent to the example above:
1259 ///
1260 /// ~~~{.cpp}
1261 /// auto nominal_hx =
1262 /// df.Vary("pt", "{{x*0.9, x*1.1}, {y*0.9, y*1.1}}", {"down", "up"}, "xy")
1263 /// // Same as above
1264 /// ~~~
1265 ///
1266 /// or also:
1267 ///
1268 /// ~~~{.cpp}
1269 /// auto nominal_hx =
1270 /// df.Vary("pt", R"(
1271 /// {
1272 /// {x*0.9, x*1.1}, // x variations
1273 /// {y*0.9, y*1.1} // y variations
1274 /// }
1275 /// )", {"down", "up"}, "xy")
1276 /// // Same as above
1277 /// ~~~
1278 ///
1279 /// \note See also This Vary() overload for more information.
1280 RInterface<Proxied> Vary(const std::vector<std::string> &colNames, std::string_view expression,
1281 const std::vector<std::string> &variationTags, std::string_view variationName)
1282 {
1283 return JittedVaryImpl(colNames, expression, variationTags, variationName, /*isSingleColumn=*/false);
1284 }
1285
1286 ////////////////////////////////////////////////////////////////////////////
1287 /// \brief Allow to refer to a column with a different name.
1288 /// \param[in] alias name of the column alias
1289 /// \param[in] columnName of the column to be aliased
1290 /// \return the first node of the computation graph for which the alias is available.
1291 ///
1292 /// Aliasing an alias is supported.
1293 ///
1294 /// ### Example usage:
1295 /// ~~~{.cpp}
1296 /// auto df_with_alias = df.Alias("simple_name", "very_long&complex_name!!!");
1297 /// ~~~
1298 RInterface<Proxied> Alias(std::string_view alias, std::string_view columnName)
1299 {
1300 // The symmetry with Define is clear. We want to:
1301 // - Create globally the alias and return this very node, unchanged
1302 // - Make aliases accessible based on chains and not globally
1303
1304 // Helper to find out if a name is a column
1306
1307 constexpr auto where = "Alias";
1309 // If the alias name is a column name, there is a problem
1311
1312 const auto validColumnName = GetValidatedColumnNames(1, {std::string(columnName)})[0];
1313
1315 newCols.AddAlias(alias, validColumnName);
1316
1318
1319 return newInterface;
1320 }
1321
1322 // clang-format off
1323 ////////////////////////////////////////////////////////////////////////////
1324 /// \brief Creates a node that filters entries based on range: [begin, end).
1325 /// \param[in] begin Initial entry number considered for this range.
1326 /// \param[in] end Final entry number (excluded) considered for this range. 0 means that the range goes until the end of the dataset.
1327 /// \param[in] stride Process one entry of the [begin, end) range every `stride` entries. Must be strictly greater than 0.
1328 /// \return the first node of the computation graph for which the event loop is limited to a certain range of entries.
1329 ///
1330 /// Note that in case of previous Ranges and Filters the selected range refers to the transformed dataset.
1331 /// Ranges are only available if EnableImplicitMT has _not_ been called. Multi-thread ranges are not supported.
1332 ///
1333 /// ### Example usage:
1334 /// ~~~{.cpp}
1335 /// auto d_0_30 = d.Range(0, 30); // Pick the first 30 entries
1336 /// auto d_15_end = d.Range(15, 0); // Pick all entries from 15 onwards
1337 /// auto d_15_end_3 = d.Range(15, 0, 3); // Stride: from event 15, pick an event every 3
1338 /// ~~~
1339 // clang-format on
1340 RInterface<RDFDetail::RRange<Proxied>> Range(unsigned int begin, unsigned int end, unsigned int stride = 1)
1341 {
1342 // check invariants
1343 if (stride == 0 || (end != 0 && end < begin))
1344 throw std::runtime_error("Range: stride must be strictly greater than 0 and end must be greater than begin.");
1345 CheckIMTDisabled("Range");
1346
1347 using Range_t = RDFDetail::RRange<Proxied>;
1348 auto rangePtr = std::make_shared<Range_t>(begin, end, stride, fProxiedPtr);
1350 return newInterface;
1351 }
1352
1353 // clang-format off
1354 ////////////////////////////////////////////////////////////////////////////
1355 /// \brief Creates a node that filters entries based on range.
1356 /// \param[in] end Final entry number (excluded) considered for this range. 0 means that the range goes until the end of the dataset.
1357 /// \return a node of the computation graph for which the range is defined.
1358 ///
1359 /// See the other Range overload for a detailed description.
1360 // clang-format on
1361 RInterface<RDFDetail::RRange<Proxied>> Range(unsigned int end) { return Range(0, end, 1); }
1362
1363 /// \}
1364 // ---------------------------------------------------------------------------------
1365 // End of the doxygen group for Transformations
1366
1367 /// \name Actions
1368 /// Actions declare a type of result to be produced, for example histograms or summary statistics.
1369 /// Actions are lazy, i.e. they are only executed once a result is requested.
1370 /// \{
1371
1372 ////////////////////////////////////////////////////////////////////////////
1373 /// \brief Return the number of entries processed (*lazy action*).
1374 /// \return the number of entries wrapped in a RResultPtr.
1375 ///
1376 /// Useful e.g. for counting the number of entries passing a certain filter (see also `Report`).
1377 /// This action is *lazy*: upon invocation of this method the calculation is
1378 /// booked but not executed. Also see RResultPtr.
1379 ///
1380 /// ### Example usage:
1381 /// ~~~{.cpp}
1382 /// auto nEntriesAfterCuts = myFilteredDf.Count();
1383 /// ~~~
1384 ///
1386 {
1387 const auto nSlots = fLoopManager->GetNSlots();
1388 auto cSPtr = std::make_shared<ULong64_t>(0);
1389 using Helper_t = RDFInternal::CountHelper;
1391 auto action = std::make_unique<Action_t>(Helper_t(cSPtr, nSlots), ColumnNames_t({}), fProxiedPtr,
1393 return MakeResultPtr(cSPtr, *fLoopManager, std::move(action));
1394 }
1395
1396 ////////////////////////////////////////////////////////////////////////////
1397 /// \brief Return a collection of values of a column (*lazy action*, returns a std::vector by default).
1398 /// \tparam T The type of the column.
1399 /// \tparam COLL The type of collection used to store the values.
1400 /// \param[in] column The name of the column to collect the values of.
1401 /// \return the content of the selected column wrapped in a RResultPtr.
1402 ///
1403 /// The collection type to be specified for C-style array columns is `RVec<T>`:
1404 /// in this case the returned collection is a `std::vector<RVec<T>>`.
1405 /// ### Example usage:
1406 /// ~~~{.cpp}
1407 /// // In this case intCol is a std::vector<int>
1408 /// auto intCol = rdf.Take<int>("integerColumn");
1409 /// // Same content as above but in this case taken as a RVec<int>
1410 /// auto intColAsRVec = rdf.Take<int, RVec<int>>("integerColumn");
1411 /// // In this case intCol is a std::vector<RVec<int>>, a collection of collections
1412 /// auto cArrayIntCol = rdf.Take<RVec<int>>("cArrayInt");
1413 /// ~~~
1414 /// This action is *lazy*: upon invocation of this method the calculation is
1415 /// booked but not executed. Also see RResultPtr.
1416 template <typename T, typename COLL = std::vector<T>>
1417 RResultPtr<COLL> Take(std::string_view column = "")
1418 {
1419 const auto columns = column.empty() ? ColumnNames_t() : ColumnNames_t({std::string(column)});
1420
1423
1424 using Helper_t = RDFInternal::TakeHelper<T, T, COLL>;
1426 auto valuesPtr = std::make_shared<COLL>();
1427 const auto nSlots = fLoopManager->GetNSlots();
1428
1429 auto action =
1430 std::make_unique<Action_t>(Helper_t(valuesPtr, nSlots), validColumnNames, fProxiedPtr, fColRegister);
1431 return MakeResultPtr(valuesPtr, *fLoopManager, std::move(action));
1432 }
1433
1434 ////////////////////////////////////////////////////////////////////////////
1435 /// \brief Fill and return a one-dimensional histogram with the values of a column (*lazy action*).
1436 /// \tparam V The type of the column used to fill the histogram.
1437 /// \param[in] model The returned histogram will be constructed using this as a model.
1438 /// \param[in] vName The name of the column that will fill the histogram.
1439 /// \return the monodimensional histogram wrapped in a RResultPtr.
1440 ///
1441 /// Columns can be of a container type (e.g. `std::vector<double>`), in which case the histogram
1442 /// is filled with each one of the elements of the container. In case multiple columns of container type
1443 /// are provided (e.g. values and weights) they must have the same length for each one of the events (but
1444 /// possibly different lengths between events).
1445 /// This action is *lazy*: upon invocation of this method the calculation is
1446 /// booked but not executed. Also see RResultPtr.
1447 ///
1448 /// ### Example usage:
1449 /// ~~~{.cpp}
1450 /// // Deduce column type (this invocation needs jitting internally)
1451 /// auto myHist1 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myColumn");
1452 /// // Explicit column type
1453 /// auto myHist2 = myDf.Histo1D<float>({"histName", "histTitle", 64u, 0., 128.}, "myColumn");
1454 /// ~~~
1455 ///
1456 /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory
1457 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
1458 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
1459 template <typename V = RDFDetail::RInferredType>
1460 RResultPtr<::TH1D> Histo1D(const TH1DModel &model = {"", "", 128u, 0., 0.}, std::string_view vName = "")
1461 {
1462 const auto userColumns = vName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(vName)});
1463
1465
1466 std::shared_ptr<::TH1D> h(nullptr);
1467 {
1468 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1469 h = model.GetHistogram();
1470 }
1471
1472 if (h->GetXaxis()->GetXmax() == h->GetXaxis()->GetXmin())
1473 h->SetCanExtend(::TH1::kAllAxes);
1475 }
1476
1477 ////////////////////////////////////////////////////////////////////////////
1478 /// \brief Fill and return a one-dimensional histogram with the values of a column (*lazy action*).
1479 /// \tparam V The type of the column used to fill the histogram.
1480 /// \param[in] vName The name of the column that will fill the histogram.
1481 /// \return the monodimensional histogram wrapped in a RResultPtr.
1482 ///
1483 /// This overload uses a default model histogram TH1D(name, title, 128u, 0., 0.).
1484 /// The "name" and "title" strings are built starting from the input column name.
1485 /// See the description of the first Histo1D() overload for more details.
1486 ///
1487 /// ### Example usage:
1488 /// ~~~{.cpp}
1489 /// // Deduce column type (this invocation needs jitting internally)
1490 /// auto myHist1 = myDf.Histo1D("myColumn");
1491 /// // Explicit column type
1492 /// auto myHist2 = myDf.Histo1D<float>("myColumn");
1493 /// ~~~
1494 template <typename V = RDFDetail::RInferredType>
1496 {
1497 const auto h_name = std::string(vName);
1498 const auto h_title = h_name + ";" + h_name + ";count";
1499 return Histo1D<V>({h_name.c_str(), h_title.c_str(), 128u, 0., 0.}, vName);
1500 }
1501
1502 ////////////////////////////////////////////////////////////////////////////
1503 /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*).
1504 /// \tparam V The type of the column used to fill the histogram.
1505 /// \tparam W The type of the column used as weights.
1506 /// \param[in] model The returned histogram will be constructed using this as a model.
1507 /// \param[in] vName The name of the column that will fill the histogram.
1508 /// \param[in] wName The name of the column that will provide the weights.
1509 /// \return the monodimensional histogram wrapped in a RResultPtr.
1510 ///
1511 /// See the description of the first Histo1D() overload for more details.
1512 ///
1513 /// ### Example usage:
1514 /// ~~~{.cpp}
1515 /// // Deduce column type (this invocation needs jitting internally)
1516 /// auto myHist1 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myValue", "myweight");
1517 /// // Explicit column type
1518 /// auto myHist2 = myDf.Histo1D<float, int>({"histName", "histTitle", 64u, 0., 128.}, "myValue", "myweight");
1519 /// ~~~
1520 template <typename V = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
1521 RResultPtr<::TH1D> Histo1D(const TH1DModel &model, std::string_view vName, std::string_view wName)
1522 {
1523 const std::vector<std::string_view> columnViews = {vName, wName};
1525 ? ColumnNames_t()
1527 std::shared_ptr<::TH1D> h(nullptr);
1528 {
1529 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1530 h = model.GetHistogram();
1531 }
1532
1533 if (h->GetXaxis()->GetXmax() == h->GetXaxis()->GetXmin())
1534 h->SetCanExtend(::TH1::kAllAxes);
1536 }
1537
1538 ////////////////////////////////////////////////////////////////////////////
1539 /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*).
1540 /// \tparam V The type of the column used to fill the histogram.
1541 /// \tparam W The type of the column used as weights.
1542 /// \param[in] vName The name of the column that will fill the histogram.
1543 /// \param[in] wName The name of the column that will provide the weights.
1544 /// \return the monodimensional histogram wrapped in a RResultPtr.
1545 ///
1546 /// This overload uses a default model histogram TH1D(name, title, 128u, 0., 0.).
1547 /// The "name" and "title" strings are built starting from the input column names.
1548 /// See the description of the first Histo1D() overload for more details.
1549 ///
1550 /// ### Example usage:
1551 /// ~~~{.cpp}
1552 /// // Deduce column types (this invocation needs jitting internally)
1553 /// auto myHist1 = myDf.Histo1D("myValue", "myweight");
1554 /// // Explicit column types
1555 /// auto myHist2 = myDf.Histo1D<float, int>("myValue", "myweight");
1556 /// ~~~
1557 template <typename V = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
1558 RResultPtr<::TH1D> Histo1D(std::string_view vName, std::string_view wName)
1559 {
1560 // We build name and title based on the value and weight column names
1561 std::string str_vName{vName};
1562 std::string str_wName{wName};
1563 const auto h_name = str_vName + "_weighted_" + str_wName;
1564 const auto h_title = str_vName + ", weights: " + str_wName + ";" + str_vName + ";count * " + str_wName;
1565 return Histo1D<V, W>({h_name.c_str(), h_title.c_str(), 128u, 0., 0.}, vName, wName);
1566 }
1567
1568 ////////////////////////////////////////////////////////////////////////////
1569 /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*).
1570 /// \tparam V The type of the column used to fill the histogram.
1571 /// \tparam W The type of the column used as weights.
1572 /// \param[in] model The returned histogram will be constructed using this as a model.
1573 /// \return the monodimensional histogram wrapped in a RResultPtr.
1574 ///
1575 /// This overload will use the first two default columns as column names.
1576 /// See the description of the first Histo1D() overload for more details.
1577 template <typename V, typename W>
1578 RResultPtr<::TH1D> Histo1D(const TH1DModel &model = {"", "", 128u, 0., 0.})
1579 {
1580 return Histo1D<V, W>(model, "", "");
1581 }
1582
1583 ////////////////////////////////////////////////////////////////////////////
1584 /// \brief Fill and return a two-dimensional histogram (*lazy action*).
1585 /// \tparam V1 The type of the column used to fill the x axis of the histogram.
1586 /// \tparam V2 The type of the column used to fill the y axis of the histogram.
1587 /// \param[in] model The returned histogram will be constructed using this as a model.
1588 /// \param[in] v1Name The name of the column that will fill the x axis.
1589 /// \param[in] v2Name The name of the column that will fill the y axis.
1590 /// \return the bidimensional histogram wrapped in a RResultPtr.
1591 ///
1592 /// Columns can be of a container type (e.g. std::vector<double>), in which case the histogram
1593 /// is filled with each one of the elements of the container. In case multiple columns of container type
1594 /// are provided (e.g. values and weights) they must have the same length for each one of the events (but
1595 /// possibly different lengths between events).
1596 /// This action is *lazy*: upon invocation of this method the calculation is
1597 /// booked but not executed. Also see RResultPtr.
1598 ///
1599 /// ### Example usage:
1600 /// ~~~{.cpp}
1601 /// // Deduce column types (this invocation needs jitting internally)
1602 /// auto myHist1 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY");
1603 /// // Explicit column types
1604 /// auto myHist2 = myDf.Histo2D<float, float>({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY");
1605 /// ~~~
1606 ///
1607 ///
1608 /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory
1609 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
1610 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
1611 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType>
1612 RResultPtr<::TH2D> Histo2D(const TH2DModel &model, std::string_view v1Name = "", std::string_view v2Name = "")
1613 {
1614 std::shared_ptr<::TH2D> h(nullptr);
1615 {
1616 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1617 h = model.GetHistogram();
1618 }
1619 if (!RDFInternal::HistoUtils<::TH2D>::HasAxisLimits(*h)) {
1620 throw std::runtime_error("2D histograms with no axes limits are not supported yet.");
1621 }
1622 const std::vector<std::string_view> columnViews = {v1Name, v2Name};
1624 ? ColumnNames_t()
1627 }
1628
1629 ////////////////////////////////////////////////////////////////////////////
1630 /// \brief Fill and return a weighted two-dimensional histogram (*lazy action*).
1631 /// \tparam V1 The type of the column used to fill the x axis of the histogram.
1632 /// \tparam V2 The type of the column used to fill the y axis of the histogram.
1633 /// \tparam W The type of the column used for the weights of the histogram.
1634 /// \param[in] model The returned histogram will be constructed using this as a model.
1635 /// \param[in] v1Name The name of the column that will fill the x axis.
1636 /// \param[in] v2Name The name of the column that will fill the y axis.
1637 /// \param[in] wName The name of the column that will provide the weights.
1638 /// \return the bidimensional histogram wrapped in a RResultPtr.
1639 ///
1640 /// This action is *lazy*: upon invocation of this method the calculation is
1641 /// booked but not executed. Also see RResultPtr.
1642 ///
1643 /// ### Example usage:
1644 /// ~~~{.cpp}
1645 /// // Deduce column types (this invocation needs jitting internally)
1646 /// auto myHist1 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY", "myWeight");
1647 /// // Explicit column types
1648 /// auto myHist2 = myDf.Histo2D<float, float, double>({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY", "myWeight");
1649 /// ~~~
1650 ///
1651 /// See the documentation of the first Histo2D() overload for more details.
1652 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
1653 typename W = RDFDetail::RInferredType>
1655 Histo2D(const TH2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
1656 {
1657 std::shared_ptr<::TH2D> h(nullptr);
1658 {
1659 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1660 h = model.GetHistogram();
1661 }
1662 if (!RDFInternal::HistoUtils<::TH2D>::HasAxisLimits(*h)) {
1663 throw std::runtime_error("2D histograms with no axes limits are not supported yet.");
1664 }
1665 const std::vector<std::string_view> columnViews = {v1Name, v2Name, wName};
1667 ? ColumnNames_t()
1670 }
1671
1672 template <typename V1, typename V2, typename W>
1674 {
1675 return Histo2D<V1, V2, W>(model, "", "", "");
1676 }
1677
1678 ////////////////////////////////////////////////////////////////////////////
1679 /// \brief Fill and return a three-dimensional histogram (*lazy action*).
1680 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
1681 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
1682 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
1683 /// \param[in] model The returned histogram will be constructed using this as a model.
1684 /// \param[in] v1Name The name of the column that will fill the x axis.
1685 /// \param[in] v2Name The name of the column that will fill the y axis.
1686 /// \param[in] v3Name The name of the column that will fill the z axis.
1687 /// \return the tridimensional histogram wrapped in a RResultPtr.
1688 ///
1689 /// This action is *lazy*: upon invocation of this method the calculation is
1690 /// booked but not executed. Also see RResultPtr.
1691 ///
1692 /// ### Example usage:
1693 /// ~~~{.cpp}
1694 /// // Deduce column types (this invocation needs jitting internally)
1695 /// auto myHist1 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
1696 /// "myValueX", "myValueY", "myValueZ");
1697 /// // Explicit column types
1698 /// auto myHist2 = myDf.Histo3D<double, double, float>({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
1699 /// "myValueX", "myValueY", "myValueZ");
1700 /// ~~~
1701 /// \note If three-dimensional histograms consume too much memory in multithreaded runs, the cloning of TH3D
1702 /// per thread can be reduced using ROOT::RDF::Experimental::ThreadsPerTH3(). See the section "Memory Usage" in
1703 /// the RDataFrame description.
1704 /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory
1705 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
1706 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
1707 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
1708 typename V3 = RDFDetail::RInferredType>
1709 RResultPtr<::TH3D> Histo3D(const TH3DModel &model, std::string_view v1Name = "", std::string_view v2Name = "",
1710 std::string_view v3Name = "")
1711 {
1712 std::shared_ptr<::TH3D> h(nullptr);
1713 {
1714 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1715 h = model.GetHistogram();
1716 }
1717 if (!RDFInternal::HistoUtils<::TH3D>::HasAxisLimits(*h)) {
1718 throw std::runtime_error("3D histograms with no axes limits are not supported yet.");
1719 }
1720 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name};
1722 ? ColumnNames_t()
1725 }
1726
1727 ////////////////////////////////////////////////////////////////////////////
1728 /// \brief Fill and return a three-dimensional histogram (*lazy action*).
1729 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
1730 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
1731 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
1732 /// \tparam W The type of the column used for the weights of the histogram. Inferred if not present.
1733 /// \param[in] model The returned histogram will be constructed using this as a model.
1734 /// \param[in] v1Name The name of the column that will fill the x axis.
1735 /// \param[in] v2Name The name of the column that will fill the y axis.
1736 /// \param[in] v3Name The name of the column that will fill the z axis.
1737 /// \param[in] wName The name of the column that will provide the weights.
1738 /// \return the tridimensional histogram wrapped in a RResultPtr.
1739 ///
1740 /// This action is *lazy*: upon invocation of this method the calculation is
1741 /// booked but not executed. Also see RResultPtr.
1742 ///
1743 /// ### Example usage:
1744 /// ~~~{.cpp}
1745 /// // Deduce column types (this invocation needs jitting internally)
1746 /// auto myHist1 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
1747 /// "myValueX", "myValueY", "myValueZ", "myWeight");
1748 /// // Explicit column types
1749 /// using d_t = double;
1750 /// auto myHist2 = myDf.Histo3D<d_t, d_t, float, d_t>({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.},
1751 /// "myValueX", "myValueY", "myValueZ", "myWeight");
1752 /// ~~~
1753 ///
1754 ///
1755 /// See the documentation of the first Histo2D() overload for more details.
1756 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
1757 typename V3 = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
1758 RResultPtr<::TH3D> Histo3D(const TH3DModel &model, std::string_view v1Name, std::string_view v2Name,
1759 std::string_view v3Name, std::string_view wName)
1760 {
1761 std::shared_ptr<::TH3D> h(nullptr);
1762 {
1763 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1764 h = model.GetHistogram();
1765 }
1766 if (!RDFInternal::HistoUtils<::TH3D>::HasAxisLimits(*h)) {
1767 throw std::runtime_error("3D histograms with no axes limits are not supported yet.");
1768 }
1769 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name, wName};
1771 ? ColumnNames_t()
1774 }
1775
1776 template <typename V1, typename V2, typename V3, typename W>
1778 {
1779 return Histo3D<V1, V2, V3, W>(model, "", "", "", "");
1780 }
1781
1782 ////////////////////////////////////////////////////////////////////////////
1783 /// \brief Fill and return an N-dimensional histogram (*lazy action*).
1784 /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred if not
1785 /// present.
1786 /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the
1787 /// object.
1788 /// \param[in] model The returned histogram will be constructed using this as a model.
1789 /// \param[in] columnList
1790 /// A list containing the names of the columns that will be passed when calling `Fill`.
1791 /// \param[in] wName The name of the column that will provide the weights.
1792 /// \return the N-dimensional histogram wrapped in a RResultPtr.
1793 ///
1794 /// This action is *lazy*: upon invocation of this method the calculation is
1795 /// booked but not executed. See RResultPtr documentation.
1796 ///
1797 /// ### Example usage:
1798 /// ~~~{.cpp}
1799 /// auto myFilledObj = myDf.HistoND<float, float, float, float>({"name","title", 4,
1800 /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}},
1801 /// {"col0", "col1", "col2", "col3"});
1802 /// ~~~
1803 ///
1804 /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new
1805 /// argument `wName`: `HistoND(model, cols, weightCol)`.
1806 ///
1807 template <typename FirstColumn, typename... OtherColumns> // need FirstColumn to disambiguate overloads
1808 RResultPtr<::THnD> HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName = "")
1809 {
1810 std::shared_ptr<::THnD> h(nullptr);
1811 {
1812 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1813 h = model.GetHistogram();
1814 const auto hDims = h->GetNdimensions();
1815 decltype(hDims) nCols = columnList.size();
1816
1817 if (!wName.empty() && nCols == hDims + 1)
1818 throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of "
1819 "input columns contains one column more than the number of dimensions of the "
1820 "histogram. Call as 'HistoND(model, cols, weightCol)'.");
1821
1822 if (nCols == hDims + 1)
1823 Warning("HistoND", "Passing the column with the weights as the last column in the list is deprecated. "
1824 "Instead, pass it as a separate argument, e.g. 'HistoND(model, cols, weightCol)'.");
1825
1826 if (!wName.empty() || nCols == hDims + 1)
1827 h->Sumw2();
1828
1829 if (nCols != hDims + 1 && nCols != hDims)
1830 throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes.");
1831 }
1832
1833 if (!wName.empty()) {
1834 // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of
1835 // passed arguments is one more the number of dimensions of the histogram.
1837 userColumns.push_back(std::string{wName});
1838 return CreateAction<RDFInternal::ActionTags::HistoND, FirstColumn, OtherColumns...>(userColumns, h, h,
1839 fProxiedPtr);
1840 }
1841 return CreateAction<RDFInternal::ActionTags::HistoND, FirstColumn, OtherColumns...>(columnList, h, h,
1842 fProxiedPtr);
1843 }
1844
1845 ////////////////////////////////////////////////////////////////////////////
1846 /// \brief Fill and return an N-dimensional histogram (*lazy action*).
1847 /// \param[in] model The returned histogram will be constructed using this as a model.
1848 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
1849 /// \param[in] wName The name of the column that will provide the weights.
1850 /// \return the N-dimensional histogram wrapped in a RResultPtr.
1851 ///
1852 /// This action is *lazy*: upon invocation of this method the calculation is
1853 /// booked but not executed. Also see RResultPtr.
1854 ///
1855 /// ### Example usage:
1856 /// ~~~{.cpp}
1857 /// auto myFilledObj = myDf.HistoND({"name","title", 4,
1858 /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}},
1859 /// {"col0", "col1", "col2", "col3"});
1860 /// ~~~
1861 ///
1862 /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new
1863 /// argument `wName`: `HistoND(model, cols, weightCol)`.
1864 ///
1865 RResultPtr<::THnD> HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName = "")
1866 {
1867 std::shared_ptr<::THnD> h(nullptr);
1868 {
1869 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1870 h = model.GetHistogram();
1871 const auto hDims = h->GetNdimensions();
1872 decltype(hDims) nCols = columnList.size();
1873
1874 if (!wName.empty() && nCols == hDims + 1)
1875 throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of "
1876 "input columns contains one column more than the number of dimensions of the "
1877 "histogram. Call as 'HistoND(model, cols, weightCol)'.");
1878
1879 if (nCols == hDims + 1)
1880 Warning("HistoND", "Passing the column with the weights as the last column in the list is deprecated. "
1881 "Instead, pass it as a separate argument, e.g. 'HistoND(model, cols, weightCol)'.");
1882
1883 if (!wName.empty() || nCols == hDims + 1)
1884 h->Sumw2();
1885
1886 if (nCols != hDims + 1 && nCols != hDims)
1887 throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes.");
1888 }
1889
1890 if (!wName.empty()) {
1891 // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of
1892 // passed arguments is one more the number of dimensions of the histogram.
1894 userColumns.push_back(std::string{wName});
1896 userColumns.size());
1897 }
1899 columnList.size());
1900 }
1901
1902 ////////////////////////////////////////////////////////////////////////////
1903 /// \brief Fill and return a sparse N-dimensional histogram (*lazy action*).
1904 /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred if not
1905 /// present.
1906 /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the
1907 /// object.
1908 /// \param[in] model The returned histogram will be constructed using this as a model.
1909 /// \param[in] columnList
1910 /// A list containing the names of the columns that will be passed when calling `Fill`.
1911 /// \param[in] wName The name of the column that will provide the weights.
1912 /// \return the N-dimensional histogram wrapped in a RResultPtr.
1913 ///
1914 /// This action is *lazy*: upon invocation of this method the calculation is
1915 /// booked but not executed. See RResultPtr documentation.
1916 ///
1917 /// ### Example usage:
1918 /// ~~~{.cpp}
1919 /// auto myFilledObj = myDf.HistoNSparseD<float, float, float, float>({"name","title", 4,
1920 /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}},
1921 /// {"col0", "col1", "col2", "col3"});
1922 /// ~~~
1923 ///
1924 /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new
1925 /// argument `wName`: `HistoND(model, cols, weightCol)`.
1926 ///
1927 template <typename FirstColumn, typename... OtherColumns> // need FirstColumn to disambiguate overloads
1929 HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName = "")
1930 {
1931 std::shared_ptr<::THnSparseD> h(nullptr);
1932 {
1933 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1934 h = model.GetHistogram();
1935 const auto hDims = h->GetNdimensions();
1936 decltype(hDims) nCols = columnList.size();
1937
1938 if (!wName.empty() && nCols == hDims + 1)
1939 throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of "
1940 "input columns contains one column more than the number of dimensions of the "
1941 "histogram. Call as 'HistoNSparseD(model, cols, weightCol)'.");
1942
1943 if (nCols == hDims + 1)
1944 Warning("HistoNSparseD",
1945 "Passing the column with the weights as the last column in the list is deprecated. "
1946 "Instead, pass it as a separate argument, e.g. 'HistoNSparseD(model, cols, weightCol)'.");
1947
1948 if (!wName.empty() || nCols == hDims + 1)
1949 h->Sumw2();
1950
1951 if (nCols != hDims + 1 && nCols != hDims)
1952 throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes.");
1953 }
1954
1955 if (!wName.empty()) {
1956 // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of
1957 // passed arguments is one more the number of dimensions of the histogram.
1959 userColumns.push_back(std::string{wName});
1960 return CreateAction<RDFInternal::ActionTags::HistoNSparseD, FirstColumn, OtherColumns...>(userColumns, h, h,
1961 fProxiedPtr);
1962 }
1963 return CreateAction<RDFInternal::ActionTags::HistoNSparseD, FirstColumn, OtherColumns...>(columnList, h, h,
1964 fProxiedPtr);
1965 }
1966
1967 ////////////////////////////////////////////////////////////////////////////
1968 /// \brief Fill and return a sparse N-dimensional histogram (*lazy action*).
1969 /// \param[in] model The returned histogram will be constructed using this as a model.
1970 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
1971 /// \param[in] wName The name of the column that will provide the weights.
1972 /// \return the N-dimensional histogram wrapped in a RResultPtr.
1973 ///
1974 /// This action is *lazy*: upon invocation of this method the calculation is
1975 /// booked but not executed. Also see RResultPtr.
1976 ///
1977 /// ### Example usage:
1978 /// ~~~{.cpp}
1979 /// auto myFilledObj = myDf.HistoNSparseD({"name","title", 4,
1980 /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}},
1981 /// {"col0", "col1", "col2", "col3"});
1982 /// ~~~
1983 ///
1984 /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new
1985 /// argument `wName`: `HistoND(model, cols, weightCol)`.
1986 ///
1988 HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName = "")
1989 {
1990 std::shared_ptr<::THnSparseD> h(nullptr);
1991 {
1992 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
1993 h = model.GetHistogram();
1994 const auto hDims = h->GetNdimensions();
1995 decltype(hDims) nCols = columnList.size();
1996
1997 if (!wName.empty() && nCols == hDims + 1)
1998 throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of "
1999 "input columns contains one column more than the number of dimensions of the "
2000 "histogram. Call as 'HistoNSparseD(model, cols, weightCol)'.");
2001
2002 if (nCols == hDims + 1)
2003 Warning("HistoNSparseD",
2004 "Passing the column with the weights as the last column in the list is deprecated. "
2005 "Instead, pass it as a separate argument, e.g. 'HistoNSparseD(model, cols, weightCol)'.");
2006
2007 if (!wName.empty() || nCols == hDims + 1)
2008 h->Sumw2();
2009
2010 if (nCols != hDims + 1 && nCols != hDims)
2011 throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes.");
2012 }
2013
2014 if (!wName.empty()) {
2015 // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of
2016 // passed arguments is one more the number of dimensions of the histogram.
2018 userColumns.push_back(std::string{wName});
2021 }
2023 columnList, h, h, fProxiedPtr, columnList.size());
2024 }
2025
2026#ifdef R__HAS_ROOT7
2027 ////////////////////////////////////////////////////////////////////////////
2028 /// \brief Fill and return a one-dimensional RHist (*lazy action*).
2029 /// \tparam BinContentType The bin content type of the returned RHist.
2030 /// \param[in] nNormalBins The returned histogram will be constructed using this number of normal bins.
2031 /// \param[in] interval The axis interval of the constructed histogram (lower end inclusive, upper end exclusive).
2032 /// \param[in] vName The name of the column that will fill the histogram.
2033 /// \return the histogram wrapped in a RResultPtr.
2034 ///
2035 /// This action is *lazy*: upon invocation of this method the calculation is
2036 /// booked but not executed. Also see RResultPtr.
2037 ///
2038 /// ### Example usage:
2039 /// ~~~{.cpp}
2040 /// auto myHist = myDf.Hist(10, {5, 15}, "col0");
2041 /// ~~~
2042 template <typename BinContentType = double, typename V = RDFDetail::RInferredType>
2044 Hist(std::uint64_t nNormalBins, std::pair<double, double> interval, std::string_view vName)
2045 {
2046 std::shared_ptr h = std::make_shared<ROOT::Experimental::RHist<BinContentType>>(nNormalBins, interval);
2047
2048 const ColumnNames_t columnList = {std::string(vName)};
2049
2050 return Hist<V>(h, columnList);
2051 }
2052
2053 ////////////////////////////////////////////////////////////////////////////
2054 /// \brief Fill and return an RHist (*lazy action*).
2055 /// \tparam BinContentType The bin content type of the returned RHist.
2056 /// \param[in] axes The returned histogram will be constructed using these axes.
2057 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2058 /// \return the histogram wrapped in a RResultPtr.
2059 ///
2060 /// This action is *lazy*: upon invocation of this method the calculation is
2061 /// booked but not executed. Also see RResultPtr.
2062 ///
2063 /// ### Example usage:
2064 /// ~~~{.cpp}
2065 /// ROOT::Experimental::RRegularAxis axis(10, {5.0, 15.0});
2066 /// auto myHist = myDf.Hist({axis}, {"col0"});
2067 /// ~~~
2068 template <typename BinContentType = double, typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes>
2070 Hist(std::vector<ROOT::Experimental::RAxisVariant> axes, const ColumnNames_t &columnList)
2071 {
2072 if (axes.size() != columnList.size()) {
2073 std::string msg = "Wrong number of columns for the specified number of histogram axes: ";
2074 msg += "expected " + std::to_string(axes.size()) + ", got " + std::to_string(columnList.size());
2075 throw std::invalid_argument(msg);
2076 }
2077
2078 std::shared_ptr h = std::make_shared<ROOT::Experimental::RHist<BinContentType>>(std::move(axes));
2079
2080 return Hist<ColumnType, ColumnTypes...>(h, columnList);
2081 }
2082
2083 ////////////////////////////////////////////////////////////////////////////
2084 /// \brief Fill the provided RHist (*lazy action*).
2085 /// \param[in] h The histogram that should be filled.
2086 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2087 /// \return the histogram wrapped in a RResultPtr.
2088 ///
2089 /// This action is *lazy*: upon invocation of this method the calculation is
2090 /// booked but not executed. Also see RResultPtr.
2091 ///
2092 /// During execution of the computation graph, the passed histogram must only be accessed with methods that are
2093 /// allowed during concurrent filling.
2094 ///
2095 /// ### Example usage:
2096 /// ~~~{.cpp}
2097 /// auto h = std::make_shared<ROOT::Experimental::RHist<double>>(10, {5.0, 15.0});
2098 /// auto myHist = myDf.Hist(h, {"col0"});
2099 /// ~~~
2100 template <typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes, typename BinContentType>
2103 {
2105
2106 if (h->GetNDimensions() != columnList.size()) {
2107 std::string msg = "Wrong number of columns for the passed histogram: ";
2108 msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size());
2109 throw std::invalid_argument(msg);
2110 }
2111
2112 return CreateAction<RDFInternal::ActionTags::Hist, ColumnType, ColumnTypes...>(columnList, h, h, fProxiedPtr,
2113 columnList.size());
2114 }
2115
2116 ////////////////////////////////////////////////////////////////////////////
2117 /// \brief Fill and return a one-dimensional RHist with weights (*lazy action*).
2118 /// \tparam BinContentType The bin content type of the returned RHist.
2119 /// \param[in] nNormalBins The returned histogram will be constructed using this number of normal bins.
2120 /// \param[in] interval The axis interval of the constructed histogram (lower end inclusive, upper end exclusive).
2121 /// \param[in] vName The name of the column that will fill the histogram.
2122 /// \param[in] wName The name of the column that will provide the weights.
2123 /// \return the histogram wrapped in a RResultPtr.
2124 ///
2125 /// This action is *lazy*: upon invocation of this method the calculation is
2126 /// booked but not executed. Also see RResultPtr.
2127 ///
2128 /// ### Example usage:
2129 /// ~~~{.cpp}
2130 /// auto myHist = myDf.Hist(10, {5, 15}, "col0", "colW");
2131 /// ~~~
2133 typename W = RDFDetail::RInferredType>
2135 Hist(std::uint64_t nNormalBins, std::pair<double, double> interval, std::string_view vName, std::string_view wName)
2136 {
2137 std::shared_ptr h = std::make_shared<ROOT::Experimental::RHist<BinContentType>>(nNormalBins, interval);
2138
2139 const ColumnNames_t columnList = {std::string(vName)};
2140
2141 return Hist<V, W>(h, columnList, wName);
2142 }
2143
2144 ////////////////////////////////////////////////////////////////////////////
2145 /// \brief Fill and return an RHist with weights (*lazy action*).
2146 /// \tparam BinContentType The bin content type of the returned RHist.
2147 /// \param[in] axes The returned histogram will be constructed using these axes.
2148 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2149 /// \param[in] wName The name of the column that will provide the weights.
2150 /// \return the histogram wrapped in a RResultPtr.
2151 ///
2152 /// This action is *lazy*: upon invocation of this method the calculation is
2153 /// booked but not executed. Also see RResultPtr.
2154 ///
2155 /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling).
2156 ///
2157 /// ### Example usage:
2158 /// ~~~{.cpp}
2159 /// ROOT::Experimental::RRegularAxis axis(10, {5.0, 15.0});
2160 /// auto myHist = myDf.Hist({axis}, {"col0"}, "colW");
2161 /// ~~~
2163 typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes>
2165 Hist(std::vector<ROOT::Experimental::RAxisVariant> axes, const ColumnNames_t &columnList, std::string_view wName)
2166 {
2168 "weighted filling is not supported for integral bin content types");
2169
2170 if (axes.size() != columnList.size()) {
2171 std::string msg = "Wrong number of columns for the specified number of histogram axes: ";
2172 msg += "expected " + std::to_string(axes.size()) + ", got " + std::to_string(columnList.size());
2173 throw std::invalid_argument(msg);
2174 }
2175
2176 std::shared_ptr h = std::make_shared<ROOT::Experimental::RHist<BinContentType>>(std::move(axes));
2177
2178 return Hist<ColumnType, ColumnTypes...>(h, columnList, wName);
2179 }
2180
2181 ////////////////////////////////////////////////////////////////////////////
2182 /// \brief Fill the provided RHist with weights (*lazy action*).
2183 /// \param[in] h The histogram that should be filled.
2184 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2185 /// \param[in] wName The name of the column that will provide the weights.
2186 /// \return the histogram wrapped in a RResultPtr.
2187 ///
2188 /// This action is *lazy*: upon invocation of this method the calculation is
2189 /// booked but not executed. Also see RResultPtr.
2190 ///
2191 /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling).
2192 ///
2193 /// During execution of the computation graph, the passed histogram must only be accessed with methods that are
2194 /// allowed during concurrent filling.
2195 ///
2196 /// ### Example usage:
2197 /// ~~~{.cpp}
2198 /// auto h = std::make_shared<ROOT::Experimental::RHist<double>>(10, {5.0, 15.0});
2199 /// auto myHist = myDf.Hist(h, {"col0"}, "colW");
2200 /// ~~~
2201 template <typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes, typename BinContentType>
2204 std::string_view wName)
2205 {
2207 "weighted filling is not supported for integral bin content types");
2208
2210
2211 if (h->GetNDimensions() != columnList.size()) {
2212 std::string msg = "Wrong number of columns for the passed histogram: ";
2213 msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size());
2214 throw std::invalid_argument(msg);
2215 }
2216
2217 // Add the weight column to the list of argument columns to pass it through the infrastructure.
2219 columnListWithWeights.push_back(std::string(wName));
2220
2221 return CreateAction<RDFInternal::ActionTags::HistWithWeight, ColumnType, ColumnTypes...>(
2223 }
2224
2225 ////////////////////////////////////////////////////////////////////////////
2226 /// \brief Fill the provided RHistEngine (*lazy action*).
2227 /// \param[in] h The histogram that should be filled.
2228 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2229 /// \return the histogram wrapped in a RResultPtr.
2230 ///
2231 /// This action is *lazy*: upon invocation of this method the calculation is
2232 /// booked but not executed. Also see RResultPtr.
2233 ///
2234 /// During execution of the computation graph, the passed histogram must only be accessed with methods that are
2235 /// allowed during concurrent filling.
2236 ///
2237 /// ### Example usage:
2238 /// ~~~{.cpp}
2239 /// auto h = std::make_shared<ROOT::Experimental::RHistEngine<double>>(10, {5.0, 15.0});
2240 /// auto myHist = myDf.Hist(h, {"col0"});
2241 /// ~~~
2242 template <typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes, typename BinContentType>
2245 {
2247
2248 if (h->GetNDimensions() != columnList.size()) {
2249 std::string msg = "Wrong number of columns for the passed histogram: ";
2250 msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size());
2251 throw std::invalid_argument(msg);
2252 }
2253
2254 return CreateAction<RDFInternal::ActionTags::Hist, ColumnType, ColumnTypes...>(columnList, h, h, fProxiedPtr,
2255 columnList.size());
2256 }
2257
2258 ////////////////////////////////////////////////////////////////////////////
2259 /// \brief Fill the provided RHistEngine with weights (*lazy action*).
2260 /// \param[in] h The histogram that should be filled.
2261 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2262 /// \param[in] wName The name of the column that will provide the weights.
2263 /// \return the histogram wrapped in a RResultPtr.
2264 ///
2265 /// This action is *lazy*: upon invocation of this method the calculation is
2266 /// booked but not executed. Also see RResultPtr.
2267 ///
2268 /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling).
2269 ///
2270 /// During execution of the computation graph, the passed histogram must only be accessed with methods that are
2271 /// allowed during concurrent filling.
2272 ///
2273 /// ### Example usage:
2274 /// ~~~{.cpp}
2275 /// auto h = std::make_shared<ROOT::Experimental::RHistEngine<double>>(10, {5.0, 15.0});
2276 /// auto myHist = myDf.Hist(h, {"col0"}, "colW");
2277 /// ~~~
2278 template <typename ColumnType = RDFDetail::RInferredType, typename... ColumnTypes, typename BinContentType>
2281 std::string_view wName)
2282 {
2284 "weighted filling is not supported for integral bin content types");
2285
2287
2288 if (h->GetNDimensions() != columnList.size()) {
2289 std::string msg = "Wrong number of columns for the passed histogram: ";
2290 msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size());
2291 throw std::invalid_argument(msg);
2292 }
2293
2294 // Add the weight column to the list of argument columns to pass it through the infrastructure.
2296 columnListWithWeights.push_back(std::string(wName));
2297
2298 return CreateAction<RDFInternal::ActionTags::HistWithWeight, ColumnType, ColumnTypes...>(
2300 }
2301#endif
2302
2303 ////////////////////////////////////////////////////////////////////////////
2304 /// \brief Fill and return a TGraph object (*lazy action*).
2305 /// \tparam X The type of the column used to fill the x axis.
2306 /// \tparam Y The type of the column used to fill the y axis.
2307 /// \param[in] x The name of the column that will fill the x axis.
2308 /// \param[in] y The name of the column that will fill the y axis.
2309 /// \return the TGraph wrapped in a RResultPtr.
2310 ///
2311 /// Columns can be of a container type (e.g. std::vector<double>), in which case the TGraph
2312 /// is filled with each one of the elements of the container.
2313 /// If Multithreading is enabled, the order in which points are inserted is undefined.
2314 /// If the Graph has to be drawn, it is suggested to the user to sort it on the x before printing.
2315 /// A name and a title to the TGraph is given based on the input column names.
2316 ///
2317 /// This action is *lazy*: upon invocation of this method the calculation is
2318 /// booked but not executed. Also see RResultPtr.
2319 ///
2320 /// ### Example usage:
2321 /// ~~~{.cpp}
2322 /// // Deduce column types (this invocation needs jitting internally)
2323 /// auto myGraph1 = myDf.Graph("xValues", "yValues");
2324 /// // Explicit column types
2325 /// auto myGraph2 = myDf.Graph<int, float>("xValues", "yValues");
2326 /// ~~~
2327 ///
2328 /// \note Differently from other ROOT interfaces, the returned TGraph is not associated to gDirectory
2329 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2330 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2331 template <typename X = RDFDetail::RInferredType, typename Y = RDFDetail::RInferredType>
2332 RResultPtr<::TGraph> Graph(std::string_view x = "", std::string_view y = "")
2333 {
2334 auto graph = std::make_shared<::TGraph>();
2335 const std::vector<std::string_view> columnViews = {x, y};
2337 ? ColumnNames_t()
2339
2341
2342 // We build a default name and title based on the input columns
2343 const auto g_name = validatedColumns[1] + "_vs_" + validatedColumns[0];
2344 const auto g_title = validatedColumns[1] + " vs " + validatedColumns[0];
2345 graph->SetNameTitle(g_name.c_str(), g_title.c_str());
2346 graph->GetXaxis()->SetTitle(validatedColumns[0].c_str());
2347 graph->GetYaxis()->SetTitle(validatedColumns[1].c_str());
2348
2350 }
2351
2352 ////////////////////////////////////////////////////////////////////////////
2353 /// \brief Fill and return a TGraphAsymmErrors object (*lazy action*).
2354 /// \param[in] x The name of the column that will fill the x axis.
2355 /// \param[in] y The name of the column that will fill the y axis.
2356 /// \param[in] exl The name of the column of X low errors
2357 /// \param[in] exh The name of the column of X high errors
2358 /// \param[in] eyl The name of the column of Y low errors
2359 /// \param[in] eyh The name of the column of Y high errors
2360 /// \return the TGraphAsymmErrors wrapped in a RResultPtr.
2361 ///
2362 /// Columns can be of a container type (e.g. std::vector<double>), in which case the graph
2363 /// is filled with each one of the elements of the container.
2364 /// If Multithreading is enabled, the order in which points are inserted is undefined.
2365 ///
2366 /// This action is *lazy*: upon invocation of this method the calculation is
2367 /// booked but not executed. Also see RResultPtr.
2368 ///
2369 /// ### Example usage:
2370 /// ~~~{.cpp}
2371 /// // Deduce column types (this invocation needs jitting internally)
2372 /// auto myGAE1 = myDf.GraphAsymmErrors("xValues", "yValues", "exl", "exh", "eyl", "eyh");
2373 /// // Explicit column types
2374 /// using f = float
2375 /// auto myGAE2 = myDf.GraphAsymmErrors<f, f, f, f, f, f>("xValues", "yValues", "exl", "exh", "eyl", "eyh");
2376 /// ~~~
2377 ///
2378 /// `GraphAsymmErrors` should also be used for the cases in which values associated only with
2379 /// one of the axes have associated errors. For example, only `ey` exist and `ex` are equal to zero.
2380 /// In such cases, user should do the following:
2381 /// ~~~{.cpp}
2382 /// // Create a column of zeros in RDataFrame
2383 /// auto rdf_withzeros = rdf.Define("zero", "0");
2384 /// // or alternatively:
2385 /// auto rdf_withzeros = rdf.Define("zero", []() -> double { return 0.;});
2386 /// // Create the graph with y errors only
2387 /// auto rdf_errorsOnYOnly = rdf_withzeros.GraphAsymmErrors("xValues", "yValues", "zero", "zero", "eyl", "eyh");
2388 /// ~~~
2389 ///
2390 /// \note Differently from other ROOT interfaces, the returned TGraphAsymmErrors is not associated to gDirectory
2391 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2392 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2393 template <typename X = RDFDetail::RInferredType, typename Y = RDFDetail::RInferredType,
2397 GraphAsymmErrors(std::string_view x = "", std::string_view y = "", std::string_view exl = "",
2398 std::string_view exh = "", std::string_view eyl = "", std::string_view eyh = "")
2399 {
2400 auto graph = std::make_shared<::TGraphAsymmErrors>();
2401 const std::vector<std::string_view> columnViews = {x, y, exl, exh, eyl, eyh};
2403 ? ColumnNames_t()
2405
2407
2408 // We build a default name and title based on the input columns
2409 const auto g_name = validatedColumns[1] + "_vs_" + validatedColumns[0];
2410 const auto g_title = validatedColumns[1] + " vs " + validatedColumns[0];
2411 graph->SetNameTitle(g_name.c_str(), g_title.c_str());
2412 graph->GetXaxis()->SetTitle(validatedColumns[0].c_str());
2413 graph->GetYaxis()->SetTitle(validatedColumns[1].c_str());
2414
2416 graph, fProxiedPtr);
2417 }
2418
2419 ////////////////////////////////////////////////////////////////////////////
2420 /// \brief Fill and return a one-dimensional profile (*lazy action*).
2421 /// \tparam V1 The type of the column the values of which are used to fill the profile. Inferred if not present.
2422 /// \tparam V2 The type of the column the values of which are used to fill the profile. Inferred if not present.
2423 /// \param[in] model The model to be considered to build the new return value.
2424 /// \param[in] v1Name The name of the column that will fill the x axis.
2425 /// \param[in] v2Name The name of the column that will fill the y axis.
2426 /// \return the monodimensional profile wrapped in a RResultPtr.
2427 ///
2428 /// This action is *lazy*: upon invocation of this method the calculation is
2429 /// booked but not executed. Also see RResultPtr.
2430 ///
2431 /// ### Example usage:
2432 /// ~~~{.cpp}
2433 /// // Deduce column types (this invocation needs jitting internally)
2434 /// auto myProf1 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues");
2435 /// // Explicit column types
2436 /// auto myProf2 = myDf.Graph<int, float>({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues");
2437 /// ~~~
2438 ///
2439 /// \note Differently from other ROOT interfaces, the returned profile is not associated to gDirectory
2440 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2441 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2442 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType>
2444 Profile1D(const TProfile1DModel &model, std::string_view v1Name = "", std::string_view v2Name = "")
2445 {
2446 std::shared_ptr<::TProfile> h(nullptr);
2447 {
2448 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2449 h = model.GetProfile();
2450 }
2451
2452 if (!RDFInternal::HistoUtils<::TProfile>::HasAxisLimits(*h)) {
2453 throw std::runtime_error("Profiles with no axes limits are not supported yet.");
2454 }
2455 const std::vector<std::string_view> columnViews = {v1Name, v2Name};
2457 ? ColumnNames_t()
2460 }
2461
2462 ////////////////////////////////////////////////////////////////////////////
2463 /// \brief Fill and return a one-dimensional profile (*lazy action*).
2464 /// \tparam V1 The type of the column the values of which are used to fill the profile. Inferred if not present.
2465 /// \tparam V2 The type of the column the values of which are used to fill the profile. Inferred if not present.
2466 /// \tparam W The type of the column the weights of which are used to fill the profile. Inferred if not present.
2467 /// \param[in] model The model to be considered to build the new return value.
2468 /// \param[in] v1Name The name of the column that will fill the x axis.
2469 /// \param[in] v2Name The name of the column that will fill the y axis.
2470 /// \param[in] wName The name of the column that will provide the weights.
2471 /// \return the monodimensional profile wrapped in a RResultPtr.
2472 ///
2473 /// This action is *lazy*: upon invocation of this method the calculation is
2474 /// booked but not executed. Also see RResultPtr.
2475 ///
2476 /// ### Example usage:
2477 /// ~~~{.cpp}
2478 /// // Deduce column types (this invocation needs jitting internally)
2479 /// auto myProf1 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues", "weight");
2480 /// // Explicit column types
2481 /// auto myProf2 = myDf.Profile1D<int, float, double>({"profName", "profTitle", 64u, -4., 4.},
2482 /// "xValues", "yValues", "weight");
2483 /// ~~~
2484 ///
2485 /// See the first Profile1D() overload for more details.
2486 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2487 typename W = RDFDetail::RInferredType>
2489 Profile1D(const TProfile1DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
2490 {
2491 std::shared_ptr<::TProfile> h(nullptr);
2492 {
2493 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2494 h = model.GetProfile();
2495 }
2496
2497 if (!RDFInternal::HistoUtils<::TProfile>::HasAxisLimits(*h)) {
2498 throw std::runtime_error("Profile histograms with no axes limits are not supported yet.");
2499 }
2500 const std::vector<std::string_view> columnViews = {v1Name, v2Name, wName};
2502 ? ColumnNames_t()
2505 }
2506
2507 ////////////////////////////////////////////////////////////////////////////
2508 /// \brief Fill and return a one-dimensional profile (*lazy action*).
2509 /// See the first Profile1D() overload for more details.
2510 template <typename V1, typename V2, typename W>
2512 {
2513 return Profile1D<V1, V2, W>(model, "", "", "");
2514 }
2515
2516 ////////////////////////////////////////////////////////////////////////////
2517 /// \brief Fill and return a two-dimensional profile (*lazy action*).
2518 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
2519 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
2520 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
2521 /// \param[in] model The returned profile will be constructed using this as a model.
2522 /// \param[in] v1Name The name of the column that will fill the x axis.
2523 /// \param[in] v2Name The name of the column that will fill the y axis.
2524 /// \param[in] v3Name The name of the column that will fill the z axis.
2525 /// \return the bidimensional profile wrapped in a RResultPtr.
2526 ///
2527 /// This action is *lazy*: upon invocation of this method the calculation is
2528 /// booked but not executed. Also see RResultPtr.
2529 ///
2530 /// ### Example usage:
2531 /// ~~~{.cpp}
2532 /// // Deduce column types (this invocation needs jitting internally)
2533 /// auto myProf1 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
2534 /// "xValues", "yValues", "zValues");
2535 /// // Explicit column types
2536 /// auto myProf2 = myDf.Profile2D<int, float, double>({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
2537 /// "xValues", "yValues", "zValues");
2538 /// ~~~
2539 ///
2540 /// \note Differently from other ROOT interfaces, the returned profile is not associated to gDirectory
2541 /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that
2542 /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas).
2543 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2544 typename V3 = RDFDetail::RInferredType>
2545 RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model, std::string_view v1Name = "",
2546 std::string_view v2Name = "", std::string_view v3Name = "")
2547 {
2548 std::shared_ptr<::TProfile2D> h(nullptr);
2549 {
2550 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2551 h = model.GetProfile();
2552 }
2553
2554 if (!RDFInternal::HistoUtils<::TProfile2D>::HasAxisLimits(*h)) {
2555 throw std::runtime_error("2D profiles with no axes limits are not supported yet.");
2556 }
2557 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name};
2559 ? ColumnNames_t()
2562 }
2563
2564 ////////////////////////////////////////////////////////////////////////////
2565 /// \brief Fill and return a two-dimensional profile (*lazy action*).
2566 /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present.
2567 /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present.
2568 /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present.
2569 /// \tparam W The type of the column used for the weights of the histogram. Inferred if not present.
2570 /// \param[in] model The returned histogram will be constructed using this as a model.
2571 /// \param[in] v1Name The name of the column that will fill the x axis.
2572 /// \param[in] v2Name The name of the column that will fill the y axis.
2573 /// \param[in] v3Name The name of the column that will fill the z axis.
2574 /// \param[in] wName The name of the column that will provide the weights.
2575 /// \return the bidimensional profile wrapped in a RResultPtr.
2576 ///
2577 /// This action is *lazy*: upon invocation of this method the calculation is
2578 /// booked but not executed. Also see RResultPtr.
2579 ///
2580 /// ### Example usage:
2581 /// ~~~{.cpp}
2582 /// // Deduce column types (this invocation needs jitting internally)
2583 /// auto myProf1 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
2584 /// "xValues", "yValues", "zValues", "weight");
2585 /// // Explicit column types
2586 /// auto myProf2 = myDf.Profile2D<int, float, double, int>({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20},
2587 /// "xValues", "yValues", "zValues", "weight");
2588 /// ~~~
2589 ///
2590 /// See the first Profile2D() overload for more details.
2591 template <typename V1 = RDFDetail::RInferredType, typename V2 = RDFDetail::RInferredType,
2592 typename V3 = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
2593 RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model, std::string_view v1Name, std::string_view v2Name,
2594 std::string_view v3Name, std::string_view wName)
2595 {
2596 std::shared_ptr<::TProfile2D> h(nullptr);
2597 {
2598 ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError);
2599 h = model.GetProfile();
2600 }
2601
2602 if (!RDFInternal::HistoUtils<::TProfile2D>::HasAxisLimits(*h)) {
2603 throw std::runtime_error("2D profiles with no axes limits are not supported yet.");
2604 }
2605 const std::vector<std::string_view> columnViews = {v1Name, v2Name, v3Name, wName};
2607 ? ColumnNames_t()
2610 }
2611
2612 /// \brief Fill and return a two-dimensional profile (*lazy action*).
2613 /// See the first Profile2D() overload for more details.
2614 template <typename V1, typename V2, typename V3, typename W>
2616 {
2617 return Profile2D<V1, V2, V3, W>(model, "", "", "", "");
2618 }
2619
2620 ////////////////////////////////////////////////////////////////////////////
2621 /// \brief Return an object of type T on which `T::Fill` will be called once per event (*lazy action*).
2622 ///
2623 /// Type T must provide at least:
2624 /// - a copy-constructor
2625 /// - a `Fill` method that accepts as many arguments and with same types as the column names passed as columnList
2626 /// (these types can also be passed as template parameters to this method)
2627 /// - a `Merge` method with signature `Merge(TCollection *)` or `Merge(const std::vector<T *>&)` that merges the
2628 /// objects passed as argument into the object on which `Merge` was called (an analogous of TH1::Merge). Note that
2629 /// if the signature that takes a `TCollection*` is used, then T must inherit from TObject (to allow insertion in
2630 /// the TCollection*).
2631 ///
2632 /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred together with OtherColumns if not present.
2633 /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the object.
2634 /// \tparam T The type of the object to fill. Automatically deduced.
2635 /// \param[in] model The model to be considered to build the new return value.
2636 /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill`
2637 /// \return the filled object wrapped in a RResultPtr.
2638 ///
2639 /// The user gives up ownership of the model object.
2640 /// The list of column names to be used for filling must always be specified.
2641 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed.
2642 /// Also see RResultPtr.
2643 ///
2644 /// ### Example usage:
2645 /// ~~~{.cpp}
2646 /// MyClass obj;
2647 /// // Deduce column types (this invocation needs jitting internally, and in this case
2648 /// // MyClass needs to be known to the interpreter)
2649 /// auto myFilledObj = myDf.Fill(obj, {"col0", "col1"});
2650 /// // explicit column types
2651 /// auto myFilledObj = myDf.Fill<float, float>(obj, {"col0", "col1"});
2652 /// ~~~
2653 ///
2654 template <typename FirstColumn = RDFDetail::RInferredType, typename... OtherColumns, typename T>
2656 {
2657 auto h = std::make_shared<std::decay_t<T>>(std::forward<T>(model));
2658 if (!RDFInternal::HistoUtils<T>::HasAxisLimits(*h)) {
2659 throw std::runtime_error("The absence of axes limits is not supported yet.");
2660 }
2661 return CreateAction<RDFInternal::ActionTags::Fill, FirstColumn, OtherColumns...>(columnList, h, h, fProxiedPtr,
2662 columnList.size());
2663 }
2664
2665 ////////////////////////////////////////////////////////////////////////////
2666 /// \brief Return a TStatistic object, filled once per event (*lazy action*).
2667 ///
2668 /// \tparam V The type of the value column
2669 /// \param[in] value The name of the column with the values to fill the statistics with.
2670 /// \return the filled TStatistic object wrapped in a RResultPtr.
2671 ///
2672 /// ### Example usage:
2673 /// ~~~{.cpp}
2674 /// // Deduce column type (this invocation needs jitting internally)
2675 /// auto stats0 = myDf.Stats("values");
2676 /// // Explicit column type
2677 /// auto stats1 = myDf.Stats<float>("values");
2678 /// ~~~
2679 ///
2680 template <typename V = RDFDetail::RInferredType>
2681 RResultPtr<TStatistic> Stats(std::string_view value = "")
2682 {
2684 if (!value.empty()) {
2685 columns.emplace_back(std::string(value));
2686 }
2688 if (std::is_same<V, RDFDetail::RInferredType>::value) {
2689 return Fill(TStatistic(), validColumnNames);
2690 } else {
2692 }
2693 }
2694
2695 ////////////////////////////////////////////////////////////////////////////
2696 /// \brief Return a TStatistic object, filled once per event (*lazy action*).
2697 ///
2698 /// \tparam V The type of the value column
2699 /// \tparam W The type of the weight column
2700 /// \param[in] value The name of the column with the values to fill the statistics with.
2701 /// \param[in] weight The name of the column with the weights to fill the statistics with.
2702 /// \return the filled TStatistic object wrapped in a RResultPtr.
2703 ///
2704 /// ### Example usage:
2705 /// ~~~{.cpp}
2706 /// // Deduce column types (this invocation needs jitting internally)
2707 /// auto stats0 = myDf.Stats("values", "weights");
2708 /// // Explicit column types
2709 /// auto stats1 = myDf.Stats<int, float>("values", "weights");
2710 /// ~~~
2711 ///
2712 template <typename V = RDFDetail::RInferredType, typename W = RDFDetail::RInferredType>
2713 RResultPtr<TStatistic> Stats(std::string_view value, std::string_view weight)
2714 {
2715 ColumnNames_t columns{std::string(value), std::string(weight)};
2716 constexpr auto vIsInferred = std::is_same<V, RDFDetail::RInferredType>::value;
2717 constexpr auto wIsInferred = std::is_same<W, RDFDetail::RInferredType>::value;
2719 // We have 3 cases:
2720 // 1. Both types are inferred: we use Fill and let the jit kick in.
2721 // 2. One of the two types is explicit and the other one is inferred: the case is not supported.
2722 // 3. Both types are explicit: we invoke the fully compiled Fill method.
2723 if (vIsInferred && wIsInferred) {
2724 return Fill(TStatistic(), validColumnNames);
2725 } else if (vIsInferred != wIsInferred) {
2726 std::string error("The ");
2727 error += vIsInferred ? "value " : "weight ";
2728 error += "column type is explicit, while the ";
2729 error += vIsInferred ? "weight " : "value ";
2730 error += " is specified to be inferred. This case is not supported: please specify both types or none.";
2731 throw std::runtime_error(error);
2732 } else {
2734 }
2735 }
2736
2737 ////////////////////////////////////////////////////////////////////////////
2738 /// \brief Return the minimum of processed column values (*lazy action*).
2739 /// \tparam T The type of the branch/column.
2740 /// \param[in] columnName The name of the branch/column to be treated.
2741 /// \return the minimum value of the selected column wrapped in a RResultPtr.
2742 ///
2743 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
2744 /// template specialization of this method.
2745 /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise.
2746 ///
2747 /// This action is *lazy*: upon invocation of this method the calculation is
2748 /// booked but not executed. Also see RResultPtr.
2749 ///
2750 /// ### Example usage:
2751 /// ~~~{.cpp}
2752 /// // Deduce column type (this invocation needs jitting internally)
2753 /// auto minVal0 = myDf.Min("values");
2754 /// // Explicit column type
2755 /// auto minVal1 = myDf.Min<double>("values");
2756 /// ~~~
2757 ///
2758 template <typename T = RDFDetail::RInferredType>
2760 {
2761 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
2762 using RetType_t = RDFDetail::MinReturnType_t<T>;
2763 auto minV = std::make_shared<RetType_t>(std::numeric_limits<RetType_t>::max());
2765 }
2766
2767 ////////////////////////////////////////////////////////////////////////////
2768 /// \brief Return the maximum of processed column values (*lazy action*).
2769 /// \tparam T The type of the branch/column.
2770 /// \param[in] columnName The name of the branch/column to be treated.
2771 /// \return the maximum value of the selected column wrapped in a RResultPtr.
2772 ///
2773 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
2774 /// template specialization of this method.
2775 /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise.
2776 ///
2777 /// This action is *lazy*: upon invocation of this method the calculation is
2778 /// booked but not executed. Also see RResultPtr.
2779 ///
2780 /// ### Example usage:
2781 /// ~~~{.cpp}
2782 /// // Deduce column type (this invocation needs jitting internally)
2783 /// auto maxVal0 = myDf.Max("values");
2784 /// // Explicit column type
2785 /// auto maxVal1 = myDf.Max<double>("values");
2786 /// ~~~
2787 ///
2788 template <typename T = RDFDetail::RInferredType>
2790 {
2791 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
2792 using RetType_t = RDFDetail::MaxReturnType_t<T>;
2793 auto maxV = std::make_shared<RetType_t>(std::numeric_limits<RetType_t>::lowest());
2795 }
2796
2797 ////////////////////////////////////////////////////////////////////////////
2798 /// \brief Return the mean of processed column values (*lazy action*).
2799 /// \tparam T The type of the branch/column.
2800 /// \param[in] columnName The name of the branch/column to be treated.
2801 /// \return the mean value of the selected column wrapped in a RResultPtr.
2802 ///
2803 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
2804 /// template specialization of this method.
2805 /// Note that internally, the summations are executed with Kahan sums in double precision, irrespective
2806 /// of the type of column that is read.
2807 ///
2808 /// This action is *lazy*: upon invocation of this method the calculation is
2809 /// booked but not executed. Also see RResultPtr.
2810 ///
2811 /// ### Example usage:
2812 /// ~~~{.cpp}
2813 /// // Deduce column type (this invocation needs jitting internally)
2814 /// auto meanVal0 = myDf.Mean("values");
2815 /// // Explicit column type
2816 /// auto meanVal1 = myDf.Mean<double>("values");
2817 /// ~~~
2818 ///
2819 template <typename T = RDFDetail::RInferredType>
2820 RResultPtr<double> Mean(std::string_view columnName = "")
2821 {
2822 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
2823 auto meanV = std::make_shared<double>(0);
2825 }
2826
2827 ////////////////////////////////////////////////////////////////////////////
2828 /// \brief Return the unbiased standard deviation of processed column values (*lazy action*).
2829 /// \tparam T The type of the branch/column.
2830 /// \param[in] columnName The name of the branch/column to be treated.
2831 /// \return the standard deviation value of the selected column wrapped in a RResultPtr.
2832 ///
2833 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
2834 /// template specialization of this method.
2835 ///
2836 /// This action is *lazy*: upon invocation of this method the calculation is
2837 /// booked but not executed. Also see RResultPtr.
2838 ///
2839 /// ### Example usage:
2840 /// ~~~{.cpp}
2841 /// // Deduce column type (this invocation needs jitting internally)
2842 /// auto stdDev0 = myDf.StdDev("values");
2843 /// // Explicit column type
2844 /// auto stdDev1 = myDf.StdDev<double>("values");
2845 /// ~~~
2846 ///
2847 template <typename T = RDFDetail::RInferredType>
2848 RResultPtr<double> StdDev(std::string_view columnName = "")
2849 {
2850 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
2851 auto stdDeviationV = std::make_shared<double>(0);
2853 }
2854
2855 // clang-format off
2856 ////////////////////////////////////////////////////////////////////////////
2857 /// \brief Return the sum of processed column values (*lazy action*).
2858 /// \tparam T The type of the branch/column.
2859 /// \param[in] columnName The name of the branch/column.
2860 /// \param[in] initValue Optional initial value for the sum. If not present, the column values must be default-constructible.
2861 /// \return the sum of the selected column wrapped in a RResultPtr.
2862 ///
2863 /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
2864 /// template specialization of this method.
2865 /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise.
2866 ///
2867 /// This action is *lazy*: upon invocation of this method the calculation is
2868 /// booked but not executed. Also see RResultPtr.
2869 ///
2870 /// ### Example usage:
2871 /// ~~~{.cpp}
2872 /// // Deduce column type (this invocation needs jitting internally)
2873 /// auto sum0 = myDf.Sum("values");
2874 /// // Explicit column type
2875 /// auto sum1 = myDf.Sum<double>("values");
2876 /// ~~~
2877 ///
2878 template <typename T = RDFDetail::RInferredType>
2880 Sum(std::string_view columnName = "",
2881 const RDFDetail::SumReturnType_t<T> &initValue = RDFDetail::SumReturnType_t<T>{})
2882 {
2883 const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
2884 auto sumV = std::make_shared<RDFDetail::SumReturnType_t<T>>(initValue);
2886 }
2887 // clang-format on
2888
2889 ////////////////////////////////////////////////////////////////////////////
2890 /// \brief Gather filtering statistics.
2891 /// \return the resulting `RCutFlowReport` instance wrapped in a RResultPtr.
2892 ///
2893 /// Calling `Report` on the main `RDataFrame` object gathers stats for
2894 /// all named filters in the call graph. Calling this method on a
2895 /// stored chain state (i.e. a graph node different from the first) gathers
2896 /// the stats for all named filters in the chain section between the original
2897 /// `RDataFrame` and that node (included). Stats are gathered in the same
2898 /// order as the named filters have been added to the graph.
2899 /// A RResultPtr<RCutFlowReport> is returned to allow inspection of the
2900 /// effects cuts had.
2901 ///
2902 /// This action is *lazy*: upon invocation of
2903 /// this method the calculation is booked but not executed. See RResultPtr
2904 /// documentation.
2905 ///
2906 /// ### Example usage:
2907 /// ~~~{.cpp}
2908 /// auto filtered = d.Filter(cut1, {"b1"}, "Cut1").Filter(cut2, {"b2"}, "Cut2");
2909 /// auto cutReport = filtered3.Report();
2910 /// cutReport->Print();
2911 /// ~~~
2912 ///
2914 {
2915 bool returnEmptyReport = false;
2916 // if this is a RInterface<RLoopManager> on which `Define` has been called, users
2917 // are calling `Report` on a chain of the form LoopManager->Define->Define->..., which
2918 // certainly does not contain named filters.
2919 // The number 4 takes into account the implicit columns for entry and slot number
2920 // and their aliases (2 + 2, i.e. {r,t}dfentry_ and {r,t}dfslot_)
2921 if (std::is_same<Proxied, RLoopManager>::value && fColRegister.GenerateColumnNames().size() > 4)
2922 returnEmptyReport = true;
2923
2924 auto rep = std::make_shared<RCutFlowReport>();
2927
2928 auto action = std::make_unique<Action_t>(Helper_t(rep, fProxiedPtr.get(), returnEmptyReport), ColumnNames_t({}),
2930
2931 return MakeResultPtr(rep, *fLoopManager, std::move(action));
2932 }
2933
2934
2935 ////////////////////////////////////////////////////////////////////////////
2936 /// \brief Provides a representation of the columns in the dataset.
2937 /// \tparam ColumnTypes variadic list of branch/column types.
2938 /// \param[in] columnList Names of the columns to be displayed.
2939 /// \param[in] nRows Number of events for each column to be displayed.
2940 /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row.
2941 /// \return the `RDisplay` instance wrapped in a RResultPtr.
2942 ///
2943 /// This function returns a `RResultPtr<RDisplay>` containing all the entries to be displayed, organized in a tabular
2944 /// form. RDisplay will either print on the standard output a summarized version through `RDisplay::Print()` or will
2945 /// return a complete version through `RDisplay::AsString()`.
2946 ///
2947 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see
2948 /// RResultPtr.
2949 ///
2950 /// Example usage:
2951 /// ~~~{.cpp}
2952 /// // Preparing the RResultPtr<RDisplay> object with all columns and default number of entries
2953 /// auto d1 = rdf.Display("");
2954 /// // Preparing the RResultPtr<RDisplay> object with two columns and 128 entries
2955 /// auto d2 = d.Display({"x", "y"}, 128);
2956 /// // Printing the short representations, the event loop will run
2957 /// d1->Print();
2958 /// d2->Print();
2959 /// ~~~
2960 template <typename... ColumnTypes>
2962 {
2963 CheckIMTDisabled("Display");
2964 auto newCols = columnList;
2965 newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column
2966 auto displayer = std::make_shared<RDisplay>(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements);
2967 using displayHelperArgs_t = std::pair<size_t, std::shared_ptr<RDisplay>>;
2968 // Need to add ULong64_t type corresponding to the first column rdfentry_
2969 return CreateAction<RDFInternal::ActionTags::Display, ULong64_t, ColumnTypes...>(
2970 std::move(newCols), displayer, std::make_shared<displayHelperArgs_t>(nRows, displayer), fProxiedPtr);
2971 }
2972
2973 ////////////////////////////////////////////////////////////////////////////
2974 /// \brief Provides a representation of the columns in the dataset.
2975 /// \param[in] columnList Names of the columns to be displayed.
2976 /// \param[in] nRows Number of events for each column to be displayed.
2977 /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row.
2978 /// \return the `RDisplay` instance wrapped in a RResultPtr.
2979 ///
2980 /// This overload automatically infers the column types.
2981 /// See the previous overloads for further details.
2982 ///
2983 /// Invoked when no types are specified to Display
2985 {
2986 CheckIMTDisabled("Display");
2987 auto newCols = columnList;
2988 newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column
2989 auto displayer = std::make_shared<RDisplay>(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements);
2990 using displayHelperArgs_t = std::pair<size_t, std::shared_ptr<RDisplay>>;
2992 std::move(newCols), displayer, std::make_shared<displayHelperArgs_t>(nRows, displayer), fProxiedPtr,
2993 columnList.size() + 1);
2994 }
2995
2996 ////////////////////////////////////////////////////////////////////////////
2997 /// \brief Provides a representation of the columns in the dataset.
2998 /// \param[in] columnNameRegexp A regular expression to select the columns.
2999 /// \param[in] nRows Number of events for each column to be displayed.
3000 /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row.
3001 /// \return the `RDisplay` instance wrapped in a RResultPtr.
3002 ///
3003 /// The existing columns are matched against the regular expression. If the string provided
3004 /// is empty, all columns are selected.
3005 /// See the previous overloads for further details.
3007 Display(std::string_view columnNameRegexp = "", size_t nRows = 5, size_t nMaxCollectionElements = 10)
3008 {
3009 const auto columnNames = GetColumnNames();
3012 }
3013
3014 ////////////////////////////////////////////////////////////////////////////
3015 /// \brief Provides a representation of the columns in the dataset.
3016 /// \param[in] columnList Names of the columns to be displayed.
3017 /// \param[in] nRows Number of events for each column to be displayed.
3018 /// \param[in] nMaxCollectionElements Number of maximum elements in collection.
3019 /// \return the `RDisplay` instance wrapped in a RResultPtr.
3020 ///
3021 /// See the previous overloads for further details.
3023 Display(std::initializer_list<std::string> columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10)
3024 {
3027 }
3028
3029 /// \}
3030 // End of the doxygen group for actions
3031 // ----------------------------------------------------------------------------------------
3032
3033 /// \name Immediate Actions
3034 /// Immediate Actions eagerly start the event loop and produce a result.
3035 /// \{
3036
3037 template <typename... ColumnTypes>
3038 [[deprecated("Snapshot is not any more a template. You can safely remove the template parameters.")]]
3040 Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList,
3041 const RSnapshotOptions &options = RSnapshotOptions())
3042 {
3043 return Snapshot(treename, filename, columnList, options);
3044 }
3045
3046 ////////////////////////////////////////////////////////////////////////////
3047 /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`.
3048 /// \param[in] treename The name of the output TTree or RNTuple.
3049 /// \param[in] filename The name of the output TFile.
3050 /// \param[in] columnList The list of names of the columns/branches/fields to be written.
3051 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple.
3052 /// \return a `RDataFrame` that wraps the snapshotted dataset.
3053 ///
3054 /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source.
3055 /// The types of the columns are automatically inferred and do not need to be specified.
3056 ///
3057 /// Support for writing of nested branches/fields is limited (although RDataFrame is able to read them) and dot ('.')
3058 /// characters in input column names will be replaced by underscores ('_') in the branches produced by Snapshot.
3059 /// When writing a variable size array through Snapshot, it is required that the column indicating its size is also
3060 /// written out and it appears before the array in the columnList.
3061 ///
3062 /// By default, in case of TTree, TChain or RNTuple inputs, Snapshot will try to write out all top-level branches.
3063 /// For other types of inputs, all columns returned by GetColumnNames() will be written out. Systematic variations of
3064 /// columns will be included if the corresponding flag is set in RSnapshotOptions. See \ref snapshot-with-variations
3065 /// "Snapshot with Variations" for more details. If friend trees or chains are present, by default all friend
3066 /// top-level branches that have names that do not collide with names of branches in the main TTree/TChain will be
3067 /// written out. Since v6.24, Snapshot will also write out friend branches with the same names of branches in the
3068 /// main TTree/TChain with names of the form
3069 /// `<friendname>_<branchname>` in order to differentiate them from the branches in the main tree/chain.
3070 ///
3071 /// ### Writing to a sub-directory
3072 ///
3073 /// Snapshot supports writing the TTree or RNTuple in a sub-directory inside the TFile. It is sufficient to specify
3074 /// the directory path as part of the TTree or RNTuple name, e.g. `df.Snapshot("subdir/t", "f.root")` writes TTree
3075 /// `t` in the sub-directory `subdir` of file `f.root` (creating file and sub-directory as needed).
3076 ///
3077 /// \attention In multi-thread runs (i.e. when EnableImplicitMT() has been called) threads will loop over clusters of
3078 /// entries in an undefined order, so Snapshot will produce outputs in which (clusters of) entries will be shuffled
3079 /// with respect to the input TTree. Using such "shuffled" TTrees as friends of the original trees would result in
3080 /// wrong associations between entries in the main TTree and entries in the "shuffled" friend. Since v6.22, ROOT will
3081 /// error out if such a "shuffled" TTree is used in a friendship.
3082 ///
3083 /// \note In case no events are written out (e.g. because no event passes all filters), Snapshot will still write the
3084 /// requested output TTree or RNTuple to the file, with all the branches requested to preserve the dataset schema.
3085 ///
3086 /// \note Snapshot will refuse to process columns with names of the form `#columnname`. These are special columns
3087 /// made available by some data sources (e.g. RNTupleDS) that represent the size of column `columnname`, and are
3088 /// not meant to be written out with that name (which is not a valid C++ variable name). Instead, go through an
3089 /// Alias(): `df.Alias("nbar", "#bar").Snapshot(..., {"nbar"})`.
3090 ///
3091 /// ### Example invocations:
3092 ///
3093 /// ~~~{.cpp}
3094 /// // No need to specify column types, they are automatically deduced thanks
3095 /// // to information coming from the data source
3096 /// df.Snapshot("outputTree", "outputFile.root", {"x", "y"});
3097 /// ~~~
3098 ///
3099 /// To book a Snapshot without triggering the event loop, one needs to set the appropriate flag in
3100 /// `RSnapshotOptions`:
3101 /// ~~~{.cpp}
3102 /// RSnapshotOptions opts;
3103 /// opts.fLazy = true;
3104 /// df.Snapshot("outputTree", "outputFile.root", {"x"}, opts);
3105 /// ~~~
3106 ///
3107 /// To snapshot to the RNTuple data format, the `fOutputFormat` option in `RSnapshotOptions` needs to be set
3108 /// accordingly:
3109 /// ~~~{.cpp}
3110 /// RSnapshotOptions opts;
3111 /// opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple;
3112 /// df.Snapshot("outputNTuple", "outputFile.root", {"x"}, opts);
3113 /// ~~~
3114 ///
3115 /// Snapshot systematic variations resulting from a Vary() call (see details \ref snapshot-with-variations "here"):
3116 /// ~~~{.cpp}
3117 /// RSnapshotOptions opts;
3118 /// opts.fIncludeVariations = true;
3119 /// df.Snapshot("outputTree", "outputFile.root", {"x"}, opts);
3120 /// ~~~
3123 const RSnapshotOptions &options = RSnapshotOptions())
3124 {
3125 // like columnList but with `#var` columns removed
3127 // like columnListWithoutSizeColumns but with aliases resolved
3130 // like validCols but with missing size branches required by array branches added in the right positions
3131 const auto pairOfColumnLists =
3135
3136 const auto fullTreeName = treename;
3138 treename = parsedTreePath.fTreeName;
3139 const auto &dirname = parsedTreePath.fDirName;
3140
3142
3144
3145 auto retrieveTypeID = [](const std::string &colName, const std::string &colTypeName,
3146 bool isRNTuple = false) -> const std::type_info * {
3147 try {
3149 } catch (const std::runtime_error &err) {
3150 if (isRNTuple)
3152
3153 if (std::string(err.what()).find("Cannot extract type_info of type") != std::string::npos) {
3154 // We could not find RTTI for this column, thus we cannot write it out at the moment.
3155 std::string trueTypeName{colTypeName};
3156 if (colTypeName.rfind("CLING_UNKNOWN_TYPE", 0) == 0)
3157 trueTypeName = colTypeName.substr(19);
3158 std::string msg{"No runtime type information is available for column \"" + colName +
3159 "\" with type name \"" + trueTypeName +
3160 "\". Thus, it cannot be written to disk with Snapshot. Make sure to generate and load "
3161 "ROOT dictionaries for the type of this column."};
3162
3163 throw std::runtime_error(msg);
3164 } else {
3165 throw;
3166 }
3167 }
3168 };
3169
3171
3172 if (options.fOutputFormat == ESnapshotOutputFormat::kRNTuple) {
3173 // The data source of the RNTuple resulting from the Snapshot action does not exist yet here, so we create one
3174 // without a data source for now, and set it once the actual data source can be created (i.e., after
3175 // writing the RNTuple).
3176 auto newRDF = std::make_shared<RInterface<RLoopManager>>(std::make_shared<RLoopManager>(colListNoPoundSizes));
3177
3178 auto snapHelperArgs = std::make_shared<RDFInternal::SnapshotHelperArgs>(RDFInternal::SnapshotHelperArgs{
3179 std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches,
3180 options, newRDF->GetLoopManager(), GetLoopManager(), true /* fToNTuple */, /*fIncludeVariations=*/false});
3181
3184
3185 const auto nSlots = fLoopManager->GetNSlots();
3186 std::vector<const std::type_info *> colTypeIDs;
3187 colTypeIDs.reserve(nColumns);
3188 for (decltype(nColumns) i{}; i < nColumns; i++) {
3189 const auto &colName = validColumnNames[i];
3191 colName, /*tree*/ nullptr, GetDataSource(), fColRegister.GetDefine(colName), options.fVector2RVec);
3192 const std::type_info *colTypeID = retrieveTypeID(colName, colTypeName, /*isRNTuple*/ true);
3193 colTypeIDs.push_back(colTypeID);
3194 }
3195 // Crucial e.g. if the column names do not correspond to already-available column readers created by the data
3196 // source
3198
3199 auto action =
3201 resPtr = MakeResultPtr(newRDF, *GetLoopManager(), std::move(action));
3202 } else {
3203 if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS" &&
3204 options.fOutputFormat == ESnapshotOutputFormat::kDefault) {
3205 Warning("Snapshot",
3206 "The default Snapshot output data format is TTree, but the input data format is RNTuple. If you "
3207 "want to Snapshot to RNTuple or suppress this warning, set the appropriate fOutputFormat option in "
3208 "RSnapshotOptions. Note that this current default behaviour might change in the future.");
3209 }
3210
3211 // We create an RLoopManager without a data source. This needs to be initialised when the output TTree dataset
3212 // has actually been created and written to TFile, i.e. at the end of the Snapshot execution.
3213 auto newRDF = std::make_shared<RInterface<RLoopManager>>(
3214 std::make_shared<RLoopManager>(colListNoAliasesWithSizeBranches));
3215
3216 auto snapHelperArgs = std::make_shared<RDFInternal::SnapshotHelperArgs>(RDFInternal::SnapshotHelperArgs{
3217 std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches,
3218 options, newRDF->GetLoopManager(), GetLoopManager(), false /* fToRNTuple */, options.fIncludeVariations});
3219
3222
3223 const auto nSlots = fLoopManager->GetNSlots();
3224 std::vector<const std::type_info *> colTypeIDs;
3225 colTypeIDs.reserve(nColumns);
3226 for (decltype(nColumns) i{}; i < nColumns; i++) {
3227 const auto &colName = validColumnNames[i];
3229 colName, /*tree*/ nullptr, GetDataSource(), fColRegister.GetDefine(colName), options.fVector2RVec);
3230 const std::type_info *colTypeID = retrieveTypeID(colName, colTypeName);
3231 colTypeIDs.push_back(colTypeID);
3232 }
3233 // Crucial e.g. if the column names do not correspond to already-available column readers created by the data
3234 // source
3236
3237 auto action =
3239 resPtr = MakeResultPtr(newRDF, *GetLoopManager(), std::move(action));
3240 }
3241
3242 if (!options.fLazy)
3243 *resPtr;
3244 return resPtr;
3245 }
3246
3247 // clang-format off
3248 ////////////////////////////////////////////////////////////////////////////
3249 /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`.
3250 /// \param[in] treename The name of the output TTree or RNTuple.
3251 /// \param[in] filename The name of the output TFile.
3252 /// \param[in] columnNameRegexp The regular expression to match the column names to be selected. The presence of a '^' and a '$' at the end of the string is implicitly assumed if they are not specified. The dialect supported is PCRE via the TPRegexp class. An empty string signals the selection of all columns.
3253 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple
3254 /// \return a `RDataFrame` that wraps the snapshotted dataset.
3255 ///
3256 /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source.
3257 /// The types of the columns are automatically inferred and do not need to be specified.
3258 ///
3259 /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages.
3261 std::string_view columnNameRegexp = "",
3262 const RSnapshotOptions &options = RSnapshotOptions())
3263 {
3265
3267 // Ignore R_rdf_sizeof_* columns coming from datasources: we don't want to Snapshot those
3269 std::copy_if(dsColumns.begin(), dsColumns.end(), std::back_inserter(dsColumnsWithoutSizeColumns),
3270 [](const std::string &name) { return name.size() < 13 || name.substr(0, 13) != "R_rdf_sizeof_"; });
3275
3276 // The only way we can get duplicate entries is if a column coming from a tree or data-source is Redefine'd.
3277 // RemoveDuplicates should preserve ordering of the columns: it might be meaningful.
3279
3280 std::vector<std::string> selectedColumns;
3281 try {
3283 }
3284 catch (const std::runtime_error &e){
3285 // No columns were found, try again but consider all input data source columns
3286 if (auto ds = GetDataSource())
3288 else
3289 throw e;
3290 }
3291
3292 if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS") {
3294 }
3295
3296 return Snapshot(treename, filename, selectedColumns, options);
3297 }
3298 // clang-format on
3299
3300 // clang-format off
3301 ////////////////////////////////////////////////////////////////////////////
3302 /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`.
3303 /// \param[in] treename The name of the output TTree or RNTuple.
3304 /// \param[in] filename The name of the output TFile.
3305 /// \param[in] columnList The list of names of the columns/branches to be written.
3306 /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple.
3307 /// \return a `RDataFrame` that wraps the snapshotted dataset.
3308 ///
3309 /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source.
3310 /// The types of the columns are automatically inferred and do not need to be specified.
3311 ///
3312 /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages.
3314 std::initializer_list<std::string> columnList,
3315 const RSnapshotOptions &options = RSnapshotOptions())
3316 {
3318 return Snapshot(treename, filename, selectedColumns, options);
3319 }
3320 // clang-format on
3321
3322 ////////////////////////////////////////////////////////////////////////////
3323 /// \brief Save selected columns in memory.
3324 /// \tparam ColumnTypes variadic list of branch/column types.
3325 /// \param[in] columnList columns to be cached in memory.
3326 /// \return a `RDataFrame` that wraps the cached dataset.
3327 ///
3328 /// This action returns a new `RDataFrame` object, completely detached from
3329 /// the originating `RDataFrame`. The new dataframe only contains the cached
3330 /// columns and stores their content in memory for fast, zero-copy subsequent access.
3331 ///
3332 /// Use `Cache` if you know you will only need a subset of the (`Filter`ed) data that
3333 /// fits in memory and that will be accessed many times.
3334 ///
3335 /// \note Cache will refuse to process columns with names of the form `#columnname`. These are special columns
3336 /// made available by some data sources (e.g. RNTupleDS) that represent the size of column `columnname`, and are
3337 /// not meant to be written out with that name (which is not a valid C++ variable name). Instead, go through an
3338 /// Alias(): `df.Alias("nbar", "#bar").Cache<std::size_t>(..., {"nbar"})`.
3339 ///
3340 /// ### Example usage:
3341 ///
3342 /// **Types and columns specified:**
3343 /// ~~~{.cpp}
3344 /// auto cache_some_cols_df = df.Cache<double, MyClass, int>({"col0", "col1", "col2"});
3345 /// ~~~
3346 ///
3347 /// **Types inferred and columns specified (this invocation relies on jitting):**
3348 /// ~~~{.cpp}
3349 /// auto cache_some_cols_df = df.Cache({"col0", "col1", "col2"});
3350 /// ~~~
3351 ///
3352 /// **Types inferred and columns selected with a regexp (this invocation relies on jitting):**
3353 /// ~~~{.cpp}
3354 /// auto cache_all_cols_df = df.Cache(myRegexp);
3355 /// ~~~
3356 template <typename... ColumnTypes>
3358 {
3359 auto staticSeq = std::make_index_sequence<sizeof...(ColumnTypes)>();
3361 }
3362
3363 ////////////////////////////////////////////////////////////////////////////
3364 /// \brief Save selected columns in memory.
3365 /// \param[in] columnList columns to be cached in memory
3366 /// \return a `RDataFrame` that wraps the cached dataset.
3367 ///
3368 /// See the previous overloads for more information.
3370 {
3371 // Early return: if the list of columns is empty, just return an empty RDF
3372 // If we proceed, the jitted call will not compile!
3373 if (columnList.empty()) {
3374 auto nEntries = *this->Count();
3375 RInterface<RLoopManager> emptyRDF(std::make_shared<RLoopManager>(nEntries));
3376 return emptyRDF;
3377 }
3378
3379 std::stringstream cacheCall;
3381 RInterface<TTraits::TakeFirstParameter_t<decltype(upcastNode)>> upcastInterface(fProxiedPtr, *fLoopManager,
3382 fColRegister);
3383 // build a string equivalent to
3384 // "(RInterface<nodetype*>*)(this)->Cache<Ts...>(*(ColumnNames_t*)(&columnList))"
3385 RInterface<RLoopManager> resRDF(std::make_shared<ROOT::Detail::RDF::RLoopManager>(0));
3386 cacheCall << "*reinterpret_cast<ROOT::RDF::RInterface<ROOT::Detail::RDF::RLoopManager>*>("
3388 << ") = reinterpret_cast<ROOT::RDF::RInterface<ROOT::Detail::RDF::RNodeBase>*>("
3390
3392
3393 const auto validColumnNames =
3395 const auto colTypes =
3396 GetValidatedArgTypes(validColumnNames, fColRegister, nullptr, GetDataSource(), "Cache", /*vector2RVec=*/false);
3397 for (const auto &colType : colTypes)
3398 cacheCall << colType << ", ";
3399 if (!columnListWithoutSizeColumns.empty())
3400 cacheCall.seekp(-2, cacheCall.cur); // remove the last ",
3401 cacheCall << ">(*reinterpret_cast<std::vector<std::string>*>(" // vector<string> should be ColumnNames_t
3403
3404 // book the code to jit with the RLoopManager and trigger the event loop
3405 fLoopManager->ToJitExec(cacheCall.str());
3406 fLoopManager->Jit();
3407
3408 return resRDF;
3409 }
3410
3411 ////////////////////////////////////////////////////////////////////////////
3412 /// \brief Save selected columns in memory.
3413 /// \param[in] columnNameRegexp The regular expression to match the column names to be selected. The presence of a '^' and a '$' at the end of the string is implicitly assumed if they are not specified. The dialect supported is PCRE via the TPRegexp class. An empty string signals the selection of all columns.
3414 /// \return a `RDataFrame` that wraps the cached dataset.
3415 ///
3416 /// The existing columns are matched against the regular expression. If the string provided
3417 /// is empty, all columns are selected. See the previous overloads for more information.
3419 {
3422 // Ignore R_rdf_sizeof_* columns coming from datasources: we don't want to Snapshot those
3424 std::copy_if(dsColumns.begin(), dsColumns.end(), std::back_inserter(dsColumnsWithoutSizeColumns),
3425 [](const std::string &name) { return name.size() < 13 || name.substr(0, 13) != "R_rdf_sizeof_"; });
3427 columnNames.reserve(definedColumns.size() + dsColumns.size());
3431 return Cache(selectedColumns);
3432 }
3433
3434 ////////////////////////////////////////////////////////////////////////////
3435 /// \brief Save selected columns in memory.
3436 /// \param[in] columnList columns to be cached in memory.
3437 /// \return a `RDataFrame` that wraps the cached dataset.
3438 ///
3439 /// See the previous overloads for more information.
3440 RInterface<RLoopManager> Cache(std::initializer_list<std::string> columnList)
3441 {
3443 return Cache(selectedColumns);
3444 }
3445
3446
3447 // clang-format off
3448 ////////////////////////////////////////////////////////////////////////////
3449 /// \brief Execute a user-defined function on each entry (*instant action*).
3450 /// \param[in] f Function, lambda expression, functor class or any other callable object performing user defined calculations.
3451 /// \param[in] columns Names of the columns/branches in input to the user function.
3452 ///
3453 /// The callable `f` is invoked once per entry. This is an *instant action*:
3454 /// upon invocation, an event loop as well as execution of all scheduled actions
3455 /// is triggered.
3456 /// Users are responsible for the thread-safety of this callable when executing
3457 /// with implicit multi-threading enabled (i.e. ROOT::EnableImplicitMT).
3458 ///
3459 /// ### Example usage:
3460 /// ~~~{.cpp}
3461 /// myDf.Foreach([](int i){ std::cout << i << std::endl;}, {"myIntColumn"});
3462 /// ~~~
3463 // clang-format on
3464 template <typename F>
3465 void Foreach(F f, const ColumnNames_t &columns = {})
3466 {
3467 using arg_types = typename TTraits::CallableTraits<decltype(f)>::arg_types_nodecay;
3468 using ret_type = typename TTraits::CallableTraits<decltype(f)>::ret_type;
3469 ForeachSlot(RDFInternal::AddSlotParameter<ret_type>(f, arg_types()), columns);
3470 }
3471
3472 // clang-format off
3473 ////////////////////////////////////////////////////////////////////////////
3474 /// \brief Execute a user-defined function requiring a processing slot index on each entry (*instant action*).
3475 /// \param[in] f Function, lambda expression, functor class or any other callable object performing user defined calculations.
3476 /// \param[in] columns Names of the columns/branches in input to the user function.
3477 ///
3478 /// Same as `Foreach`, but the user-defined function takes an extra
3479 /// `unsigned int` as its first parameter, the *processing slot index*.
3480 /// This *slot index* will be assigned a different value, `0` to `poolSize - 1`,
3481 /// for each thread of execution.
3482 /// This is meant as a helper in writing thread-safe `Foreach`
3483 /// actions when using `RDataFrame` after `ROOT::EnableImplicitMT()`.
3484 /// The user-defined processing callable is able to follow different
3485 /// *streams of processing* indexed by the first parameter.
3486 /// `ForeachSlot` works just as well with single-thread execution: in that
3487 /// case `slot` will always be `0`.
3488 ///
3489 /// ### Example usage:
3490 /// ~~~{.cpp}
3491 /// myDf.ForeachSlot([](unsigned int s, int i){ std::cout << "Slot " << s << ": "<< i << std::endl;}, {"myIntColumn"});
3492 /// ~~~
3493 // clang-format on
3494 template <typename F>
3495 void ForeachSlot(F f, const ColumnNames_t &columns = {})
3496 {
3498 constexpr auto nColumns = ColTypes_t::list_size;
3499
3502
3503 using Helper_t = RDFInternal::ForeachSlotHelper<F>;
3505
3506 auto action = std::make_unique<Action_t>(Helper_t(std::move(f)), validColumnNames, fProxiedPtr, fColRegister);
3507
3508 fLoopManager->Run();
3509 }
3510
3511 /// \}
3512 // End of doxygen group for immediate actions
3513 // ----------------------------------------------------------------------------------------
3514
3515 /// \brief Returns the names of the filters created.
3516 /// \return the container of filters names.
3517 ///
3518 /// If called on a root node, all the filters in the computation graph will
3519 /// be printed. For any other node, only the filters upstream of that node.
3520 /// Filters without a name are printed as "Unnamed Filter"
3521 /// This is not an action nor a transformation, just a query to the RDataFrame object.
3522 ///
3523 /// ### Example usage:
3524 /// ~~~{.cpp}
3525 /// auto filtNames = d.GetFilterNames();
3526 /// for (auto &&filtName : filtNames) std::cout << filtName << std::endl;
3527 /// ~~~
3528 ///
3529 std::vector<std::string> GetFilterNames() { return RDFInternal::GetFilterNames(fProxiedPtr); }
3530
3531 /// \name User-defined Actions (lazy)
3532 /// Pass user-defined functions to be applied to the data and create results.
3533 /// These actions are lazy, i.e., they only run once a result is actually requested.
3534 /// \{
3535
3536 // clang-format off
3537 ////////////////////////////////////////////////////////////////////////////
3538 /// \brief Execute a user-defined accumulation operation on the processed column values in each processing slot.
3539 /// \tparam F The type of the aggregator callable. Automatically deduced.
3540 /// \tparam U The type of the aggregator variable. Must be default-constructible, copy-constructible and copy-assignable. Automatically deduced.
3541 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
3542 /// \param[in] aggregator A callable with signature `U(U,T)` or `void(U&,T)`, where T is the type of the column, U is the type of the aggregator variable
3543 /// \param[in] merger A callable with signature `U(U,U)` or `void(std::vector<U>&)` used to merge the results of the accumulations of each thread
3544 /// \param[in] columnName The column to be aggregated. If omitted, the first default column is used instead.
3545 /// \param[in] aggIdentity The aggregator variable of each thread is initialized to this value (or is default-constructed if the parameter is omitted)
3546 /// \return the result of the aggregation wrapped in a RResultPtr.
3547 ///
3548 /// An aggregator callable takes two values, an aggregator variable and a column value. The aggregator variable is
3549 /// initialized to aggIdentity or default-constructed if aggIdentity is omitted.
3550 /// This action calls the aggregator callable for each processed entry, passing in the aggregator variable and
3551 /// the value of the column columnName.
3552 /// If the signature is `U(U,T)` the aggregator variable is then copy-assigned the result of the execution of the callable.
3553 /// Otherwise the signature of aggregator must be `void(U&,T)`.
3554 ///
3555 /// The merger callable is used to merge the partial accumulation results of each processing thread. It is only called in multi-thread executions.
3556 /// If its signature is `U(U,U)` the aggregator variables of each thread are merged two by two.
3557 /// If its signature is `void(std::vector<U>& a)` it is assumed that it merges all aggregators in a[0].
3558 ///
3559 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see RResultPtr.
3560 ///
3561 /// Example usage:
3562 /// ~~~{.cpp}
3563 /// auto aggregator = [](double acc, double x) { return acc * x; };
3564 /// ROOT::EnableImplicitMT();
3565 /// // If multithread is enabled, the aggregator function will be called by more threads
3566 /// // and will produce a vector of partial accumulators.
3567 /// // The merger function performs the final aggregation of these partial results.
3568 /// auto merger = [](std::vector<double> &accumulators) {
3569 /// for (auto i : ROOT::TSeqU(1u, accumulators.size())) {
3570 /// accumulators[0] *= accumulators[i];
3571 /// }
3572 /// };
3573 ///
3574 /// // The accumulator is initialized at this value by every thread.
3575 /// double initValue = 1.;
3576 ///
3577 /// // Multiplies all elements of the column "x"
3578 /// auto result = d.Aggregate(aggregator, merger, "x", initValue);
3579 /// ~~~
3580 // clang-format on
3582 typename ArgTypes = typename TTraits::CallableTraits<AccFun>::arg_types,
3583 typename ArgTypesNoDecay = typename TTraits::CallableTraits<AccFun>::arg_types_nodecay,
3584 typename U = TTraits::TakeFirstParameter_t<ArgTypes>,
3585 typename T = TTraits::TakeFirstParameter_t<TTraits::RemoveFirstParameter_t<ArgTypes>>>
3587 {
3588 RDFInternal::CheckAggregate<R, MergeFun>(ArgTypesNoDecay());
3589 const auto columns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
3590
3593
3594 auto accObjPtr = std::make_shared<U>(aggIdentity);
3595 using Helper_t = RDFInternal::AggregateHelper<AccFun, MergeFun, R, T, U>;
3597 auto action = std::make_unique<Action_t>(
3598 Helper_t(std::move(aggregator), std::move(merger), accObjPtr, fLoopManager->GetNSlots()), validColumnNames,
3600 return MakeResultPtr(accObjPtr, *fLoopManager, std::move(action));
3601 }
3602
3603 // clang-format off
3604 ////////////////////////////////////////////////////////////////////////////
3605 /// \brief Execute a user-defined accumulation operation on the processed column values in each processing slot.
3606 /// \tparam F The type of the aggregator callable. Automatically deduced.
3607 /// \tparam U The type of the aggregator variable. Must be default-constructible, copy-constructible and copy-assignable. Automatically deduced.
3608 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
3609 /// \param[in] aggregator A callable with signature `U(U,T)` or `void(U,T)`, where T is the type of the column, U is the type of the aggregator variable
3610 /// \param[in] merger A callable with signature `U(U,U)` or `void(std::vector<U>&)` used to merge the results of the accumulations of each thread
3611 /// \param[in] columnName The column to be aggregated. If omitted, the first default column is used instead.
3612 /// \return the result of the aggregation wrapped in a RResultPtr.
3613 ///
3614 /// See previous Aggregate overload for more information.
3615 // clang-format on
3617 typename ArgTypes = typename TTraits::CallableTraits<AccFun>::arg_types,
3618 typename U = TTraits::TakeFirstParameter_t<ArgTypes>,
3619 typename T = TTraits::TakeFirstParameter_t<TTraits::RemoveFirstParameter_t<ArgTypes>>>
3621 {
3622 static_assert(
3623 std::is_default_constructible<U>::value,
3624 "aggregated object cannot be default-constructed. Please provide an initialisation value (aggIdentity)");
3625 return Aggregate(std::move(aggregator), std::move(merger), columnName, U());
3626 }
3627
3628 // clang-format off
3629 ////////////////////////////////////////////////////////////////////////////
3630 /// \brief Book execution of a custom action using a user-defined helper object.
3631 /// \tparam FirstColumn The type of the first column used by this action. Inferred together with OtherColumns if not present.
3632 /// \tparam OtherColumns A list of the types of the other columns used by this action
3633 /// \tparam Helper The type of the user-defined helper. See below for the required interface it should expose.
3634 /// \param[in] helper The Action Helper to be scheduled.
3635 /// \param[in] columns The names of the columns on which the helper acts.
3636 /// \return the result of the helper wrapped in a RResultPtr.
3637 ///
3638 /// This method books a custom action for execution. The behavior of the action is completely dependent on the
3639 /// Helper object provided by the caller. The required interface for the helper is described below (more
3640 /// methods that the ones required can be present, e.g. a constructor that takes the number of worker threads is usually useful):
3641 ///
3642 /// ### Mandatory interface
3643 ///
3644 /// * `Helper` must publicly inherit from `ROOT::Detail::RDF::RActionImpl<Helper>`
3645 /// * `Helper::Result_t`: public alias for the type of the result of this action helper. `Result_t` must be default-constructible.
3646 /// * `Helper(Helper &&)`: a move-constructor is required. Copy-constructors are discouraged.
3647 /// * `std::shared_ptr<Result_t> GetResultPtr() const`: return a shared_ptr to the result of this action (of type
3648 /// Result_t). The RResultPtr returned by Book will point to this object. Note that this method can be called
3649 /// _before_ Initialize(), because the RResultPtr is constructed before the event loop is started.
3650 /// * `void Initialize()`: this method is called once before starting the event-loop. Useful for setup operations.
3651 /// It must reset the state of the helper to the expected state at the beginning of the event loop: the same helper,
3652 /// or copies of it, might be used for multiple event loops (e.g. in the presence of systematic variations).
3653 /// * `void InitTask(TTreeReader *, unsigned int slot)`: each working thread shall call this method during the event
3654 /// loop, before processing a batch of entries. The pointer passed as argument, if not null, will point to the TTreeReader
3655 /// that RDataFrame has set up to read the task's batch of entries. It is passed to the helper to allow certain advanced optimizations
3656 /// it should not usually serve any purpose for the Helper. This method is often no-op for simple helpers.
3657 /// * `void Exec(unsigned int slot, ColumnTypes...columnValues)`: each working thread shall call this method
3658 /// during the event-loop, possibly concurrently. No two threads will ever call Exec with the same 'slot' value:
3659 /// this parameter is there to facilitate writing thread-safe helpers. The other arguments will be the values of
3660 /// the requested columns for the particular entry being processed.
3661 /// * `void Finalize()`: this method is called at the end of the event loop. Commonly used to finalize the contents of the result.
3662 /// * `std::string GetActionName()`: it returns a string identifier for this type of action that RDataFrame will use in
3663 /// diagnostics, SaveGraph(), etc.
3664 ///
3665 /// ### Optional methods
3666 ///
3667 /// If these methods are implemented they enable extra functionality as per the description below.
3668 ///
3669 /// * `Result_t &PartialUpdate(unsigned int slot)`: if present, it must return the value of the partial result of this action for the given 'slot'.
3670 /// Different threads might call this method concurrently, but will do so with different 'slot' numbers.
3671 /// RDataFrame leverages this method to implement RResultPtr::OnPartialResult().
3672 /// * `ROOT::RDF::SampleCallback_t GetSampleCallback()`: if present, it must return a callable with the
3673 /// appropriate signature (see ROOT::RDF::SampleCallback_t) that will be invoked at the beginning of the processing
3674 /// of every sample, as in DefinePerSample().
3675 /// * `Helper MakeNew(void *newResult, std::string_view variation = "nominal")`: if implemented, it enables varying
3676 /// the action's result with VariationsFor(). It takes a type-erased new result that can be safely cast to a
3677 /// `std::shared_ptr<Result_t> *` (a pointer to shared pointer) and should be used as the action's output result.
3678 /// The function optionally takes the name of the current variation which could be useful in customizing its behaviour.
3679 ///
3680 /// In case Book is called without specifying column types as template arguments, corresponding typed code will be just-in-time compiled
3681 /// by RDataFrame. In that case the Helper class needs to be known to the ROOT interpreter.
3682 ///
3683 /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see RResultPtr.
3684 ///
3685 /// ### Examples
3686 /// See [this tutorial](https://root.cern/doc/master/df018__customActions_8C.html) for an example implementation of an action helper.
3687 ///
3688 /// It is also possible to inspect the code used by built-in RDataFrame actions at ActionHelpers.hxx.
3689 ///
3690 // clang-format on
3691 template <typename FirstColumn = RDFDetail::RInferredType, typename... OtherColumns, typename Helper>
3693 {
3694 using HelperT = std::decay_t<Helper>;
3695 // TODO add more static sanity checks on Helper
3697 static_assert(std::is_base_of<AH, HelperT>::value && std::is_convertible<HelperT *, AH *>::value,
3698 "Action helper of type T must publicly inherit from ROOT::Detail::RDF::RActionImpl<T>");
3699
3700 auto hPtr = std::make_shared<HelperT>(std::forward<Helper>(helper));
3701 auto resPtr = hPtr->GetResultPtr();
3702
3703 if (std::is_same<FirstColumn, RDFDetail::RInferredType>::value && columns.empty()) {
3705 } else {
3706 return CreateAction<RDFInternal::ActionTags::Book, FirstColumn, OtherColumns...>(columns, resPtr, hPtr,
3707 fProxiedPtr, columns.size());
3708 }
3709 }
3710
3711
3712 // clang-format off
3713 ////////////////////////////////////////////////////////////////////////////
3714 /// \brief Execute a user-defined reduce operation on the values of a column.
3715 /// \tparam F The type of the reduce callable. Automatically deduced.
3716 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
3717 /// \param[in] f A callable with signature `T(T,T)`
3718 /// \param[in] columnName The column to be reduced. If omitted, the first default column is used instead.
3719 /// \return the reduced quantity wrapped in a ROOT::RDF:RResultPtr.
3720 ///
3721 /// A reduction takes two values of a column and merges them into one (e.g.
3722 /// by summing them, taking the maximum, etc). This action performs the
3723 /// specified reduction operation on all processed column values, returning
3724 /// a single value of the same type. The callable f must satisfy the general
3725 /// requirements of a *processing function* besides having signature `T(T,T)`
3726 /// where `T` is the type of column columnName.
3727 ///
3728 /// The returned reduced value of each thread (e.g. the initial value of a sum) is initialized to a
3729 /// default-constructed T object. This is commonly expected to be the neutral/identity element for the specific
3730 /// reduction operation `f` (e.g. 0 for a sum, 1 for a product). If a default-constructed T does not satisfy this
3731 /// requirement, users should explicitly specify an initialization value for T by calling the appropriate `Reduce`
3732 /// overload.
3733 ///
3734 /// ### Example usage:
3735 /// ~~~{.cpp}
3736 /// auto sumOfIntCol = d.Reduce([](int x, int y) { return x + y; }, "intCol");
3737 /// ~~~
3738 ///
3739 /// This action is *lazy*: upon invocation of this method the calculation is
3740 /// booked but not executed. Also see RResultPtr.
3741 // clang-format on
3743 RResultPtr<T> Reduce(F f, std::string_view columnName = "")
3744 {
3745 static_assert(
3746 std::is_default_constructible<T>::value,
3747 "reduce object cannot be default-constructed. Please provide an initialisation value (redIdentity)");
3748 return Reduce(std::move(f), columnName, T());
3749 }
3750
3751 ////////////////////////////////////////////////////////////////////////////
3752 /// \brief Execute a user-defined reduce operation on the values of a column.
3753 /// \tparam F The type of the reduce callable. Automatically deduced.
3754 /// \tparam T The type of the column to apply the reduction to. Automatically deduced.
3755 /// \param[in] f A callable with signature `T(T,T)`
3756 /// \param[in] columnName The column to be reduced. If omitted, the first default column is used instead.
3757 /// \param[in] redIdentity The reduced object of each thread is initialized to this value.
3758 /// \return the reduced quantity wrapped in a RResultPtr.
3759 ///
3760 /// ### Example usage:
3761 /// ~~~{.cpp}
3762 /// auto sumOfIntColWithOffset = d.Reduce([](int x, int y) { return x + y; }, "intCol", 42);
3763 /// ~~~
3764 /// See the description of the first Reduce overload for more information.
3766 RResultPtr<T> Reduce(F f, std::string_view columnName, const T &redIdentity)
3767 {
3768 return Aggregate(f, f, columnName, redIdentity);
3769 }
3770
3771 /// \}
3772 // End of the doxygen group for user-defined actions
3773
3774private:
3776 std::enable_if_t<std::is_default_constructible<RetType>::value, RInterface<Proxied>>
3777 DefineImpl(std::string_view name, F &&expression, const ColumnNames_t &columns, const std::string &where)
3778 {
3779 if (where.compare(0, 8, "Redefine") != 0) { // not a Redefine
3783 } else {
3787 }
3788
3789 using ArgTypes_t = typename TTraits::CallableTraits<F>::arg_types;
3791 std::is_same<DefineType, RDFDetail::ExtraArgsForDefine::Slot>::value, ArgTypes_t>::type;
3793 std::is_same<DefineType, RDFDetail::ExtraArgsForDefine::SlotAndEntry>::value, ColTypesTmp_t>::type;
3794
3795 constexpr auto nColumns = ColTypes_t::list_size;
3796
3799
3800 // Declare return type to the interpreter, for future use by jitted actions
3802 if (retTypeName.empty()) {
3803 // The type is not known to the interpreter.
3804 // We must not error out here, but if/when this column is used in jitted code
3806 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
3807 }
3808
3810 auto newColumn = std::make_shared<NewCol_t>(name, retTypeName, std::forward<F>(expression), validColumnNames,
3812
3814 newCols.AddDefine(std::move(newColumn));
3815
3817
3818 return newInterface;
3819 }
3820
3821 // This overload is chosen when the callable passed to Define or DefineSlot returns void.
3822 // It simply fires a compile-time error. This is preferable to a static_assert in the main `Define` overload because
3823 // this way compilation of `Define` has no way to continue after throwing the error.
3825 bool IsFStringConv = std::is_convertible<F, std::string>::value,
3826 bool IsRetTypeDefConstr = std::is_default_constructible<RetType>::value>
3827 std::enable_if_t<!IsFStringConv && !IsRetTypeDefConstr, RInterface<Proxied>>
3828 DefineImpl(std::string_view, F, const ColumnNames_t &, const std::string &)
3829 {
3830 static_assert(std::is_default_constructible<typename TTraits::CallableTraits<F>::ret_type>::value,
3831 "Error in `Define`: type returned by expression is not default-constructible");
3832 return *this; // never reached
3833 }
3834
3835 ////////////////////////////////////////////////////////////////////////////
3836 /// \brief Implementation of cache.
3837 template <typename... ColTypes, std::size_t... S>
3839 {
3841
3842 // Check at compile time that the columns types are copy constructible
3843 constexpr bool areCopyConstructible =
3844 RDFInternal::TEvalAnd<std::is_copy_constructible<ColTypes>::value...>::value;
3845 static_assert(areCopyConstructible, "Columns of a type which is not copy constructible cannot be cached yet.");
3846
3848
3849 auto colHolders = std::make_tuple(Take<ColTypes>(columnListWithoutSizeColumns[S])...);
3850 auto ds = std::make_unique<RLazyDS<ColTypes...>>(
3851 std::make_pair(columnListWithoutSizeColumns[S], std::get<S>(colHolders))...);
3852
3853 RInterface<RLoopManager> cachedRDF(std::make_shared<RLoopManager>(std::move(ds), columnListWithoutSizeColumns));
3854
3855 return cachedRDF;
3856 }
3857
3858 template <bool IsSingleColumn, typename F>
3860 VaryImpl(const std::vector<std::string> &colNames, F &&expression, const ColumnNames_t &inputColumns,
3861 const std::vector<std::string> &variationTags, std::string_view variationName)
3862 {
3863 using F_t = std::decay_t<F>;
3864 using ColTypes_t = typename TTraits::CallableTraits<F_t>::arg_types;
3865 using RetType = typename TTraits::CallableTraits<F_t>::ret_type;
3866 constexpr auto nColumns = ColTypes_t::list_size;
3867
3869
3872
3874 if (retTypeName.empty()) {
3875 // The type is not known to the interpreter, but we don't want to error out
3876 // here, rather if/when this column is used in jitted code, so we inject a broken but telling type name.
3878 retTypeName = "CLING_UNKNOWN_TYPE_" + demangledType;
3879 }
3880
3881 auto variation = std::make_shared<RDFInternal::RVariation<F_t, IsSingleColumn>>(
3882 colNames, variationName, std::forward<F>(expression), variationTags, retTypeName, fColRegister, *fLoopManager,
3884
3886 newCols.AddVariation(std::move(variation));
3887
3889
3890 return newInterface;
3891 }
3892
3893 RInterface<Proxied> JittedVaryImpl(const std::vector<std::string> &colNames, std::string_view expression,
3894 const std::vector<std::string> &variationTags, std::string_view variationName,
3895 bool isSingleColumn)
3896 {
3897 R__ASSERT(!variationTags.empty() && "Must have at least one variation.");
3898 R__ASSERT(!colNames.empty() && "Must have at least one varied column.");
3899 R__ASSERT(!variationName.empty() && "Must provide a variation name.");
3900
3901 for (auto &colName : colNames) {
3905 }
3907
3908 // when varying multiple columns, they must be different columns
3909 if (colNames.size() > 1) {
3910 std::set<std::string> uniqueCols(colNames.begin(), colNames.end());
3911 if (uniqueCols.size() != colNames.size())
3912 throw std::logic_error("A column name was passed to the same Vary invocation multiple times.");
3913 }
3914
3915 // Cannot vary different input column types, assume the first
3917 auto jittedVariation =
3920
3922 newColRegister.AddVariation(std::move(jittedVariation));
3923
3925
3926 return newInterface;
3927 }
3928
3929 template <typename Helper, typename ActionResultType>
3930 auto CallCreateActionWithoutColsIfPossible(const std::shared_ptr<ActionResultType> &resPtr,
3931 const std::shared_ptr<Helper> &hPtr,
3933 -> decltype(hPtr->Exec(0u), RResultPtr<ActionResultType>{})
3934 {
3936 }
3937
3938 template <typename Helper, typename ActionResultType, typename... Others>
3940 CallCreateActionWithoutColsIfPossible(const std::shared_ptr<ActionResultType> &,
3941 const std::shared_ptr<Helper>& /*hPtr*/,
3942 Others...)
3943 {
3944 throw std::logic_error(std::string("An action was booked with no input columns, but the action requires "
3945 "columns! The action helper type was ") +
3946 typeid(Helper).name());
3947 return {};
3948 }
3949
3950protected:
3951 RInterface(const std::shared_ptr<Proxied> &proxied, RLoopManager &lm,
3954 {
3955 }
3956
3957 const std::shared_ptr<Proxied> &GetProxiedPtr() const { return fProxiedPtr; }
3958};
3959
3960} // namespace RDF
3961
3962} // namespace ROOT
3963
3964#endif // ROOT_RDF_INTERFACE
#define f(i)
Definition RSha256.hxx:104
#define h(i)
Definition RSha256.hxx:106
#define e(i)
Definition RSha256.hxx:103
Basic types used by ROOT and required by TInterpreter.
unsigned int UInt_t
Unsigned integer 4 bytes (unsigned int)
Definition RtypesCore.h:60
long long Long64_t
Portable signed long integer 8 bytes.
Definition RtypesCore.h:83
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Definition RtypesCore.h:84
#define X(type, name)
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition TError.cxx:252
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char filename
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
char name[80]
Definition TGX11.cxx:148
Base class for action helpers, see RInterface::Book() for more information.
implementation of FilterAvailable and FilterMissing operations
The head node of a RDF computation graph.
A histogram data structure to bin data along multiple dimensions.
A histogram for aggregation of data along multiple dimensions.
Definition RHist.hxx:67
Helper class that provides the operation graph nodes.
A RDataFrame node that produces a result.
Definition RAction.hxx:53
A binder for user-defined columns, variations and aliases.
std::vector< std::string_view > GenerateColumnNames() const
Return the list of the names of the defined columns (Defines + Aliases).
RDFDetail::RDefineBase * GetDefine(std::string_view colName) const
Return the RDefine for the requested column name, or nullptr.
The dataset specification for RDataFrame.
virtual const std::vector< std::string > & GetColumnNames() const =0
Returns a reference to the collection of the dataset's column names.
The base public interface to the RDataFrame federation of classes.
std::string GetColumnType(std::string_view column)
Return the type of a given column as a string.
ColumnNames_t GetValidatedColumnNames(const unsigned int nColumns, const ColumnNames_t &columns)
ColumnNames_t GetColumnTypeNamesList(const ColumnNames_t &columnList)
std::shared_ptr< ROOT::Detail::RDF::RLoopManager > fLoopManager
< The RLoopManager at the root of this computation graph. Never null.
RResultPtr< ActionResultType > CreateAction(const ColumnNames_t &columns, const std::shared_ptr< ActionResultType > &r, const std::shared_ptr< HelperArgType > &helperArg, const std::shared_ptr< RDFNode > &proxiedPtr, const int=-1)
Create RAction object, return RResultPtr for the action Overload for the case in which all column typ...
RDataSource * GetDataSource() const
void CheckAndFillDSColumns(ColumnNames_t validCols, TTraits::TypeList< ColumnTypes... > typeList)
void CheckIMTDisabled(std::string_view callerName)
ColumnNames_t GetColumnNames()
Returns the names of the available columns.
RDFDetail::RLoopManager * GetLoopManager() const
RDFInternal::RColumnRegister fColRegister
Contains the columns defined up to this node.
The public interface to the RDataFrame federation of classes.
RResultPtr< RDisplay > Display(const ColumnNames_t &columnList, size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RResultPtr<::TProfile > Profile1D(const TProfile1DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
Fill and return a one-dimensional profile (lazy action).
RResultPtr<::THnD > HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName="")
Fill and return an N-dimensional histogram (lazy action).
RResultPtr<::TGraph > Graph(std::string_view x="", std::string_view y="")
Fill and return a TGraph object (lazy action).
RInterface< Proxied > Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName="")
Register systematic variations for a single existing column using custom variation tags.
RInterface< Proxied > Vary(const std::vector< std::string > &colNames, std::string_view expression, std::size_t nVariations, std::string_view variationName)
Register systematic variations for multiple existing columns using auto-generated variation tags.
RResultPtr< ROOT::Experimental::RHist< BinContentType > > Hist(std::uint64_t nNormalBins, std::pair< double, double > interval, std::string_view vName, std::string_view wName)
Fill and return a one-dimensional RHist with weights (lazy action).
RInterface(const RInterface &)=default
Copy-ctor for RInterface.
RResultPtr< RDFDetail::MaxReturnType_t< T > > Max(std::string_view columnName="")
Return the maximum of processed column values (lazy action).
auto CallCreateActionWithoutColsIfPossible(const std::shared_ptr< ActionResultType > &resPtr, const std::shared_ptr< Helper > &hPtr, TTraits::TypeList< RDFDetail::RInferredType >) -> decltype(hPtr->Exec(0u), RResultPtr< ActionResultType >{})
RInterface(RInterface &&)=default
Move-ctor for RInterface.
RInterface< Proxied > Vary(std::string_view colName, std::string_view expression, const std::vector< std::string > &variationTags, std::string_view variationName="")
Register systematic variations for a single existing column using custom variation tags.
RInterface< RDFDetail::RFilter< F, Proxied > > Filter(F f, const std::initializer_list< std::string > &columns)
Append a filter to the call graph.
RInterface< RLoopManager > Cache(std::initializer_list< std::string > columnList)
Save selected columns in memory.
RInterface< Proxied > Vary(std::string_view colName, F &&expression, const ColumnNames_t &inputColumns, std::size_t nVariations, std::string_view variationName="")
Register systematic variations for a single existing column using auto-generated variation tags.
RInterface< Proxied > Vary(std::initializer_list< std::string > colNames, std::string_view expression, std::size_t nVariations, std::string_view variationName)
Register systematic variations for multiple existing columns using auto-generated variation tags.
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList, const RSnapshotOptions &options=RSnapshotOptions())
RResultPtr<::TProfile2D > Profile2D(const TProfile2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view v3Name, std::string_view wName)
Fill and return a two-dimensional profile (lazy action).
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, std::string_view columnNameRegexp="", const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree or RNTuple treename in file filename.
RResultPtr< RDisplay > Display(const ColumnNames_t &columnList, size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RResultPtr< RDisplay > Display(std::initializer_list< std::string > columnList, size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RInterface(const std::shared_ptr< RLoopManager > &proxied)
Build a RInterface from a RLoopManager.
RResultPtr<::THnSparseD > HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName="")
Fill and return a sparse N-dimensional histogram (lazy action).
RInterface< Proxied > Redefine(std::string_view name, F expression, const ColumnNames_t &columns={})
Overwrite the value and/or type of an existing column.
std::shared_ptr< Proxied > fProxiedPtr
Smart pointer to the graph node encapsulated by this RInterface.
RInterface< Proxied > Vary(const std::vector< std::string > &colNames, std::string_view expression, const std::vector< std::string > &variationTags, std::string_view variationName)
Register systematic variations for multiple existing columns using custom variation tags.
RInterface< Proxied > Vary(std::string_view colName, std::string_view expression, std::size_t nVariations, std::string_view variationName="")
Register systematic variations for a single existing column using auto-generated variation tags.
RResultPtr<::TH1D > Histo1D(std::string_view vName)
Fill and return a one-dimensional histogram with the values of a column (lazy action).
RInterface< RDFDetail::RRange< Proxied > > Range(unsigned int begin, unsigned int end, unsigned int stride=1)
Creates a node that filters entries based on range: [begin, end).
RResultPtr< typename std::decay_t< Helper >::Result_t > Book(Helper &&helper, const ColumnNames_t &columns={})
Book execution of a custom action using a user-defined helper object.
RResultPtr< ROOT::Experimental::RHist< BinContentType > > Hist(std::vector< ROOT::Experimental::RAxisVariant > axes, const ColumnNames_t &columnList)
Fill and return an RHist (lazy action).
RResultPtr<::TProfile > Profile1D(const TProfile1DModel &model, std::string_view v1Name="", std::string_view v2Name="")
Fill and return a one-dimensional profile (lazy action).
const std::shared_ptr< Proxied > & GetProxiedPtr() const
RResultPtr<::TH1D > Histo1D(const TH1DModel &model={"", "", 128u, 0., 0.})
Fill and return a one-dimensional histogram with the weighted values of a column (lazy action).
RResultPtr< T > Reduce(F f, std::string_view columnName="")
Execute a user-defined reduce operation on the values of a column.
RResultPtr< T > Reduce(F f, std::string_view columnName, const T &redIdentity)
Execute a user-defined reduce operation on the values of a column.
RInterface< Proxied > Vary(const std::vector< std::string > &colNames, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName)
Register systematic variations for multiple existing columns using custom variation tags.
RInterface< RLoopManager > Cache(const ColumnNames_t &columnList)
Save selected columns in memory.
RResultPtr<::TH1D > Histo1D(const TH1DModel &model, std::string_view vName, std::string_view wName)
Fill and return a one-dimensional histogram with the weighted values of a column (lazy action).
RResultPtr< RDisplay > Display(std::string_view columnNameRegexp="", size_t nRows=5, size_t nMaxCollectionElements=10)
Provides a representation of the columns in the dataset.
RInterface & operator=(const RInterface &)=default
Copy-assignment operator for RInterface.
RInterface< Proxied > VaryImpl(const std::vector< std::string > &colNames, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName)
RResultPtr<::THnSparseD > HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName="")
Fill and return a sparse N-dimensional histogram (lazy action).
RInterface< Proxied > Define(std::string_view name, std::string_view expression)
Define a new column.
RInterface< RDFDetail::RFilterWithMissingValues< Proxied > > FilterAvailable(std::string_view column)
Discard entries with missing values.
std::enable_if_t<!IsFStringConv &&!IsRetTypeDefConstr, RInterface< Proxied > > DefineImpl(std::string_view, F, const ColumnNames_t &, const std::string &)
RInterface< Proxied > Redefine(std::string_view name, std::string_view expression)
Overwrite the value and/or type of an existing column.
std::vector< std::string > GetFilterNames()
Returns the names of the filters created.
RInterface< RLoopManager > Cache(std::string_view columnNameRegexp="")
Save selected columns in memory.
RResultPtr<::TH1D > Histo1D(const TH1DModel &model={"", "", 128u, 0., 0.}, std::string_view vName="")
Fill and return a one-dimensional histogram with the values of a column (lazy action).
RInterface< Proxied > Vary(std::initializer_list< std::string > colNames, F &&expression, const ColumnNames_t &inputColumns, const std::vector< std::string > &variationTags, std::string_view variationName)
Register systematic variations for multiple existing columns using custom variation tags.
RResultPtr<::TH3D > Histo3D(const TH3DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view v3Name, std::string_view wName)
Fill and return a three-dimensional histogram (lazy action).
friend class RDFInternal::GraphDrawing::GraphCreatorHelper
RResultPtr< ROOT::Experimental::RHistEngine< BinContentType > > Hist(std::shared_ptr< ROOT::Experimental::RHistEngine< BinContentType > > h, const ColumnNames_t &columnList)
Fill the provided RHistEngine (lazy action).
RInterface< RLoopManager > CacheImpl(const ColumnNames_t &columnList, std::index_sequence< S... >)
Implementation of cache.
RResultPtr<::TProfile2D > Profile2D(const TProfile2DModel &model, std::string_view v1Name="", std::string_view v2Name="", std::string_view v3Name="")
Fill and return a two-dimensional profile (lazy action).
RInterface< RDFDetail::RFilter< F, Proxied > > Filter(F f, std::string_view name)
Append a filter to the call graph.
RResultPtr< U > Aggregate(AccFun aggregator, MergeFun merger, std::string_view columnName="")
Execute a user-defined accumulation operation on the processed column values in each processing slot.
std::enable_if_t< std::is_default_constructible< RetType >::value, RInterface< Proxied > > DefineImpl(std::string_view name, F &&expression, const ColumnNames_t &columns, const std::string &where)
RResultPtr< ROOT::Experimental::RHist< BinContentType > > Hist(std::shared_ptr< ROOT::Experimental::RHist< BinContentType > > h, const ColumnNames_t &columnList)
Fill the provided RHist (lazy action).
RInterface(const std::shared_ptr< Proxied > &proxied, RLoopManager &lm, const RDFInternal::RColumnRegister &colRegister)
RResultPtr< COLL > Take(std::string_view column="")
Return a collection of values of a column (lazy action, returns a std::vector by default).
RInterface< Proxied > Alias(std::string_view alias, std::string_view columnName)
Allow to refer to a column with a different name.
RResultPtr< RDFDetail::MinReturnType_t< T > > Min(std::string_view columnName="")
Return the minimum of processed column values (lazy action).
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList, const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree or RNTuple treename in file filename.
RResultPtr< ROOT::Experimental::RHistEngine< BinContentType > > Hist(std::shared_ptr< ROOT::Experimental::RHistEngine< BinContentType > > h, const ColumnNames_t &columnList, std::string_view wName)
Fill the provided RHistEngine with weights (lazy action).
RResultPtr< RCutFlowReport > Report()
Gather filtering statistics.
RResultPtr<::TH3D > Histo3D(const TH3DModel &model)
RResultPtr<::TH3D > Histo3D(const TH3DModel &model, std::string_view v1Name="", std::string_view v2Name="", std::string_view v3Name="")
Fill and return a three-dimensional histogram (lazy action).
RResultPtr<::TH1D > Histo1D(std::string_view vName, std::string_view wName)
Fill and return a one-dimensional histogram with the weighted values of a column (lazy action).
RInterface< Proxied > DefinePerSample(std::string_view name, std::string_view expression)
Define a new column that is updated when the input sample changes.
RInterface< Proxied > DefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns={})
Define a new column with a value dependent on the processing slot and the current entry.
RResultPtr< std::decay_t< T > > Fill(T &&model, const ColumnNames_t &columnList)
Return an object of type T on which T::Fill will be called once per event (lazy action).
RInterface< Proxied > DefineSlot(std::string_view name, F expression, const ColumnNames_t &columns={})
Define a new column with a value dependent on the processing slot.
RInterface< RDFDetail::RFilterWithMissingValues< Proxied > > FilterMissing(std::string_view column)
Keep only the entries that have missing values.
RResultPtr< TStatistic > Stats(std::string_view value="")
Return a TStatistic object, filled once per event (lazy action).
RInterface< Proxied > JittedVaryImpl(const std::vector< std::string > &colNames, std::string_view expression, const std::vector< std::string > &variationTags, std::string_view variationName, bool isSingleColumn)
RInterface< Proxied > DefaultValueFor(std::string_view column, const T &defaultValue)
In case the value in the given column is missing, provide a default value.
RResultPtr< TStatistic > Stats(std::string_view value, std::string_view weight)
Return a TStatistic object, filled once per event (lazy action).
RResultPtr<::TProfile2D > Profile2D(const TProfile2DModel &model)
Fill and return a two-dimensional profile (lazy action).
RInterface< Proxied > RedefineSlot(std::string_view name, F expression, const ColumnNames_t &columns={})
Overwrite the value and/or type of an existing column.
void Foreach(F f, const ColumnNames_t &columns={})
Execute a user-defined function on each entry (instant action).
RResultPtr<::TH2D > Histo2D(const TH2DModel &model, std::string_view v1Name="", std::string_view v2Name="")
Fill and return a two-dimensional histogram (lazy action).
RResultPtr< ActionResultType > CallCreateActionWithoutColsIfPossible(const std::shared_ptr< ActionResultType > &, const std::shared_ptr< Helper > &, Others...)
RInterface< Proxied > Define(std::string_view name, F expression, const ColumnNames_t &columns={})
Define a new column.
void ForeachSlot(F f, const ColumnNames_t &columns={})
Execute a user-defined function requiring a processing slot index on each entry (instant action).
RResultPtr<::TGraphAsymmErrors > GraphAsymmErrors(std::string_view x="", std::string_view y="", std::string_view exl="", std::string_view exh="", std::string_view eyl="", std::string_view eyh="")
Fill and return a TGraphAsymmErrors object (lazy action).
RResultPtr< U > Aggregate(AccFun aggregator, MergeFun merger, std::string_view columnName, const U &aggIdentity)
Execute a user-defined accumulation operation on the processed column values in each processing slot.
RResultPtr< ROOT::Experimental::RHist< BinContentType > > Hist(std::shared_ptr< ROOT::Experimental::RHist< BinContentType > > h, const ColumnNames_t &columnList, std::string_view wName)
Fill the provided RHist with weights (lazy action).
RResultPtr<::TProfile > Profile1D(const TProfile1DModel &model)
Fill and return a one-dimensional profile (lazy action).
RResultPtr< RInterface< RLoopManager > > Snapshot(std::string_view treename, std::string_view filename, std::initializer_list< std::string > columnList, const RSnapshotOptions &options=RSnapshotOptions())
Save selected columns to disk, in a new TTree or RNTuple treename in file filename.
RInterface & operator=(RInterface &&)=default
Move-assignment operator for RInterface.
RResultPtr<::TH2D > Histo2D(const TH2DModel &model)
RResultPtr< double > Mean(std::string_view columnName="")
Return the mean of processed column values (lazy action).
RInterface< RDFDetail::RFilter< F, Proxied > > Filter(F f, const ColumnNames_t &columns={}, std::string_view name="")
Append a filter to the call graph.
RInterface< RLoopManager > Cache(const ColumnNames_t &columnList)
Save selected columns in memory.
RInterface< Proxied > DefinePerSample(std::string_view name, F expression)
Define a new column that is updated when the input sample changes.
RInterface< Proxied > Vary(std::initializer_list< std::string > colNames, F &&expression, const ColumnNames_t &inputColumns, std::size_t nVariations, std::string_view variationName)
Register systematic variations for for multiple existing columns using custom variation tags.
RInterface< RDFDetail::RRange< Proxied > > Range(unsigned int end)
Creates a node that filters entries based on range.
RInterface< Proxied > RedefineSlotEntry(std::string_view name, F expression, const ColumnNames_t &columns={})
Overwrite the value and/or type of an existing column.
RInterface< RDFDetail::RJittedFilter > Filter(std::string_view expression, std::string_view name="")
Append a filter to the call graph.
RResultPtr< ROOT::Experimental::RHist< BinContentType > > Hist(std::uint64_t nNormalBins, std::pair< double, double > interval, std::string_view vName)
Fill and return a one-dimensional RHist (lazy action).
RResultPtr< ROOT::Experimental::RHist< BinContentType > > Hist(std::vector< ROOT::Experimental::RAxisVariant > axes, const ColumnNames_t &columnList, std::string_view wName)
Fill and return an RHist with weights (lazy action).
RResultPtr< ULong64_t > Count()
Return the number of entries processed (lazy action).
RResultPtr<::TH2D > Histo2D(const TH2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName)
Fill and return a weighted two-dimensional histogram (lazy action).
RInterface< Proxied > Vary(const std::vector< std::string > &colNames, F &&expression, const ColumnNames_t &inputColumns, std::size_t nVariations, std::string_view variationName)
Register systematic variations for multiple existing columns using auto-generated tags.
RResultPtr<::THnD > HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName="")
Fill and return an N-dimensional histogram (lazy action).
RResultPtr< double > StdDev(std::string_view columnName="")
Return the unbiased standard deviation of processed column values (lazy action).
RResultPtr< RDFDetail::SumReturnType_t< T > > Sum(std::string_view columnName="", const RDFDetail::SumReturnType_t< T > &initValue=RDFDetail::SumReturnType_t< T >{})
Return the sum of processed column values (lazy action).
A RDataSource implementation which is built on top of result proxies.
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
const_iterator begin() const
const_iterator end() const
typename RemoveFirstParameter< T >::type RemoveFirstParameter_t
TDirectory::TContext keeps track and restore the current directory.
Definition TDirectory.h:89
A TGraph is an object made of two arrays X and Y with npoints each.
Definition TGraph.h:41
@ kAllAxes
Definition TH1.h:126
Statistical variable, defined by its mean and variance (RMS).
Definition TStatistic.h:33
Double_t y[n]
Definition legend1.C:17
Double_t x[n]
Definition legend1.C:17
void CheckForNoVariations(const std::string &where, std::string_view definedColView, const RColumnRegister &colRegister)
Throw if the column has systematic variations attached.
ParsedTreePath ParseTreePath(std::string_view fullTreeName)
const std::type_info & TypeName2TypeID(const std::string &name)
Return the type_info associated to a name.
Definition RDFUtils.cxx:86
void ChangeEmptyEntryRange(const ROOT::RDF::RNode &node, std::pair< ULong64_t, ULong64_t > &&newRange)
std::shared_ptr< RJittedDefine > BookDefinePerSampleJit(std::string_view name, std::string_view expression, RLoopManager &lm, const RColumnRegister &colRegister)
Book the jitting of a DefinePerSample call.
void CheckValidCppVarName(std::string_view var, const std::string &where)
void ChangeSpec(const ROOT::RDF::RNode &node, ROOT::RDF::Experimental::RDatasetSpec &&spec)
Changes the input dataset specification of an RDataFrame.
const std::vector< std::string > & GetTopLevelFieldNames(const ROOT::RDF::RDataSource &ds)
Definition RDFUtils.cxx:669
void RemoveDuplicates(ColumnNames_t &columnNames)
std::shared_ptr< RNodeBase > UpcastNode(std::shared_ptr< RNodeBase > ptr)
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition RDFUtils.cxx:200
void CheckSnapshotOptionsFormatCompatibility(const ROOT::RDF::RSnapshotOptions &opts)
void CheckForDefinition(const std::string &where, std::string_view definedColView, const RColumnRegister &colRegister, const ColumnNames_t &dataSourceColumns)
Throw if column definedColView is not already there.
std::vector< std::string > GetFilterNames(const std::shared_ptr< RLoopManager > &loopManager)
std::string GetDataSourceLabel(const ROOT::RDF::RNode &node)
std::string PrettyPrintAddr(const void *const addr)
std::shared_ptr< RDFDetail::RJittedFilter > BookFilterJit(std::shared_ptr< RDFDetail::RNodeBase > prevNode, std::string_view name, std::string_view expression, const RColumnRegister &colRegister, TTree *tree, RDataSource *ds)
Book the jitting of a Filter call.
void TriggerRun(ROOT::RDF::RNode node)
Trigger the execution of an RDataFrame computation graph.
void CheckTypesAndPars(unsigned int nTemplateParams, unsigned int nColumnNames)
std::string DemangleTypeIdName(const std::type_info &typeInfo)
bool AtLeastOneEmptyString(const std::vector< std::string_view > strings)
std::pair< std::vector< std::string >, std::vector< std::string > > AddSizeBranches(ROOT::RDF::RDataSource *ds, std::vector< std::string > &&colsWithoutAliases, std::vector< std::string > &&colsWithAliases)
Return copies of colsWithoutAliases and colsWithAliases with size branches for variable-sized array b...
std::string ColumnName2ColumnTypeName(const std::string &colName, TTree *, RDataSource *, RDefineBase *, bool vector2RVec=true)
Return a string containing the type of the given branch.
Definition RDFUtils.cxx:339
void SetTTreeLifeline(ROOT::RDF::RNode &node, std::any lifeline)
void RemoveRNTupleSubfields(ColumnNames_t &columnNames)
std::vector< std::pair< std::uint64_t, std::uint64_t > > GetDatasetGlobalClusterBoundaries(const RNode &node)
Retrieve the cluster boundaries for each cluster in the dataset, across files, with a global offset.
ColumnNames_t FilterArraySizeColNames(const ColumnNames_t &columnNames, const std::string &action)
Take a list of column names, return that list with entries starting by '#' filtered out.
void WarnHist()
Warn once about experimental filling of RHist.
Definition RDFUtils.cxx:55
void CheckForDuplicateSnapshotColumns(const ColumnNames_t &cols)
ColumnNames_t ConvertRegexToColumns(const ColumnNames_t &colNames, std::string_view columnNameRegexp, std::string_view callerName)
void CheckForRedefinition(const std::string &where, std::string_view definedColView, const RColumnRegister &colRegister, const ColumnNames_t &dataSourceColumns)
Throw if column definedColView is already there.
std::shared_ptr< RJittedDefine > BookDefineJit(std::string_view name, std::string_view expression, RLoopManager &lm, RDataSource *ds, const RColumnRegister &colRegister)
Book the jitting of a Define call.
std::shared_ptr< RJittedVariation > BookVariationJit(const std::vector< std::string > &colNames, std::string_view variationName, const std::vector< std::string > &variationTags, std::string_view expression, RLoopManager &lm, RDataSource *ds, const RColumnRegister &colRegister, bool isSingleColumn, const std::string &varyColType)
Book the jitting of a Vary call.
void ChangeBeginAndEndEntries(const RNode &node, Long64_t begin, Long64_t end)
RInterface<::ROOT::Detail::RDF::RNodeBase > RNode
std::vector< std::string > ColumnNames_t
ROOT type_traits extensions.
void EnableImplicitMT(UInt_t numthreads=0)
Enable ROOT's implicit multi-threading for all objects and methods that provide an internal paralleli...
Definition TROOT.cxx:613
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
Definition TROOT.cxx:669
@ kError
An error.
void DisableImplicitMT()
Disables the implicit multi-threading in ROOT (see EnableImplicitMT).
Definition TROOT.cxx:655
A special bin content type to compute the bin error in weighted filling.
type is TypeList if MustRemove is false, otherwise it is a TypeList with the first type removed
Definition Utils.hxx:156
Tag to let data sources use the native data type when creating a column reader.
Definition Utils.hxx:347
A collection of options to steer the creation of the dataset on disk through Snapshot().
A struct which stores some basic parameters of a TH1D.
std::shared_ptr<::TH1D > GetHistogram() const
A struct which stores some basic parameters of a TH2D.
std::shared_ptr<::TH2D > GetHistogram() const
A struct which stores some basic parameters of a TH3D.
std::shared_ptr<::TH3D > GetHistogram() const
A struct which stores some basic parameters of a THnD.
std::shared_ptr<::THnD > GetHistogram() const
A struct which stores some basic parameters of a THnSparseD.
std::shared_ptr<::THnSparseD > GetHistogram() const
A struct which stores some basic parameters of a TProfile.
std::shared_ptr<::TProfile > GetProfile() const
A struct which stores some basic parameters of a TProfile2D.
std::shared_ptr<::TProfile2D > GetProfile() const
Lightweight storage for a collection of types.