Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
df001_introduction.C
Go to the documentation of this file.
1/// \file
2/// \ingroup tutorial_dataframe
3/// \notebook -nodraw
4/// Basic RDataFrame usage.
5///
6/// This tutorial illustrates the basic features of the RDataFrame class,
7/// a utility which allows to interact with data stored in TTrees following
8/// a functional-chain like approach.
9///
10/// \macro_code
11/// \macro_output
12///
13/// \date December 2016
14/// \author Enrico Guiraud (CERN)
15
16// ## Preparation
17
18// A simple helper function to fill a test tree: this makes the example
19// stand-alone.
20void fill_tree(const char *treeName, const char *fileName)
21{
23 int i(0);
24 d.Define("b1", [&i]() { return (double)i; })
25 .Define("b2",
26 [&i]() {
27 auto j = i * i;
28 ++i;
29 return j;
30 })
31 .Snapshot(treeName, fileName);
32}
33
35{
36
37 // We prepare an input tree to run on
38 auto fileName = "df001_introduction.root";
39 auto treeName = "myTree";
40 fill_tree(treeName, fileName);
41
42 // We read the tree from the file and create a RDataFrame, a class that
43 // allows us to interact with the data contained in the tree.
44 // We select a default column, a *branch* to adopt ROOT jargon, which will
45 // be looked at if none is specified by the user when dealing with filters
46 // and actions.
47 ROOT::RDataFrame d(treeName, fileName, {"b1"});
48
49 // ## Operations on the dataframe
50 // We now review some *actions* which can be performed on the data frame.
51 // Actions can be divided into instant actions (e. g. Foreach()) and lazy
52 // actions (e. g. Count()), depending on whether they trigger the event
53 // loop immediately or only when one of the results is accessed for the
54 // first time. Actions that return "something" either return their result
55 // wrapped in a RResultPtr or in a RDataFrame.
56 // But first of all, let us define our cut-flow with two lambda
57 // functions. We can use free functions too.
58 auto cutb1 = [](double b1) { return b1 < 5.; };
59 auto cutb1b2 = [](int b2, double b1) { return b2 % 2 && b1 < 4.; };
60
61 // ### `Count` action
62 // The `Count` allows to retrieve the number of the entries that passed the
63 // filters. Here, we show how the automatic selection of the column kicks
64 // in in case the user specifies none.
65 auto entries1 = d.Filter(cutb1) // <- no column name specified here!
66 .Filter(cutb1b2, {"b2", "b1"})
67 .Count();
68
69 std::cout << *entries1 << " entries passed all filters" << std::endl;
70
71 // Filters can be expressed as strings. The content must be C++ code. The
72 // name of the variables must be the name of the branches. The code is
73 // just-in-time compiled.
74 auto entries2 = d.Filter("b1 < 5.").Count();
75 std::cout << *entries2 << " entries passed the string filter" << std::endl;
76
77 // ### `Min`, `Max` and `Mean` actions
78 // These actions allow to retrieve statistical information about the entries
79 // passing the cuts, if any.
80 auto b1b2_cut = d.Filter(cutb1b2, {"b2", "b1"});
81 auto minVal = b1b2_cut.Min();
82 auto maxVal = b1b2_cut.Max();
83 auto meanVal = b1b2_cut.Mean();
84 auto nonDefmeanVal = b1b2_cut.Mean("b2"); // <- Column is not the default
85 std::cout << "The mean is always included between the min and the max: " << *minVal << " <= " << *meanVal
86 << " <= " << *maxVal << std::endl;
87
88 // ### `Take` action
89 // The `Take` action allows to retrieve all values of the variable stored in a
90 // particular column that passed filters we specified. The values are stored
91 // in a vector by default, but other collections can be chosen.
92 auto b1_cut = d.Filter(cutb1);
93 auto b1Vec = b1_cut.Take<double>();
94 auto b1List = b1_cut.Take<double, std::list<double>>();
95
96 std::cout << "Selected b1 entries" << std::endl;
97 for (auto b1_entry : *b1List)
98 std::cout << b1_entry << " ";
99 std::cout << std::endl;
100 auto b1VecCl = ROOT::GetClass(b1Vec.GetPtr());
101 std::cout << "The type of b1Vec is " << b1VecCl->GetName() << std::endl;
102
103 // ### `Histo1D` action
104 // The `Histo1D` action allows to fill an histogram. It returns a TH1D filled
105 // with values of the column that passed the filters. For the most common
106 // types, the type of the values stored in the column is automatically
107 // guessed.
108 auto hist = d.Filter(cutb1).Histo1D();
109 std::cout << "Filled h " << hist->GetEntries() << " times, mean: " << hist->GetMean() << std::endl;
110
111 // ### `Foreach` action
112 // The most generic action of all: an operation is applied to all entries.
113 // In this case we fill a histogram. In some sense this is a violation of a
114 // purely functional paradigm - C++ allows to do that.
115 TH1F h("h", "h", 12, -1, 11);
116 d.Filter([](int b2) { return b2 % 2 == 0; }, {"b2"}).Foreach([&h](double b1) { h.Fill(b1); });
117
118 std::cout << "Filled h with " << h.GetEntries() << " entries" << std::endl;
119
120 // ## Express your chain of operations with clarity!
121 // We are discussing an example here but it is not hard to imagine much more
122 // complex pipelines of actions acting on data. Those might require code
123 // which is well organised, for example allowing to conditionally add filters
124 // or again to clearly separate filters and actions without the need of
125 // writing the entire pipeline on one line. This can be easily achieved.
126 // We'll show this by re-working the `Count` example:
127 auto cutb1_result = d.Filter(cutb1);
128 auto cutb1b2_result = d.Filter(cutb1b2, {"b2", "b1"});
129 auto cutb1_cutb1b2_result = cutb1_result.Filter(cutb1b2, {"b2", "b1"});
130 // Now we want to count:
131 auto evts_cutb1_result = cutb1_result.Count();
132 auto evts_cutb1b2_result = cutb1b2_result.Count();
133 auto evts_cutb1_cutb1b2_result = cutb1_cutb1b2_result.Count();
134
135 std::cout << "Events passing cutb1: " << *evts_cutb1_result << std::endl
136 << "Events passing cutb1b2: " << *evts_cutb1b2_result << std::endl
137 << "Events passing both: " << *evts_cutb1_cutb1b2_result << std::endl;
138
139 // ## Calculating quantities starting from existing columns
140 // Often, operations need to be carried out on quantities calculated starting
141 // from the ones present in the columns. We'll create in this example a third
142 // column, the values of which are the sum of the *b1* and *b2* ones, entry by
143 // entry. The way in which the new quantity is defined is via a callable.
144 // It is important to note two aspects at this point:
145 // - The value is created on the fly only if the entry passed the existing
146 // filters.
147 // - The newly created column behaves as the one present on the file on disk.
148 // - The operation creates a new value, without modifying anything. De facto,
149 // this is like having a general container at disposal able to accommodate
150 // any value of any type.
151 // Let's dive in an example:
152 auto entries_sum = d.Define("sum", [](double b1, int b2) { return b2 + b1; }, {"b1", "b2"})
153 .Filter([](double sum) { return sum > 4.2; }, {"sum"})
154 .Count();
155 std::cout << *entries_sum << std::endl;
156
157 // Additional columns can be expressed as strings. The content must be C++
158 // code. The name of the variables must be the name of the branches. The code
159 // is just-in-time compiled.
160 auto entries_sum2 = d.Define("sum2", "b1 + b2").Filter("sum2 > 4.2").Count();
161 std::cout << *entries_sum2 << std::endl;
162
163 // It is possible at any moment to read the entry number and the processing
164 // slot number. The latter may change when implicit multithreading is active.
165 // The special columns which provide the entry number and the slot index are
166 // called "rdfentry_" and "rdfslot_" respectively. Their types are an unsigned
167 // 64 bit integer and an unsigned integer.
168 auto printEntrySlot = [](ULong64_t iEntry, unsigned int slot) {
169 std::cout << "Entry: " << iEntry << " Slot: " << slot << std::endl;
170 };
171 d.Foreach(printEntrySlot, {"rdfentry_", "rdfslot_"});
172
173 return 0;
174}
#define d(i)
Definition RSha256.hxx:102
#define h(i)
Definition RSha256.hxx:106
unsigned long long ULong64_t
Definition RtypesCore.h:81
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
1-D histogram with a float per channel (see TH1 documentation)
Definition TH1.h:621
RVec< T > Filter(const RVec< T > &v, F &&f)
Create a new collection with the elements passing the filter expressed by the predicate.
Definition RVec.hxx:2145
TClass * GetClass(T *)
Definition TClass.h:659
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2345