Logo ROOT   6.14/05
Reference Guide
df001_introduction.C
Go to the documentation of this file.
1 /// \file
2 /// \ingroup tutorial_dataframe
3 /// \notebook -nodraw
4 /// This tutorial illustrates the basic features of the RDataFrame class,
5 /// a utility which allows to interact with data stored in TTrees following
6 /// a functional-chain like approach.
7 ///
8 /// \macro_code
9 ///
10 /// \date December 2016
11 /// \author Enrico Guiraud
12 
13 // ## Preparation
14 
15 // A simple helper function to fill a test tree: this makes the example
16 // stand-alone.
17 void fill_tree(const char *treeName, const char *fileName)
18 {
19  ROOT::RDataFrame d(10);
20  int i(0);
21  d.Define("b1", [&i]() { return (double)i; })
22  .Define("b2",
23  [&i]() {
24  auto j = i * i;
25  ++i;
26  return j;
27  })
28  .Snapshot(treeName, fileName);
29 }
30 
32 {
33 
34  // We prepare an input tree to run on
35  auto fileName = "df001_introduction.root";
36  auto treeName = "myTree";
37  fill_tree(treeName, fileName);
38 
39  // We read the tree from the file and create a RDataFrame, a class that
40  // allows us to interact with the data contained in the tree.
41  // We select a default column, a *branch* to adopt ROOT jargon, which will
42  // be looked at if none is specified by the user when dealing with filters
43  // and actions.
44  ROOT::RDataFrame d(treeName, fileName, {"b1"});
45 
46  // ## Operations on the dataframe
47  // We now review some *actions* which can be performed on the data frame.
48  // All actions but ForEach return a TActionResultPtr<T>. The series of
49  // operations on the data frame is not executed until one of those pointers
50  // is accessed. If the Foreach action is invoked, the execution is immediate.
51  // But first of all, let us we define now our cut-flow with two lambda
52  // functions. We can use free functions too.
53  auto cutb1 = [](double b1) { return b1 < 5.; };
54  auto cutb1b2 = [](int b2, double b1) { return b2 % 2 && b1 < 4.; };
55 
56  // ### `Count` action
57  // The `Count` allows to retrieve the number of the entries that passed the
58  // filters. Here we show how the automatic selection of the column kicks
59  // in in case the user specifies none.
60  auto entries1 = d.Filter(cutb1) // <- no column name specified here!
61  .Filter(cutb1b2, {"b2", "b1"})
62  .Count();
63 
64  std::cout << *entries1 << " entries passed all filters" << std::endl;
65 
66  // Filters can be expressed as strings. The content must be C++ code. The
67  // name of the variables must be the name of the branches. The code is
68  // just in time compiled.
69  auto entries2 = d.Filter("b1 < 5.").Count();
70  std::cout << *entries2 << " entries passed the string filter" << std::endl;
71 
72  // ### `Min`, `Max` and `Mean` actions
73  // These actions allow to retrieve statistical information about the entries
74  // passing the cuts, if any.
75  auto b1b2_cut = d.Filter(cutb1b2, {"b2", "b1"});
76  auto minVal = b1b2_cut.Min();
77  auto maxVal = b1b2_cut.Max();
78  auto meanVal = b1b2_cut.Mean();
79  auto nonDefmeanVal = b1b2_cut.Mean("b2"); // <- Column is not the default
80  std::cout << "The mean is always included between the min and the max: " << *minVal << " <= " << *meanVal
81  << " <= " << *maxVal << std::endl;
82 
83  // ### `Take` action
84  // The `Take` action allows to retrieve all values of the variable stored in a
85  // particular column that passed filters we specified. The values are stored
86  // in a list by default, but other collections can be chosen.
87  auto b1_cut = d.Filter(cutb1);
88  auto b1Vec = b1_cut.Take<double>();
89  auto b1List = b1_cut.Take<double, std::list<double>>();
90 
91  std::cout << "Selected b1 entries" << std::endl;
92  for (auto b1_entry : *b1List)
93  std::cout << b1_entry << " ";
94  std::cout << std::endl;
95  auto b1VecCl = TClass::GetClass(typeid(*b1Vec));
96  std::cout << "The type of b1Vec is " << b1VecCl->GetName() << std::endl;
97 
98  // ### `Histo1D` action
99  // The `Histo1D` action allows to fill an histogram. It returns a TH1D filled
100  // with values of the column that passed the filters. For the most common
101  // types, the type of the values stored in the column is automatically
102  // guessed.
103  auto hist = d.Filter(cutb1).Histo1D();
104  std::cout << "Filled h " << hist->GetEntries() << " times, mean: " << hist->GetMean() << std::endl;
105 
106  // ### `Foreach` action
107  // The most generic action of all: an operation is applied to all entries.
108  // In this case we fill a histogram. In some sense this is a violation of a
109  // purely functional paradigm - C++ allows to do that.
110  TH1F h("h", "h", 12, -1, 11);
111  d.Filter([](int b2) { return b2 % 2 == 0; }, {"b2"}).Foreach([&h](double b1) { h.Fill(b1); });
112 
113  std::cout << "Filled h with " << h.GetEntries() << " entries" << std::endl;
114 
115  // ## Express your chain of operations with clarity!
116  // We are discussing an example here but it is not hard to imagine much more
117  // complex pipelines of actions acting on data. Those might require code
118  // which is well organised, for example allowing to conditionally add filters
119  // or again to clearly separate filters and actions without the need of
120  // writing the entire pipeline on one line. This can be easily achieved.
121  // We'll show this re-working the `Count` example:
122  auto cutb1_result = d.Filter(cutb1);
123  auto cutb1b2_result = d.Filter(cutb1b2, {"b2", "b1"});
124  auto cutb1_cutb1b2_result = cutb1_result.Filter(cutb1b2, {"b2", "b1"});
125  // Now we want to count:
126  auto evts_cutb1_result = cutb1_result.Count();
127  auto evts_cutb1b2_result = cutb1b2_result.Count();
128  auto evts_cutb1_cutb1b2_result = cutb1_cutb1b2_result.Count();
129 
130  std::cout << "Events passing cutb1: " << *evts_cutb1_result << std::endl
131  << "Events passing cutb1b2: " << *evts_cutb1b2_result << std::endl
132  << "Events passing both: " << *evts_cutb1_cutb1b2_result << std::endl;
133 
134  // ## Calculating quantities starting from existing columns
135  // Often, operations need to be carried out on quantities calculated starting
136  // from the ones present in the columns. We'll create in this example a third
137  // column the values of which are the sum of the *b1* and *b2* ones, entry by
138  // entry. The way in which the new quantity is defined is via a runable.
139  // It is important to note two aspects at this point:
140  // - The value is created on the fly only if the entry passed the existing
141  // filters.
142  // - The newly created column behaves as the one present on the file on disk.
143  // - The operation creates a new value, without modifying anything. De facto,
144  // this is like having a general container at disposal able to accommodate
145  // any value of any type.
146  // Let's dive in an example:
147  auto entries_sum = d.Define("sum", [](double b1, int b2) { return b2 + b1; }, {"b1", "b2"})
148  .Filter([](double sum) { return sum > 4.2; }, {"sum"})
149  .Count();
150  std::cout << *entries_sum << std::endl;
151 
152  // Additional columns can be expressed as strings. The content must be C++
153  // code. The name of the variables must be the name of the branches. The code
154  // is just in time compiled.
155  auto entries_sum2 = d.Define("sum2", "b1 + b2").Filter("sum2 > 4.2").Count();
156  std::cout << *entries_sum2 << std::endl;
157 
158  // It is possible at any moment to read the entry number and the processing
159  // slot number. The latter may change when implicit multithreading is active.
160  // The special columns which provide the entry number and the slot index are
161  // called "tdfentry_" and "tdfslot_" respectively. Their types are an unsigned
162  // 64 bit integer and an unsigned integer.
163  auto printEntrySlot = [](ULong64_t iEntry, unsigned int slot) {
164  std::cout << "Entry: " << iEntry << " Slot: " << slot << std::endl;
165  };
166  d.Foreach(printEntrySlot, {"tdfentry_", "tdfslot_"});
167 
168  return 0;
169 }
static long int sum(long int i)
Definition: Factory.cxx:2258
1-D histogram with a float per channel (see TH1 documentation)}
Definition: TH1.h:567
RVec< T > Filter(const RVec< T > &v, F &&f)
Create a new collection with the elements passing the filter expressed by the predicate.
Definition: RVec.hxx:636
#define h(i)
Definition: RSha256.hxx:106
ROOT&#39;s RDataFrame offers a high level interface for analyses of data stored in TTrees, CSV&#39;s and other data formats.
Definition: RDataFrame.hxx:42
#define d(i)
Definition: RSha256.hxx:102
unsigned long long ULong64_t
Definition: RtypesCore.h:70
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition: TClass.cxx:2887