Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
df036_missingBranches.py
Go to the documentation of this file.
1# \file
2# \ingroup tutorial_dataframe
3# \notebook -nodraw
4#
5# This example shows how to process a dataset where entries might be
6# incomplete due to one or more missing branches in one or more of the files
7# in the dataset. It shows usage of the FilterAvailable and DefaultValueFor
8# RDataFrame functionalities to act upon the missing entries.
9#
10# \macro_code
11# \macro_output
12#
13# \date September 2024
14# \author Vincenzo Eduardo Padulano (CERN)
15import os
16import ROOT
17import numpy
18
19
20class DatasetContext:
21 """A helper class to create the dataset for the tutorial below."""
22
23 filenames = [
24 "df036_missingBranches_py_file_1.root",
25 "df036_missingBranches_py_file_2.root",
26 "df036_missingBranches_py_file_3.root"
27 ]
28 treenames = ["tree_1", "tree_2", "tree_3"]
29 nentries = 5
30
31 def __init__(self):
32 with ROOT.TFile(self.filenames[0], "RECREATE") as f:
33 t = ROOT.TTree(self.treenames[0], self.treenames[0])
34 x = numpy.array([0], dtype=int)
35 y = numpy.array([0], dtype=int)
36 t.Branch("x", x, "x/I")
37 t.Branch("y", y, "y/I")
38
39 for i in range(1, self.nentries + 1):
40 x[0] = i
41 y[0] = 2 * i
42 t.Fill()
43
44 t.Write()
45
46 with ROOT.TFile(self.filenames[1], "RECREATE") as f:
47 t = ROOT.TTree(self.treenames[1], self.treenames[1])
48 y = numpy.array([0], dtype=int)
49 t.Branch("y", y, "y/I")
50
51 for i in range(1, self.nentries + 1):
52 y[0] = 3 * i
53 t.Fill()
54
55 t.Write()
56
57 with ROOT.TFile(self.filenames[2], "RECREATE") as f:
58 t = ROOT.TTree(self.treenames[2], self.treenames[2])
59 x = numpy.array([0], dtype=int)
60 t.Branch("x", x, "x/I")
61
62 for i in range(1, self.nentries + 1):
63 x[0] = 4 * i
64 t.Fill()
65
66 t.Write()
67
68 def __enter__(self):
69 """Enable using the class as a context manager."""
70 return self
71
72 def __exit__(self, *_):
73 """
74 Enable using the class as a context manager. At the end of the context,
75 remove the files created.
76 """
77 for filename in self.filenames:
78 os.remove(filename)
79
80
81def df036_missingBranches(dataset: DatasetContext):
82 # The input dataset contains three files, with one TTree each.
83 # The first contains branches (x, y), the second only branch y, the third
84 # only branch x. The TChain will process the three files, encountering a
85 # different missing branch when switching to the next tree
86 chain = ROOT.TChain()
87 for fname, tname in zip(dataset.filenames, dataset.treenames):
88 chain.Add(fname + "?#" + tname)
89
90 df = ROOT.RDataFrame(chain)
91
92 default_value = ROOT.std.numeric_limits[int].min()
93
94 # Example 1: provide a default value for all missing branches
95 display_1 = (
96 df.DefaultValueFor("x", default_value)
97 .DefaultValueFor("y", default_value)
98 .Display(columnList=("x", "y"), nRows=15)
99 )
100
101 # Example 2: provide a default value for branch y, but skip events where
102 # branch x is missing
103 display_2 = (
104 df.DefaultValueFor("y", default_value)
105 .FilterAvailable("x")
106 .Display(columnList=("x", "y"), nRows=15)
107 )
108
109 # Example 3: only keep events where branch y is missing and display values for branch x
110 display_3 = df.FilterMissing("y").Display(columnList=("x",), nRows=15)
111
112 print("Example 1: provide a default value for all missing branches")
113 display_1.Print()
114 print("Example 2: provide a default value for branch y, but skip events where branch x is missing")
115 display_2.Print()
116 print("Example 3: only keep events where branch y is missing and display values for branch x")
117 display_3.Print()
118
119
120if __name__ == "__main__":
121 with DatasetContext() as dataset:
122 df036_missingBranches(dataset)
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...