Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
df103_NanoAODHiggsAnalysis.py
Go to the documentation of this file.
1## \file
2## \ingroup tutorial_dataframe
3## \notebook -draw
4## An example of complex analysis with RDataFrame: reconstructing the Higgs boson.
5##
6## This tutorial is a simplified but yet complex example of an analysis reconstructing the Higgs boson decaying to two Z
7## bosons from events with four leptons. The data and simulated events are taken from CERN OpenData representing a
8## subset of the data recorded in 2012 with the CMS detector at the LHC. The tutorials follows the Higgs to four leptons
9## analysis published on CERN Open Data portal ([10.7483/OPENDATA.CMS.JKB8.RR42](http://opendata.cern.ch/record/5500)).
10## The resulting plots show the invariant mass of the selected four lepton systems in different decay modes (four muons,
11## four electrons and two of each kind) and in a combined plot indicating the decay of the Higgs boson with a mass of
12## about 125 GeV.
13##
14## The following steps are performed for each sample with data and simulated events in order to reconstruct the Higgs
15## boson from the selected muons and electrons:
16## 1. Select interesting events with multiple cuts on event properties, e.g., number of leptons, kinematics of the
17## leptons and quality of the tracks.
18## 2. Reconstruct two Z bosons of which only one on the mass shell from the selected events and apply additional cuts on
19## the reconstructed objects.
20## 3. Reconstruct the Higgs boson from the remaining Z boson candidates and calculate its invariant mass.
21##
22## Another aim of this version of the tutorial is to show a way to blend C++ and Python code. All the functions that
23## make computations on data to define new columns or filter existing ones in a precise way, better suited to be written
24## in C++, have been moved to a header that is then declared to the ROOT C++ interpreter. The functions that instead
25## create nodes of the computational graph (e.g. Filter, Define) remain inside the main Python script.
26##
27## The tutorial has the fast mode enabled by default, which reads the data from already skimmed
28## datasets with a total size of only 51MB. If the fast mode is disabled, the tutorial runs over
29## the full dataset with a size of 12GB.
30##
31## \macro_image
32## \macro_code
33## \macro_output
34##
35## \date July 2019
36## \authors Stefan Wunsch (KIT, CERN), Vincenzo Eduardo Padulano (UniMiB, CERN)
37
38import ROOT
39import os
40
41# Enable multi-threading
42ROOT.ROOT.EnableImplicitMT()
43
44# Include necessary header
45higgs_header_path = os.path.join(os.sep, str(ROOT.gROOT.GetTutorialDir()) + os.sep, "analysis" + os.sep,
46 "dataframe" + os.sep, "df103_NanoAODHiggsAnalysis_python.h")
47
48ROOT.gInterpreter.Declare('#include "{}"'.format(higgs_header_path))
49
50
51# Python functions
52def reco_higgs_to_2el2mu(df):
53 """Reconstruct Higgs from two electrons and two muons"""
54 # Filter interesting events
55 df_base = selection_2el2mu(df)
56 # Compute masses of Z systems
57 df_z_mass = df_base.Define("Z_mass", "compute_z_masses_2el2mu(Electron_pt, Electron_eta, Electron_phi,"
58 " Electron_mass, Muon_pt, Muon_eta, Muon_phi, Muon_mass)")
59 # Cut on mass of Z candidates
60 df_z_cut = filter_z_candidates(df_z_mass)
61 # Reconstruct H mass
62 df_h_mass = df_z_cut.Define("H_mass", "compute_higgs_mass_2el2mu(Electron_pt, Electron_eta, Electron_phi,"
63 " Electron_mass, Muon_pt, Muon_eta, Muon_phi, Muon_mass)")
64
65 return df_h_mass
66
67
68def selection_2el2mu(df):
69 """Select interesting events with two electrons and two muons"""
70 df_ge2el2mu = df.Filter("nElectron>=2 && nMuon>=2", "At least two electrons and two muons")
71 df_eta = df_ge2el2mu.Filter("All(abs(Electron_eta)<2.5) && All(abs(Muon_eta)<2.4)", "Eta cuts")
72
73 df_pt = df_eta.Filter("pt_cuts(Muon_pt, Electron_pt)", "Pt cuts")
74
75 df_dr = df_pt.Filter("dr_cuts(Muon_eta, Muon_phi, Electron_eta, Electron_phi)", "Dr cuts")
76
77 df_iso = df_dr.Filter("All(abs(Electron_pfRelIso03_all)<0.40) && All(abs(Muon_pfRelIso04_all)<0.40)",
78 "Require good isolation")
79 df_el_ip3d = df_iso.Define("Electron_ip3d_el", "sqrt(Electron_dxy*Electron_dxy + Electron_dz*Electron_dz)")
80 df_el_sip3d = df_el_ip3d.Define("Electron_sip3d_el",
81 "Electron_ip3d_el/sqrt(Electron_dxyErr*Electron_dxyErr + "
82 "Electron_dzErr*Electron_dzErr)")
83 df_el_track = df_el_sip3d.Filter("All(Electron_sip3d_el<4) && All(abs(Electron_dxy)<0.5) &&"
84 " All(abs(Electron_dz)<1.0)",
85 "Electron track close to primary vertex with small uncertainty")
86 df_mu_ip3d = df_el_track.Define("Muon_ip3d_mu", "sqrt(Muon_dxy*Muon_dxy + Muon_dz*Muon_dz)")
87
88 df_mu_sip3d = df_mu_ip3d.Define("Muon_sip3d_mu",
89 "Muon_ip3d_mu/sqrt(Muon_dxyErr*Muon_dxyErr + Muon_dzErr*Muon_dzErr)")
90 df_mu_track = df_mu_sip3d.Filter("All(Muon_sip3d_mu<4) && All(abs(Muon_dxy)<0.5) && All(abs(Muon_dz)<1.0)",
91 "Muon track close to primary vertex with small uncertainty")
92 df_2p2n = df_mu_track.Filter("Sum(Electron_charge)==0 && Sum(Muon_charge)==0",
93 "Two opposite charged electron and muon pairs")
94
95 return df_2p2n
96
97
98def reco_higgs_to_4mu(df):
99 """Reconstruct Higgs from four muons"""
100 # Filter interesting events
101 df_base = selection_4mu(df)
102
103 # Reconstruct Z systems
104 df_z_idx = df_base.Define("Z_idx", "reco_zz_to_4l(Muon_pt, Muon_eta, Muon_phi, Muon_mass, Muon_charge)")
105
106 # Cut on distance between muons building Z systems
107 df_z_dr = df_z_idx.Filter("filter_z_dr(Z_idx, Muon_eta, Muon_phi)", "Delta R separation of muons building Z system")
108
109 # Compute masses of Z systems
110 df_z_mass = df_z_dr.Define("Z_mass", "compute_z_masses_4l(Z_idx, Muon_pt, Muon_eta, Muon_phi, Muon_mass)")
111
112 # Cut on mass of Z candidates
113 df_z_cut = filter_z_candidates(df_z_mass)
114
115 # Reconstruct H mass
116 df_h_mass = df_z_cut.Define("H_mass", "compute_higgs_mass_4l(Z_idx, Muon_pt, Muon_eta, Muon_phi, Muon_mass)")
117
118 return df_h_mass
119
120
121def selection_4mu(df):
122 """Select interesting events with four muons"""
123 df_ge4m = df.Filter("nMuon>=4", "At least four muons")
124
125 df_iso = df_ge4m.Filter("All(abs(Muon_pfRelIso04_all)<0.40)", "Require good isolation")
126 df_kin = df_iso.Filter("All(Muon_pt>5) && All(abs(Muon_eta)<2.4)", "Good muon kinematics")
127 df_ip3d = df_kin.Define("Muon_ip3d", "sqrt(Muon_dxy*Muon_dxy + Muon_dz*Muon_dz)")
128 df_sip3d = df_ip3d.Define("Muon_sip3d", "Muon_ip3d/sqrt(Muon_dxyErr*Muon_dxyErr + Muon_dzErr*Muon_dzErr)")
129 df_pv = df_sip3d.Filter("All(Muon_sip3d<4) && All(abs(Muon_dxy)<0.5) && All(abs(Muon_dz)<1.0)",
130 "Track close to primary vertex with small uncertainty")
131 df_2p2n = df_pv.Filter("nMuon==4 && Sum(Muon_charge==1)==2 && Sum(Muon_charge==-1)==2",
132 "Two positive and two negative muons")
133
134 return df_2p2n
135
136
137def filter_z_candidates(df):
138 """Apply selection on reconstructed Z candidates"""
139 df_z1_cut = df.Filter("Z_mass[0] > 40 && Z_mass[0] < 120", "Mass of first Z candidate in [40, 120]")
140 df_z2_cut = df_z1_cut.Filter("Z_mass[1] > 12 && Z_mass[1] < 120", "Mass of second Z candidate in [12, 120]")
141
142 return df_z2_cut
143
144
145def reco_higgs_to_4el(df):
146 """Reconstruct Higgs from four electrons"""
147 # Filter interesting events
148 df_base = selection_4el(df)
149
150 # Reconstruct Z systems
151 df_z_idx = df_base.Define("Z_idx",
152 "reco_zz_to_4l(Electron_pt, Electron_eta, Electron_phi, Electron_mass, Electron_charge)")
153
154 # Cut on distance between Electrons building Z systems
155 df_z_dr = df_z_idx.Filter("filter_z_dr(Z_idx, Electron_eta, Electron_phi)",
156 "Delta R separation of Electrons building Z system")
157
158 # Compute masses of Z systems
159 df_z_mass = df_z_dr.Define("Z_mass",
160 "compute_z_masses_4l(Z_idx, Electron_pt, Electron_eta, Electron_phi, Electron_mass)")
161
162 # Cut on mass of Z candidates
163 df_z_cut = filter_z_candidates(df_z_mass)
164
165 # Reconstruct H mass
166 df_h_mass = df_z_cut.Define("H_mass",
167 "compute_higgs_mass_4l(Z_idx, Electron_pt, Electron_eta, Electron_phi, Electron_mass)")
168
169 return df_h_mass
170
171
172def selection_4el(df):
173 """Select interesting events with four electrons"""
174 df_ge4el = df.Filter("nElectron>=4", "At least our electrons")
175 df_iso = df_ge4el.Filter("All(abs(Electron_pfRelIso03_all)<0.40)", "Require good isolation")
176 df_kin = df_iso.Filter("All(Electron_pt>7) && All(abs(Electron_eta)<2.5)", "Good Electron kinematics")
177 df_ip3d = df_kin.Define("Electron_ip3d", "sqrt(Electron_dxy*Electron_dxy + Electron_dz*Electron_dz)")
178 df_sip3d = df_ip3d.Define("Electron_sip3d",
179 "Electron_ip3d/sqrt(Electron_dxyErr*Electron_dxyErr + Electron_dzErr*Electron_dzErr)")
180 df_pv = df_sip3d.Filter("All(Electron_sip3d<4) && All(abs(Electron_dxy)<0.5) && All(abs(Electron_dz)<1.0)",
181 "Track close to primary vertex with small uncertainty")
182 df_2p2n = df_pv.Filter("nElectron==4 && Sum(Electron_charge==1)==2 && Sum(Electron_charge==-1)==2",
183 "Two positive and two negative electrons")
184
185 return df_2p2n
186
187
188def plot(sig, bkg, data, x_label, filename):
189 """
190 Plot invariant mass for signal and background processes from simulated
191 events overlay the measured data.
192 """
193 # Canvas and general style options
194 ROOT.gStyle.SetTextFont(42)
195 d = ROOT.TCanvas("", "", 800, 700)
196 d.SetLeftMargin(0.15)
197
198 # Get signal and background histograms and stack them to show Higgs signal
199 # on top of the background process
200 h_bkg = bkg.Clone()
201 h_cmb = sig.Clone()
202
203 h_cmb.Add(h_bkg)
204 h_cmb.SetTitle("")
205 h_cmb.GetXaxis().SetTitle(x_label)
206 h_cmb.GetXaxis().SetTitleSize(0.04)
207 h_cmb.GetYaxis().SetTitle("N_{Events}")
208 h_cmb.GetYaxis().SetTitleSize(0.04)
209 h_cmb.SetLineColor(ROOT.kRed)
210 h_cmb.SetLineWidth(2)
211 h_cmb.SetMaximum(18)
212 h_cmb.SetStats(False)
213
214 h_bkg.SetLineWidth(2)
215 h_bkg.SetFillStyle(1001)
216 h_bkg.SetLineColor(ROOT.kBlack)
217 h_bkg.SetFillColor(ROOT.kAzure - 9)
218
219 # Get histogram of data points
220 h_data = data.Clone()
221 h_data.SetLineWidth(1)
222 h_data.SetMarkerStyle(20)
223 h_data.SetMarkerSize(1.0)
224 h_data.SetMarkerColor(ROOT.kBlack)
225 h_data.SetLineColor(ROOT.kBlack)
226
227 # Draw histograms
228 h_cmb.Draw("HIST")
229 h_bkg.Draw("HIST SAME")
230 h_data.Draw("PE1 SAME")
231
232 # Add legend
233 legend = ROOT.TLegend(0.62, 0.70, 0.82, 0.88)
234 legend.SetFillColor(0)
235 legend.SetBorderSize(0)
236 legend.SetTextSize(0.03)
237 legend.AddEntry(h_data, "Data", "pe")
238 legend.AddEntry(h_bkg, "ZZ", "f")
239 legend.AddEntry(h_cmb, "m_{H} = 125 GeV", "f")
240 legend.Draw()
241
242 # Add header
243 cms_label = ROOT.TLatex()
244 cms_label.SetTextSize(0.04)
245 cms_label.DrawLatexNDC(0.16, 0.92, "#bf{CMS Open Data}")
246 header = ROOT.TLatex()
247 header.SetTextSize(0.03)
248 header.DrawLatexNDC(0.63, 0.92, "#sqrt{s} = 8 TeV, L_{int} = 11.6 fb^{-1}")
249
250 # Save plot
251 d.SaveAs(filename)
252
253 # Make sure canvas and objects remains existing after the macro execution
254 ROOT.SetOwnership(d, False)
255 ROOT.SetOwnership(h_cmb, False)
256 ROOT.SetOwnership(h_data, False)
257 ROOT.SetOwnership(h_bkg, False)
258 ROOT.SetOwnership(legend, False)
259
260
262 # In fast mode, take samples from */cms_opendata_2012_nanoaod_skimmed/*, which has
263 # the preselections from the selection_* functions already applied.
264 path = "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/"
265 run_fast = True # Run on skimmed data, set to False to run on full dataset
266 if run_fast: path = "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod_skimmed/"
267
268 # Create dataframes for signal, background and data samples
269
270 # Signal: Higgs -> 4 leptons
271 df_sig_4l = ROOT.RDataFrame("Events", path + "SMHiggsToZZTo4L.root")
272
273 # Background: ZZ -> 4 leptons
274 # Note that additional background processes from the original paper
275 # with minor contribution were left out for this
276 # tutorial.
277 df_bkg_4mu = ROOT.RDataFrame("Events", path + "ZZTo4mu.root")
278 df_bkg_4el = ROOT.RDataFrame("Events", path + "ZZTo4e.root")
279 df_bkg_2el2mu = ROOT.RDataFrame("Events", path + "ZZTo2e2mu.root")
280
281 # CMS data taken in 2012 (11.6 fb^-1 integrated luminosity)
282 df_data_doublemu = ROOT.RDataFrame("Events", (path + f for f in ["Run2012B_DoubleMuParked.root", "Run2012C_DoubleMuParked.root"]))
283 df_data_doubleel = ROOT.RDataFrame("Events", (path + f for f in ["Run2012B_DoubleElectron.root", "Run2012C_DoubleElectron.root"]))
284
285 # Number of bins for all histograms
286 nbins = 36
287
288 # Weights
289 luminosity = 11580.0 # Integrated luminosity of the data samples
290
291 xsec_ZZTo4mu = 0.077 # ZZ->4mu: Standard Model cross-section
292 nevt_ZZTo4mu = 1499064.0 # ZZ->4mu: Number of simulated events
293
294 xsec_ZZTo4el = 0.077 # ZZ->4el: Standard Model cross-section
295 nevt_ZZTo4el = 1499093.0 # ZZ->4el: Number of simulated events
296
297 xsec_ZZTo2el2mu = 0.18 # ZZ->2el2mu: Standard Model cross-section
298 nevt_ZZTo2el2mu = 1497445.0 # ZZ->2el2mu: Number of simulated events
299
300 xsec_SMHiggsToZZTo4L = 0.0065 # H->4l: Standard Model cross-section
301 nevt_SMHiggsToZZTo4L = 299973.0 # H->4l: Number of simulated events
302 scale_ZZTo4l = 1.386 # ZZ->4l: Scale factor for ZZ to four leptons
303
304 weight_sig_4mu = luminosity * xsec_SMHiggsToZZTo4L / nevt_SMHiggsToZZTo4L
305 weight_bkg_4mu = luminosity * xsec_ZZTo4mu * scale_ZZTo4l / nevt_ZZTo4mu
306
307 weight_sig_4el = luminosity * xsec_SMHiggsToZZTo4L / nevt_SMHiggsToZZTo4L
308 weight_bkg_4el = luminosity * xsec_ZZTo4el * scale_ZZTo4l / nevt_ZZTo4el
309
310 weight_sig_2el2mu = luminosity * xsec_SMHiggsToZZTo4L / nevt_SMHiggsToZZTo4L
311 weight_bkg_2el2mu = luminosity * xsec_ZZTo2el2mu * scale_ZZTo4l / nevt_ZZTo2el2mu
312
313 # Reconstruct Higgs to 4 muons
314 df_sig_4mu_reco = reco_higgs_to_4mu(df_sig_4l)
315
316 df_h_sig_4mu = df_sig_4mu_reco.Define("weight", "{}".format(weight_sig_4mu))\
317 .Histo1D(("h_sig_4mu", "", nbins, 70, 180), "H_mass", "weight")
318
319 df_bkg_4mu_reco = reco_higgs_to_4mu(df_bkg_4mu)
320
321 df_h_bkg_4mu = df_bkg_4mu_reco.Define("weight", "{}".format(weight_bkg_4mu))\
322 .Histo1D(("h_bkg_4mu", "", nbins, 70, 180), "H_mass", "weight")
323
324 df_data_4mu_reco = reco_higgs_to_4mu(df_data_doublemu)
325
326 df_h_data_4mu = df_data_4mu_reco.Define("weight", "1.0")\
327 .Histo1D(("h_data_4mu", "", nbins, 70, 180), "H_mass", "weight")
328
329 # Reconstruct Higgs to 4 electrons
330 df_sig_4el_reco = reco_higgs_to_4el(df_sig_4l)
331
332 df_h_sig_4el = df_sig_4el_reco.Define("weight", "{}".format(weight_sig_4el))\
333 .Histo1D(("h_sig_4el", "", nbins, 70, 180), "H_mass", "weight")
334
335 df_bkg_4el_reco = reco_higgs_to_4el(df_bkg_4el)
336
337 df_h_bkg_4el = df_bkg_4el_reco.Define("weight", "{}".format(weight_bkg_4el))\
338 .Histo1D(("h_bkg_4el", "", nbins, 70, 180), "H_mass", "weight")
339
340 df_data_4el_reco = reco_higgs_to_4el(df_data_doubleel)
341
342 df_h_data_4el = df_data_4el_reco.Define("weight", "1.0")\
343 .Histo1D(("h_data_4el", "", nbins, 70, 180), "H_mass", "weight")
344
345 # Reconstruct Higgs to 2 electrons and 2 muons
346 df_sig_2el2mu_reco = reco_higgs_to_2el2mu(df_sig_4l)
347
348 df_h_sig_2el2mu = df_sig_2el2mu_reco.Define("weight", "{}".format(weight_sig_2el2mu))\
349 .Histo1D(("h_sig_2el2mu", "", nbins, 70, 180), "H_mass", "weight")
350
351 df_bkg_2el2mu_reco = reco_higgs_to_2el2mu(df_bkg_2el2mu)
352
353 df_h_bkg_2el2mu = df_bkg_2el2mu_reco.Define("weight", "{}".format(weight_bkg_2el2mu))\
354 .Histo1D(("h_bkg_2el2mu", "", nbins, 70, 180), "H_mass", "weight")
355
356 df_data_2el2mu_reco = reco_higgs_to_2el2mu(df_data_doublemu)
357
358 df_h_data_2el2mu = df_data_2el2mu_reco.Define("weight", "1.0")\
359 .Histo1D(("h_data_2el2mu_doublemu", "", nbins, 70, 180), "H_mass", "weight")
360
361 # RunGraphs allows to run the event loops of the separate RDataFrame graphs
362 # concurrently. This results in an improved usage of the available resources
363 # if each separate RDataFrame can not utilize all available resources, e.g.,
364 # because not enough data is available.
365 ROOT.RDF.RunGraphs([df_h_sig_4mu, df_h_bkg_4mu, df_h_data_4mu,
366 df_h_sig_4el, df_h_bkg_4el, df_h_data_4el,
367 df_h_sig_2el2mu, df_h_bkg_2el2mu, df_h_data_2el2mu])
368
369 # Get histograms (does not rerun the event loop)
370 signal_4mu = df_h_sig_4mu.GetValue()
371 background_4mu = df_h_bkg_4mu.GetValue()
372 data_4mu = df_h_data_4mu.GetValue()
373
374 signal_4el = df_h_sig_4el.GetValue()
375 background_4el = df_h_bkg_4el.GetValue()
376 data_4el = df_h_data_4el.GetValue()
377
378 signal_2el2mu = df_h_sig_2el2mu.GetValue()
379 background_2el2mu = df_h_bkg_2el2mu.GetValue()
380 data_2el2mu = df_h_data_2el2mu.GetValue()
381
382 # Make plots
383 plot(signal_4mu, background_4mu, data_4mu, "m_{4#mu} (GeV)", "higgs_4mu.pdf")
384 plot(signal_4el, background_4el, data_4el, "m_{4e} (GeV)", "higgs_4el.pdf")
385 plot(signal_2el2mu, background_2el2mu, data_2el2mu, "m_{2e2#mu} (GeV)", "higgs_2el2mu.pdf")
386
387 # Combined plots
388 # If this was done before plotting the others, calling the `Add` function
389 # on the `signal_4mu` histogram would modify the underlying `TH1D` object.
390 # Thus, the histogram with the 4 muons reconstruction would be lost,
391 # instead resulting in the same plot as the aggregated histograms.
392 h_sig_4l = signal_4mu
393 h_sig_4l.Add(signal_4el)
394 h_sig_4l.Add(signal_2el2mu)
395
396 h_bkg_4l = background_4mu
397 h_bkg_4l.Add(background_4el)
398 h_bkg_4l.Add(background_2el2mu)
399
400 h_data_4l = data_4mu
401 h_data_4l.Add(data_4el)
402 h_data_4l.Add(data_2el2mu)
403
404 # Plot aggregated histograms
405 plot(h_sig_4l, h_bkg_4l, h_data_4l, "m_{4l} (GeV)", "higgs_4l.pdf")
406
407
408if __name__ == "__main__":
winID h TVirtualViewer3D TVirtualGLPainter char TVirtualGLPainter plot
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t format
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
unsigned int RunGraphs(std::vector< RResultHandle > handles)
Trigger the event loop of multiple RDataFrames concurrently.