Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
rf409_NumPyPandasToRooFit.py
Go to the documentation of this file.
1## \file
2## \ingroup tutorial_roofit
3## \notebook
4## Convert between NumPy arrays or Pandas DataFrames and RooDataSets.
5##
6## This tutorials first how to export a RooDataSet to NumPy arrays or a Pandas
7## DataFrame, and then it shows you how to create a RooDataSet from a Pandas
8## DataFrame.
9##
10## \macro_code
11## \macro_output
12##
13## \date November 2021
14## \author Jonas Rembser
15
16import ROOT
17
18import numpy as np
19
20
21# The number of events that we use for the datasets created in this tutorial.
22n_events = 10000
23
24
25# Creating a RooDataSet and exporting it to the Python ecosystem
26# --------------------------------------------------------------
27
28# Define the observable.
29x = ROOT.RooRealVar("x", "x", -10, 10)
30
31# Define a Gaussian model with its parameters.
32mean = ROOT.RooRealVar("mean", "mean of gaussian", 1, -10, 10)
33sigma = ROOT.RooRealVar("sigma", "width of gaussian", 1, 0.1, 10)
34gauss = ROOT.RooGaussian("gauss", "gaussian PDF", x, mean, sigma)
35
36# Create a RooDataSet.
37data = gauss.generate(ROOT.RooArgSet(x), 10000)
38
39# Use RooDataSet.to_numpy() to export dataset to a dictionary of NumPy arrays.
40# Real values will be of type `double`, categorical values of type `int`.
41arrays = data.to_numpy()
42
43# We can verify that the mean and standard deviation matches our model specification.
44print("Mean of numpy array:", np.mean(arrays["x"]))
45print("Standard deviation of numpy array:", np.std(arrays["x"]))
46
47# It is also possible to create a Pandas DataFrame directly from the numpy arrays:
48df = data.to_pandas()
49
50# Now you can use the DataFrame e.g. for plotting. You can even combine this
51# with the RooAbsReal.bins PyROOT function, which returns the binning from
52# RooFit as a numpy array!
53try:
54 import matplotlib.pyplot as plt
55
56 df.hist(column="x", bins=x.bins())
57except Exception:
58 print(
59 'Skipping `df.hist(column="x", bins=x.bins())` because matplotlib could not be imported or was not able to display the plot.'
60 )
61
62del data
63del arrays
64del df
65
66
67# Creating a dataset with NumPy and importing it to a RooDataSet
68# --------------------------------------------------------------
69
70# Now we create some Gaussian toy data with numpy, this time with a different
71# mean.
72x_arr = np.random.normal(-1.0, 1.0, (n_events,))
73
74# Import the data to a RooDataSet, passing a dictionary of arrays and the
75# corresponding RooRealVars just like you would pass to the RooDataSet
76# constructor.
77data = ROOT.RooDataSet.from_numpy({"x": x_arr}, [x])
78
79# Let's fit the Gaussian to the data. The mean is updated accordingly.
80fit_result = gauss.fitTo(data, PrintLevel=-1, Save=True)
81fit_result.Print()
82
83# We can now plot the model and the dataset with RooFit.
84xframe = x.frame(Title="Gaussian pdf")
85data.plotOn(xframe)
86gauss.plotOn(xframe)
87
88# Draw RooFit plot on a canvas.
89c = ROOT.TCanvas("rf409_NumPyPandasToRooFit", "rf409_NumPyPandasToRooFit", 800, 400)
90xframe.Draw()
91c.SaveAs("rf409_NumPyPandasToRooFit.png")
92
93
94# Exporting a RooDataHist to NumPy arrays for histogram counts and bin edges
95# --------------------------------------------------------------------------
96
97
98def print_histogram_output(histogram_output):
99 counts, bin_edges = histogram_output
100 print(np.array(counts, dtype=int))
101 print(bin_edges[0])
102
103
104# Create a binned clone of the dataset to show RooDataHist to NumPy export.
105datahist = data.binnedClone()
106
107# You can also export a RooDataHist to numpy arrays with
108# RooDataHist.to_numpy(). As output, you will get a multidimensional array with
109# the histogram counts and a list of arrays with bin edges. This is comparable
110# to the output of numpy.histogram (or numpy.histogramdd for the
111# multidimensional case).
112counts, bin_edges = datahist.to_numpy()
113
114print("Counts and bin edges from RooDataHist.to_numpy:")
115print_histogram_output((counts, bin_edges))
116
117# Let's compare the output to the counts and bin edges we get with
118# numpy.histogramdd when we pass it the original samples:
119print("Counts and bin edges from np.histogram:")
120print_histogram_output(np.histogramdd([x_arr], bins=[x.bins()]))
121
122# The array values should be the same!
123
124
125# Importing a RooDataHist from NumPy arrays with histogram counts and bin edges
126# -----------------------------------------------------------------------------
127
128# There is also a `RooDataHist.from_numpy` function, again with an interface
129# inspired by `numpy.histogramdd`. You need to pass at least the histogram
130# counts and the list of variables. The binning is optional: the default
131# binning of the RooRealVars is used if not explicitly specified.
132datahist_new_1 = ROOT.RooDataHist.from_numpy(counts, [x])
133
134print("RooDataHist imported with default binning and exported back to numpy:")
135print_histogram_output(datahist_new_1.to_numpy())
136
137
138# It's also possible to pass custom bin edges to `RooDataHist.from_numpy`, just
139# like you pass them to `numpy.histogramdd` when you get the counts to fill the
140# RooDataHist with:
141bins = [np.linspace(-10, 10, 21)]
142counts, _ = np.histogramdd([x_arr], bins=bins)
143datahist_new_2 = ROOT.RooDataHist.from_numpy(counts, [x], bins=bins)
144
145print("RooDataHist imported with linspace binning and exported back to numpy:")
146print_histogram_output(datahist_new_2.to_numpy())
147
148# Alternatively, you can specify only the number of bins and the range if your
149# binning is uniform. This is preferred over passing the full list of bin
150# edges, because RooFit will know that the binning is uniform and do some
151# optimizations.
152bins = [20]
153ranges = [(-10, 10)]
154counts, _ = np.histogramdd([x_arr], bins=bins, range=ranges)
155datahist_new_3 = ROOT.RooDataHist.from_numpy(counts, [x], bins=bins, ranges=ranges)
156
157print("RooDataHist imported with uniform binning and exported back to numpy:")
158print_histogram_output(datahist_new_3.to_numpy())