Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
ntpl011_global_temperatures.C
Go to the documentation of this file.
1/// \file
2/// \ingroup tutorial_ntuple
3/// \notebook
4/// This ROOT 7 example demonstrates how to use RNTuple in combination with ROOT 6 features like RDataframe and
5/// visualizations. It ingests climate data and creates a model with fields like AverageTemperature. Then it uses
6/// RDataframe to process and filter the climate data for average temperature per city by season. Then it does the same
7/// for average temperature per city for the years between 1993-2002, and 2003-2013. Finally, the tutorial visualizes
8/// this processed data through histograms.
9///
10/// TODO(jblomer): re-enable once issues are fixed (\macro_image (rcanvas_js))
11/// \macro_code
12///
13/// \date 2021-02-26
14/// \author John Yoon
15
16// NOTE: The RNTuple classes are experimental at this point.
17// Functionality and interface are still subject to changes.
18// During ROOT setup, configure the following flags:
19// `-DCMAKE_CXX_STANDARD=17 -Droot7=ON -Dwebgui=ON`
20
21#include <ROOT/RDataFrame.hxx>
22#include <ROOT/RNTupleDS.hxx>
23#include <ROOT/RNTupleModel.hxx>
25#include <ROOT/RCanvas.hxx>
26#include <ROOT/RColor.hxx>
29#include <ROOT/RRawFile.hxx>
30#include <TH1D.h>
31#include <TLegend.h>
32#include <TSystem.h>
33
34#include <algorithm>
35#include <cassert>
36#include <cstdio>
37#include <fstream>
38#include <iostream>
39#include <memory>
40#include <string>
41#include <sstream>
42#include <stdexcept>
43#include <utility>
44#include <chrono>
45
46using Clock = std::chrono::high_resolution_clock;
47using RRawFile = ROOT::Internal::RRawFile;
48using namespace ROOT::Experimental;
49
50// Helper function to handle histogram pointer ownership.
51std::shared_ptr<TH1D> GetDrawableHist(ROOT::RDF::RResultPtr<TH1D> &h)
52{
53 auto result = std::shared_ptr<TH1D>(static_cast<TH1D *>(h.GetPtr()->Clone()));
54 result->SetDirectory(nullptr);
55 return result;
56}
57
58// Climate data is downloadable at the following URL:
59// https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data
60// The original data set is from http://berkeleyearth.org/archive/data/
61// License CC BY-NC-SA 4.0
62constexpr const char *kRawDataUrl = "http://root.cern./files/tutorials/GlobalLandTemperaturesByCity.csv";
63constexpr const char *kNTupleFileName = "GlobalLandTemperaturesByCity.root";
64
65void Ingest()
66{
67 int nRecords = 0;
68 int nSkipped = 0;
69 std::cout << "Converting " << kRawDataUrl << " to " << kNTupleFileName << std::endl;
70
71 auto t1 = Clock::now();
72
73 // Create a unique pointer to an empty data model.
74 auto model = RNTupleModel::Create();
75 // To define the data model, create fields with a given C++ type and name. Fields are roughly TTree branches.
76 // MakeField returns a shared pointer to a memory location to fill the ntuple with data.
77 auto fieldYear = model->MakeField<std::uint32_t>("Year");
78 auto fieldMonth = model->MakeField<std::uint32_t>("Month");
79 auto fieldDay = model->MakeField<std::uint32_t>("Day");
80 auto fieldAvgTemp = model->MakeField<float>("AverageTemperature");
81 auto fieldTempUncrty = model->MakeField<float>("AverageTemperatureUncertainty");
82 auto fieldCity = model->MakeField<std::string>("City");
83 auto fieldCountry = model->MakeField<std::string>("Country");
84 auto fieldLat = model->MakeField<float>("Latitude");
85 auto fieldLong = model->MakeField<float>("Longitude");
86
87 // Hand-over the data model to a newly created ntuple of name "globalTempData", stored in kNTupleFileName.
88 // In return, get a unique pointer to a fillable ntuple (first compress the file).
89 auto ntuple = RNTupleWriter::Recreate(std::move(model), "GlobalTempData", kNTupleFileName);
90
91 // Download data in 4MB blocks
92 RRawFile::ROptions options;
93 options.fBlockSize = 4'000'000;
94
95 auto file = RRawFile::Create(kRawDataUrl, options);
96 std::string record;
97 constexpr int kMaxCharsPerLine = 128;
98 while (file->Readln(record)) {
99 if (record.length() >= kMaxCharsPerLine)
100 throw std::runtime_error("record too long: " + record);
101
102 // Parse lines of the form:
103 // 1743-11-01,6.068,1.7369999999999999,Ã…rhus,Denmark,57.05N,10.33E
104 // and skip records with empty fields.
105 std::replace(record.begin(), record.end(), ',', ' ');
106 char country[kMaxCharsPerLine];
107 char city[kMaxCharsPerLine];
108 int nFields =
109 sscanf(record.c_str(), "%u-%u-%u %f %f %s %s %fN %fE", fieldYear.get(), fieldMonth.get(), fieldDay.get(),
110 fieldAvgTemp.get(), fieldTempUncrty.get(), country, city, fieldLat.get(), fieldLong.get());
111 if (nFields != 9) {
112 nSkipped++;
113 continue;
114 }
115 *fieldCountry = country;
116 *fieldCity = city;
117
118 ntuple->Fill();
119
120 if (++nRecords % 1000000 == 0)
121 std::cout << " ... converted " << nRecords << " records" << std::endl;
122 }
123
124 // Display the total time to process the data.
125 std::cout << nSkipped << " records skipped" << std::endl;
126 std::cout << nRecords << " records processed" << std::endl;
127
128 auto t2 = Clock::now();
129 std::cout << std::endl
130 << "Processing Time: " << std::chrono::duration_cast<std::chrono::seconds>(t2 - t1).count() << " seconds\n"
131 << std::endl;
132}
133
134// Every data result that we want to get is declared first, and it is only upon their declaration that
135// they are actually used. This stems from motivations relating to efficiency and optimization.
136void Analyze()
137{
138 // Create a RDataframe by wrapping around NTuple.
139 ROOT::RDataFrame df("GlobalTempData", kNTupleFileName);
140 df.Display()->Print();
141
142 // Declare the minimum and maximum temperature from the dataset.
143 auto min_value = df.Min("AverageTemperature");
144 auto max_value = df.Max("AverageTemperature");
145
146 // Functions to filter by each season from date formatted "1944-12-01."
147 auto fnWinter = [](int month) { return month == 12 || month == 1 || month == 2; };
148 auto fnSpring = [](int month) { return month == 3 || month == 4 || month == 5; };
149 auto fnSummer = [](int month) { return month == 6 || month == 7 || month == 8; };
150 auto fnFall = [](int month) { return month == 9 || month == 10 || month == 11; };
151
152 // Create a RDataFrame per season.
153 auto dfWinter = df.Filter(fnWinter, {"Month"});
154 auto dfSpring = df.Filter(fnSpring, {"Month"});
155 auto dfSummer = df.Filter(fnSummer, {"Month"});
156 auto dfFall = df.Filter(fnFall, {"Month"});
157
158 // Get the count for each season.
159 auto winterCount = dfWinter.Count();
160 auto springCount = dfSpring.Count();
161 auto summerCount = dfSummer.Count();
162 auto fallCount = dfFall.Count();
163
164 // Functions to filter for the time period between 2003-2013, and 1993-2002.
165 auto fn1993_to_2002 = [](int year) { return year >= 1993 && year <= 2002; };
166 auto fn2003_to_2013 = [](int year) { return year >= 2003 && year <= 2013; };
167
168 // Create a RDataFrame for decades 1993_to_2002 & 2003_to_2013.
169 auto df1993_to_2002 = df.Filter(fn1993_to_2002, {"Year"});
170 auto df2003_to_2013 = df.Filter(fn2003_to_2013, {"Year"});
171
172 // Get the count for each decade.
173 auto decade_1993_to_2002_Count = *df1993_to_2002.Count();
174 auto decade_2003_to_2013_Count = *df2003_to_2013.Count();
175
176 // Configure histograms for each season.
177 auto fallHistResultPtr =
178 dfFall.Histo1D({"Fall Average Temp", "Average Temperature by Season", 100, -40, 40}, "AverageTemperature");
179 auto winterHistResultPtr =
180 dfWinter.Histo1D({"Winter Average Temp", "Average Temperature by Season", 100, -40, 40}, "AverageTemperature");
181 auto springHistResultPtr =
182 dfSpring.Histo1D({"Spring Average Temp", "Average Temperature by Season", 100, -40, 40}, "AverageTemperature");
183 auto summerHistResultPtr =
184 dfSummer.Histo1D({"Summer Average Temp", "Average Temperature by Season", 100, -40, 40}, "AverageTemperature");
185
186 // Configure histograms for each decade.
187 auto hist_1993_to_2002_ResultPtr = df1993_to_2002.Histo1D(
188 {"1993_to_2002 Average Temp", "Average Temperature: 1993_to_2002 vs. 2003_to_2013", 100, -40, 40},
189 "AverageTemperature");
190 auto hist_2003_to_2013_ResultPtr = df2003_to_2013.Histo1D(
191 {"2003_to_2013 Average Temp", "Average Temperature: 1993_to_2002 vs. 2003_to_2013", 100, -40, 40},
192 "AverageTemperature");
193
194 //____________________________________________________________________________________
195
196 // Display the minimum and maximum temperature values.
197 std::cout << std::endl << "The Minimum temperature is: " << *min_value << std::endl;
198 std::cout << "The Maximum temperature is: " << *max_value << std::endl;
199
200 // Display the count for each season.
201 std::cout << std::endl << "The count for Winter: " << *winterCount << std::endl;
202 std::cout << "The count for Spring: " << *springCount << std::endl;
203 std::cout << "The count for Summer: " << *summerCount << std::endl;
204 std::cout << "The count for Fall: " << *fallCount << std::endl;
205
206 // Display the count for each decade.
207 std::cout << std::endl << "The count for 1993_to_2002: " << decade_1993_to_2002_Count << std::endl;
208 std::cout << "The count for 2003_to_2013: " << decade_2003_to_2013_Count << std::endl;
209
210 // Transform histogram in order to address ROOT 7 v 6 version compatibility
211 auto fallHist = GetDrawableHist(fallHistResultPtr);
212 auto winterHist = GetDrawableHist(winterHistResultPtr);
213 auto springHist = GetDrawableHist(springHistResultPtr);
214 auto summerHist = GetDrawableHist(summerHistResultPtr);
215
216 // Set an orange histogram for fall.
217 fallHist->SetLineColor(kOrange);
218 fallHist->SetLineWidth(6);
219 // Set a blue histogram for winter.
220 winterHist->SetLineColor(kBlue);
221 winterHist->SetLineWidth(6);
222 // Set a green histogram for spring.
223 springHist->SetLineColor(kGreen);
224 springHist->SetLineWidth(6);
225 // Set a red histogram for summer.
226 summerHist->SetLineColor(kRed);
227 summerHist->SetLineWidth(6);
228
229 // Transform histogram in order to address ROOT 7 v 6 version compatibility
230 auto hist_1993_to_2002 = GetDrawableHist(hist_1993_to_2002_ResultPtr);
231 auto hist_2003_to_2013 = GetDrawableHist(hist_2003_to_2013_ResultPtr);
232
233 // Set a violet histogram for 1993_to_2002.
234 hist_1993_to_2002->SetLineColor(kViolet);
235 hist_1993_to_2002->SetLineWidth(6);
236 // Set a spring-green histogram for 2003_to_2013.
237 hist_2003_to_2013->SetLineColor(kSpring);
238 hist_2003_to_2013->SetLineWidth(6);
239
240 // Create a canvas to display histograms for average temperature by season.
241 auto canvas = RCanvas::Create("Average Temperature by Season");
242 canvas->Draw<TObjectDrawable>(fallHist, "L");
243 canvas->Draw<TObjectDrawable>(winterHist, "L");
244 canvas->Draw<TObjectDrawable>(springHist, "L");
245 canvas->Draw<TObjectDrawable>(summerHist, "L");
246
247 // Create a legend for the seasons canvas.
248 auto legend = std::make_shared<TLegend>(0.15, 0.65, 0.53, 0.85);
249 legend->AddEntry(fallHist.get(), "fall", "l");
250 legend->AddEntry(winterHist.get(), "winter", "l");
251 legend->AddEntry(springHist.get(), "spring", "l");
252 legend->AddEntry(summerHist.get(), "summer", "l");
253 canvas->Draw<TObjectDrawable>(legend, "L");
254 canvas->Show();
255
256 // Create a canvas to display histograms for average temperature for 1993_to_2002 & 2003_to_2013.
257 auto canvas2 = RCanvas::Create("Average Temperature: 1993_to_2002 vs. 2003_to_2013");
258 canvas2->Draw<TObjectDrawable>(hist_1993_to_2002, "L");
259 canvas2->Draw<TObjectDrawable>(hist_2003_to_2013, "L");
260
261 // Create a legend for the two decades canvas.
262 auto legend2 = std::make_shared<TLegend>(0.1, 0.7, 0.48, 0.9);
263 legend2->AddEntry(hist_1993_to_2002.get(), "1993_to_2002", "l");
264 legend2->AddEntry(hist_2003_to_2013.get(), "2003_to_2013", "l");
265 canvas2->Draw<TObjectDrawable>(legend2, "L");
266 canvas2->Show();
267}
268
269void ntpl011_global_temperatures()
270{
271 // if NOT zero (the file does NOT already exist), then Ingest
272 if (gSystem->AccessPathName(kNTupleFileName) != 0) {
273 Ingest();
274 }
275 Analyze();
276}
#define h(i)
Definition RSha256.hxx:106
@ kRed
Definition Rtypes.h:66
@ kOrange
Definition Rtypes.h:67
@ kGreen
Definition Rtypes.h:66
@ kBlue
Definition Rtypes.h:66
@ kViolet
Definition Rtypes.h:67
@ kSpring
Definition Rtypes.h:67
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
R__EXTERN TSystem * gSystem
Definition TSystem.h:561
Provides v7 drawing facilities for TObject types (TGraph, TH1, TH2, etc).
The RRawFile provides read-only access to local and remote files.
Definition RRawFile.hxx:43
Smart pointer for the return type of actions.
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
1-D histogram with a double per channel (see TH1 documentation)
Definition TH1.h:670
virtual Bool_t AccessPathName(const char *path, EAccessMode mode=kFileExists)
Returns FALSE if one can access a file using the specified access mode.
Definition TSystem.cxx:1296
On construction, an ROptions parameter can customize the RRawFile behavior.
Definition RRawFile.hxx:49
size_t fBlockSize
Read at least fBlockSize bytes at a time. A value of zero turns off I/O buffering.
Definition RRawFile.hxx:54
auto * t1
Definition textangle.C:20