Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleInspector.hxx
Go to the documentation of this file.
1/// \file ROOT/RNTuplerInspector.hxx
2/// \ingroup NTuple ROOT7
3/// \author Florine de Geus <florine.de.geus@cern.ch>
4/// \date 2023-01-09
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2023, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
16#ifndef ROOT7_RNTupleInspector
17#define ROOT7_RNTupleInspector
18
19#include <ROOT/RError.hxx>
20#include <ROOT/RNTuple.hxx>
22
23#include <TFile.h>
24#include <TH1D.h>
25
26#include <cstdlib>
27#include <memory>
28#include <regex>
29#include <vector>
30
31namespace ROOT {
32namespace Experimental {
33
36
37// clang-format off
38/**
39\class ROOT::Experimental::RNTupleInspector
40\ingroup NTuple
41\brief Inspect on-disk and storage-related information of an RNTuple.
42
43The RNTupleInspector can be used for studying an RNTuple in terms of its storage efficiency. It provides information on
44the level of the RNTuple itself, on the (sub)field level and on the column level.
45
46Example usage:
47
48~~~ {.cpp}
49#include <ROOT/RNTuple.hxx>
50#include <ROOT/RNTupleInspector.hxx>
51
52#include <iostream>
53
54using ROOT::Experimental::RNTuple;
55using ROOT::Experimental::RNTupleInspector;
56
57auto file = TFile::Open("data.rntuple");
58auto rntuple = file->Get<RNTuple>("NTupleName");
59auto inspector = RNTupleInspector::Create(rntuple).Unwrap();
60
61std::cout << "The compression factor is " << inspector->GetCompressionFactor()
62 << " using compression settings " << inspector->GetCompressionSettings()
63 << std::endl;
64~~~
65*/
66// clang-format on
68public:
69 /////////////////////////////////////////////////////////////////////////////
70 /// \brief Holds column-level storage information.
71 ///
72 /// The RColumnInfo class provides storage information for an individual column. This information is either
73 /// collected during the construction of the RNTupleInpector object, or can be accessed using
74 /// the RColumnDescriptor that belongs to this column.
76 private:
78 std::uint64_t fCompressedSize = 0;
79 std::uint32_t fElementSize = 0;
80 std::uint64_t fNElements = 0;
81
82 public:
83 RColumnInfo(const RColumnDescriptor &colDesc, std::uint64_t onDiskSize, std::uint32_t elemSize,
84 std::uint64_t nElems)
85 : fColumnDescriptor(colDesc), fCompressedSize(onDiskSize), fElementSize(elemSize), fNElements(nElems){};
86 ~RColumnInfo() = default;
87
89 std::uint64_t GetCompressedSize() const { return fCompressedSize; }
90 std::uint64_t GetUncompressedSize() const { return fElementSize * fNElements; }
91 std::uint64_t GetElementSize() const { return fElementSize; }
92 std::uint64_t GetNElements() const { return fNElements; }
94 };
95
96 /////////////////////////////////////////////////////////////////////////////
97 /// \brief Holds field-level storage information.
98 ///
99 /// The RFieldTreeInfo class provides storage information for a field **and** its subfields. This information is
100 /// either collected during the construction of the RNTupleInpector object, or can be accessed using
101 /// the RFieldDescriptor that belongs to this field.
103 private:
105 std::uint64_t fCompressedSize = 0;
106 std::uint64_t fUncompressedSize = 0;
107
108 public:
109 RFieldTreeInfo(const RFieldDescriptor &fieldDesc, std::uint64_t onDiskSize, std::uint64_t inMemSize)
110 : fRootFieldDescriptor(fieldDesc), fCompressedSize(onDiskSize), fUncompressedSize(inMemSize){};
111 ~RFieldTreeInfo() = default;
112
114 std::uint64_t GetCompressedSize() const { return fCompressedSize; }
115 std::uint64_t GetUncompressedSize() const { return fUncompressedSize; }
116 };
117
118private:
119 std::unique_ptr<TFile> fSourceFile;
120 std::unique_ptr<Detail::RPageSource> fPageSource;
121 std::unique_ptr<RNTupleDescriptor> fDescriptor;
123 std::uint64_t fCompressedSize = 0;
124 std::uint64_t fUncompressedSize = 0;
125
126 std::map<int, RColumnInfo> fColumnInfo;
127 std::map<int, RFieldTreeInfo> fFieldTreeInfo;
128
129 RNTupleInspector(std::unique_ptr<Detail::RPageSource> pageSource);
130
131 /////////////////////////////////////////////////////////////////////////////
132 /// \brief Gather column-level and RNTuple-level information.
133 ///
134 /// \note This method is called when the RNTupleInspector is initially created. This means that anything unexpected
135 /// about the RNTuple itself (e.g. inconsistent compression settings across clusters) will be detected here.
136 /// Therefore, any related exceptions will be thrown on creation of the inspector.
137 void CollectColumnInfo();
138
139 /////////////////////////////////////////////////////////////////////////////
140 /// \brief Recursively gather field-level information.
141 ///
142 /// \param[in] fieldId The ID of the field from which to start the recursive traversal. Typically this is the "zero
143 /// ID", i.e. the logical parent of all top-level fields.
144 ///
145 /// \return The RFieldTreeInfo for the provided field ID.
146 ///
147 // / This method iscalled when the RNTupleInpector is initially created.
149
150 /////////////////////////////////////////////////////////////////////////////
151 /// \brief Get the columns that make up the given field, including its subfields.
152 ///
153 /// \param [in] fieldId The ID of the field for which to collect the columns.
154 ///
155 /// \return A vector containing the IDs of all columns for the provided field ID.
156 std::vector<DescriptorId_t> GetColumnsByFieldId(DescriptorId_t fieldId) const;
157
158public:
159 RNTupleInspector(const RNTupleInspector &other) = delete;
163 ~RNTupleInspector() = default;
164
165 /////////////////////////////////////////////////////////////////////////////
166 /// \brief Create a new RNTupleInspector.
167 ///
168 /// \param[in] sourceNTuple A pointer to the RNTuple to be inspected.
169 ///
170 /// \return A pointer to the newly created RNTupleInspector.
171 ///
172 /// \note When this factory method is called, all required static information is collected from the RNTuple's fields
173 /// and underlying columns are collected at ones. This means that when any inconsistencies are encountered (e.g.
174 /// inconsistent compression across clusters), it will throw an error here.
175 static std::unique_ptr<RNTupleInspector> Create(RNTuple *sourceNTuple);
176
177 /////////////////////////////////////////////////////////////////////////////
178 /// \brief Create a new RNTupleInspector.
179 ///
180 /// \param[in] ntupleName The name of the RNTuple to be inspected.
181 /// \param[in] storage The path or URI to the RNTuple to be inspected.
182 ///
183 /// \see Create(RNTuple *sourceNTuple)
184 static std::unique_ptr<RNTupleInspector> Create(std::string_view ntupleName, std::string_view storage);
185
186 /////////////////////////////////////////////////////////////////////////////
187 /// \brief Create a new RNTupleInspector.
188 ///
189 /// \param[in] pageSource The RPageSource object belonging to the RNTuple to be inspected.
190 ///
191 /// \see Create(RNTuple *sourceNTuple)
192 static std::unique_ptr<RNTupleInspector> Create(std::unique_ptr<Detail::RPageSource> pageSource);
193
194 /////////////////////////////////////////////////////////////////////////////
195 /// \brief Get the descriptor for the RNTuple being inspected.
196 ///
197 /// \return A static copy of the RNTupleDescriptor belonging to the inspected RNTuple.
198 RNTupleDescriptor *GetDescriptor() const { return fDescriptor.get(); }
199
200 /////////////////////////////////////////////////////////////////////////////
201 /// \brief Get the compression settings of the RNTuple being inspected.
202 ///
203 /// \return The integer representation (\f$algorithm * 10 + level\f$, where \f$algorithm\f$ follows
204 /// ROOT::RCompressionSetting::ELevel::EValues) of the compression settings used for the inspected RNTuple.
205 ///
206 /// \note Here, we assume that the compression settings are consistent across all clusters and columns. If this is
207 /// not the case, an exception will be thrown when RNTupleInspector::Create is called.
209
210 /////////////////////////////////////////////////////////////////////////////
211 /// \brief Get a string describing compression settings of the RNTuple being inspected.
212 ///
213 /// \return A string describing the compression used for the inspected RNTuple. The format of the string is
214 /// `"A (level L)"`, where `A` is the name of the compression algorithm and `L` the compression level.
215 ///
216 /// \note Here, we assume that the compression settings are consistent across all clusters and columns. If this is
217 /// not the case, an exception will be thrown when RNTupleInspector::Create is called.
218 std::string GetCompressionSettingsAsString() const;
219
220 /////////////////////////////////////////////////////////////////////////////
221 /// \brief Get the compressed, on-disk size of the RNTuple being inspected.
222 ///
223 /// \return The compressed size of the inspected RNTuple, in bytes, excluding the size of the header and footer.
224 std::uint64_t GetCompressedSize() const { return fCompressedSize; }
225
226 /////////////////////////////////////////////////////////////////////////////
227 /// \brief Get the uncompressed total size of the RNTuple being inspected.
228 ///
229 /// \return The uncompressed size of the inspected RNTuple, in bytes, excluding the size of the header and footer.
230 std::uint64_t GetUncompressedSize() const { return fUncompressedSize; }
231
232 /////////////////////////////////////////////////////////////////////////////
233 /// \brief Get the compression factor of the RNTuple being inspected.
234 ///
235 /// \return The compression factor of the inspected RNTuple.
236 ///
237 /// The compression factor shows how well the data present in the RNTuple is compressed by the compression settings
238 /// that were used. The compression factor is calculated as \f$size_{uncompressed} / size_{compressed}\f$.
239 float GetCompressionFactor() const { return (float)fUncompressedSize / (float)fCompressedSize; }
240
241 /////////////////////////////////////////////////////////////////////////////
242 /// \brief Get storage information for a given column.
243 ///
244 /// \param[in] physicalColumnId The physical ID of the column for which to get the information.
245 ///
246 /// \return The storage information for the provided column.
247 const RColumnInfo &GetColumnInfo(DescriptorId_t physicalColumnId) const;
248
249 /////////////////////////////////////////////////////////////////////////////
250 /// \brief Get the number of columns of a given type present in the RNTuple.
251 ///
252 /// \param[in] colType The column type to count, as defined by ROOT::Experimental::EColumnType.
253 ///
254 /// \return The number of columns present in the inspected RNTuple of the provided type.
255 size_t GetColumnCountByType(EColumnType colType) const;
256
257 /////////////////////////////////////////////////////////////////////////////
258 /// \brief Get the IDs of all columns with the given type.
259 ///
260 /// \param[in] colType The column type to collect, as defined by ROOT::Experimental::EColumnType.
261 ///
262 /// \return A vector containing the physical IDs of columns of the provided type.
263 const std::vector<DescriptorId_t> GetColumnsByType(EColumnType colType);
264
265 /////////////////////////////////////////////////////////////////////////////
266 /// \brief Print storage information per column type.
267 ///
268 /// \param[in] format Whether to print the information as a (markdown-parseable) table or in CSV format.
269 /// \param[in] output Where to write the output to. Default is `stdout`.
270 ///
271 /// The output includes for each column type its count, the total number of elements, the compressed size and the
272 /// uncompressed size.
273 ///
274 /// **Example: printing the column type information of an RNTuple as a table**
275 /// ~~~ {.cpp}
276 /// #include <ROOT/RNTupleInspector.hxx>
277 /// using ROOT::Experimental::RNTupleInspector;
278 /// using ROOT::Experimental::ENTupleInspectorPrintFormat;
279 ///
280 /// auto inspector = RNTupleInspector::Create("myNTuple", "some/file.root");
281 /// inspector->PrintColumnTypeInfo();
282 /// ~~~
283 /// Ouput:
284 /// ~~~
285 /// column type | count | # elements | compressed bytes | uncompressed bytes
286 /// ----------------|---------|-----------------|-------------------|--------------------
287 /// SplitIndex64 | 2 | 150 | 72 | 1200
288 /// SplitReal32 | 4 | 300 | 189 | 1200
289 /// SplitUInt32 | 3 | 225 | 123 | 900
290 /// ~~~
291 ///
292 /// **Example: printing the column type information of an RNTuple in CSV format**
293 /// ~~~ {.cpp}
294 /// #include <ROOT/RNTupleInspector.hxx>
295 /// using ROOT::Experimental::RNTupleInspector;
296 /// using ROOT::Experimental::ENTupleInspectorPrintFormat;
297 ///
298 /// auto inspector = RNTupleInspector::Create("myNTuple", "some/file.root");
299 /// inspector->PrintColumnTypeInfo();
300 /// ~~~
301 /// Ouput:
302 /// ~~~
303 /// columnType,count,nElements,compressedSize,uncompressedSize
304 /// SplitIndex64,2,150,72,1200
305 /// SplitReal32,4,300,189,1200
306 /// SplitUInt32,3,225,123,900
307 /// ~~~
309 std::ostream &output = std::cout);
310
311 /////////////////////////////////////////////////////////////////////////////
312 /// \brief Get a histogram showing information for each column type present,
313 ///
314 /// \param[in] histKind Which type of information should be returned.
315 /// \param[in] histName The name of the histogram. An empty string means a default name will be used.
316 /// \param[in] histTitle The title of the histogram. An empty string means a default title will be used.
317 ///
318 /// \return A pointer to a `TH1D` containing the specified kind of information.
319 ///
320 /// Get a histogram showing the count, number of elements, size on disk, or size in memory for each column
321 /// type present in the inspected RNTuple.
322 std::unique_ptr<TH1D> GetColumnTypeInfoAsHist(ENTupleInspectorHist histKind, std::string_view histName = "",
323 std::string_view histTitle = "");
324
325 /////////////////////////////////////////////////////////////////////////////
326 /// \brief Get storage information for a given (sub)field by ID.
327 ///
328 /// \param[in] fieldId The ID of the (sub)field for which to get the information.
329 ///
330 /// \return The storage information for the provided (sub)field.
331 const RFieldTreeInfo &GetFieldTreeInfo(DescriptorId_t fieldId) const;
332
333 /////////////////////////////////////////////////////////////////////////////
334 /// \brief Get storage information for a given (sub)field by name.
335 ///
336 /// \param[in] fieldName The name of the (sub)field for which to get the information.
337 ///
338 /// \return The storage information for the provided (sub)field.
339 const RFieldTreeInfo &GetFieldTreeInfo(std::string_view fieldName) const;
340
341 /////////////////////////////////////////////////////////////////////////////
342 /// \brief Get the number of fields of a given type or class present in the RNTuple.
343 ///
344 /// \param[in] typeNamePattern The type or class name to count. May contain regular expression patterns for grouping
345 /// multiple kinds of types or classes.
346 /// \param[in] searchInSubFields If set to `false`, only top-level fields will be considered.
347 ///
348 /// \return The number of fields that matches the provided type.
349 size_t GetFieldCountByType(const std::regex &typeNamePattern, bool searchInSubFields = true) const;
350
351 /////////////////////////////////////////////////////////////////////////////
352 /// \brief Get the number of fields of a given type or class present in the RNTuple.
353 ///
354 /// \see GetFieldCountByType(const std::regex &typeNamePattern, bool searchInSubFields) const
355 size_t GetFieldCountByType(std::string_view typeNamePattern, bool searchInSubFields = true) const
356 {
357 return GetFieldCountByType(std::regex{std::string(typeNamePattern)}, searchInSubFields);
358 }
359
360 /////////////////////////////////////////////////////////////////////////////
361 /// \brief Get the IDs of (sub-)fields whose name matches the given string.
362 ///
363 /// \param[in] fieldNamePattern The name of the field name to get. Because field names are unique by design,
364 /// providing a single field name will return a vector containing just the ID of that field. However, regular
365 /// expression patterns are supported in order to get the IDs of all fields whose name follow a certain structure.
366 /// \param[in] searchInSubFields If set to `false`, only top-level fields will be considered.
367 ///
368 /// \return A vector containing the IDs of fields that match the provided name.
369 const std::vector<DescriptorId_t>
370 GetFieldsByName(const std::regex &fieldNamePattern, bool searchInSubFields = true) const;
371
372 /////////////////////////////////////////////////////////////////////////////
373 /// \brief Get the IDs of (sub-)fields whose name matches the given string.
374 ///
375 /// \see GetFieldsByName(const std::regex &fieldNamePattern, bool searchInSubFields) const
376 const std::vector<DescriptorId_t> GetFieldsByName(std::string_view fieldNamePattern, bool searchInSubFields = true)
377 {
378 return GetFieldsByName(std::regex{std::string(fieldNamePattern)}, searchInSubFields);
379 }
380};
381} // namespace Experimental
382} // namespace ROOT
383
384#endif // ROOT7_RNTupleInspector
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t format
Meta-data stored for every column of an ntuple.
Meta-data stored for every field of an ntuple.
The on-storage meta-data of an ntuple.
Holds column-level storage information.
const RColumnDescriptor & GetDescriptor() const
RColumnInfo(const RColumnDescriptor &colDesc, std::uint64_t onDiskSize, std::uint32_t elemSize, std::uint64_t nElems)
RFieldTreeInfo(const RFieldDescriptor &fieldDesc, std::uint64_t onDiskSize, std::uint64_t inMemSize)
Inspect on-disk and storage-related information of an RNTuple.
const RFieldTreeInfo & GetFieldTreeInfo(DescriptorId_t fieldId) const
Get storage information for a given (sub)field by ID.
float GetCompressionFactor() const
Get the compression factor of the RNTuple being inspected.
const std::vector< DescriptorId_t > GetFieldsByName(const std::regex &fieldNamePattern, bool searchInSubFields=true) const
Get the IDs of (sub-)fields whose name matches the given string.
RNTupleInspector & operator=(RNTupleInspector &&other)=delete
RNTupleInspector(const RNTupleInspector &other)=delete
std::uint64_t GetCompressedSize() const
Get the compressed, on-disk size of the RNTuple being inspected.
std::map< int, RColumnInfo > fColumnInfo
std::uint64_t GetUncompressedSize() const
Get the uncompressed total size of the RNTuple being inspected.
size_t GetColumnCountByType(EColumnType colType) const
Get the number of columns of a given type present in the RNTuple.
RNTupleInspector(RNTupleInspector &&other)=delete
std::map< int, RFieldTreeInfo > fFieldTreeInfo
std::string GetCompressionSettingsAsString() const
Get a string describing compression settings of the RNTuple being inspected.
RNTupleDescriptor * GetDescriptor() const
Get the descriptor for the RNTuple being inspected.
static std::unique_ptr< RNTupleInspector > Create(RNTuple *sourceNTuple)
Create a new RNTupleInspector.
size_t GetFieldCountByType(std::string_view typeNamePattern, bool searchInSubFields=true) const
Get the number of fields of a given type or class present in the RNTuple.
const std::vector< DescriptorId_t > GetColumnsByType(EColumnType colType)
Get the IDs of all columns with the given type.
void PrintColumnTypeInfo(ENTupleInspectorPrintFormat format=ENTupleInspectorPrintFormat::kTable, std::ostream &output=std::cout)
Print storage information per column type.
const std::vector< DescriptorId_t > GetFieldsByName(std::string_view fieldNamePattern, bool searchInSubFields=true)
Get the IDs of (sub-)fields whose name matches the given string.
RFieldTreeInfo CollectFieldTreeInfo(DescriptorId_t fieldId)
Recursively gather field-level information.
std::unique_ptr< RNTupleDescriptor > fDescriptor
RNTupleInspector & operator=(const RNTupleInspector &other)=delete
std::vector< DescriptorId_t > GetColumnsByFieldId(DescriptorId_t fieldId) const
Get the columns that make up the given field, including its subfields.
void CollectColumnInfo()
Gather column-level and RNTuple-level information.
int GetCompressionSettings() const
Get the compression settings of the RNTuple being inspected.
size_t GetFieldCountByType(const std::regex &typeNamePattern, bool searchInSubFields=true) const
Get the number of fields of a given type or class present in the RNTuple.
const RColumnInfo & GetColumnInfo(DescriptorId_t physicalColumnId) const
Get storage information for a given column.
std::unique_ptr< TH1D > GetColumnTypeInfoAsHist(ENTupleInspectorHist histKind, std::string_view histName="", std::string_view histTitle="")
Get a histogram showing information for each column type present,.
std::unique_ptr< Detail::RPageSource > fPageSource
Representation of an RNTuple data set in a ROOT file.
Definition RNTuple.hxx:512
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
This file contains a specialised ROOT message handler to test for diagnostic in unit tests.
static void output()