Logo ROOT  
Reference Guide
RNTupleDescriptor.hxx
Go to the documentation of this file.
1/// \file ROOT/RNTupleDescriptor.hxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \date 2018-07-19
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2019, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
16#ifndef ROOT7_RNTupleDescriptor
17#define ROOT7_RNTupleDescriptor
18
19#include <ROOT/RColumnModel.hxx>
20#include <ROOT/RNTupleUtil.hxx>
21#include <ROOT/RStringView.hxx>
22
23#include <chrono>
24#include <memory>
25#include <ostream>
26#include <vector>
27#include <string>
28#include <unordered_map>
29
30namespace ROOT {
31namespace Experimental {
32
33class RNTupleDescriptorBuilder;
34class RNTupleModel;
35
36// clang-format off
37/**
38\class ROOT::Experimental::RFieldDescriptor
39\ingroup NTuple
40\brief Meta-data stored for every field of an ntuple
41*/
42// clang-format on
45
46private:
48 /// The version of the C++-type-to-column translation mechanics
50 /// The version of the C++ type itself
52 /// The leaf name, not including parent fields
53 std::string fFieldName;
54 /// Free text set by the user
55 std::string fFieldDescription;
56 /// The C++ type that was used when writing the field
57 std::string fTypeName;
58 /// The number of elements per entry for fixed-size arrays
59 std::uint64_t fNRepetitions;
60 /// The structural information carried by this field in the data model tree
62 /// Establishes sub field relationships, such as classes and collections
64 /// The pointers in the other direction from parent to children. They are serialized, too, to keep the
65 /// order of sub fields.
66 std::vector<DescriptorId_t> fLinkIds;
67
68public:
69 /// In order to handle changes to the serialization routine in future ntuple versions
70 static constexpr std::uint16_t kFrameVersionCurrent = 0;
71 static constexpr std::uint16_t kFrameVersionMin = 0;
72
73 RFieldDescriptor() = default;
74 RFieldDescriptor(const RFieldDescriptor &other) = delete;
78
79 bool operator==(const RFieldDescriptor &other) const;
80
81 DescriptorId_t GetId() const { return fFieldId; }
84 std::string GetFieldName() const { return fFieldName; }
85 std::string GetFieldDescription() const { return fFieldDescription; }
86 std::string GetTypeName() const { return fTypeName; }
87 std::uint64_t GetNRepetitions() const { return fNRepetitions; }
90 const std::vector<DescriptorId_t> &GetLinkIds() const { return fLinkIds; }
91};
92
93
94// clang-format off
95/**
96\class ROOT::Experimental::RColumnDescriptor
97\ingroup NTuple
98\brief Meta-data stored for every column of an ntuple
99*/
100// clang-format on
103
104private:
106 /// Versions can change, e.g., when new column types are added
108 /// Contains the column type and whether it is sorted
110 /// Every column belongs to one and only one field
112 /// A field can be serialized into several columns, which are numbered from zero to $n$
113 std::uint32_t fIndex;
114
115public:
116 /// In order to handle changes to the serialization routine in future ntuple versions
117 static constexpr std::uint16_t kFrameVersionCurrent = 0;
118 static constexpr std::uint16_t kFrameVersionMin = 0;
119
120 RColumnDescriptor() = default;
121 RColumnDescriptor(const RColumnDescriptor &other) = delete;
125
126 bool operator==(const RColumnDescriptor &other) const;
127
128 DescriptorId_t GetId() const { return fColumnId; }
130 RColumnModel GetModel() const { return fModel; }
131 std::uint32_t GetIndex() const { return fIndex; }
133};
134
135
136// clang-format off
137/**
138\class ROOT::Experimental::RClusterDescriptor
139\ingroup NTuple
140\brief Meta-data for a set of ntuple clusters
141
142The cluster descriptor might carry information of only a subset of available clusters, for instance if multiple
143files are chained and not all of them have been processed yet.
144*/
145// clang-format on
148
149public:
150 /// Generic information about the physical location of data. Values depend on the concrete storage type. E.g.,
151 /// for a local file fUrl might be unsused and fPosition might be a file offset. Objects on storage can be compressed
152 /// and therefore we need to store their actual size.
153 struct RLocator {
154 std::int64_t fPosition = 0;
155 std::uint32_t fBytesOnStorage = 0;
156 std::string fUrl;
157
158 bool operator==(const RLocator &other) const {
159 return fPosition == other.fPosition && fBytesOnStorage == other.fBytesOnStorage && fUrl == other.fUrl;
160 }
161 };
162
163 /// The window of element indexes of a particular column in a particular cluster
166 /// A 64bit element index
168 /// A 32bit value for the number of column elements in the cluster
170 /// The usual format for ROOT compression settings (see Compression.h).
171 /// The pages of a particular column in a particular cluster are all compressed with the same settings.
172 std::int64_t fCompressionSettings = 0;
173
174 // TODO(jblomer): we perhaps want to store summary information, such as average, min/max, etc.
175 // Should this be done on the field level?
176
177 bool operator==(const RColumnRange &other) const {
178 return fColumnId == other.fColumnId && fFirstElementIndex == other.fFirstElementIndex &&
180 }
181
182 bool Contains(NTupleSize_t index) const {
183 return (fFirstElementIndex <= index && (fFirstElementIndex + fNElements) > index);
184 }
185 };
186
187 /// Records the parition of data into pages for a particular column in a particular cluster
188 struct RPageRange {
189 /// We do not need to store the element size / uncompressed page size because we know to which column
190 /// the page belongs
191 struct RPageInfo {
192 /// The sum of the elements of all the pages must match the corresponding fNElements field in fColumnRanges
194 /// The meaning of fLocator depends on the storage backend.
196
197 bool operator==(const RPageInfo &other) const {
198 return fNElements == other.fNElements && fLocator == other.fLocator;
199 }
200 };
201
202 RPageRange() = default;
203 RPageRange(const RPageRange &other) = delete;
204 RPageRange &operator =(const RPageRange &other) = delete;
205 RPageRange(RPageRange &&other) = default;
206 RPageRange &operator =(RPageRange &&other) = default;
207
209 std::vector<RPageInfo> fPageInfos;
210
211 bool operator==(const RPageRange &other) const {
212 return fColumnId == other.fColumnId && fPageInfos == other.fPageInfos;
213 }
214 };
215
216private:
218 /// Future versions of the cluster descriptor might add more meta-data, e.g. a semantic checksum
220 /// Clusters can be swapped by adjusting the entry offsets
223 /// For pre-fetching / caching an entire contiguous cluster
225
226 std::unordered_map<DescriptorId_t, RColumnRange> fColumnRanges;
227 std::unordered_map<DescriptorId_t, RPageRange> fPageRanges;
228
229public:
230 /// In order to handle changes to the serialization routine in future ntuple versions
231 static constexpr std::uint16_t kFrameVersionCurrent = 0;
232 static constexpr std::uint16_t kFrameVersionMin = 0;
233
239
240 bool operator==(const RClusterDescriptor &other) const;
241
242 DescriptorId_t GetId() const { return fClusterId; }
246 RLocator GetLocator() const { return fLocator; }
247 const RColumnRange &GetColumnRange(DescriptorId_t columnId) const { return fColumnRanges.at(columnId); }
248 const RPageRange &GetPageRange(DescriptorId_t columnId) const { return fPageRanges.at(columnId); }
249};
250
251
252// clang-format off
253/**
254\class ROOT::Experimental::RNTupleDescriptor
255\ingroup NTuple
256\brief The on-storage meta-data of an ntuple
257
258Represents the on-disk (on storage) information about an ntuple. The meta-data consists of a header and one or
259several footers. The header carries the ntuple schema, i.e. the fields and the associated columns and their
260relationships. The footer(s) carry information about one or several clusters. For every cluster, a footer stores
261its location and size, and for every column the range of element indexes as well as a list of pages and page
262locations.
263
264The descriptor provide machine-independent (de-)serialization of headers and footers, and it provides lookup routines
265for ntuple objects (pages, clusters, ...). It is supposed to be usable by all RPageStorage implementations.
266
267The serialization does not use standard ROOT streamers in order to not let it depend on libCore. The serialization uses
268the concept of frames: header, footer, and substructures have a preamble with version numbers and the size of the
269writte struct. This allows for forward and backward compatibility when the meta-data evolves.
270*/
271// clang-format on
274
275private:
276 /// The ntuple name needs to be unique in a given storage location (file)
277 std::string fName;
278 /// Free text from the user
279 std::string fDescription;
280 /// The origin of the data
281 std::string fAuthor;
282 /// The current responsible for storing the data
283 std::string fCustodian;
284 /// The time stamp of the ntuple data (immutable)
285 std::chrono::system_clock::time_point fTimeStampData;
286 /// The time stamp of writing the data to storage, which gets updated when re-written
287 std::chrono::system_clock::time_point fTimeStampWritten;
288 /// The version evolves with the ntuple summary meta-data
290 /// Every NTuple gets a unique identifier
292 /// Column sets that are created as derived sets from existing NTuples share the same group id.
293 /// NTuples in the same group have the same number of entries and are supposed to contain associated data.
295
296 std::unordered_map<DescriptorId_t, RFieldDescriptor> fFieldDescriptors;
297 std::unordered_map<DescriptorId_t, RColumnDescriptor> fColumnDescriptors;
298 /// May contain only a subset of all the available clusters, e.g. the clusters of the current file
299 /// from a chain of files
300 std::unordered_map<DescriptorId_t, RClusterDescriptor> fClusterDescriptors;
301
302public:
303 /// In order to handle changes to the serialization routine in future ntuple versions
304 static constexpr std::uint16_t kFrameVersionCurrent = 0;
305 static constexpr std::uint16_t kFrameVersionMin = 0;
306 /// The preamble is sufficient to get the length of the header
307 static constexpr unsigned int kNBytesPreamble = 8;
308 /// The last few bytes after the footer store the length of footer and header
309 static constexpr unsigned int kNBytesPostscript = 16;
310
311 RNTupleDescriptor() = default;
312 RNTupleDescriptor(const RNTupleDescriptor &other) = delete;
316
317 bool operator ==(const RNTupleDescriptor &other) const;
318
319 /// We deliberately do not use ROOT's built-in serialization in order to allow for use of RNTuple's without libCore
320 /// Serializes the global ntuple information as well as the column and field schemata
321 /// Returns the number of bytes and fills buffer if it is not nullptr.
322 /// TODO(jblomer): instead of runtime testing for nullptr, there should be a template for the case where
323 /// only the size of the buffer is required.
324 std::uint32_t SerializeHeader(void* buffer) const;
325 /// Serializes cluster meta data. Returns the number of bytes and fills buffer if it is not nullptr.
326 std::uint32_t SerializeFooter(void* buffer) const;
327 /// Given kNBytesPostscript bytes, extract the header and footer lengths in bytes
328 static void LocateMetadata(const void *postscript, std::uint32_t &szHeader, std::uint32_t &szFooter);
329
330 const RFieldDescriptor& GetFieldDescriptor(DescriptorId_t fieldId) const { return fFieldDescriptors.at(fieldId); }
332 return fColumnDescriptors.at(columnId);
333 }
335 return fClusterDescriptors.at(clusterId);
336 }
337 std::string GetName() const { return fName; }
338 std::string GetDescription() const { return fDescription; }
339 std::string GetAuthor() const { return fAuthor; }
340 std::string GetCustodian() const { return fCustodian; }
341 std::chrono::system_clock::time_point GetTimeStampData() const { return fTimeStampData; }
342 std::chrono::system_clock::time_point GetTimeStampWritten() const { return fTimeStampWritten; }
344 RNTupleUuid GetOwnUuid() const { return fOwnUuid; }
346
347 std::size_t GetNFields() const { return fFieldDescriptors.size(); }
348 std::size_t GetNColumns() const { return fColumnDescriptors.size(); }
349 std::size_t GetNClusters() const { return fClusterDescriptors.size(); }
350
351 // The number of entries as seen with the currently loaded cluster meta-data; there might be more
354
356 /// Searches for a top-level field
358 DescriptorId_t FindColumnId(DescriptorId_t fieldId, std::uint32_t columnIndex) const;
360
361 /// Re-create the C++ model from the stored meta-data
362 std::unique_ptr<RNTupleModel> GenerateModel() const;
363 void PrintInfo(std::ostream &output) const;
364};
365
366
367// clang-format off
368/**
369\class ROOT::Experimental::RNTupleDescriptorBuilder
370\ingroup NTuple
371\brief A helper class for piece-wise construction of an RNTupleDescriptor
372
373Used by RPageStorage implementations in order to construct the RNTupleDescriptor from the various header parts.
374*/
375// clang-format on
377private:
379
380public:
381 bool IsValid() const { return true; /* TODO(jblomer) */}
382 const RNTupleDescriptor& GetDescriptor() const { return fDescriptor; }
384
385 void SetNTuple(const std::string_view name, const std::string_view description, const std::string_view author,
386 const RNTupleVersion &version, const RNTupleUuid &uuid);
387
388 void AddField(DescriptorId_t fieldId, const RNTupleVersion &fieldVersion, const RNTupleVersion &typeVersion,
389 std::string_view fieldName, std::string_view typeName, std::uint64_t nRepetitions,
390 ENTupleStructure structure);
391 void AddFieldLink(DescriptorId_t fieldId, DescriptorId_t linkId);
392
393 void AddColumn(DescriptorId_t columnId, DescriptorId_t fieldId,
394 const RNTupleVersion &version, const RColumnModel &model, std::uint32_t index);
395
396 void SetFromHeader(void* headerBuffer);
397
398 void AddCluster(DescriptorId_t clusterId, RNTupleVersion version,
399 NTupleSize_t firstEntryIndex, ClusterSize_t nEntries);
403
404 void AddClustersFromFooter(void* footerBuffer);
405};
406
407} // namespace Experimental
408} // namespace ROOT
409
410#endif
char name[80]
Definition: TGX11.cxx:109
Meta-data for a set of ntuple clusters.
std::unordered_map< DescriptorId_t, RPageRange > fPageRanges
RNTupleVersion fVersion
Future versions of the cluster descriptor might add more meta-data, e.g. a semantic checksum.
RClusterDescriptor(RClusterDescriptor &&other)=default
RLocator fLocator
For pre-fetching / caching an entire contiguous cluster.
static constexpr std::uint16_t kFrameVersionMin
RClusterDescriptor(const RClusterDescriptor &other)=delete
const RPageRange & GetPageRange(DescriptorId_t columnId) const
NTupleSize_t fFirstEntryIndex
Clusters can be swapped by adjusting the entry offsets.
const RColumnRange & GetColumnRange(DescriptorId_t columnId) const
RClusterDescriptor & operator=(const RClusterDescriptor &other)=delete
std::unordered_map< DescriptorId_t, RColumnRange > fColumnRanges
bool operator==(const RClusterDescriptor &other) const
static constexpr std::uint16_t kFrameVersionCurrent
In order to handle changes to the serialization routine in future ntuple versions.
Meta-data stored for every column of an ntuple.
RColumnDescriptor(const RColumnDescriptor &other)=delete
static constexpr std::uint16_t kFrameVersionCurrent
In order to handle changes to the serialization routine in future ntuple versions.
RColumnDescriptor(RColumnDescriptor &&other)=default
DescriptorId_t fFieldId
Every column belongs to one and only one field.
RColumnDescriptor & operator=(const RColumnDescriptor &other)=delete
RColumnModel fModel
Contains the column type and whether it is sorted.
static constexpr std::uint16_t kFrameVersionMin
RNTupleVersion fVersion
Versions can change, e.g., when new column types are added.
std::uint32_t fIndex
A field can be serialized into several columns, which are numbered from zero to $n$.
bool operator==(const RColumnDescriptor &other) const
Holds the static meta-data of a column in a tree.
Meta-data stored for every field of an ntuple.
std::vector< DescriptorId_t > fLinkIds
The pointers in the other direction from parent to children.
RNTupleVersion fFieldVersion
The version of the C++-type-to-column translation mechanics.
std::string fFieldDescription
Free text set by the user.
static constexpr std::uint16_t kFrameVersionMin
std::string fFieldName
The leaf name, not including parent fields.
const std::vector< DescriptorId_t > & GetLinkIds() const
RFieldDescriptor(const RFieldDescriptor &other)=delete
DescriptorId_t fParentId
Establishes sub field relationships, such as classes and collections.
RNTupleVersion fTypeVersion
The version of the C++ type itself.
bool operator==(const RFieldDescriptor &other) const
ENTupleStructure fStructure
The structural information carried by this field in the data model tree.
RFieldDescriptor & operator=(const RFieldDescriptor &other)=delete
RFieldDescriptor(RFieldDescriptor &&other)=default
std::string fTypeName
The C++ type that was used when writing the field.
std::uint64_t fNRepetitions
The number of elements per entry for fixed-size arrays.
static constexpr std::uint16_t kFrameVersionCurrent
In order to handle changes to the serialization routine in future ntuple versions.
A helper class for piece-wise construction of an RNTupleDescriptor.
void AddCluster(DescriptorId_t clusterId, RNTupleVersion version, NTupleSize_t firstEntryIndex, ClusterSize_t nEntries)
void AddFieldLink(DescriptorId_t fieldId, DescriptorId_t linkId)
void AddColumn(DescriptorId_t columnId, DescriptorId_t fieldId, const RNTupleVersion &version, const RColumnModel &model, std::uint32_t index)
const RNTupleDescriptor & GetDescriptor() const
void SetClusterLocator(DescriptorId_t clusterId, RClusterDescriptor::RLocator locator)
void AddClusterColumnRange(DescriptorId_t clusterId, const RClusterDescriptor::RColumnRange &columnRange)
void AddField(DescriptorId_t fieldId, const RNTupleVersion &fieldVersion, const RNTupleVersion &typeVersion, std::string_view fieldName, std::string_view typeName, std::uint64_t nRepetitions, ENTupleStructure structure)
void SetNTuple(const std::string_view name, const std::string_view description, const std::string_view author, const RNTupleVersion &version, const RNTupleUuid &uuid)
void AddClusterPageRange(DescriptorId_t clusterId, RClusterDescriptor::RPageRange &&pageRange)
The on-storage meta-data of an ntuple.
std::unordered_map< DescriptorId_t, RClusterDescriptor > fClusterDescriptors
May contain only a subset of all the available clusters, e.g.
RNTupleUuid fGroupUuid
Column sets that are created as derived sets from existing NTuples share the same group id.
std::unique_ptr< RNTupleModel > GenerateModel() const
Re-create the C++ model from the stored meta-data.
std::chrono::system_clock::time_point fTimeStampWritten
The time stamp of writing the data to storage, which gets updated when re-written.
std::uint32_t SerializeHeader(void *buffer) const
We deliberately do not use ROOT's built-in serialization in order to allow for use of RNTuple's witho...
std::chrono::system_clock::time_point GetTimeStampData() const
std::unordered_map< DescriptorId_t, RColumnDescriptor > fColumnDescriptors
RNTupleDescriptor(RNTupleDescriptor &&other)=default
std::string fName
The ntuple name needs to be unique in a given storage location (file)
std::uint32_t SerializeFooter(void *buffer) const
Serializes cluster meta data. Returns the number of bytes and fills buffer if it is not nullptr.
std::chrono::system_clock::time_point GetTimeStampWritten() const
const RClusterDescriptor & GetClusterDescriptor(DescriptorId_t clusterId) const
RNTupleDescriptor(const RNTupleDescriptor &other)=delete
std::string fAuthor
The origin of the data.
std::unordered_map< DescriptorId_t, RFieldDescriptor > fFieldDescriptors
static constexpr std::uint16_t kFrameVersionMin
RNTupleDescriptor & operator=(RNTupleDescriptor &&other)=default
RNTupleVersion fVersion
The version evolves with the ntuple summary meta-data.
bool operator==(const RNTupleDescriptor &other) const
DescriptorId_t FindFieldId(std::string_view fieldName, DescriptorId_t parentId) const
const RColumnDescriptor & GetColumnDescriptor(DescriptorId_t columnId) const
const RFieldDescriptor & GetFieldDescriptor(DescriptorId_t fieldId) const
std::string fCustodian
The current responsible for storing the data.
DescriptorId_t FindColumnId(DescriptorId_t fieldId, std::uint32_t columnIndex) const
NTupleSize_t GetNElements(DescriptorId_t columnId) const
RNTupleDescriptor & operator=(const RNTupleDescriptor &other)=delete
static constexpr unsigned int kNBytesPreamble
The preamble is sufficient to get the length of the header.
static void LocateMetadata(const void *postscript, std::uint32_t &szHeader, std::uint32_t &szFooter)
Given kNBytesPostscript bytes, extract the header and footer lengths in bytes.
std::string fDescription
Free text from the user.
static constexpr std::uint16_t kFrameVersionCurrent
In order to handle changes to the serialization routine in future ntuple versions.
static constexpr unsigned int kNBytesPostscript
The last few bytes after the footer store the length of footer and header.
RNTupleUuid fOwnUuid
Every NTuple gets a unique identifier.
void PrintInfo(std::ostream &output) const
std::chrono::system_clock::time_point fTimeStampData
The time stamp of the ntuple data (immutable)
DescriptorId_t FindClusterId(DescriptorId_t columnId, NTupleSize_t index) const
For forward and backward compatibility, attach version information to the consitituents of the file f...
basic_string_view< char > string_view
std::uint64_t NTupleSize_t
Integer type long enough to hold the maximum number of entries in a column.
Definition: RNTupleUtil.hxx:42
ENTupleStructure
The fields in the ntuple model tree can carry different structural information about the type system.
Definition: RNTupleUtil.hxx:32
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
Definition: RNTupleUtil.hxx:78
constexpr NTupleSize_t kInvalidNTupleIndex
Definition: RNTupleUtil.hxx:43
std::string RNTupleUuid
Every NTuple is identified by a UUID. TODO(jblomer): should this be a TUUID?
constexpr ClusterSize_t kInvalidClusterIndex(std::uint32_t(-1))
constexpr DescriptorId_t kInvalidDescriptorId
Definition: RNTupleUtil.hxx:79
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Definition: StringConv.hxx:21
The window of element indexes of a particular column in a particular cluster.
std::int64_t fCompressionSettings
The usual format for ROOT compression settings (see Compression.h).
NTupleSize_t fFirstElementIndex
A 64bit element index.
ClusterSize_t fNElements
A 32bit value for the number of column elements in the cluster.
bool operator==(const RColumnRange &other) const
Generic information about the physical location of data.
We do not need to store the element size / uncompressed page size because we know to which column the...
RLocator fLocator
The meaning of fLocator depends on the storage backend.
ClusterSize_t fNElements
The sum of the elements of all the pages must match the corresponding fNElements field in fColumnRang...
Records the parition of data into pages for a particular column in a particular cluster.
bool operator==(const RPageRange &other) const
RPageRange(const RPageRange &other)=delete
RPageRange & operator=(const RPageRange &other)=delete
Wrap the 32bit integer in a struct in order to avoid template specialization clash with std::uint32_t...
Definition: RNTupleUtil.hxx:45
static void output(int code)
Definition: gifencode.c:226