Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RPageStorage.hxx
Go to the documentation of this file.
1/// \file ROOT/RPageStorage.hxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \date 2018-07-19
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2019, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
16#ifndef ROOT7_RPageStorage
17#define ROOT7_RPageStorage
18
21#include <ROOT/RNTupleUtil.hxx>
22#include <ROOT/RPage.hxx>
24#include <ROOT/RStringView.hxx>
25
26#include <atomic>
27#include <cstddef>
28#include <functional>
29#include <memory>
30#include <unordered_set>
31
32namespace ROOT {
33namespace Experimental {
34
35class RNTupleModel;
36// TODO(jblomer): factory methods to create tree sinks and sources outside Detail namespace
37
38namespace Detail {
39
40class RCluster;
41class RColumn;
42class RPagePool;
43class RFieldBase;
44class RNTupleMetrics;
45
46enum class EPageStorageType {
47 kSink,
48 kSource,
49};
50
51// clang-format off
52/**
53\class ROOT::Experimental::Detail::RPageStorage
54\ingroup NTuple
55\brief Common functionality of an ntuple storage for both reading and writing
56
57The RPageStore provides access to a storage container that keeps the bits of pages and clusters comprising
58an ntuple. Concrete implementations can use a TFile, a raw file, an object store, and so on.
59*/
60// clang-format on
62public:
63 /// The interface of a task scheduler to schedule page (de)compression tasks
65 public:
66 virtual ~RTaskScheduler() = default;
67 /// Start a new set of tasks
68 virtual void Reset() = 0;
69 /// Take a callable that represents a task
70 virtual void AddTask(const std::function<void(void)> &taskFunc) = 0;
71 /// Blocks until all scheduled tasks finished
72 virtual void Wait() = 0;
73 };
74
75protected:
76 std::string fNTupleName;
78
79public:
80 explicit RPageStorage(std::string_view name);
81 RPageStorage(const RPageStorage &other) = delete;
82 RPageStorage& operator =(const RPageStorage &other) = delete;
83 RPageStorage(RPageStorage &&other) = default;
85 virtual ~RPageStorage();
86
87 /// Whether the concrete implementation is a sink or a source
89
92 const RColumn *fColumn = nullptr;
93
94 /// Returns true for a valid column handle; fColumn and fId should always either both
95 /// be valid or both be invalid.
96 operator bool() const { return fId != kInvalidDescriptorId && fColumn; }
97 };
98 /// The column handle identifies a column with the current open page storage
100
101 /// Register a new column. When reading, the column must exist in the ntuple on disk corresponding to the meta-data.
102 /// When writing, every column can only be attached once.
103 virtual ColumnHandle_t AddColumn(DescriptorId_t fieldId, const RColumn &column) = 0;
104 /// Unregisters a column. A page source decreases the reference counter for the corresponding active column.
105 /// For a page sink, dropping columns is currently a no-op.
106 virtual void DropColumn(ColumnHandle_t columnHandle) = 0;
107
108 /// Every page store needs to be able to free pages it handed out. But Sinks and sources have different means
109 /// of allocating pages.
110 virtual void ReleasePage(RPage &page) = 0;
111
112 /// Returns an empty metrics. Page storage implementations usually have their own metrics.
113 virtual RNTupleMetrics &GetMetrics();
114
115 void SetTaskScheduler(RTaskScheduler *taskScheduler) { fTaskScheduler = taskScheduler; }
116};
117
118// clang-format off
119/**
120\class ROOT::Experimental::Detail::RPageSink
121\ingroup NTuple
122\brief Abstract interface to write data into an ntuple
123
124The page sink takes the list of columns and afterwards a series of page commits and cluster commits.
125The user is responsible to commit clusters at a consistent point, i.e. when all pages corresponding to data
126up to the given entry number are committed.
127*/
128// clang-format on
129class RPageSink : public RPageStorage {
130protected:
132
133 /// Building the ntuple descriptor while writing is done in the same way for all the storage sink implementations.
134 /// Field, column, cluster ids and page indexes per cluster are issued sequentially starting with 0
139 /// Keeps track of the number of elements in the currently open cluster. Indexed by column id.
140 std::vector<RClusterDescriptor::RColumnRange> fOpenColumnRanges;
141 /// Keeps track of the written pages in the currently open cluster. Indexed by column id.
142 std::vector<RClusterDescriptor::RPageRange> fOpenPageRanges;
144
145 virtual void CreateImpl(const RNTupleModel &model) = 0;
146 virtual RClusterDescriptor::RLocator CommitPageImpl(ColumnHandle_t columnHandle, const RPage &page) = 0;
148 virtual void CommitDatasetImpl() = 0;
149
150public:
151 RPageSink(std::string_view ntupleName, const RNTupleWriteOptions &options);
152
153 RPageSink(const RPageSink&) = delete;
154 RPageSink& operator=(const RPageSink&) = delete;
155 RPageSink(RPageSink&&) = default;
157 virtual ~RPageSink();
158
159 /// Guess the concrete derived page source from the file name (location)
160 static std::unique_ptr<RPageSink> Create(std::string_view ntupleName, std::string_view location,
161 const RNTupleWriteOptions &options = RNTupleWriteOptions());
163
164 ColumnHandle_t AddColumn(DescriptorId_t fieldId, const RColumn &column) final;
165 void DropColumn(ColumnHandle_t /*columnHandle*/) final {}
166
167 /// Physically creates the storage container to hold the ntuple (e.g., a keys a TFile or an S3 bucket)
168 /// To do so, Create() calls CreateImpl() after updating the descriptor.
169 /// Create() associates column handles to the columns referenced by the model
170 void Create(RNTupleModel &model);
171 /// Write a page to the storage. The column must have been added before.
172 void CommitPage(ColumnHandle_t columnHandle, const RPage &page);
173 /// Finalize the current cluster and create a new one for the following data.
174 void CommitCluster(NTupleSize_t nEntries);
175 /// Finalize the current cluster and the entrire data set.
177
178 /// Get a new, empty page for the given column that can be filled with up to nElements. If nElements is zero,
179 /// the page sink picks an appropriate size.
180 virtual RPage ReservePage(ColumnHandle_t columnHandle, std::size_t nElements = 0) = 0;
181};
182
183// clang-format off
184/**
185\class ROOT::Experimental::Detail::RPageSource
186\ingroup NTuple
187\brief Abstract interface to read data from an ntuple
188
189The page source is initialized with the columns of interest. Pages from those columns can then be
190mapped into memory. The page source also gives access to the ntuple's meta-data.
191*/
192// clang-format on
193class RPageSource : public RPageStorage {
194public:
195 /// Derived from the model (fields) that are actually being requested at a given point in time
196 using ColumnSet_t = std::unordered_set<DescriptorId_t>;
197
198protected:
201 /// The active columns are implicitly defined by the model fields or views
203
205 // Only called if a task scheduler is set. No-op be default.
206 virtual void UnzipClusterImpl(RCluster * /* cluster */)
207 { }
208
209public:
210 RPageSource(std::string_view ntupleName, const RNTupleReadOptions &fOptions);
211 RPageSource(const RPageSource&) = delete;
215 virtual ~RPageSource();
216 /// Guess the concrete derived page source from the file name (location)
217 static std::unique_ptr<RPageSource> Create(std::string_view ntupleName, std::string_view location,
218 const RNTupleReadOptions &options = RNTupleReadOptions());
219 /// Open the same storage multiple time, e.g. for reading in multiple threads
220 virtual std::unique_ptr<RPageSource> Clone() const = 0;
221
223 const RNTupleDescriptor &GetDescriptor() const { return fDescriptor; }
224 ColumnHandle_t AddColumn(DescriptorId_t fieldId, const RColumn &column) final;
225 void DropColumn(ColumnHandle_t columnHandle) final;
226
227 /// Open the physical storage container for the tree
232
233 /// Allocates and fills a page that contains the index-th element
234 virtual RPage PopulatePage(ColumnHandle_t columnHandle, NTupleSize_t globalIndex) = 0;
235 /// Another version of PopulatePage that allows to specify cluster-relative indexes
236 virtual RPage PopulatePage(ColumnHandle_t columnHandle, const RClusterIndex &clusterIndex) = 0;
237
238 /// Populates all the pages of the given cluster id and columns; it is possible that some columns do not
239 /// contain any pages. The pages source may load more columns than the minimal necessary set from `columns`.
240 /// To indicate which columns have been loaded, LoadCluster() must mark them with SetColumnAvailable().
241 /// That includes the ones from the `columns` that don't have pages; otherwise subsequent requests
242 /// for the cluster would assume an incomplete cluster and trigger loading again.
243 /// LoadCluster() is typically called from the I/O thread of a cluster pool, i.e. the method runs
244 /// concurrently to other methods of the page source.
245 virtual std::unique_ptr<RCluster> LoadCluster(DescriptorId_t clusterId, const ColumnSet_t &columns) = 0;
246
247 /// Parallel decompression and unpacking of the pages in the given cluster. The unzipped pages are supposed
248 /// to be preloaded in a page pool attached to the source. The method is triggered by the cluster pool's
249 /// unzip thread. It is an optional optimization, the method can safely do nothing. In particular, the
250 /// actual implementation will only run if a task scheduler is set. In practice, a task scheduler is set
251 /// if implicit multi-threading is turned on.
252 void UnzipCluster(RCluster *cluster);
253};
254
255} // namespace Detail
256
257} // namespace Experimental
258} // namespace ROOT
259
260#endif
char name[80]
Definition TGX11.cxx:110
An in-memory subset of the packed and compressed pages of a cluster.
Definition RCluster.hxx:154
A collection of Counter objects with a name, a unit, and a description.
Abstract interface to write data into an ntuple.
virtual RClusterDescriptor::RLocator CommitPageImpl(ColumnHandle_t columnHandle, const RPage &page)=0
void CommitDataset()
Finalize the current cluster and the entrire data set.
RPageSink(const RPageSink &)=delete
std::vector< RClusterDescriptor::RPageRange > fOpenPageRanges
Keeps track of the written pages in the currently open cluster. Indexed by column id.
RPageSink & operator=(RPageSink &&)=default
void CommitPage(ColumnHandle_t columnHandle, const RPage &page)
Write a page to the storage. The column must have been added before.
virtual RPage ReservePage(ColumnHandle_t columnHandle, std::size_t nElements=0)=0
Get a new, empty page for the given column that can be filled with up to nElements.
RNTupleDescriptorBuilder fDescriptorBuilder
RPageSink & operator=(const RPageSink &)=delete
void DropColumn(ColumnHandle_t) final
Unregisters a column.
void CommitCluster(NTupleSize_t nEntries)
Finalize the current cluster and create a new one for the following data.
static std::unique_ptr< RPageSink > Create(std::string_view ntupleName, std::string_view location, const RNTupleWriteOptions &options=RNTupleWriteOptions())
Guess the concrete derived page source from the file name (location)
DescriptorId_t fLastFieldId
Building the ntuple descriptor while writing is done in the same way for all the storage sink impleme...
virtual void CreateImpl(const RNTupleModel &model)=0
virtual RClusterDescriptor::RLocator CommitClusterImpl(NTupleSize_t nEntries)=0
EPageStorageType GetType() final
Whether the concrete implementation is a sink or a source.
std::vector< RClusterDescriptor::RColumnRange > fOpenColumnRanges
Keeps track of the number of elements in the currently open cluster. Indexed by column id.
ColumnHandle_t AddColumn(DescriptorId_t fieldId, const RColumn &column) final
Register a new column.
Abstract interface to read data from an ntuple.
virtual std::unique_ptr< RPageSource > Clone() const =0
Open the same storage multiple time, e.g. for reading in multiple threads.
void Attach()
Open the physical storage container for the tree.
virtual std::unique_ptr< RCluster > LoadCluster(DescriptorId_t clusterId, const ColumnSet_t &columns)=0
Populates all the pages of the given cluster id and columns; it is possible that some columns do not ...
virtual RPage PopulatePage(ColumnHandle_t columnHandle, NTupleSize_t globalIndex)=0
Allocates and fills a page that contains the index-th element.
void DropColumn(ColumnHandle_t columnHandle) final
Unregisters a column.
virtual RPage PopulatePage(ColumnHandle_t columnHandle, const RClusterIndex &clusterIndex)=0
Another version of PopulatePage that allows to specify cluster-relative indexes.
ColumnSet_t fActiveColumns
The active columns are implicitly defined by the model fields or views.
RPageSource & operator=(RPageSource &&)=default
std::unordered_set< DescriptorId_t > ColumnSet_t
Derived from the model (fields) that are actually being requested at a given point in time.
ColumnHandle_t AddColumn(DescriptorId_t fieldId, const RColumn &column) final
Register a new column.
NTupleSize_t GetNElements(ColumnHandle_t columnHandle)
RPageSource(const RPageSource &)=delete
static std::unique_ptr< RPageSource > Create(std::string_view ntupleName, std::string_view location, const RNTupleReadOptions &options=RNTupleReadOptions())
Guess the concrete derived page source from the file name (location)
virtual RNTupleDescriptor AttachImpl()=0
RPageSource(RPageSource &&)=default
virtual void UnzipClusterImpl(RCluster *)
EPageStorageType GetType() final
Whether the concrete implementation is a sink or a source.
const RNTupleDescriptor & GetDescriptor() const
RPageSource & operator=(const RPageSource &)=delete
void UnzipCluster(RCluster *cluster)
Parallel decompression and unpacking of the pages in the given cluster.
ColumnId_t GetColumnId(ColumnHandle_t columnHandle)
The interface of a task scheduler to schedule page (de)compression tasks.
virtual void Reset()=0
Start a new set of tasks.
virtual void Wait()=0
Blocks until all scheduled tasks finished.
virtual void AddTask(const std::function< void(void)> &taskFunc)=0
Take a callable that represents a task.
Common functionality of an ntuple storage for both reading and writing.
RPageStorage(const RPageStorage &other)=delete
RPageStorage(RPageStorage &&other)=default
RColumnHandle ColumnHandle_t
The column handle identifies a column with the current open page storage.
virtual EPageStorageType GetType()=0
Whether the concrete implementation is a sink or a source.
void SetTaskScheduler(RTaskScheduler *taskScheduler)
virtual ColumnHandle_t AddColumn(DescriptorId_t fieldId, const RColumn &column)=0
Register a new column.
virtual void DropColumn(ColumnHandle_t columnHandle)=0
Unregisters a column.
virtual void ReleasePage(RPage &page)=0
Every page store needs to be able to free pages it handed out.
virtual RNTupleMetrics & GetMetrics()
Returns an empty metrics. Page storage implementations usually have their own metrics.
RPageStorage & operator=(const RPageStorage &other)=delete
A page is a slice of a column that is mapped into memory.
Definition RPage.hxx:41
Addresses a column element or field item relative to a particular cluster, instead of a global NTuple...
A column is a storage-backed array of a simple, fixed-size type, from which pages can be mapped into ...
A field translates read and write calls from/to underlying columns to/from tree values.
A helper class for piece-wise construction of an RNTupleDescriptor.
The on-storage meta-data of an ntuple.
The RNTupleModel encapulates the schema of an ntuple.
Common user-tunable settings for reading ntuples.
Common user-tunable settings for storing ntuples.
std::uint64_t NTupleSize_t
Integer type long enough to hold the maximum number of entries in a column.
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
std::int64_t ColumnId_t
Uniquely identifies a physical column within the scope of the current process, used to tag pages.
constexpr DescriptorId_t kInvalidDescriptorId
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Generic information about the physical location of data.