Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleImporter.hxx
Go to the documentation of this file.
1/// \file ROOT/RNTupleImporter.hxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \date 2022-11-22
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
16#ifndef ROOT7_RNTuplerImporter
17#define ROOT7_RNTuplerImporter
18
19#include <ROOT/REntry.hxx>
20#include <ROOT/RError.hxx>
21#include <ROOT/RField.hxx>
23#include <ROOT/RNTupleModel.hxx>
26#include <string_view>
27
28#include <TFile.h>
29#include <TTree.h>
30
31#include <cstdlib>
32#include <functional>
33#include <map>
34#include <memory>
35#include <vector>
36
37class TLeaf;
38
39namespace ROOT {
40namespace Experimental {
41
42// clang-format off
43/**
44\class ROOT::Experimental::RNTupleImporter
45\ingroup NTuple
46\brief Converts a TTree into an RNTuple
47
48Example usage (see the ntpl008_import.C tutorial for a full example):
49
50~~~ {.cpp}
51#include <ROOT/RNTupleImporter.hxx>
52using ROOT::Experimental::RNTupleImporter;
53
54auto importer = RNTupleImporter::Create("data.root", "TreeName", "output.root");
55// As required: importer->SetNTupleName(), importer->SetWriteOptions(), ...
56importer->Import();
57~~~
58
59The output file is created if it does not exist, otherwise the ntuple is added to the existing file.
60Note that input file and output file can be identical if the ntuple is stored under a different name than the tree
61(use `SetNTupleName()`).
62
63By default, the RNTuple is compressed with zstd, independent of the input compression. The compression settings
64(and other output parameters) can be changed by `SetWriteOptions()`. For example, to compress the imported RNTuple
65using lz4 (with compression level 4) instead:
66
67~~~ {.cpp}
68auto writeOptions = importer->GetWriteOptions();
69writeOptions.SetCompression(404);
70importer->SetWriteOptions(writeOptions);
71~~~
72
73Most RNTuple fields have a type identical to the corresponding TTree input branch. Exceptions are
74 - C string branches are translated to `std::string` fields
75 - C style arrays are translated to `std::array<...>` fields
76 - Leaf lists are translated to untyped records
77 - Leaf count arrays are translated to anonymous collections with generic names (`_collection0`, `_collection1`, etc.).
78 In order to keep field names and branch names aligned, RNTuple projects the members of these collections and
79 its collection counter to the input branch names. For instance, the following input leafs:
80~~~
81Int_t njets
82float jet_pt[njets]
83float jet_eta[njets]
84~~~
85 will be converted to the following RNTuple schema:
86~~~
87 _collection0 (untyped collection)
88 |- float jet_pt
89 |- float jet_eta
90 std::size_t (RNTupleCardinality) njets (projected from _collection0 without subfields)
91 ROOT::RVec<float> jet_pt (projected from _collection0.jet_pt)
92 ROOT::RVec<float> jet_eta (projected from _collection0.jet_eta)
93~~~
94 These projections are meta-data only operations and don't involve duplicating the data.
95
96Current limitations of the importer:
97 - No support for trees containing TClonesArray collections
98 - Due to RNTuple currently storing data fully split, "don't split" markers are ignored
99 - Some types are not available in RNTuple. Please refer to the
100 [RNTuple specification](https://github.com/root-project/root/blob/master/tree/ntuple/v7/doc/specifications.md) for
101 an overview of all types currently supported.
102*/
103// clang-format on
105public:
106 /// Used to make adjustments to the fields of the output model.
107 using FieldModifier_t = std::function<void(RFieldBase &)>;
108
109 /// Used to report every ~50MB (compressed), and at the end about the status of the import.
111 public:
112 virtual ~RProgressCallback() = default;
113 void operator()(std::uint64_t nbytesWritten, std::uint64_t neventsWritten)
114 {
115 Call(nbytesWritten, neventsWritten);
116 }
117 virtual void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) = 0;
118 virtual void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) = 0;
119 };
120
121private:
123 RImportBranch() = default;
124 RImportBranch(const RImportBranch &other) = delete;
125 RImportBranch(RImportBranch &&other) = default;
126 RImportBranch &operator=(const RImportBranch &other) = delete;
128 std::string fBranchName; ///< Top-level branch name from the input TTree
129 std::unique_ptr<unsigned char[]> fBranchBuffer; ///< The destination of SetBranchAddress() for `fBranchName`
130 };
131
133 RImportField() = default;
134 ~RImportField() = default;
135 RImportField(const RImportField &other) = delete;
136 RImportField(RImportField &&other) = default;
137 RImportField &operator=(const RImportField &other) = delete;
139
140 /// The field is kept during schema preparation and transferred to the fModel before the writing starts
141 RFieldBase *fField = nullptr;
142 std::unique_ptr<RFieldBase::RValue> fValue; ///< Set if a value is generated, only for transformed fields
143 void *fFieldBuffer = nullptr; ///< Usually points to the corresponding RImportBranch::fBranchBuffer but not always
144 bool fIsInUntypedCollection = false; ///< Sub-fields of untyped collections (leaf count arrays in the input)
145 bool fIsClass = false; ///< Field imported from a branch with stramer info (e.g., STL, user-defined class)
146 };
147
148 /// Base class to perform data transformations from TTree branches to RNTuple fields if necessary
150 std::size_t fImportBranchIdx = 0;
151 std::size_t fImportFieldIdx = 0;
152
153 RImportTransformation(std::size_t branchIdx, std::size_t fieldIdx)
154 : fImportBranchIdx(branchIdx), fImportFieldIdx(fieldIdx)
155 {
156 }
157 virtual ~RImportTransformation() = default;
158 virtual RResult<void> Transform(const RImportBranch &branch, RImportField &field) = 0;
159 virtual void ResetEntry() = 0; // called at the end of an entry
160 };
161
162 /// When the schema is set up and the import started, it needs to be reset before the next Import() call
163 /// can start. This RAII guard ensures that ResetSchema is called.
166
167 explicit RImportGuard(RNTupleImporter &importer) : fImporter(importer) {}
168 RImportGuard(const RImportGuard &) = delete;
173 };
174
175 /// Leaf count arrays require special treatment. They are translated into RNTuple untyped collections.
176 /// This class does the bookkeeping of the sub-schema for these collections.
183 std::unique_ptr<RNTupleModel> fCollectionModel; ///< The model for the collection itself
184 std::shared_ptr<RNTupleCollectionWriter> fCollectionWriter; ///< Used to fill the collection elements per event
185 std::unique_ptr<REntry> fCollectionEntry; ///< Keeps the memory location of the collection members
186 /// The number of elements for the collection for a particular event. Used as a destination for SetBranchAddress()
187 /// of the count leaf
188 std::unique_ptr<Int_t> fCountVal;
189 std::vector<size_t> fImportFieldIndexes; ///< Points to the correspondings fields in fImportFields
190 /// One transformation for every field, to copy the content of the array one by one
191 std::vector<std::unique_ptr<RImportTransformation>> fTransformations;
192 Int_t fMaxLength = 0; ///< Stores count leaf GetMaximum() to create large enough buffers for the array leafs
193 std::string fFieldName; ///< name of the untyped collection, e.g. `_collection0`, `_collection1`, etc.
194 };
195
196 /// Transform a NULL terminated C string branch into an `std::string` field
198 RCStringTransformation(std::size_t b, std::size_t f) : RImportTransformation(b, f) {}
199 ~RCStringTransformation() override = default;
200 RResult<void> Transform(const RImportBranch &branch, RImportField &field) final;
201 void ResetEntry() final {}
202 };
203
204 /// When writing the elements of a leaf count array, moves the data from the input array one-by-one
205 /// to the memory locations of the fields of the corresponding untyped collection.
206 /// TODO(jblomer): write arrays as a whole to RNTuple
208 std::int64_t fNum = 0;
209 RLeafArrayTransformation(std::size_t b, std::size_t f) : RImportTransformation(b, f) {}
210 ~RLeafArrayTransformation() override = default;
211 RResult<void> Transform(const RImportBranch &branch, RImportField &field) final;
212 void ResetEntry() final { fNum = 0; }
213 };
214
215 RNTupleImporter() = default;
216
217 std::unique_ptr<TFile> fSourceFile;
219
220 std::string fDestFileName;
221 std::string fNTupleName;
222 std::unique_ptr<TFile> fDestFile;
224
225 /// Whether or not dot characters in branch names should be converted to underscores. If this option is not set and a
226 /// branch with a '.' is encountered, the importer will throw an exception.
228
229 /// The maximum number of entries to import. When this value is -1 (default), import all entries.
230 std::int64_t fMaxEntries = -1;
231
232 /// No standard output, conversely if set to false, schema information and progress is printed.
233 bool fIsQuiet = false;
234 std::unique_ptr<RProgressCallback> fProgressCallback;
236
237 std::unique_ptr<RNTupleModel> fModel;
238 std::unique_ptr<REntry> fEntry;
239 std::vector<RImportBranch> fImportBranches;
240 std::vector<RImportField> fImportFields;
241 /// Maps the count leaf to the information about the corresponding untyped collection
242 std::map<std::string, RImportLeafCountCollection> fLeafCountCollections;
243 /// The list of transformations to be performed for every entry
244 std::vector<std::unique_ptr<RImportTransformation>> fImportTransformations;
245
246 ROOT::Experimental::RResult<void> InitDestination(std::string_view destFileName);
247
248 void ResetSchema();
249 /// Sets up the connection from TTree branches to RNTuple fields, including initialization of the memory
250 /// buffers used for reading and writing.
252 void ReportSchema();
253
254public:
255 RNTupleImporter(const RNTupleImporter &other) = delete;
259 ~RNTupleImporter() = default;
260
261 /// Opens the input file for reading and the output file for writing (update).
262 static std::unique_ptr<RNTupleImporter>
263 Create(std::string_view sourceFileName, std::string_view treeName, std::string_view destFileName);
264
265 /// Directly uses the provided tree and opens the output file for writing (update).
266 static std::unique_ptr<RNTupleImporter> Create(TTree *sourceTree, std::string_view destFileName);
267
270 void SetNTupleName(const std::string &name) { fNTupleName = name; }
271 void SetMaxEntries(std::uint64_t maxEntries) { fMaxEntries = maxEntries; };
272
273 /// Whereas branch names may contain dots, RNTuple field names may not. By setting this option, dot characters are
274 /// automatically converted into underscores to prevent the importer from throwing an exception.
276
277 /// Whether or not information and progress is printed to stdout.
278 void SetIsQuiet(bool value) { fIsQuiet = value; }
279
280 /// Add custom method to adjust column representations. Will be called for every field of the frozen model
281 /// before it is attached to the page sink
283
284 /// Import works in two steps:
285 /// 1. PrepareSchema() calls SetBranchAddress() on all the TTree branches and creates the corresponding RNTuple
286 /// fields and the model
287 /// 2. An event loop reads every entry from the TTree, applies transformations where necessary, and writes the
288 /// output entry to the RNTuple.
289 void Import();
290}; // class RNTupleImporter
291
292} // namespace Experimental
293} // namespace ROOT
294
295#endif
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t modifier
char name[80]
Definition TGX11.cxx:110
A field translates read and write calls from/to underlying columns to/from tree values.
Used to report every ~50MB (compressed), and at the end about the status of the import.
virtual void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten)=0
void operator()(std::uint64_t nbytesWritten, std::uint64_t neventsWritten)
virtual void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten)=0
Converts a TTree into an RNTuple.
void SetWriteOptions(RNTupleWriteOptions options)
bool fConvertDotsInBranchNames
Whether or not dot characters in branch names should be converted to underscores.
std::int64_t fMaxEntries
The maximum number of entries to import. When this value is -1 (default), import all entries.
std::map< std::string, RImportLeafCountCollection > fLeafCountCollections
Maps the count leaf to the information about the corresponding untyped collection.
RNTupleImporter & operator=(const RNTupleImporter &other)=delete
std::vector< RImportBranch > fImportBranches
void SetNTupleName(const std::string &name)
RNTupleImporter(const RNTupleImporter &other)=delete
void SetConvertDotsInBranchNames(bool value)
Whereas branch names may contain dots, RNTuple field names may not.
RNTupleImporter & operator=(RNTupleImporter &&other)=delete
static std::unique_ptr< RNTupleImporter > Create(std::string_view sourceFileName, std::string_view treeName, std::string_view destFileName)
Opens the input file for reading and the output file for writing (update).
std::unique_ptr< RProgressCallback > fProgressCallback
RNTupleImporter(RNTupleImporter &&other)=delete
RResult< void > PrepareSchema()
Sets up the connection from TTree branches to RNTuple fields, including initialization of the memory ...
std::function< void(RFieldBase &)> FieldModifier_t
Used to make adjustments to the fields of the output model.
ROOT::Experimental::RResult< void > InitDestination(std::string_view destFileName)
void SetFieldModifier(FieldModifier_t modifier)
Add custom method to adjust column representations.
void Import()
Import works in two steps:
RNTupleWriteOptions GetWriteOptions() const
bool fIsQuiet
No standard output, conversely if set to false, schema information and progress is printed.
std::vector< RImportField > fImportFields
void SetIsQuiet(bool value)
Whether or not information and progress is printed to stdout.
void SetMaxEntries(std::uint64_t maxEntries)
std::unique_ptr< RNTupleModel > fModel
std::vector< std::unique_ptr< RImportTransformation > > fImportTransformations
The list of transformations to be performed for every entry.
Common user-tunable settings for storing ntuples.
The class is used as a return type for operations that can fail; wraps a value of type T or an RError...
Definition RError.hxx:194
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition TLeaf.h:57
A TTree represents a columnar dataset.
Definition TTree.h:79
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Transform a NULL terminated C string branch into an std::string field.
RResult< void > Transform(const RImportBranch &branch, RImportField &field) final
std::string fBranchName
Top-level branch name from the input TTree.
RImportBranch(const RImportBranch &other)=delete
RImportBranch & operator=(RImportBranch &&other)=default
RImportBranch & operator=(const RImportBranch &other)=delete
std::unique_ptr< unsigned char[]> fBranchBuffer
The destination of SetBranchAddress() for fBranchName
RImportBranch(RImportBranch &&other)=default
void * fFieldBuffer
Usually points to the corresponding RImportBranch::fBranchBuffer but not always.
RFieldBase * fField
The field is kept during schema preparation and transferred to the fModel before the writing starts.
bool fIsClass
Field imported from a branch with stramer info (e.g., STL, user-defined class)
std::unique_ptr< RFieldBase::RValue > fValue
Set if a value is generated, only for transformed fields.
RImportField(RImportField &&other)=default
bool fIsInUntypedCollection
Sub-fields of untyped collections (leaf count arrays in the input)
RImportField & operator=(const RImportField &other)=delete
RImportField & operator=(RImportField &&other)=default
RImportField(const RImportField &other)=delete
When the schema is set up and the import started, it needs to be reset before the next Import() call ...
RImportGuard & operator=(const RImportGuard &)=delete
RImportGuard & operator=(RImportGuard &&)=delete
std::string fFieldName
name of the untyped collection, e.g. _collection0, _collection1, etc.
Int_t fMaxLength
Stores count leaf GetMaximum() to create large enough buffers for the array leafs.
std::vector< size_t > fImportFieldIndexes
Points to the correspondings fields in fImportFields.
std::unique_ptr< RNTupleModel > fCollectionModel
The model for the collection itself.
RImportLeafCountCollection & operator=(const RImportLeafCountCollection &other)=delete
RImportLeafCountCollection(RImportLeafCountCollection &&other)=default
std::vector< std::unique_ptr< RImportTransformation > > fTransformations
One transformation for every field, to copy the content of the array one by one.
RImportLeafCountCollection(const RImportLeafCountCollection &other)=delete
std::shared_ptr< RNTupleCollectionWriter > fCollectionWriter
Used to fill the collection elements per event.
RImportLeafCountCollection & operator=(RImportLeafCountCollection &&other)=default
std::unique_ptr< Int_t > fCountVal
The number of elements for the collection for a particular event.
std::unique_ptr< REntry > fCollectionEntry
Keeps the memory location of the collection members.
Base class to perform data transformations from TTree branches to RNTuple fields if necessary.
virtual RResult< void > Transform(const RImportBranch &branch, RImportField &field)=0
RImportTransformation(std::size_t branchIdx, std::size_t fieldIdx)
When writing the elements of a leaf count array, moves the data from the input array one-by-one to th...
RResult< void > Transform(const RImportBranch &branch, RImportField &field) final