Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleImporter.hxx
Go to the documentation of this file.
1/// \file ROOT/RNTupleImporter.hxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \date 2022-11-22
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
16#ifndef ROOT7_RNTuplerImporter
17#define ROOT7_RNTuplerImporter
18
19#include <ROOT/REntry.hxx>
20#include <ROOT/RError.hxx>
21#include <ROOT/RField.hxx>
22#include <ROOT/RNTupleModel.hxx>
25#include <string_view>
26
27#include <TFile.h>
28#include <TTree.h>
29
30#include <cstdlib>
31#include <functional>
32#include <map>
33#include <memory>
34#include <vector>
35
36class TLeaf;
37
38namespace ROOT {
39namespace Experimental {
40
41// clang-format off
42/**
43\class ROOT::Experimental::RNTupleImporter
44\ingroup NTuple
45\brief Converts a TTree into an RNTuple
46
47Example usage (see the ntpl008_import.C tutorial for a full example):
48
49~~~ {.cpp}
50#include <ROOT/RNTupleImporter.hxx>
51using ROOT::Experimental::RNTupleImporter;
52
53auto importer = RNTupleImporter::Create("data.root", "TreeName", "output.root");
54// As required: importer->SetNTupleName(), importer->SetWriteOptions(), ...
55importer->Import();
56~~~
57
58The output file is created if it does not exist, otherwise the ntuple is added to the existing file.
59Note that input file and output file can be identical if the ntuple is stored under a different name than the tree
60(use `SetNTupleName()`).
61
62By default, the RNTuple is compressed with zstd, independent of the input compression. The compression settings
63(and other output parameters) can be changed by `SetWriteOptions()`. For example, to compress the imported RNTuple
64using lz4 (with compression level 4) instead:
65
66~~~ {.cpp}
67auto writeOptions = importer->GetWriteOptions();
68writeOptions.SetCompression(404);
69importer->SetWriteOptions(writeOptions);
70~~~
71
72Most RNTuple fields have a type identical to the corresponding TTree input branch. Exceptions are
73 - C string branches are translated to `std::string` fields
74 - C style arrays are translated to `std::array<...>` fields
75 - Leaf lists are translated to untyped records
76 - Leaf count arrays are translated to anonymous collections with generic names (`_collection0`, `_collection1`, etc.).
77 In order to keep field names and branch names aligned, RNTuple projects the members of these collections and
78 its collection counter to the input branch names. For instance, the following input leafs:
79~~~
80Int_t njets
81float jet_pt[njets]
82float jet_eta[njets]
83~~~
84 will be converted to the following RNTuple schema:
85~~~
86 _collection0 (untyped collection)
87 |- float jet_pt
88 |- float jet_eta
89 std::size_t (RNTupleCardinality) njets (projected from _collection0 without subfields)
90 ROOT::RVec<float> jet_pt (projected from _collection0.jet_pt)
91 ROOT::RVec<float> jet_eta (projected from _collection0.jet_eta)
92~~~
93 These projections are meta-data only operations and don't involve duplicating the data.
94
95Current limitations of the importer:
96 - No support for trees containing TClonesArray collections
97 - Due to RNTuple currently storing data fully split, "don't split" markers are ignored
98 - Some types are not available in RNTuple. Please refer to the
99 [RNTuple specification](https://github.com/root-project/root/blob/master/tree/ntuple/v7/doc/specifications.md) for
100 an overview of all types currently supported.
101*/
102// clang-format on
104public:
105 /// Used to make adjustments to the fields of the output model.
106 using FieldModifier_t = std::function<void(RFieldBase &)>;
107
108 /// Used to report every ~100 MB (compressed), and at the end about the status of the import.
110 public:
111 virtual ~RProgressCallback() = default;
112 void operator()(std::uint64_t nbytesWritten, std::uint64_t neventsWritten)
113 {
114 Call(nbytesWritten, neventsWritten);
115 }
116 virtual void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) = 0;
117 virtual void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) = 0;
118 };
119
120private:
122 RImportBranch() = default;
123 RImportBranch(const RImportBranch &other) = delete;
124 RImportBranch(RImportBranch &&other) = default;
125 RImportBranch &operator=(const RImportBranch &other) = delete;
127 std::string fBranchName; ///< Top-level branch name from the input TTree
128 std::unique_ptr<unsigned char[]> fBranchBuffer; ///< The destination of SetBranchAddress() for `fBranchName`
129 };
130
132 RImportField() = default;
133 ~RImportField() = default;
134 RImportField(const RImportField &other) = delete;
135 RImportField(RImportField &&other) = default;
136 RImportField &operator=(const RImportField &other) = delete;
138
139 /// The field is kept during schema preparation and transferred to the fModel before the writing starts
140 RFieldBase *fField = nullptr;
141 std::unique_ptr<RFieldBase::RValue> fValue; ///< Set if a value is generated, only for transformed fields
142 void *fFieldBuffer = nullptr; ///< Usually points to the corresponding RImportBranch::fBranchBuffer but not always
143 };
144
145 /// Base class to perform data transformations from TTree branches to RNTuple fields if necessary
147 std::size_t fImportBranchIdx = 0;
148 std::size_t fImportFieldIdx = 0;
149
150 RImportTransformation(std::size_t branchIdx, std::size_t fieldIdx)
151 : fImportBranchIdx(branchIdx), fImportFieldIdx(fieldIdx)
152 {
153 }
154 virtual ~RImportTransformation() = default;
155 virtual RResult<void> Transform(const RImportBranch &branch, RImportField &field) = 0;
156 };
157
158 /// When the schema is set up and the import started, it needs to be reset before the next Import() call
159 /// can start. This RAII guard ensures that ResetSchema is called.
162
163 explicit RImportGuard(RNTupleImporter &importer) : fImporter(importer) {}
164 RImportGuard(const RImportGuard &) = delete;
169 };
170
171 /// Leaf count arrays require special treatment. They are translated into untyped collections of untyped records.
172 /// This class does the bookkeeping of the sub-schema for these collections.
179 std::string fFieldName; ///< name of the untyped collection, e.g. `_collection0`, `_collection1`, etc.
180 /// Stores count leaf GetMaximum() to create large enough buffers for the array leafs.
181 /// Uses Int_t because that is the return type if TLeaf::GetMaximum().
183 /// The number of elements for the collection for a particular event. Used as a destination for SetBranchAddress()
184 /// of the count leaf
185 std::unique_ptr<Int_t> fCountVal;
186 /// The leafs of the array as we encounter them traversing the TTree schema.
187 /// Eventually, the fields are moved as leaves to an untyped collection of untyped records that in turn
188 /// is attached to the RNTuple model.
189 std::vector<std::unique_ptr<RFieldBase>> fLeafFields;
190 std::vector<size_t> fLeafBranchIndexes; ///< Points to the correspondings leaf branches in fImportBranches
191 RRecordField *fRecordField = nullptr; ///< Points to the item field of the untyped collection field in the model.
192 std::vector<unsigned char> fFieldBuffer; ///< The collection field memory representation. Bound to the entry.
193 };
194
195 /// Transform a NULL terminated C string branch into an `std::string` field
197 RCStringTransformation(std::size_t b, std::size_t f) : RImportTransformation(b, f) {}
198 ~RCStringTransformation() override = default;
199 RResult<void> Transform(const RImportBranch &branch, RImportField &field) final;
200 };
201
202 RNTupleImporter() = default;
203
204 std::unique_ptr<TFile> fSourceFile;
206
207 std::string fDestFileName;
208 std::string fNTupleName;
209 std::unique_ptr<TFile> fDestFile;
211
212 /// Whether or not dot characters in branch names should be converted to underscores. If this option is not set and a
213 /// branch with a '.' is encountered, the importer will throw an exception.
215
216 /// The maximum number of entries to import. When this value is -1 (default), import all entries.
217 std::int64_t fMaxEntries = -1;
218
219 /// No standard output, conversely if set to false, schema information and progress is printed.
220 bool fIsQuiet = false;
221 std::unique_ptr<RProgressCallback> fProgressCallback;
223
224 std::unique_ptr<RNTupleModel> fModel;
225 std::unique_ptr<REntry> fEntry;
226 std::vector<RImportBranch> fImportBranches;
227 std::vector<RImportField> fImportFields;
228 /// Maps the count leaf to the information about the corresponding untyped collection
229 std::map<std::string, RImportLeafCountCollection> fLeafCountCollections;
230 /// The list of transformations to be performed for every entry
231 std::vector<std::unique_ptr<RImportTransformation>> fImportTransformations;
232
233 ROOT::Experimental::RResult<void> InitDestination(std::string_view destFileName);
234
235 void ResetSchema();
236 /// Sets up the connection from TTree branches to RNTuple fields, including initialization of the memory
237 /// buffers used for reading and writing.
239 void ReportSchema();
240
241public:
242 RNTupleImporter(const RNTupleImporter &other) = delete;
246 ~RNTupleImporter() = default;
247
248 /// Opens the input file for reading and the output file for writing (update).
249 static std::unique_ptr<RNTupleImporter>
250 Create(std::string_view sourceFileName, std::string_view treeName, std::string_view destFileName);
251
252 /// Directly uses the provided tree and opens the output file for writing (update).
253 static std::unique_ptr<RNTupleImporter> Create(TTree *sourceTree, std::string_view destFileName);
254
257 void SetNTupleName(const std::string &name) { fNTupleName = name; }
258 void SetMaxEntries(std::uint64_t maxEntries) { fMaxEntries = maxEntries; };
259
260 /// Whereas branch names may contain dots, RNTuple field names may not. By setting this option, dot characters are
261 /// automatically converted into underscores to prevent the importer from throwing an exception.
263
264 /// Whether or not information and progress is printed to stdout.
265 void SetIsQuiet(bool value) { fIsQuiet = value; }
266
267 /// Add custom method to adjust column representations. Will be called for every field of the frozen model
268 /// before it is attached to the page sink
270
271 /// Import works in two steps:
272 /// 1. PrepareSchema() calls SetBranchAddress() on all the TTree branches and creates the corresponding RNTuple
273 /// fields and the model
274 /// 2. An event loop reads every entry from the TTree, applies transformations where necessary, and writes the
275 /// output entry to the RNTuple.
276 void Import();
277}; // class RNTupleImporter
278
279} // namespace Experimental
280} // namespace ROOT
281
282#endif
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t modifier
char name[80]
Definition TGX11.cxx:110
A field translates read and write calls from/to underlying columns to/from tree values.
Used to report every ~100 MB (compressed), and at the end about the status of the import.
virtual void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten)=0
void operator()(std::uint64_t nbytesWritten, std::uint64_t neventsWritten)
virtual void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten)=0
Converts a TTree into an RNTuple.
void SetWriteOptions(RNTupleWriteOptions options)
bool fConvertDotsInBranchNames
Whether or not dot characters in branch names should be converted to underscores.
std::int64_t fMaxEntries
The maximum number of entries to import. When this value is -1 (default), import all entries.
std::map< std::string, RImportLeafCountCollection > fLeafCountCollections
Maps the count leaf to the information about the corresponding untyped collection.
RNTupleImporter & operator=(const RNTupleImporter &other)=delete
std::vector< RImportBranch > fImportBranches
void SetNTupleName(const std::string &name)
RNTupleImporter(const RNTupleImporter &other)=delete
void SetConvertDotsInBranchNames(bool value)
Whereas branch names may contain dots, RNTuple field names may not.
RNTupleImporter & operator=(RNTupleImporter &&other)=delete
static std::unique_ptr< RNTupleImporter > Create(std::string_view sourceFileName, std::string_view treeName, std::string_view destFileName)
Opens the input file for reading and the output file for writing (update).
std::unique_ptr< RProgressCallback > fProgressCallback
RNTupleImporter(RNTupleImporter &&other)=delete
RResult< void > PrepareSchema()
Sets up the connection from TTree branches to RNTuple fields, including initialization of the memory ...
std::function< void(RFieldBase &)> FieldModifier_t
Used to make adjustments to the fields of the output model.
ROOT::Experimental::RResult< void > InitDestination(std::string_view destFileName)
void SetFieldModifier(FieldModifier_t modifier)
Add custom method to adjust column representations.
void Import()
Import works in two steps:
RNTupleWriteOptions GetWriteOptions() const
bool fIsQuiet
No standard output, conversely if set to false, schema information and progress is printed.
std::vector< RImportField > fImportFields
void SetIsQuiet(bool value)
Whether or not information and progress is printed to stdout.
void SetMaxEntries(std::uint64_t maxEntries)
std::unique_ptr< RNTupleModel > fModel
std::vector< std::unique_ptr< RImportTransformation > > fImportTransformations
The list of transformations to be performed for every entry.
Common user-tunable settings for storing ntuples.
The field for an untyped record.
The class is used as a return type for operations that can fail; wraps a value of type T or an RError...
Definition RError.hxx:194
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition TLeaf.h:57
A TTree represents a columnar dataset.
Definition TTree.h:79
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Transform a NULL terminated C string branch into an std::string field.
RResult< void > Transform(const RImportBranch &branch, RImportField &field) final
std::string fBranchName
Top-level branch name from the input TTree.
RImportBranch(const RImportBranch &other)=delete
RImportBranch & operator=(RImportBranch &&other)=default
RImportBranch & operator=(const RImportBranch &other)=delete
std::unique_ptr< unsigned char[]> fBranchBuffer
The destination of SetBranchAddress() for fBranchName
RImportBranch(RImportBranch &&other)=default
void * fFieldBuffer
Usually points to the corresponding RImportBranch::fBranchBuffer but not always.
RFieldBase * fField
The field is kept during schema preparation and transferred to the fModel before the writing starts.
std::unique_ptr< RFieldBase::RValue > fValue
Set if a value is generated, only for transformed fields.
RImportField(RImportField &&other)=default
RImportField & operator=(const RImportField &other)=delete
RImportField & operator=(RImportField &&other)=default
RImportField(const RImportField &other)=delete
When the schema is set up and the import started, it needs to be reset before the next Import() call ...
RImportGuard & operator=(const RImportGuard &)=delete
RImportGuard & operator=(RImportGuard &&)=delete
std::string fFieldName
name of the untyped collection, e.g.
Int_t fMaxLength
Stores count leaf GetMaximum() to create large enough buffers for the array leafs.
std::vector< std::unique_ptr< RFieldBase > > fLeafFields
The leafs of the array as we encounter them traversing the TTree schema.
std::vector< unsigned char > fFieldBuffer
The collection field memory representation. Bound to the entry.
RImportLeafCountCollection & operator=(const RImportLeafCountCollection &other)=delete
RImportLeafCountCollection(RImportLeafCountCollection &&other)=default
RRecordField * fRecordField
Points to the item field of the untyped collection field in the model.
RImportLeafCountCollection(const RImportLeafCountCollection &other)=delete
RImportLeafCountCollection & operator=(RImportLeafCountCollection &&other)=default
std::vector< size_t > fLeafBranchIndexes
Points to the correspondings leaf branches in fImportBranches.
std::unique_ptr< Int_t > fCountVal
The number of elements for the collection for a particular event.
Base class to perform data transformations from TTree branches to RNTuple fields if necessary.
virtual RResult< void > Transform(const RImportBranch &branch, RImportField &field)=0
RImportTransformation(std::size_t branchIdx, std::size_t fieldIdx)