Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleImporter.cxx
Go to the documentation of this file.
1/// \file RNTupleImporter.cxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \date 2022-11-22
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
16#include <ROOT/RError.hxx>
17#include <ROOT/RField.hxx>
18#include <ROOT/RNTuple.hxx>
21#include <ROOT/RNTupleUtil.hxx>
22#include <ROOT/RPageStorage.hxx>
24#include <ROOT/RStringView.hxx>
25
26#include <TBranch.h>
27#include <TChain.h>
28#include <TClass.h>
29#include <TDataType.h>
30#include <TLeaf.h>
31#include <TLeafC.h>
32#include <TLeafElement.h>
33#include <TLeafObject.h>
34
35#include <cassert>
36#include <cstdint>
37#include <cstring>
38#include <iostream>
39#include <utility>
40
41namespace {
42
43class RDefaultProgressCallback : public ROOT::Experimental::RNTupleImporter::RProgressCallback {
44private:
45 static constexpr std::uint64_t gUpdateFrequencyBytes = 50 * 1000 * 1000; // report every 50MB
46 std::uint64_t fNbytesNext = gUpdateFrequencyBytes;
47
48public:
49 virtual ~RDefaultProgressCallback() {}
50 void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) final
51 {
52 // Report if more than 50MB (compressed) where written since the last status update
53 if (nbytesWritten < fNbytesNext)
54 return;
55 std::cout << "Wrote " << nbytesWritten / 1000 / 1000 << "MB, " << neventsWritten << " entries" << std::endl;
56 fNbytesNext += gUpdateFrequencyBytes;
57 }
58
59 void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) final
60 {
61 std::cout << "Done, wrote " << nbytesWritten / 1000 / 1000 << "MB, " << neventsWritten << " entries" << std::endl;
62 }
63};
64
65} // anonymous namespace
66
69{
70 *reinterpret_cast<std::string *>(field.fFieldBuffer) = reinterpret_cast<const char *>(branch.fBranchBuffer.get());
72}
73
76 RImportField &field)
77{
78 auto valueSize = field.fField->GetValueSize();
79 memcpy(field.fFieldBuffer, branch.fBranchBuffer.get() + (fNum * valueSize), valueSize);
80 fNum++;
82}
83
84std::unique_ptr<ROOT::Experimental::RNTupleImporter>
85ROOT::Experimental::RNTupleImporter::Create(std::string_view sourceFileName, std::string_view treeName,
86 std::string_view destFileName)
87{
88 auto importer = std::unique_ptr<RNTupleImporter>(new RNTupleImporter());
89 importer->fNTupleName = treeName;
90 importer->fSourceFile = std::unique_ptr<TFile>(TFile::Open(std::string(sourceFileName).c_str()));
91 if (!importer->fSourceFile || importer->fSourceFile->IsZombie()) {
92 throw RException(R__FAIL("cannot open source file " + std::string(sourceFileName)));
93 }
94
95 importer->fSourceTree = importer->fSourceFile->Get<TTree>(std::string(treeName).c_str());
96 if (!importer->fSourceTree) {
97 throw RException(R__FAIL("cannot read TTree " + std::string(treeName) + " from " + std::string(sourceFileName)));
98 }
99
100 // If we have IMT enabled, its best use is for parallel page compression
101 importer->fSourceTree->SetImplicitMT(false);
102 auto result = importer->InitDestination(destFileName);
103
104 if (!result)
106
107 return importer;
108}
109
110std::unique_ptr<ROOT::Experimental::RNTupleImporter>
111ROOT::Experimental::RNTupleImporter::Create(TTree *sourceTree, std::string_view destFileName)
112{
113 auto importer = std::unique_ptr<RNTupleImporter>(new RNTupleImporter());
114
115 if (sourceTree->IsA() == TChain::Class() && std::strcmp(sourceTree->GetName(), "") == 0) {
116 if (sourceTree->LoadTree(0) != 0)
117 throw RException(R__FAIL("failure retrieving first tree from provided chain"));
118 importer->fNTupleName = sourceTree->GetTree()->GetName();
119 } else {
120 importer->fNTupleName = sourceTree->GetName();
121 }
122
123 importer->fSourceTree = sourceTree;
124
125 // If we have IMT enabled, its best use is for parallel page compression
126 importer->fSourceTree->SetImplicitMT(false);
127 auto result = importer->InitDestination(destFileName);
128
129 if (!result)
131
132 return importer;
133}
134
136{
137 fDestFileName = destFileName;
139 fDestFile = std::unique_ptr<TFile>(TFile::Open(fDestFileName.c_str(), "UPDATE"));
140 if (!fDestFile || fDestFile->IsZombie()) {
141 return R__FAIL("cannot open dest file " + std::string(fDestFileName));
142 }
143
144 return RResult<void>::Success();
145}
146
148{
149 for (const auto &f : fImportFields) {
150 std::cout << "Importing '" << f.fField->GetName() << "' [" << f.fField->GetType() << ']' << std::endl;
151 }
152}
153
155{
156 fImportBranches.clear();
157 fImportFields.clear();
158 fLeafCountCollections.clear();
161 fEntry = nullptr;
162}
163
165{
166 ResetSchema();
167
168 // Browse through all branches and their leaves, create corresponding fields and prepare the memory buffers for
169 // reading and writing. Usually, reading and writing share the same memory buffer, i.e. the object is read from TTree
170 // and written as-is to the RNTuple. There are exceptions, e.g. for leaf count arrays and C strings.
172 assert(b);
173 const auto firstLeaf = static_cast<TLeaf *>(b->GetListOfLeaves()->First());
174 assert(firstLeaf);
175
176 const bool isLeafList = b->GetNleaves() > 1;
177 const bool isCountLeaf = firstLeaf->IsRange(); // A leaf storing the number of elements of a leaf count array
178 const bool isClass = (firstLeaf->IsA() == TLeafElement::Class()); // STL or user-defined class
179 if (isLeafList && isClass)
180 return R__FAIL("unsupported: classes in leaf list, branch " + std::string(b->GetName()));
181 if (isLeafList && isCountLeaf)
182 return R__FAIL("unsupported: count leaf arrays in leaf list, branch " + std::string(b->GetName()));
183
184 // Only plain leafs with type identifies 'C' are C strings. Otherwise, they are char arrays.
185 // We use GetLeafCounter instead of GetLeafCount and GetLenStatic because the latter don't distinguish between
186 // char arrays and C strings.
187 Int_t firstLeafCountval;
188 const bool isCString = !isLeafList && (firstLeaf->IsA() == TLeafC::Class()) &&
189 (!firstLeaf->GetLeafCounter(firstLeafCountval)) && (firstLeafCountval == 1);
190
191 if (isCountLeaf) {
192 // This is a count leaf. We expect that this is not part of a leaf list. We also expect that the
193 // leaf count comes before any array leaves that use it.
194 // Count leaf branches do not end up as (physical) fields but they trigger the creation of an untyped
195 // collection, together the collection mode.
198 c.fMaxLength = firstLeaf->GetMaximum();
199 c.fCountVal = std::make_unique<Int_t>(); // count leafs are integers
200 // Casting to void * makes it work for both Int_t and UInt_t
201 fSourceTree->SetBranchAddress(b->GetName(), static_cast<void *>(c.fCountVal.get()));
202 fLeafCountCollections.emplace(firstLeaf->GetName(), std::move(c));
203 continue;
204 }
205
206 std::size_t branchBufferSize = 0; // Size of the memory location into which TTree reads the events' branch data
207 // For leaf lists, every leaf translates into a sub field of an untyped RNTuple record
208 std::vector<std::unique_ptr<Detail::RFieldBase>> recordItems;
209 for (auto l : TRangeDynCast<TLeaf>(b->GetListOfLeaves())) {
210 if (l->IsA() == TLeafObject::Class()) {
211 return R__FAIL("unsupported: TObject branches, branch: " + std::string(b->GetName()));
212 }
213
214 // We don't use GetLeafCounter() because it relies on the correct format of the leaf title.
215 // There are files in the public where the title is broken (empty).
216 Int_t countval = l->GetLenStatic();
217 auto *countleaf = l->GetLeafCount();
218 const bool isLeafCountArray = (countleaf != nullptr);
219 const bool isFixedSizeArray = !isCString && (countleaf == nullptr) && (countval > 1);
220
221 // The base case for branches with fundamental, single numerical types.
222 // For other types of branches, different field names or types are necessary,
223 // which is determined below.
224 std::string fieldName = b->GetName();
225 std::string fieldType = l->GetTypeName();
226
227 if (isLeafList)
228 fieldName = l->GetName();
229
230 if (isCString)
231 fieldType = "std::string";
232
233 if (isClass)
234 fieldType = b->GetClassName();
235
236 if (isFixedSizeArray)
237 fieldType = "std::array<" + fieldType + "," + std::to_string(countval) + ">";
238
240 // Replace any occurrence of a dot ('.') with an underscore.
241 std::replace(fieldName.begin(), fieldName.end(), '.', '_');
242 }
243
245 f.fIsClass = isClass;
246 auto fieldOrError = Detail::RFieldBase::Create(fieldName, fieldType);
247 if (!fieldOrError)
248 return R__FORWARD_ERROR(fieldOrError);
249 auto field = fieldOrError.Unwrap();
250 if (isCString) {
251 branchBufferSize = l->GetMaximum();
252 f.fValue = std::make_unique<Detail::RFieldBase::RValue>(field->GenerateValue());
253 f.fFieldBuffer = f.fValue->GetRawPtr();
254 fImportTransformations.emplace_back(
255 std::make_unique<RCStringTransformation>(fImportBranches.size(), fImportFields.size()));
256 } else {
257 if (isClass) {
258 // For classes, the branch buffer contains a pointer to object, which gets instantiated by TTree upon
259 // calling SetBranchAddress()
260 branchBufferSize = sizeof(void *) * countval;
261 } else if (isLeafCountArray) {
262 branchBufferSize = fLeafCountCollections[countleaf->GetName()].fMaxLength * field->GetValueSize();
263 } else {
264 branchBufferSize = l->GetOffset() + field->GetValueSize();
265 }
266 }
267 f.fField = field.get();
268
269 if (isLeafList) {
270 recordItems.emplace_back(std::move(field));
271 } else if (isLeafCountArray) {
272 f.fValue = std::make_unique<Detail::RFieldBase::RValue>(field->GenerateValue());
273 f.fFieldBuffer = f.fValue->GetRawPtr();
274 f.fIsInUntypedCollection = true;
275 const std::string countleafName = countleaf->GetName();
276 fLeafCountCollections[countleafName].fCollectionModel->AddField(std::move(field));
277 fLeafCountCollections[countleafName].fImportFieldIndexes.emplace_back(fImportFields.size());
278 fLeafCountCollections[countleafName].fTransformations.emplace_back(
279 std::make_unique<RLeafArrayTransformation>(fImportBranches.size(), fImportFields.size()));
280 fImportFields.emplace_back(std::move(f));
281 } else {
282 fModel->AddField(std::move(field));
283 fImportFields.emplace_back(std::move(f));
284 }
285 }
286 if (!recordItems.empty()) {
287 auto recordField = std::make_unique<RRecordField>(b->GetName(), std::move(recordItems));
289 f.fField = recordField.get();
290 fImportFields.emplace_back(std::move(f));
291 fModel->AddField(std::move(recordField));
292 }
293
294 RImportBranch ib;
295 ib.fBranchName = b->GetName();
296 ib.fBranchBuffer = std::make_unique<unsigned char[]>(branchBufferSize);
297 if (isClass) {
298 auto klass = TClass::GetClass(b->GetClassName());
299 if (!klass) {
300 return R__FAIL("unable to load class " + std::string(b->GetClassName()) + " for branch " +
301 std::string(b->GetName()));
302 }
303 auto ptrBuf = reinterpret_cast<void **>(ib.fBranchBuffer.get());
304 fSourceTree->SetBranchAddress(b->GetName(), ptrBuf, klass, EDataType::kOther_t, true /* isptr*/);
305 } else {
306 fSourceTree->SetBranchAddress(b->GetName(), reinterpret_cast<void *>(ib.fBranchBuffer.get()));
307 }
308
309 // If the TTree branch type and the RNTuple field type match, use the branch read buffer as RNTuple write buffer
310 if (!fImportFields.back().fFieldBuffer) {
311 fImportFields.back().fFieldBuffer =
312 isClass ? *reinterpret_cast<void **>(ib.fBranchBuffer.get()) : ib.fBranchBuffer.get();
313 }
314
315 fImportBranches.emplace_back(std::move(ib));
316 }
317
318 int iLeafCountCollection = 0;
319 for (auto &p : fLeafCountCollections) {
320 // We want to capture this variable, which is not possible with a
321 // structured binding in C++17. Explicitly defining a variable works.
322 auto &countLeafName = p.first;
323 auto &c = p.second;
324 c.fCollectionModel->Freeze();
325 c.fCollectionEntry = c.fCollectionModel->CreateBareEntry();
326 for (auto idx : c.fImportFieldIndexes) {
327 const auto name = fImportFields[idx].fField->GetName();
328 const auto buffer = fImportFields[idx].fFieldBuffer;
329 c.fCollectionEntry->CaptureValueUnsafe(name, buffer);
330 }
331 c.fFieldName = "_collection" + std::to_string(iLeafCountCollection);
332 c.fCollectionWriter = fModel->MakeCollection(c.fFieldName, std::move(c.fCollectionModel));
333 // Add projected fields for all leaf count arrays
334 for (auto idx : c.fImportFieldIndexes) {
335 const auto name = fImportFields[idx].fField->GetName();
336 auto projectedField =
337 Detail::RFieldBase::Create(name, "ROOT::RVec<" + fImportFields[idx].fField->GetType() + ">").Unwrap();
338 fModel->AddProjectedField(std::move(projectedField), [&name, &c](const std::string &fieldName) {
339 if (fieldName == name)
340 return c.fFieldName;
341 else
342 return c.fFieldName + "." + name;
343 });
344 }
345 // Add projected fields for count leaf
346 auto projectedField =
347 Detail::RFieldBase::Create(countLeafName, "ROOT::Experimental::RNTupleCardinality<std::uint32_t>").Unwrap();
348 fModel->AddProjectedField(std::move(projectedField), [&c](const std::string &) { return c.fFieldName; });
349 iLeafCountCollection++;
350 }
351
352 fModel->Freeze();
353 fEntry = fModel->CreateBareEntry();
354 for (const auto &f : fImportFields) {
355 if (f.fIsInUntypedCollection)
356 continue;
357 fEntry->CaptureValueUnsafe(f.fField->GetName(), f.fFieldBuffer);
358 }
359 for (const auto &[_, c] : fLeafCountCollections) {
360 fEntry->CaptureValueUnsafe(c.fFieldName, c.fCollectionWriter->GetOffsetPtr());
361 }
362
363 if (!fIsQuiet)
364 ReportSchema();
365
366 return RResult<void>::Success();
367}
368
370{
371 if (fDestFile->FindKey(fNTupleName.c_str()) != nullptr)
372 throw RException(R__FAIL("Key '" + fNTupleName + "' already exists in file " + fDestFileName));
373
375
376 auto sink = std::make_unique<Detail::RPageSinkFile>(fNTupleName, *fDestFile, fWriteOptions);
377 sink->GetMetrics().Enable();
378 auto ctrZippedBytes = sink->GetMetrics().GetCounter("RPageSinkFile.szWritePayload");
379
380 auto ntplWriter = std::make_unique<RNTupleWriter>(std::move(fModel), std::move(sink));
381 // The guard needs to be destructed before the writer goes out of scope
382 RImportGuard importGuard(*this);
383
384 fProgressCallback = fIsQuiet ? nullptr : std::make_unique<RDefaultProgressCallback>();
385
386 auto nEntries = fSourceTree->GetEntries();
387
388 if (fMaxEntries >= 0 && fMaxEntries < nEntries) {
389 nEntries = fMaxEntries;
390 }
391
392 for (decltype(nEntries) i = 0; i < nEntries; ++i) {
394
395 for (const auto &[_, c] : fLeafCountCollections) {
396 for (Int_t l = 0; l < *c.fCountVal; ++l) {
397 for (auto &t : c.fTransformations) {
398 auto result = t->Transform(fImportBranches[t->fImportBranchIdx], fImportFields[t->fImportFieldIdx]);
399 if (!result)
401 }
402 c.fCollectionWriter->Fill(c.fCollectionEntry.get());
403 }
404 for (auto &t : c.fTransformations)
405 t->ResetEntry();
406 }
407
408 for (auto &t : fImportTransformations) {
409 auto result = t->Transform(fImportBranches[t->fImportBranchIdx], fImportFields[t->fImportFieldIdx]);
410 if (!result)
412 t->ResetEntry();
413 }
414
415 ntplWriter->Fill(*fEntry);
416
418 fProgressCallback->Call(ctrZippedBytes->GetValueAsInt(), i);
419 }
421 fProgressCallback->Finish(ctrZippedBytes->GetValueAsInt(), nEntries);
422}
#define R__FORWARD_ERROR(res)
Short-hand to return an RResult<T> in an error state (i.e. after checking)
Definition RError.hxx:307
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
Definition RError.hxx:303
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
#define c(i)
Definition RSha256.hxx:101
@ kOther_t
Definition TDataType.h:32
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
char name[80]
Definition TGX11.cxx:110
#define _(A, B)
Definition cfortran.h:108
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &canonicalType, const std::string &typeAlias)
Factory method to resurrect a field from the stored on-disk type information.
Definition RField.cxx:351
virtual size_t GetValueSize() const =0
The number of bytes taken by a value of the appropriate type.
Base class for all ROOT issued exceptions.
Definition RError.hxx:78
Used to report every ~50MB (compressed), and at the end about the status of the import.
virtual void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten)=0
virtual void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten)=0
bool fConvertDotsInBranchNames
Whether or not dot characters in branch names should be converted to underscores.
std::int64_t fMaxEntries
The maximum number of entries to import. When this value is -1 (default), import all entries.
std::map< std::string, RImportLeafCountCollection > fLeafCountCollections
Maps the count leaf to the information about the corresponding untyped collection.
std::vector< RImportBranch > fImportBranches
static constexpr int kDefaultCompressionSettings
By default, compress RNTuple with zstd, level 5.
static std::unique_ptr< RNTupleImporter > Create(std::string_view sourceFileName, std::string_view treeName, std::string_view destFileName)
Opens the input file for reading and the output file for writing (update).
std::unique_ptr< RProgressCallback > fProgressCallback
RResult< void > PrepareSchema()
Sets up the connection from TTree branches to RNTuple fields, including initialization of the memory ...
ROOT::Experimental::RResult< void > InitDestination(std::string_view destFileName)
void Import()
Import works in two steps:
bool fIsQuiet
No standard output, conversely if set to false, schema information and progress is printed.
std::vector< RImportField > fImportFields
std::unique_ptr< RNTupleModel > fModel
std::vector< std::unique_ptr< RImportTransformation > > fImportTransformations
The list of transformations to be performed for every entry.
static std::unique_ptr< RNTupleModel > CreateBare()
A bare model has no default entry.
The class is used as a return type for operations that can fail; wraps a value of type T or an RError...
Definition RError.hxx:207
static TClass * Class()
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition TClass.cxx:2968
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition TFile.cxx:4075
static TClass * Class()
static TClass * Class()
static TClass * Class()
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition TLeaf.h:57
TClass * IsA() const override
Definition TLine.h:79
const char * GetName() const override
Returns name of object.
Definition TNamed.h:47
virtual const char * GetName() const
Returns name of object.
Definition TObject.cxx:439
A TTree represents a columnar dataset.
Definition TTree.h:79
virtual Int_t GetEntry(Long64_t entry, Int_t getall=0)
Read all branches of entry and return total number of bytes read.
Definition TTree.cxx:5635
virtual Int_t SetBranchAddress(const char *bname, void *add, TBranch **ptr=nullptr)
Change branch address, dealing with clone trees properly.
Definition TTree.cxx:8377
virtual void SetImplicitMT(Bool_t enabled)
Definition TTree.h:621
virtual Long64_t GetEntries() const
Definition TTree.h:463
virtual TObjArray * GetListOfBranches()
Definition TTree.h:488
virtual TTree * GetTree() const
Definition TTree.h:517
virtual Long64_t LoadTree(Long64_t entry)
Set current entry.
Definition TTree.cxx:6470
TClass * IsA() const override
Definition TTree.h:659
RResult< void > Transform(const RImportBranch &branch, RImportField &field) final
std::string fBranchName
Top-level branch name from the input TTree.
std::unique_ptr< unsigned char[]> fBranchBuffer
The destination of SetBranchAddress() for fBranchName
void * fFieldBuffer
Usually points to the corresponding RImportBranch::fBranchBuffer but not always.
bool fIsClass
Field imported from a branch with stramer info (e.g., STL, user-defined class)
Detail::RFieldBase * fField
The field is kept during schema preparation and transferred to the fModel before the writing starts.
When the schema is set up and the import started, it needs to be reset before the next Import() call ...
std::unique_ptr< RNTupleModel > fCollectionModel
The model for the collection itself.
RResult< void > Transform(const RImportBranch &branch, RImportField &field) final
TLine l
Definition textangle.C:4