Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleImporter.cxx
Go to the documentation of this file.
1/// \file RNTupleImporter.cxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \date 2022-11-22
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
16#include <ROOT/RError.hxx>
17#include <ROOT/RField.hxx>
19#include <ROOT/RNTupleUtil.hxx>
22#include <ROOT/RPageSinkBuf.hxx>
23#include <ROOT/RPageStorage.hxx>
25#include <string_view>
26
27#include <TBranch.h>
28#include <TChain.h>
29#include <TClass.h>
30#include <TDataType.h>
31#include <TLeaf.h>
32#include <TLeafC.h>
33#include <TLeafElement.h>
34#include <TLeafObject.h>
35
36#include <cassert>
37#include <cstdint>
38#include <cstring>
39#include <iostream>
40#include <utility>
41
42namespace {
43
44class RDefaultProgressCallback : public ROOT::Experimental::RNTupleImporter::RProgressCallback {
45private:
46 static constexpr std::uint64_t gUpdateFrequencyBytes = 50 * 1000 * 1000; // report every 50MB
47 std::uint64_t fNbytesNext = gUpdateFrequencyBytes;
48
49public:
50 ~RDefaultProgressCallback() override {}
51 void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) final
52 {
53 // Report if more than 50MB (compressed) where written since the last status update
54 if (nbytesWritten < fNbytesNext)
55 return;
56 std::cout << "Wrote " << nbytesWritten / 1000 / 1000 << "MB, " << neventsWritten << " entries" << std::endl;
57 fNbytesNext += gUpdateFrequencyBytes;
58 if (nbytesWritten > fNbytesNext) {
59 // If we already passed the next threshold, increase by a sensible amount.
60 fNbytesNext = nbytesWritten + gUpdateFrequencyBytes;
61 }
62 }
63
64 void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) final
65 {
66 std::cout << "Done, wrote " << nbytesWritten / 1000 / 1000 << "MB, " << neventsWritten << " entries" << std::endl;
67 }
68};
69
70} // anonymous namespace
71
74{
75 *reinterpret_cast<std::string *>(field.fFieldBuffer) = reinterpret_cast<const char *>(branch.fBranchBuffer.get());
77}
78
81 RImportField &field)
82{
83 auto valueSize = field.fField->GetValueSize();
84 memcpy(field.fFieldBuffer, branch.fBranchBuffer.get() + (fNum * valueSize), valueSize);
85 fNum++;
87}
88
89std::unique_ptr<ROOT::Experimental::RNTupleImporter>
90ROOT::Experimental::RNTupleImporter::Create(std::string_view sourceFileName, std::string_view treeName,
91 std::string_view destFileName)
92{
93 auto importer = std::unique_ptr<RNTupleImporter>(new RNTupleImporter());
94 importer->fNTupleName = treeName;
95 importer->fSourceFile = std::unique_ptr<TFile>(TFile::Open(std::string(sourceFileName).c_str()));
96 if (!importer->fSourceFile || importer->fSourceFile->IsZombie()) {
97 throw RException(R__FAIL("cannot open source file " + std::string(sourceFileName)));
98 }
99
100 importer->fSourceTree = importer->fSourceFile->Get<TTree>(std::string(treeName).c_str());
101 if (!importer->fSourceTree) {
102 throw RException(R__FAIL("cannot read TTree " + std::string(treeName) + " from " + std::string(sourceFileName)));
103 }
104
105 // If we have IMT enabled, its best use is for parallel page compression
106 importer->fSourceTree->SetImplicitMT(false);
107 auto result = importer->InitDestination(destFileName);
108
109 if (!result)
111
112 return importer;
113}
114
115std::unique_ptr<ROOT::Experimental::RNTupleImporter>
116ROOT::Experimental::RNTupleImporter::Create(TTree *sourceTree, std::string_view destFileName)
117{
118 auto importer = std::unique_ptr<RNTupleImporter>(new RNTupleImporter());
119
120 if (sourceTree->IsA() == TChain::Class() && std::strcmp(sourceTree->GetName(), "") == 0) {
121 if (sourceTree->LoadTree(0) != 0)
122 throw RException(R__FAIL("failure retrieving first tree from provided chain"));
123 importer->fNTupleName = sourceTree->GetTree()->GetName();
124 } else {
125 importer->fNTupleName = sourceTree->GetName();
126 }
127
128 importer->fSourceTree = sourceTree;
129
130 // If we have IMT enabled, its best use is for parallel page compression
131 importer->fSourceTree->SetImplicitMT(false);
132 auto result = importer->InitDestination(destFileName);
133
134 if (!result)
136
137 return importer;
138}
139
141{
142 fDestFileName = destFileName;
143 fDestFile = std::unique_ptr<TFile>(TFile::Open(fDestFileName.c_str(), "UPDATE"));
144 if (!fDestFile || fDestFile->IsZombie()) {
145 return R__FAIL("cannot open dest file " + std::string(fDestFileName));
146 }
147
148 return RResult<void>::Success();
149}
150
152{
153 for (const auto &f : fImportFields) {
154 std::cout << "Importing '" << f.fField->GetFieldName() << "' [" << f.fField->GetTypeName() << ']' << std::endl;
155 }
156}
157
159{
160 fImportBranches.clear();
161 fImportFields.clear();
162 fLeafCountCollections.clear();
165 fEntry = nullptr;
166}
167
169{
170 ResetSchema();
171
172 // Browse through all branches and their leaves, create corresponding fields and prepare the memory buffers for
173 // reading and writing. Usually, reading and writing share the same memory buffer, i.e. the object is read from TTree
174 // and written as-is to the RNTuple. There are exceptions, e.g. for leaf count arrays and C strings.
176 assert(b);
177 const auto firstLeaf = static_cast<TLeaf *>(b->GetListOfLeaves()->First());
178 assert(firstLeaf);
179
180 const bool isLeafList = b->GetNleaves() > 1;
181 const bool isCountLeaf = firstLeaf->IsRange(); // A leaf storing the number of elements of a leaf count array
182 const bool isClass = (firstLeaf->IsA() == TLeafElement::Class()); // STL or user-defined class
183 if (isLeafList && isClass)
184 return R__FAIL("unsupported: classes in leaf list, branch " + std::string(b->GetName()));
185 if (isLeafList && isCountLeaf)
186 return R__FAIL("unsupported: count leaf arrays in leaf list, branch " + std::string(b->GetName()));
187
188 // Only plain leafs with type identifies 'C' are C strings. Otherwise, they are char arrays.
189 // We use GetLeafCounter instead of GetLeafCount and GetLenStatic because the latter don't distinguish between
190 // char arrays and C strings.
191 Int_t firstLeafCountval;
192 const bool isCString = !isLeafList && (firstLeaf->IsA() == TLeafC::Class()) &&
193 (!firstLeaf->GetLeafCounter(firstLeafCountval)) && (firstLeafCountval == 1);
194
195 if (isCountLeaf) {
196 // This is a count leaf. We expect that this is not part of a leaf list. We also expect that the
197 // leaf count comes before any array leaves that use it.
198 // Count leaf branches do not end up as (physical) fields but they trigger the creation of an untyped
199 // collection, together the collection mode.
202 c.fMaxLength = firstLeaf->GetMaximum();
203 c.fCountVal = std::make_unique<Int_t>(); // count leafs are integers
204 // Casting to void * makes it work for both Int_t and UInt_t
205 fSourceTree->SetBranchAddress(b->GetName(), static_cast<void *>(c.fCountVal.get()));
206 fLeafCountCollections.emplace(firstLeaf->GetName(), std::move(c));
207 continue;
208 }
209
210 std::size_t branchBufferSize = 0; // Size of the memory location into which TTree reads the events' branch data
211 // For leaf lists, every leaf translates into a sub field of an untyped RNTuple record
212 std::vector<std::unique_ptr<RFieldBase>> recordItems;
213 for (auto l : TRangeDynCast<TLeaf>(b->GetListOfLeaves())) {
214 if (l->IsA() == TLeafObject::Class()) {
215 return R__FAIL("unsupported: TObject branches, branch: " + std::string(b->GetName()));
216 }
217
218 // We don't use GetLeafCounter() because it relies on the correct format of the leaf title.
219 // There are files in the public where the title is broken (empty).
220 Int_t countval = l->GetLenStatic();
221 auto *countleaf = l->GetLeafCount();
222 const bool isLeafCountArray = (countleaf != nullptr);
223 const bool isFixedSizeArray = !isCString && (countleaf == nullptr) && (countval > 1);
224
225 // The base case for branches with fundamental, single numerical types.
226 // For other types of branches, different field names or types are necessary,
227 // which is determined below.
228 std::string fieldName = b->GetName();
229 std::string fieldType = l->GetTypeName();
230
231 if (isLeafList)
232 fieldName = l->GetName();
233
234 if (isCString)
235 fieldType = "std::string";
236
237 if (isClass)
238 fieldType = b->GetClassName();
239
240 if (isFixedSizeArray)
241 fieldType = "std::array<" + fieldType + "," + std::to_string(countval) + ">";
242
244 // Replace any occurrence of a dot ('.') with an underscore.
245 std::replace(fieldName.begin(), fieldName.end(), '.', '_');
246 }
247
249 f.fIsClass = isClass;
250 auto fieldOrError = RFieldBase::Create(fieldName, fieldType);
251 if (!fieldOrError)
252 return R__FORWARD_ERROR(fieldOrError);
253 auto field = fieldOrError.Unwrap();
254 if (isCString) {
255 branchBufferSize = l->GetMaximum();
256 f.fValue = std::make_unique<RFieldBase::RValue>(field->CreateValue());
257 f.fFieldBuffer = f.fValue->GetPtr<void>().get();
258 fImportTransformations.emplace_back(
259 std::make_unique<RCStringTransformation>(fImportBranches.size(), fImportFields.size()));
260 } else {
261 if (isClass) {
262 // For classes, the branch buffer contains a pointer to object, which gets instantiated by TTree upon
263 // calling SetBranchAddress()
264 branchBufferSize = sizeof(void *) * countval;
265 } else if (isLeafCountArray) {
266 branchBufferSize = fLeafCountCollections[countleaf->GetName()].fMaxLength * field->GetValueSize();
267 } else {
268 branchBufferSize = l->GetOffset() + field->GetValueSize();
269 }
270 }
271 f.fField = field.get();
272
273 if (isLeafList) {
274 recordItems.emplace_back(std::move(field));
275 } else if (isLeafCountArray) {
276 f.fValue = std::make_unique<RFieldBase::RValue>(field->CreateValue());
277 f.fFieldBuffer = f.fValue->GetPtr<void>().get();
278 f.fIsInUntypedCollection = true;
279 const std::string countleafName = countleaf->GetName();
280 fLeafCountCollections[countleafName].fCollectionModel->AddField(std::move(field));
281 fLeafCountCollections[countleafName].fImportFieldIndexes.emplace_back(fImportFields.size());
282 fLeafCountCollections[countleafName].fTransformations.emplace_back(
283 std::make_unique<RLeafArrayTransformation>(fImportBranches.size(), fImportFields.size()));
284 fImportFields.emplace_back(std::move(f));
285 } else {
286 fModel->AddField(std::move(field));
287 fImportFields.emplace_back(std::move(f));
288 }
289 }
290 if (!recordItems.empty()) {
291 auto recordField = std::make_unique<RRecordField>(b->GetName(), std::move(recordItems));
293 f.fField = recordField.get();
294 fImportFields.emplace_back(std::move(f));
295 fModel->AddField(std::move(recordField));
296 }
297
298 RImportBranch ib;
299 ib.fBranchName = b->GetName();
300 ib.fBranchBuffer = std::make_unique<unsigned char[]>(branchBufferSize);
301 if (isClass) {
302 auto klass = TClass::GetClass(b->GetClassName());
303 if (!klass) {
304 return R__FAIL("unable to load class " + std::string(b->GetClassName()) + " for branch " +
305 std::string(b->GetName()));
306 }
307 auto ptrBuf = reinterpret_cast<void **>(ib.fBranchBuffer.get());
308 fSourceTree->SetBranchAddress(b->GetName(), ptrBuf, klass, EDataType::kOther_t, true /* isptr*/);
309 } else {
310 fSourceTree->SetBranchAddress(b->GetName(), reinterpret_cast<void *>(ib.fBranchBuffer.get()));
311 }
312
313 // If the TTree branch type and the RNTuple field type match, use the branch read buffer as RNTuple write buffer
314 if (!fImportFields.back().fFieldBuffer) {
315 fImportFields.back().fFieldBuffer =
316 isClass ? *reinterpret_cast<void **>(ib.fBranchBuffer.get()) : ib.fBranchBuffer.get();
317 }
318
319 fImportBranches.emplace_back(std::move(ib));
320 }
321
322 int iLeafCountCollection = 0;
323 for (auto &p : fLeafCountCollections) {
324 // We want to capture this variable, which is not possible with a
325 // structured binding in C++17. Explicitly defining a variable works.
326 auto &countLeafName = p.first;
327 auto &c = p.second;
328 c.fCollectionModel->Freeze();
329 c.fCollectionEntry = c.fCollectionModel->CreateBareEntry();
330 for (auto idx : c.fImportFieldIndexes) {
331 const auto name = fImportFields[idx].fField->GetFieldName();
332 const auto buffer = fImportFields[idx].fFieldBuffer;
333 c.fCollectionEntry->BindRawPtr(name, buffer);
334 }
335 c.fFieldName = "_collection" + std::to_string(iLeafCountCollection);
336 c.fCollectionWriter = fModel->MakeCollection(c.fFieldName, std::move(c.fCollectionModel));
337 // Add projected fields for all leaf count arrays
338 for (auto idx : c.fImportFieldIndexes) {
339 const auto name = fImportFields[idx].fField->GetFieldName();
340 auto projectedField =
341 RFieldBase::Create(name, "ROOT::VecOps::RVec<" + fImportFields[idx].fField->GetTypeName() + ">").Unwrap();
342 R__ASSERT(dynamic_cast<RRVecField *>(projectedField.get()));
343 fModel->AddProjectedField(std::move(projectedField), [&name, &c](const std::string &fieldName) {
344 if (fieldName == name)
345 return c.fFieldName;
346 else
347 return c.fFieldName + "." + name;
348 });
349 }
350 // Add projected fields for count leaf
351 auto projectedField =
352 RFieldBase::Create(countLeafName, "ROOT::Experimental::RNTupleCardinality<std::uint32_t>").Unwrap();
353 fModel->AddProjectedField(std::move(projectedField), [&c](const std::string &) { return c.fFieldName; });
354 iLeafCountCollection++;
355 }
356
357 fModel->Freeze();
358 fEntry = fModel->CreateBareEntry();
359 for (const auto &f : fImportFields) {
360 if (f.fIsInUntypedCollection)
361 continue;
362 fEntry->BindRawPtr(f.fField->GetFieldName(), f.fFieldBuffer);
363 }
364 for (const auto &[_, c] : fLeafCountCollections) {
365 fEntry->BindRawPtr<void>(c.fFieldName, c.fCollectionWriter->GetOffsetPtr());
366 }
367
368 if (!fIsQuiet)
369 ReportSchema();
370
371 return RResult<void>::Success();
372}
373
375{
376 if (fDestFile->FindKey(fNTupleName.c_str()) != nullptr)
377 throw RException(R__FAIL("Key '" + fNTupleName + "' already exists in file " + fDestFileName));
378
380
381 std::unique_ptr<Internal::RPageSink> sink =
382 std::make_unique<Internal::RPageSinkFile>(fNTupleName, *fDestFile, fWriteOptions);
383 sink->GetMetrics().Enable();
384 auto ctrZippedBytes = sink->GetMetrics().GetCounter("RPageSinkFile.szWritePayload");
385
387 sink = std::make_unique<Internal::RPageSinkBuf>(std::move(sink));
388 }
389
390 auto ntplWriter = Internal::CreateRNTupleWriter(std::move(fModel), std::move(sink));
391 // The guard needs to be destructed before the writer goes out of scope
392 RImportGuard importGuard(*this);
393
394 fProgressCallback = fIsQuiet ? nullptr : std::make_unique<RDefaultProgressCallback>();
395
396 auto nEntries = fSourceTree->GetEntries();
397
398 if (fMaxEntries >= 0 && fMaxEntries < nEntries) {
399 nEntries = fMaxEntries;
400 }
401
402 for (decltype(nEntries) i = 0; i < nEntries; ++i) {
404
405 for (const auto &[_, c] : fLeafCountCollections) {
406 for (Int_t l = 0; l < *c.fCountVal; ++l) {
407 for (auto &t : c.fTransformations) {
408 auto result = t->Transform(fImportBranches[t->fImportBranchIdx], fImportFields[t->fImportFieldIdx]);
409 if (!result)
411 }
412 c.fCollectionWriter->Fill(*c.fCollectionEntry);
413 }
414 for (auto &t : c.fTransformations)
415 t->ResetEntry();
416 }
417
418 for (auto &t : fImportTransformations) {
419 auto result = t->Transform(fImportBranches[t->fImportBranchIdx], fImportFields[t->fImportFieldIdx]);
420 if (!result)
422 t->ResetEntry();
423 }
424
425 ntplWriter->Fill(*fEntry);
426
428 fProgressCallback->Call(ctrZippedBytes->GetValueAsInt(), i);
429 }
431 fProgressCallback->Finish(ctrZippedBytes->GetValueAsInt(), nEntries);
432}
#define R__FORWARD_ERROR(res)
Short-hand to return an RResult<T> in an error state (i.e. after checking)
Definition RError.hxx:294
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
Definition RError.hxx:290
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
#define c(i)
Definition RSha256.hxx:101
@ kOther_t
Definition TDataType.h:32
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
char name[80]
Definition TGX11.cxx:110
#define _(A, B)
Definition cfortran.h:108
Base class for all ROOT issued exceptions.
Definition RError.hxx:78
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &canonicalType, const std::string &typeAlias, bool fContinueOnError=false)
Factory method to resurrect a field from the stored on-disk type information.
Definition RField.cxx:626
virtual size_t GetValueSize() const =0
The number of bytes taken by a value of the appropriate type.
Used to report every ~50MB (compressed), and at the end about the status of the import.
virtual void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten)=0
virtual void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten)=0
bool fConvertDotsInBranchNames
Whether or not dot characters in branch names should be converted to underscores.
std::int64_t fMaxEntries
The maximum number of entries to import. When this value is -1 (default), import all entries.
std::map< std::string, RImportLeafCountCollection > fLeafCountCollections
Maps the count leaf to the information about the corresponding untyped collection.
std::vector< RImportBranch > fImportBranches
static std::unique_ptr< RNTupleImporter > Create(std::string_view sourceFileName, std::string_view treeName, std::string_view destFileName)
Opens the input file for reading and the output file for writing (update).
std::unique_ptr< RProgressCallback > fProgressCallback
RResult< void > PrepareSchema()
Sets up the connection from TTree branches to RNTuple fields, including initialization of the memory ...
ROOT::Experimental::RResult< void > InitDestination(std::string_view destFileName)
void Import()
Import works in two steps:
bool fIsQuiet
No standard output, conversely if set to false, schema information and progress is printed.
std::vector< RImportField > fImportFields
std::unique_ptr< RNTupleModel > fModel
std::vector< std::unique_ptr< RImportTransformation > > fImportTransformations
The list of transformations to be performed for every entry.
static std::unique_ptr< RNTupleModel > CreateBare()
A bare model has no default entry.
The type-erased field for a RVec<Type>
Definition RField.hxx:1176
The class is used as a return type for operations that can fail; wraps a value of type T or an RError...
Definition RError.hxx:194
static TClass * Class()
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition TClass.cxx:2968
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition TFile.cxx:4089
static TClass * Class()
static TClass * Class()
static TClass * Class()
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition TLeaf.h:57
TClass * IsA() const override
Definition TLine.h:79
const char * GetName() const override
Returns name of object.
Definition TNamed.h:47
virtual const char * GetName() const
Returns name of object.
Definition TObject.cxx:438
A TTree represents a columnar dataset.
Definition TTree.h:79
virtual Int_t GetEntry(Long64_t entry, Int_t getall=0)
Read all branches of entry and return total number of bytes read.
Definition TTree.cxx:5638
virtual Int_t SetBranchAddress(const char *bname, void *add, TBranch **ptr=nullptr)
Change branch address, dealing with clone trees properly.
Definition TTree.cxx:8385
virtual Long64_t GetEntries() const
Definition TTree.h:463
virtual void SetImplicitMT(bool enabled)
Definition TTree.h:621
virtual TObjArray * GetListOfBranches()
Definition TTree.h:488
virtual TTree * GetTree() const
Definition TTree.h:517
virtual Long64_t LoadTree(Long64_t entry)
Set current entry.
Definition TTree.cxx:6473
TClass * IsA() const override
Definition TTree.h:665
std::unique_ptr< RNTupleWriter > CreateRNTupleWriter(std::unique_ptr< RNTupleModel > model, std::unique_ptr< Internal::RPageSink > sink)
RResult< void > Transform(const RImportBranch &branch, RImportField &field) final
std::string fBranchName
Top-level branch name from the input TTree.
std::unique_ptr< unsigned char[]> fBranchBuffer
The destination of SetBranchAddress() for fBranchName
void * fFieldBuffer
Usually points to the corresponding RImportBranch::fBranchBuffer but not always.
RFieldBase * fField
The field is kept during schema preparation and transferred to the fModel before the writing starts.
bool fIsClass
Field imported from a branch with stramer info (e.g., STL, user-defined class)
When the schema is set up and the import started, it needs to be reset before the next Import() call ...
std::unique_ptr< RNTupleModel > fCollectionModel
The model for the collection itself.
RResult< void > Transform(const RImportBranch &branch, RImportField &field) final
TLine l
Definition textangle.C:4