Logo ROOT  
Reference Guide
Loading...
Searching...
No Matches
RNTupleImporter.cxx
Go to the documentation of this file.
1/// \file RNTupleImporter.cxx
2/// \author Jakob Blomer <jblomer@cern.ch>
3/// \date 2022-11-22
4/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
5/// is welcome!
6
7/*************************************************************************
8 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
9 * All rights reserved. *
10 * *
11 * For the licensing terms see $ROOTSYS/LICENSE. *
12 * For the list of contributors see $ROOTSYS/README/CREDITS. *
13 *************************************************************************/
14
15#include <ROOT/RError.hxx>
16#include <ROOT/RField.hxx>
18#include <ROOT/RNTupleTypes.hxx>
21#include <ROOT/RPageSinkBuf.hxx>
22#include <ROOT/RPageStorage.hxx>
24#include <string_view>
25
26#include <TBranch.h>
27#include <TChain.h>
28#include <TClass.h>
29#include <TDataType.h>
30#include <TLeaf.h>
31#include <TLeafC.h>
32#include <TLeafElement.h>
33#include <TLeafObject.h>
34
35#include <cassert>
36#include <cstdint>
37#include <cstring>
38#include <iostream>
39#include <utility>
40
41namespace {
42
43class RDefaultProgressCallback : public ROOT::Experimental::RNTupleImporter::RProgressCallback {
44private:
45 static constexpr std::uint64_t gUpdateFrequencyBytes = 100 * 1000 * 1000; // report every 100 MB
46 std::uint64_t fNbytesNext = gUpdateFrequencyBytes;
47
48public:
49 ~RDefaultProgressCallback() override {}
50 void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) final
51 {
52 // Report if more than 100 MB (compressed) where written since the last status update
53 if (nbytesWritten < fNbytesNext)
54 return;
55 std::cout << "Wrote " << nbytesWritten / 1000 / 1000 << "MB, " << neventsWritten << " entries\n";
56 fNbytesNext += gUpdateFrequencyBytes;
57 if (nbytesWritten > fNbytesNext) {
58 // If we already passed the next threshold, increase by a sensible amount.
59 fNbytesNext = nbytesWritten + gUpdateFrequencyBytes;
60 }
61 }
62
63 void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) final
64 {
65 std::cout << "Done, wrote " << nbytesWritten / 1000 / 1000 << "MB, " << neventsWritten << " entries\n";
66 }
67};
68
69} // anonymous namespace
70
73{
74 *reinterpret_cast<std::string *>(field.fFieldBuffer) = reinterpret_cast<const char *>(branch.fBranchBuffer.get());
76}
77
78std::unique_ptr<ROOT::Experimental::RNTupleImporter>
79ROOT::Experimental::RNTupleImporter::Create(std::string_view sourceFileName, std::string_view treeName,
80 std::string_view destFileName)
81{
82 auto importer = std::unique_ptr<RNTupleImporter>(new RNTupleImporter());
83 importer->fNTupleName = treeName;
84 importer->fSourceFile = std::unique_ptr<TFile>(TFile::Open(std::string(sourceFileName).c_str()));
85 if (!importer->fSourceFile || importer->fSourceFile->IsZombie()) {
86 throw RException(R__FAIL("cannot open source file " + std::string(sourceFileName)));
87 }
88
89 importer->fSourceTree = importer->fSourceFile->Get<TTree>(std::string(treeName).c_str());
90 if (!importer->fSourceTree) {
91 throw RException(R__FAIL("cannot read TTree " + std::string(treeName) + " from " + std::string(sourceFileName)));
92 }
93
94 // If we have IMT enabled, its best use is for parallel page compression
95 importer->fSourceTree->SetImplicitMT(false);
96 auto result = importer->InitDestination(destFileName);
97
98 if (!result)
99 throw RException(R__FORWARD_ERROR(result));
100
101 return importer;
102}
103
104std::unique_ptr<ROOT::Experimental::RNTupleImporter>
105ROOT::Experimental::RNTupleImporter::Create(TTree *sourceTree, std::string_view destFileName)
106{
107 auto importer = std::unique_ptr<RNTupleImporter>(new RNTupleImporter());
108
109 if (sourceTree->IsA() == TChain::Class() && std::strcmp(sourceTree->GetName(), "") == 0) {
110 if (sourceTree->LoadTree(0) != 0)
111 throw RException(R__FAIL("failure retrieving first tree from provided chain"));
112 importer->fNTupleName = sourceTree->GetTree()->GetName();
113 } else {
114 importer->fNTupleName = sourceTree->GetName();
115 }
116
117 importer->fSourceTree = sourceTree;
118
119 // If we have IMT enabled, its best use is for parallel page compression
120 importer->fSourceTree->SetImplicitMT(false);
121 auto result = importer->InitDestination(destFileName);
122
123 if (!result)
124 throw RException(R__FORWARD_ERROR(result));
125
126 return importer;
127}
128
130{
131 fDestFileName = destFileName;
132 fDestFile = std::unique_ptr<TFile>(TFile::Open(fDestFileName.c_str(), "UPDATE"));
133 if (!fDestFile || fDestFile->IsZombie()) {
134 return R__FAIL("cannot open dest file " + std::string(fDestFileName));
135 }
136
137 return RResult<void>::Success();
138}
139
141{
142 for (const auto &f : fImportFields) {
143 std::cout << "Importing '" << f.fField->GetFieldName() << "' [" << f.fField->GetTypeName() << "]\n";
144 }
145 for (const auto &f : ROOT::Internal::GetProjectedFieldsOfModel(*fModel).GetFieldZero().GetConstSubfields()) {
146 std::cout << "Importing (projected) '" << f->GetFieldName() << "' [" << f->GetTypeName() << "]\n";
147 }
148}
149
159
161{
162 ResetSchema();
163
164 // Browse through all branches and their leaves, create corresponding fields and prepare the memory buffers for
165 // reading and writing. Usually, reading and writing share the same memory buffer, i.e. the object is read from TTree
166 // and written as-is to the RNTuple. There are exceptions, e.g. for leaf count arrays and C strings.
167 for (auto b : TRangeDynCast<TBranch>(*fSourceTree->GetListOfBranches())) {
168 assert(b);
169 const auto firstLeaf = static_cast<TLeaf *>(b->GetListOfLeaves()->First());
170 assert(firstLeaf);
171
172 const bool isLeafList = b->GetNleaves() > 1;
173 const bool isCountLeaf = firstLeaf->IsRange(); // A leaf storing the number of elements of a leaf count array
174 const bool isClass = (firstLeaf->IsA() == TLeafElement::Class()); // STL or user-defined class
175 if (isLeafList && isClass)
176 return R__FAIL("unsupported: classes in leaf list, branch " + std::string(b->GetName()));
177 if (isLeafList && isCountLeaf)
178 return R__FAIL("unsupported: count leaf arrays in leaf list, branch " + std::string(b->GetName()));
179
180 // Only plain leafs with type identifies 'C' are C strings. Otherwise, they are char arrays.
181 // We use GetLeafCounter instead of GetLeafCount and GetLenStatic because the latter don't distinguish between
182 // char arrays and C strings.
183 Int_t firstLeafCountval;
184 const bool isCString = !isLeafList && (firstLeaf->IsA() == TLeafC::Class()) &&
185 (!firstLeaf->GetLeafCounter(firstLeafCountval)) && (firstLeafCountval == 1);
186
187 if (isCountLeaf) {
188 // This is a count leaf. We expect that this is not part of a leaf list. We also expect that the
189 // leaf count comes before any array leaves that use it.
190 // Count leaf branches do not end up as (physical) fields but they trigger the creation of an untyped
191 // collection, together the collection mode.
193 c.fMaxLength = firstLeaf->GetMaximum();
194 c.fCountVal = std::make_unique<Int_t>(); // count leafs are integers
195 // Casting to void * makes it work for both Int_t and UInt_t
196 fSourceTree->SetBranchAddress(b->GetName(), static_cast<void *>(c.fCountVal.get()));
197 fLeafCountCollections.emplace(firstLeaf->GetName(), std::move(c));
198 continue;
199 }
200
201 std::size_t branchBufferSize = 0; // Size of the memory location into which TTree reads the events' branch data
202 // For leaf lists, every leaf translates into a sub field of an untyped RNTuple record
203 std::vector<std::unique_ptr<ROOT::RFieldBase>> recordItems;
204 // For leaf count arrays, we expect to find a single leaf; we don't add a field right away but only
205 // later through a projection
206 bool isLeafCountArray = false;
207 for (auto l : TRangeDynCast<TLeaf>(b->GetListOfLeaves())) {
208 if (l->IsA() == TLeafObject::Class()) {
209 return R__FAIL("unsupported: TObject branches, branch: " + std::string(b->GetName()));
210 }
211
212 // We don't use GetLeafCounter() because it relies on the correct format of the leaf title.
213 // There are files in the public where the title is broken (empty).
214 Int_t countval = l->GetLenStatic();
215 auto *countleaf = l->GetLeafCount();
216 const bool isFixedSizeArray = !isCString && (countleaf == nullptr) && (countval > 1);
217 isLeafCountArray = (countleaf != nullptr);
218
219 // The base case for branches with fundamental, single numerical types.
220 // For other types of branches, different field names or types are necessary,
221 // which is determined below.
222 std::string fieldName = b->GetName();
223 std::string fieldType = l->GetTypeName();
224
225 if (isLeafList)
226 fieldName = l->GetName();
227
228 if (isCString)
229 fieldType = "std::string";
230
231 if (isClass)
232 fieldType = b->GetClassName();
233
234 if (isFixedSizeArray)
235 fieldType = "std::array<" + fieldType + "," + std::to_string(countval) + ">";
236
238 // Replace any occurrence of a dot ('.') with an underscore.
239 std::replace(fieldName.begin(), fieldName.end(), '.', '_');
240 }
241
243 auto fieldOrError = ROOT::RFieldBase::Create(fieldName, fieldType);
244 if (!fieldOrError)
245 return R__FORWARD_ERROR(fieldOrError);
246 auto field = fieldOrError.Unwrap();
247 if (isCString) {
248 branchBufferSize = l->GetMaximum();
249 f.fValue = std::make_unique<ROOT::RFieldBase::RValue>(field->CreateValue());
250 f.fFieldBuffer = f.fValue->GetPtr<void>().get();
251 fImportTransformations.emplace_back(
252 std::make_unique<RCStringTransformation>(fImportBranches.size(), fImportFields.size()));
253 } else {
254 if (isClass) {
255 // For classes, the branch buffer contains a pointer to object, which gets instantiated by TTree upon
256 // calling SetBranchAddress()
257 branchBufferSize = sizeof(void *) * countval;
258 } else if (isLeafCountArray) {
259 branchBufferSize = fLeafCountCollections[countleaf->GetName()].fMaxLength * field->GetValueSize();
260 } else {
261 branchBufferSize = l->GetOffset() + field->GetValueSize();
262 }
263 }
264 f.fField = field.get();
265
266 if (isLeafList) {
267 recordItems.emplace_back(std::move(field));
268 } else if (isLeafCountArray) {
269 const std::string countleafName = countleaf->GetName();
270 fLeafCountCollections[countleafName].fLeafFields.emplace_back(std::move(field));
271 fLeafCountCollections[countleafName].fLeafBranchIndexes.emplace_back(fImportBranches.size());
272 R__ASSERT(b->GetListOfLeaves()->GetEntries() == 1);
273 break;
274 } else {
275 fModel->AddField(std::move(field));
276 fImportFields.emplace_back(std::move(f));
277 }
278 }
279 if (!recordItems.empty()) {
280 auto recordField = std::make_unique<ROOT::RRecordField>(b->GetName(), std::move(recordItems));
282 f.fField = recordField.get();
283 fImportFields.emplace_back(std::move(f));
284 fModel->AddField(std::move(recordField));
285 }
286
287 RImportBranch ib;
288 ib.fBranchName = b->GetName();
289 ib.fBranchBuffer = std::make_unique<unsigned char[]>(branchBufferSize);
290 if (isClass) {
291 auto klass = TClass::GetClass(b->GetClassName());
292 if (!klass) {
293 return R__FAIL("unable to load class " + std::string(b->GetClassName()) + " for branch " +
294 std::string(b->GetName()));
295 }
296 auto ptrBuf = reinterpret_cast<void **>(ib.fBranchBuffer.get());
297 fSourceTree->SetBranchAddress(b->GetName(), ptrBuf, klass, EDataType::kOther_t, true /* isptr*/);
298 } else {
299 fSourceTree->SetBranchAddress(b->GetName(), reinterpret_cast<void *>(ib.fBranchBuffer.get()));
300 }
301
302 // If the TTree branch type and the RNTuple field type match, use the branch read buffer as RNTuple write buffer
303 if (!isLeafCountArray && !fImportFields.back().fFieldBuffer) {
304 fImportFields.back().fFieldBuffer =
305 isClass ? *reinterpret_cast<void **>(ib.fBranchBuffer.get()) : ib.fBranchBuffer.get();
306 }
307
308 fImportBranches.emplace_back(std::move(ib));
309 }
310
311 int iLeafCountCollection = 0;
312 for (auto &p : fLeafCountCollections) {
313 // We want to capture this variable, which is not possible with a
314 // structured binding in C++17. Explicitly defining a variable works.
315 auto countLeafName = p.first;
316 auto &c = p.second;
317
318 c.fFieldName = "_collection" + std::to_string(iLeafCountCollection);
319 auto recordField = std::make_unique<ROOT::RRecordField>("_0", std::move(c.fLeafFields));
320 auto collectionField = ROOT::RVectorField::CreateUntyped(c.fFieldName, std::move(recordField));
321 c.fRecordField = static_cast<RRecordField *>(collectionField->GetMutableSubfields()[0]);
322 fModel->AddField(std::move(collectionField));
323
324 // Add projected fields for all leaf count arrays
325 for (const auto leaf : c.fRecordField->GetConstSubfields()) {
326 const auto name = leaf->GetFieldName();
327 auto projectedField =
328 ROOT::RFieldBase::Create(name, "ROOT::VecOps::RVec<" + leaf->GetTypeName() + ">").Unwrap();
329 fModel->AddProjectedField(std::move(projectedField), [&name, &c](const std::string &fieldName) {
330 if (fieldName == name)
331 return c.fFieldName;
332 else
333 return c.fFieldName + "._0." + name;
334 });
335 }
336
338 // Replace any occurrenceof a dot ('.') in the count leaf name with an underscore.
339 std::replace(countLeafName.begin(), countLeafName.end(), '.', '_');
340 }
341
342 // Add projected fields for count leaf
343 auto projectedField = ROOT::RFieldBase::Create(countLeafName, "ROOT::RNTupleCardinality<std::uint32_t>").Unwrap();
344 fModel->AddProjectedField(std::move(projectedField), [&c](const std::string &) { return c.fFieldName; });
345
346 iLeafCountCollection++;
347 }
348
349 if (fFieldModifier) {
350 for (auto &field : fModel->GetMutableFieldZero()) {
351 fFieldModifier(field);
352 }
353 }
354
355 fModel->Freeze();
356
357 fEntry = fModel->CreateBareEntry();
358 for (const auto &f : fImportFields) {
359 fEntry->BindRawPtr(f.fField->GetFieldName(), f.fFieldBuffer);
360 }
361 for (auto &[_, c] : fLeafCountCollections) {
362 fEntry->BindRawPtr<void>(c.fFieldName, &c.fFieldBuffer);
363 }
364
365 if (!fIsQuiet)
366 ReportSchema();
367
368 return RResult<void>::Success();
369}
370
372{
373 TDirectory *targetDir = fDestFile.get();
374 // Extract the ntuple name and directory name
375 auto lastSlash = fNTupleName.find_last_of('/');
376 std::string ntupleName = fNTupleName;
377 if (lastSlash != std::string::npos) {
378 // Create the directory in the output file if it does not exist
379 auto dirName = fNTupleName.substr(0, lastSlash);
380 ntupleName = fNTupleName.substr(lastSlash + 1);
381 targetDir = fDestFile->mkdir(dirName.c_str(), "", true);
382 if (!targetDir) {
383 throw RException(R__FAIL("Failed to create directory " + dirName + " in file " + fDestFileName));
384 }
385 }
386
387 if (targetDir->FindKey(ntupleName.c_str()) != nullptr) {
388 throw RException(R__FAIL("Key '" + fNTupleName + "' already exists in file " + fDestFileName));
389 }
390
392
393 std::unique_ptr<ROOT::Internal::RPageSink> sink =
394 std::make_unique<ROOT::Internal::RPageSinkFile>(ntupleName, *targetDir, fWriteOptions);
395 sink->GetMetrics().Enable();
396 auto ctrZippedBytes = sink->GetMetrics().GetCounter("RPageSinkFile.szWritePayload");
397
398 if (fWriteOptions.GetUseBufferedWrite()) {
399 sink = std::make_unique<ROOT::Internal::RPageSinkBuf>(std::move(sink));
400 }
401
402 auto ntplWriter = ROOT::Internal::CreateRNTupleWriter(std::move(fModel), std::move(sink));
403 // The guard needs to be destructed before the writer goes out of scope
404 RImportGuard importGuard(*this);
405
406 fProgressCallback = fIsQuiet ? nullptr : std::make_unique<RDefaultProgressCallback>();
407
408 auto nEntries = fSourceTree->GetEntries();
409
410 if (fMaxEntries >= 0 && fMaxEntries < nEntries) {
411 nEntries = fMaxEntries;
412 }
413
414 for (decltype(nEntries) i = 0; i < nEntries; ++i) {
415 fSourceTree->GetEntry(i);
416
417 for (auto &[_, c] : fLeafCountCollections) {
418 const auto sizeOfRecord = c.fRecordField->GetValueSize();
419 c.fFieldBuffer.resize(sizeOfRecord * (*c.fCountVal));
420
421 const auto nLeafs = c.fRecordField->GetConstSubfields().size();
422 for (std::size_t l = 0; l < nLeafs; ++l) {
423 const auto offset = c.fRecordField->GetOffsets()[l];
424 const auto sizeOfLeaf = c.fRecordField->GetConstSubfields()[l]->GetValueSize();
425 const auto idxImportBranch = c.fLeafBranchIndexes[l];
426 for (Int_t j = 0; j < *c.fCountVal; ++j) {
427 memcpy(c.fFieldBuffer.data() + j * sizeOfRecord + offset,
428 fImportBranches[idxImportBranch].fBranchBuffer.get() + (j * sizeOfLeaf), sizeOfLeaf);
429 }
430 }
431 }
432
433 for (auto &t : fImportTransformations) {
434 auto result = t->Transform(fImportBranches[t->fImportBranchIdx], fImportFields[t->fImportFieldIdx]);
435 if (!result)
436 throw RException(R__FORWARD_ERROR(result));
437 }
438
439 ntplWriter->Fill(*fEntry);
440
442 fProgressCallback->Call(ctrZippedBytes->GetValueAsInt(), i);
443 }
445 fProgressCallback->Finish(ctrZippedBytes->GetValueAsInt(), nEntries);
446}
#define R__FORWARD_ERROR(res)
Short-hand to return an RResult<T> in an error state (i.e. after checking).
Definition RError.hxx:303
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
Definition RError.hxx:299
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
#define c(i)
Definition RSha256.hxx:101
int Int_t
Signed integer 4 bytes (int).
Definition RtypesCore.h:59
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
@ kOther_t
Definition TDataType.h:32
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125
char name[80]
Definition TGX11.cxx:148
#define _(A, B)
Definition cfortran.h:108
Used to report every ~100 MB (compressed), and at the end about the status of the import.
bool fConvertDotsInBranchNames
Whether or not dot characters in branch names should be converted to underscores.
std::unique_ptr< ROOT::REntry > fEntry
std::int64_t fMaxEntries
The maximum number of entries to import. When this value is -1 (default), import all entries.
std::map< std::string, RImportLeafCountCollection > fLeafCountCollections
Maps the count leaf to the information about the corresponding untyped collection.
std::unique_ptr< ROOT::RNTupleModel > fModel
std::vector< RImportBranch > fImportBranches
ROOT::RResult< void > InitDestination(std::string_view destFileName)
static std::unique_ptr< RNTupleImporter > Create(std::string_view sourceFileName, std::string_view treeName, std::string_view destFileName)
Opens the input file for reading and the output file for writing (update).
std::unique_ptr< RProgressCallback > fProgressCallback
void Import()
Import works in two steps:
ROOT::RNTupleWriteOptions fWriteOptions
bool fIsQuiet
No standard output, conversely if set to false, schema information and progress is printed.
std::vector< RImportField > fImportFields
std::vector< std::unique_ptr< RImportTransformation > > fImportTransformations
The list of transformations to be performed for every entry.
ROOT::RResult< void > PrepareSchema()
Sets up the connection from TTree branches to RNTuple fields, including initialization of the memory ...
Base class for all ROOT issued exceptions.
Definition RError.hxx:78
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &typeName, const ROOT::RCreateFieldOptions &options, const ROOT::RNTupleDescriptor *desc, ROOT::DescriptorId_t fieldId)
Factory method to resurrect a field from the stored on-disk type information.
static std::unique_ptr< RNTupleModel > CreateBare()
Creates a "bare model", i.e. an RNTupleModel with no default entry.
The field for an untyped record.
The class is used as a return type for operations that can fail; wraps a value of type T or an RError...
Definition RError.hxx:197
static std::unique_ptr< RVectorField > CreateUntyped(std::string_view fieldName, std::unique_ptr< RFieldBase > itemField)
static TClass * Class()
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition TClass.cxx:2994
Describe directory structure in memory.
Definition TDirectory.h:45
virtual TKey * FindKey(const char *) const
Definition TDirectory.h:198
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition TFile.cxx:3787
static TClass * Class()
static TClass * Class()
static TClass * Class()
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition TLeaf.h:57
const char * GetName() const override
Returns name of object.
Definition TNamed.h:49
A TTree represents a columnar dataset.
Definition TTree.h:89
virtual void SetImplicitMT(bool enabled)
Definition TTree.h:713
virtual TTree * GetTree() const
Definition TTree.h:604
virtual Long64_t LoadTree(Long64_t entry)
Set current entry.
Definition TTree.cxx:6584
TClass * IsA() const override
Definition TTree.h:757
std::unique_ptr< RNTupleWriter > CreateRNTupleWriter(std::unique_ptr< ROOT::RNTupleModel > model, std::unique_ptr< Internal::RPageSink > sink)
RProjectedFields & GetProjectedFieldsOfModel(RNTupleModel &model)
RResult< void > Transform(const RImportBranch &branch, RImportField &field) final
std::string fBranchName
Top-level branch name from the input TTree.
std::unique_ptr< unsigned char[]> fBranchBuffer
The destination of SetBranchAddress() for fBranchName.
void * fFieldBuffer
Usually points to the corresponding RImportBranch::fBranchBuffer but not always.
When the schema is set up and the import started, it needs to be reset before the next Import() call ...
TLine l
Definition textangle.C:4