Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleImporter.cxx
Go to the documentation of this file.
1/// \file RNTupleImporter.cxx
2/// \author Jakob Blomer <jblomer@cern.ch>
3/// \date 2022-11-22
4/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
5/// is welcome!
6
7/*************************************************************************
8 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
9 * All rights reserved. *
10 * *
11 * For the licensing terms see $ROOTSYS/LICENSE. *
12 * For the list of contributors see $ROOTSYS/README/CREDITS. *
13 *************************************************************************/
14
15#include <ROOT/RError.hxx>
16#include <ROOT/RField.hxx>
18#include <ROOT/RNTupleTypes.hxx>
21#include <ROOT/RPageSinkBuf.hxx>
22#include <ROOT/RPageStorage.hxx>
24#include <string_view>
25
26#include <TBranch.h>
27#include <TChain.h>
28#include <TClass.h>
29#include <TDataType.h>
30#include <TLeaf.h>
31#include <TLeafC.h>
32#include <TLeafElement.h>
33#include <TLeafObject.h>
34
35#include <cassert>
36#include <cstdint>
37#include <cstring>
38#include <iostream>
39#include <utility>
40
41namespace {
42
43class RDefaultProgressCallback : public ROOT::Experimental::RNTupleImporter::RProgressCallback {
44private:
45 static constexpr std::uint64_t gUpdateFrequencyBytes = 100 * 1000 * 1000; // report every 100 MB
46 std::uint64_t fNbytesNext = gUpdateFrequencyBytes;
47
48public:
49 ~RDefaultProgressCallback() override {}
50 void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) final
51 {
52 // Report if more than 100 MB (compressed) where written since the last status update
53 if (nbytesWritten < fNbytesNext)
54 return;
55 std::cout << "Wrote " << nbytesWritten / 1000 / 1000 << "MB, " << neventsWritten << " entries\n";
56 fNbytesNext += gUpdateFrequencyBytes;
57 if (nbytesWritten > fNbytesNext) {
58 // If we already passed the next threshold, increase by a sensible amount.
59 fNbytesNext = nbytesWritten + gUpdateFrequencyBytes;
60 }
61 }
62
63 void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) final
64 {
65 std::cout << "Done, wrote " << nbytesWritten / 1000 / 1000 << "MB, " << neventsWritten << " entries\n";
66 }
67};
68
69} // anonymous namespace
70
73{
74 *reinterpret_cast<std::string *>(field.fFieldBuffer) = reinterpret_cast<const char *>(branch.fBranchBuffer.get());
76}
77
78std::unique_ptr<ROOT::Experimental::RNTupleImporter>
80 std::string_view destFileName)
81{
82 auto importer = std::unique_ptr<RNTupleImporter>(new RNTupleImporter());
83 importer->fNTupleName = treeName;
84 importer->fSourceFile = std::unique_ptr<TFile>(TFile::Open(std::string(sourceFileName).c_str()));
85 if (!importer->fSourceFile || importer->fSourceFile->IsZombie()) {
86 throw RException(R__FAIL("cannot open source file " + std::string(sourceFileName)));
87 }
88
89 importer->fSourceTree = importer->fSourceFile->Get<TTree>(std::string(treeName).c_str());
90 if (!importer->fSourceTree) {
91 throw RException(R__FAIL("cannot read TTree " + std::string(treeName) + " from " + std::string(sourceFileName)));
92 }
93
94 // If we have IMT enabled, its best use is for parallel page compression
95 importer->fSourceTree->SetImplicitMT(false);
96 auto result = importer->InitDestination(destFileName);
97
98 if (!result)
100
101 return importer;
102}
103
104std::unique_ptr<ROOT::Experimental::RNTupleImporter>
106{
107 auto importer = std::unique_ptr<RNTupleImporter>(new RNTupleImporter());
108
109 if (sourceTree->IsA() == TChain::Class() && std::strcmp(sourceTree->GetName(), "") == 0) {
110 if (sourceTree->LoadTree(0) != 0)
111 throw RException(R__FAIL("failure retrieving first tree from provided chain"));
112 importer->fNTupleName = sourceTree->GetTree()->GetName();
113 } else {
114 importer->fNTupleName = sourceTree->GetName();
115 }
116
117 importer->fSourceTree = sourceTree;
118
119 // If we have IMT enabled, its best use is for parallel page compression
120 importer->fSourceTree->SetImplicitMT(false);
121 auto result = importer->InitDestination(destFileName);
122
123 if (!result)
125
126 return importer;
127}
128
130{
132 fDestFile = std::unique_ptr<TFile>(TFile::Open(fDestFileName.c_str(), "UPDATE"));
133 if (!fDestFile || fDestFile->IsZombie()) {
134 return R__FAIL("cannot open dest file " + std::string(fDestFileName));
135 }
136
137 return RResult<void>::Success();
138}
139
141{
142 for (const auto &f : fImportFields) {
143 std::cout << "Importing '" << f.fField->GetFieldName() << "' [" << f.fField->GetTypeName() << "]\n";
144 }
146 std::cout << "Importing (projected) '" << f->GetFieldName() << "' [" << f->GetTypeName() << "]\n";
147 }
148}
149
159
161{
162 ResetSchema();
163
164 // Browse through all branches and their leaves, create corresponding fields and prepare the memory buffers for
165 // reading and writing. Usually, reading and writing share the same memory buffer, i.e. the object is read from TTree
166 // and written as-is to the RNTuple. There are exceptions, e.g. for leaf count arrays and C strings.
168 assert(b);
169 const auto firstLeaf = static_cast<TLeaf *>(b->GetListOfLeaves()->First());
171
172 const bool isLeafList = b->GetNleaves() > 1;
173 const bool isCountLeaf = firstLeaf->IsRange(); // A leaf storing the number of elements of a leaf count array
174 const bool isClass = (firstLeaf->IsA() == TLeafElement::Class()); // STL or user-defined class
175 if (isLeafList && isClass)
176 return R__FAIL("unsupported: classes in leaf list, branch " + std::string(b->GetName()));
177 if (isLeafList && isCountLeaf)
178 return R__FAIL("unsupported: count leaf arrays in leaf list, branch " + std::string(b->GetName()));
179
180 // Only plain leafs with type identifies 'C' are C strings. Otherwise, they are char arrays.
181 // We use GetLeafCounter instead of GetLeafCount and GetLenStatic because the latter don't distinguish between
182 // char arrays and C strings.
184 const bool isCString = !isLeafList && (firstLeaf->IsA() == TLeafC::Class()) &&
185 (!firstLeaf->GetLeafCounter(firstLeafCountval)) && (firstLeafCountval == 1);
186
187 if (isCountLeaf) {
188 // This is a count leaf. We expect that this is not part of a leaf list. We also expect that the
189 // leaf count comes before any array leaves that use it.
190 // Count leaf branches do not end up as (physical) fields but they trigger the creation of an untyped
191 // collection, together the collection mode.
193 c.fMaxLength = firstLeaf->GetMaximum();
194 c.fCountVal = std::make_unique<Int_t>(); // count leafs are integers
195 // Casting to void * makes it work for both Int_t and UInt_t
196 fSourceTree->SetBranchAddress(b->GetName(), static_cast<void *>(c.fCountVal.get()));
197 fLeafCountCollections.emplace(firstLeaf->GetName(), std::move(c));
198 continue;
199 }
200
201 std::size_t branchBufferSize = 0; // Size of the memory location into which TTree reads the events' branch data
202 // For leaf lists, every leaf translates into a sub field of an untyped RNTuple record
203 std::vector<std::unique_ptr<ROOT::RFieldBase>> recordItems;
204 // For leaf count arrays, we expect to find a single leaf; we don't add a field right away but only
205 // later through a projection
206 bool isLeafCountArray = false;
207 for (auto l : TRangeDynCast<TLeaf>(b->GetListOfLeaves())) {
208 if (l->IsA() == TLeafObject::Class()) {
209 return R__FAIL("unsupported: TObject branches, branch: " + std::string(b->GetName()));
210 }
211
212 // We don't use GetLeafCounter() because it relies on the correct format of the leaf title.
213 // There are files in the public where the title is broken (empty).
214 Int_t countval = l->GetLenStatic();
215 auto *countleaf = l->GetLeafCount();
216 const bool isFixedSizeArray = !isCString && (countleaf == nullptr) && (countval > 1);
217 isLeafCountArray = (countleaf != nullptr);
218
219 // The base case for branches with fundamental, single numerical types.
220 // For other types of branches, different field names or types are necessary,
221 // which is determined below.
222 std::string fieldName = b->GetName();
223 std::string fieldType = l->GetTypeName();
224
225 if (isLeafList)
226 fieldName = l->GetName();
227
228 if (isCString)
229 fieldType = "std::string";
230
231 if (isClass)
232 fieldType = b->GetClassName();
233
235 fieldType = "std::array<" + fieldType + "," + std::to_string(countval) + ">";
236
238 // Replace any occurrence of a dot ('.') with an underscore.
239 std::replace(fieldName.begin(), fieldName.end(), '.', '_');
240 }
241
244 if (!fieldOrError)
246 auto field = fieldOrError.Unwrap();
247 if (isCString) {
248 branchBufferSize = l->GetMaximum();
249 f.fValue = std::make_unique<ROOT::RFieldBase::RValue>(field->CreateValue());
250 f.fFieldBuffer = f.fValue->GetPtr<void>().get();
251 fImportTransformations.emplace_back(
252 std::make_unique<RCStringTransformation>(fImportBranches.size(), fImportFields.size()));
253 } else {
254 if (isClass) {
255 // For classes, the branch buffer contains a pointer to object, which gets instantiated by TTree upon
256 // calling SetBranchAddress()
257 branchBufferSize = sizeof(void *) * countval;
258 } else if (isLeafCountArray) {
259 branchBufferSize = fLeafCountCollections[countleaf->GetName()].fMaxLength * field->GetValueSize();
260 } else {
261 branchBufferSize = l->GetOffset() + field->GetValueSize();
262 }
263 }
264 f.fField = field.get();
265
266 if (isLeafList) {
267 recordItems.emplace_back(std::move(field));
268 } else if (isLeafCountArray) {
269 const std::string countleafName = countleaf->GetName();
270 fLeafCountCollections[countleafName].fLeafFields.emplace_back(std::move(field));
271 fLeafCountCollections[countleafName].fLeafBranchIndexes.emplace_back(fImportBranches.size());
272 R__ASSERT(b->GetListOfLeaves()->GetEntries() == 1);
273 break;
274 } else {
275 fModel->AddField(std::move(field));
276 fImportFields.emplace_back(std::move(f));
277 }
278 }
279 if (!recordItems.empty()) {
280 auto recordField = std::make_unique<ROOT::RRecordField>(b->GetName(), std::move(recordItems));
282 f.fField = recordField.get();
283 fImportFields.emplace_back(std::move(f));
284 fModel->AddField(std::move(recordField));
285 }
286
288 ib.fBranchName = b->GetName();
289 ib.fBranchBuffer = std::make_unique<unsigned char[]>(branchBufferSize);
290 if (isClass) {
291 auto klass = TClass::GetClass(b->GetClassName());
292 if (!klass) {
293 return R__FAIL("unable to load class " + std::string(b->GetClassName()) + " for branch " +
294 std::string(b->GetName()));
295 }
296 auto ptrBuf = reinterpret_cast<void **>(ib.fBranchBuffer.get());
297 fSourceTree->SetBranchAddress(b->GetName(), ptrBuf, klass, EDataType::kOther_t, true /* isptr*/);
298 } else {
299 fSourceTree->SetBranchAddress(b->GetName(), reinterpret_cast<void *>(ib.fBranchBuffer.get()));
300 }
301
302 // If the TTree branch type and the RNTuple field type match, use the branch read buffer as RNTuple write buffer
303 if (!isLeafCountArray && !fImportFields.back().fFieldBuffer) {
304 fImportFields.back().fFieldBuffer =
305 isClass ? *reinterpret_cast<void **>(ib.fBranchBuffer.get()) : ib.fBranchBuffer.get();
306 }
307
308 fImportBranches.emplace_back(std::move(ib));
309 }
310
311 int iLeafCountCollection = 0;
312 for (auto &p : fLeafCountCollections) {
313 // We want to capture this variable, which is not possible with a
314 // structured binding in C++17. Explicitly defining a variable works.
315 auto countLeafName = p.first;
316 auto &c = p.second;
317
318 c.fFieldName = "_collection" + std::to_string(iLeafCountCollection);
319 auto recordField = std::make_unique<ROOT::RRecordField>("_0", std::move(c.fLeafFields));
320 auto collectionField = ROOT::RVectorField::CreateUntyped(c.fFieldName, std::move(recordField));
321 c.fRecordField = static_cast<RRecordField *>(collectionField->GetMutableSubfields()[0]);
322 fModel->AddField(std::move(collectionField));
323
324 // Add projected fields for all leaf count arrays
325 for (const auto leaf : c.fRecordField->GetConstSubfields()) {
326 const auto name = leaf->GetFieldName();
327 auto projectedField =
328 ROOT::RFieldBase::Create(name, "ROOT::VecOps::RVec<" + leaf->GetTypeName() + ">").Unwrap();
329 fModel->AddProjectedField(std::move(projectedField), [&name, &c](const std::string &fieldName) {
330 if (fieldName == name)
331 return c.fFieldName;
332 else
333 return c.fFieldName + "._0." + name;
334 });
335 }
336
338 // Replace any occurrenceof a dot ('.') in the count leaf name with an underscore.
339 std::replace(countLeafName.begin(), countLeafName.end(), '.', '_');
340 }
341
342 // Add projected fields for count leaf
343 auto projectedField = ROOT::RFieldBase::Create(countLeafName, "ROOT::RNTupleCardinality<std::uint32_t>").Unwrap();
344 fModel->AddProjectedField(std::move(projectedField), [&c](const std::string &) { return c.fFieldName; });
345
347 }
348
349 if (fFieldModifier) {
350 for (auto &field : fModel->GetMutableFieldZero()) {
352 }
353 }
354
355 fModel->Freeze();
356
357 fEntry = fModel->CreateBareEntry();
358 for (const auto &f : fImportFields) {
359 fEntry->BindRawPtr(f.fField->GetFieldName(), f.fFieldBuffer);
360 }
361 for (auto &[_, c] : fLeafCountCollections) {
362 fEntry->BindRawPtr<void>(c.fFieldName, &c.fFieldBuffer);
363 }
364
365 if (!fIsQuiet)
366 ReportSchema();
367
368 return RResult<void>::Success();
369}
370
372{
374 // Extract the ntuple name and directory name
375 auto lastSlash = fNTupleName.find_last_of('/');
376 std::string ntupleName = fNTupleName;
377 if (lastSlash != std::string::npos) {
378 // Create the directory in the output file if it does not exist
379 auto dirName = fNTupleName.substr(0, lastSlash);
380 ntupleName = fNTupleName.substr(lastSlash + 1);
381 targetDir = fDestFile->mkdir(dirName.c_str(), "", true);
382 if (!targetDir) {
383 throw RException(R__FAIL("Failed to create directory " + dirName + " in file " + fDestFileName));
384 }
385 }
386
387 if (targetDir->FindKey(ntupleName.c_str()) != nullptr) {
388 throw RException(R__FAIL("Key '" + fNTupleName + "' already exists in file " + fDestFileName));
389 }
390
392
393 std::unique_ptr<ROOT::Internal::RPageSink> sink =
394 std::make_unique<ROOT::Internal::RPageSinkFile>(ntupleName, *targetDir, fWriteOptions);
395 sink->GetMetrics().Enable();
396 auto ctrZippedBytes = sink->GetMetrics().GetCounter("RPageSinkFile.szWritePayload");
397
399 sink = std::make_unique<ROOT::Internal::RPageSinkBuf>(std::move(sink));
400 }
401
402 auto ntplWriter = ROOT::Internal::CreateRNTupleWriter(std::move(fModel), std::move(sink));
403 // The guard needs to be destructed before the writer goes out of scope
405
406 fProgressCallback = fIsQuiet ? nullptr : std::make_unique<RDefaultProgressCallback>();
407
409
410 if (fMaxEntries >= 0 && fMaxEntries < nEntries) {
412 }
413
414 for (decltype(nEntries) i = 0; i < nEntries; ++i) {
416
417 for (auto &[_, c] : fLeafCountCollections) {
418 const auto sizeOfRecord = c.fRecordField->GetValueSize();
419 c.fFieldBuffer.resize(sizeOfRecord * (*c.fCountVal));
420
421 const auto nLeafs = c.fRecordField->GetConstSubfields().size();
422 for (std::size_t l = 0; l < nLeafs; ++l) {
423 const auto offset = c.fRecordField->GetOffsets()[l];
424 const auto sizeOfLeaf = c.fRecordField->GetConstSubfields()[l]->GetValueSize();
425 const auto idxImportBranch = c.fLeafBranchIndexes[l];
426 for (Int_t j = 0; j < *c.fCountVal; ++j) {
427 memcpy(c.fFieldBuffer.data() + j * sizeOfRecord + offset,
428 fImportBranches[idxImportBranch].fBranchBuffer.get() + (j * sizeOfLeaf), sizeOfLeaf);
429 }
430 }
431 }
432
433 for (auto &t : fImportTransformations) {
434 auto result = t->Transform(fImportBranches[t->fImportBranchIdx], fImportFields[t->fImportFieldIdx]);
435 if (!result)
437 }
438
439 ntplWriter->Fill(*fEntry);
440
442 fProgressCallback->Call(ctrZippedBytes->GetValueAsInt(), i);
443 }
445 fProgressCallback->Finish(ctrZippedBytes->GetValueAsInt(), nEntries);
446}
#define R__FORWARD_ERROR(res)
Short-hand to return an RResult<T> in an error state (i.e. after checking)
Definition RError.hxx:304
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
Definition RError.hxx:300
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
#define c(i)
Definition RSha256.hxx:101
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
@ kOther_t
Definition TDataType.h:32
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
char name[80]
Definition TGX11.cxx:148
#define _(A, B)
Definition cfortran.h:108
Used to report every ~100 MB (compressed), and at the end about the status of the import.
bool fConvertDotsInBranchNames
Whether or not dot characters in branch names should be converted to underscores.
std::unique_ptr< ROOT::REntry > fEntry
std::int64_t fMaxEntries
The maximum number of entries to import. When this value is -1 (default), import all entries.
std::map< std::string, RImportLeafCountCollection > fLeafCountCollections
Maps the count leaf to the information about the corresponding untyped collection.
std::unique_ptr< ROOT::RNTupleModel > fModel
std::vector< RImportBranch > fImportBranches
ROOT::RResult< void > InitDestination(std::string_view destFileName)
static std::unique_ptr< RNTupleImporter > Create(std::string_view sourceFileName, std::string_view treeName, std::string_view destFileName)
Opens the input file for reading and the output file for writing (update).
std::unique_ptr< RProgressCallback > fProgressCallback
void Import()
Import works in two steps:
ROOT::RNTupleWriteOptions fWriteOptions
bool fIsQuiet
No standard output, conversely if set to false, schema information and progress is printed.
std::vector< RImportField > fImportFields
std::vector< std::unique_ptr< RImportTransformation > > fImportTransformations
The list of transformations to be performed for every entry.
ROOT::RResult< void > PrepareSchema()
Sets up the connection from TTree branches to RNTuple fields, including initialization of the memory ...
ROOT::RFieldZero & GetFieldZero()
Base class for all ROOT issued exceptions.
Definition RError.hxx:79
std::vector< const RFieldBase * > GetConstSubfields() const
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &typeName, const ROOT::RCreateFieldOptions &options, const ROOT::RNTupleDescriptor *desc, ROOT::DescriptorId_t fieldId)
Factory method to resurrect a field from the stored on-disk type information.
static std::unique_ptr< RNTupleModel > CreateBare()
Creates a "bare model", i.e. an RNTupleModel with no default entry.
const_iterator begin() const
const_iterator end() const
The field for an untyped record.
The class is used as a return type for operations that can fail; wraps a value of type T or an RError...
Definition RError.hxx:198
static std::unique_ptr< RVectorField > CreateUntyped(std::string_view fieldName, std::unique_ptr< RFieldBase > itemField)
static TClass * Class()
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition TClass.cxx:2994
Describe directory structure in memory.
Definition TDirectory.h:45
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition TFile.cxx:3787
static TClass * Class()
static TClass * Class()
static TClass * Class()
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition TLeaf.h:57
TClass * IsA() const override
Definition TLine.h:79
virtual const char * GetName() const
Returns name of object.
Definition TObject.cxx:462
A TTree represents a columnar dataset.
Definition TTree.h:89
virtual Int_t SetBranchAddress(const char *bname, void *add, TBranch **ptr, TClass *realClass, EDataType datatype, bool isptr, bool suppressMissingBranchError)
Definition TTree.cxx:8675
virtual Int_t GetEntry(Long64_t entry, Int_t getall=0)
Read all branches of entry and return total number of bytes read.
Definition TTree.cxx:5718
virtual Long64_t GetEntries() const
Definition TTree.h:510
virtual TObjArray * GetListOfBranches()
Definition TTree.h:575
std::unique_ptr< RNTupleWriter > CreateRNTupleWriter(std::unique_ptr< ROOT::RNTupleModel > model, std::unique_ptr< Internal::RPageSink > sink)
RProjectedFields & GetProjectedFieldsOfModel(RNTupleModel &model)
RResult< void > Transform(const RImportBranch &branch, RImportField &field) final
ROOT::RFieldBase * fField
The field is kept during schema preparation and transferred to the fModel before the writing starts.
When the schema is set up and the import started, it needs to be reset before the next Import() call ...
Int_t fMaxLength
Stores count leaf GetMaximum() to create large enough buffers for the array leafs.
TLine l
Definition textangle.C:4