Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleImporter.cxx
Go to the documentation of this file.
1/// \file RNTupleImporter.cxx
2/// \ingroup NTuple ROOT7
3/// \author Jakob Blomer <jblomer@cern.ch>
4/// \date 2022-11-22
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
16#include <ROOT/RError.hxx>
17#include <ROOT/RField.hxx>
19#include <ROOT/RNTupleUtil.hxx>
22#include <ROOT/RPageSinkBuf.hxx>
23#include <ROOT/RPageStorage.hxx>
25#include <string_view>
26
27#include <TBranch.h>
28#include <TChain.h>
29#include <TClass.h>
30#include <TDataType.h>
31#include <TLeaf.h>
32#include <TLeafC.h>
33#include <TLeafElement.h>
34#include <TLeafObject.h>
35
36#include <cassert>
37#include <cstdint>
38#include <cstring>
39#include <iostream>
40#include <utility>
41
42namespace {
43
44class RDefaultProgressCallback : public ROOT::Experimental::RNTupleImporter::RProgressCallback {
45private:
46 static constexpr std::uint64_t gUpdateFrequencyBytes = 100 * 1000 * 1000; // report every 100 MB
47 std::uint64_t fNbytesNext = gUpdateFrequencyBytes;
48
49public:
50 ~RDefaultProgressCallback() override {}
51 void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) final
52 {
53 // Report if more than 100 MB (compressed) where written since the last status update
54 if (nbytesWritten < fNbytesNext)
55 return;
56 std::cout << "Wrote " << nbytesWritten / 1000 / 1000 << "MB, " << neventsWritten << " entries\n";
57 fNbytesNext += gUpdateFrequencyBytes;
58 if (nbytesWritten > fNbytesNext) {
59 // If we already passed the next threshold, increase by a sensible amount.
60 fNbytesNext = nbytesWritten + gUpdateFrequencyBytes;
61 }
62 }
63
64 void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) final
65 {
66 std::cout << "Done, wrote " << nbytesWritten / 1000 / 1000 << "MB, " << neventsWritten << " entries\n";
67 }
68};
69
70} // anonymous namespace
71
74{
75 *reinterpret_cast<std::string *>(field.fFieldBuffer) = reinterpret_cast<const char *>(branch.fBranchBuffer.get());
77}
78
79std::unique_ptr<ROOT::Experimental::RNTupleImporter>
80ROOT::Experimental::RNTupleImporter::Create(std::string_view sourceFileName, std::string_view treeName,
81 std::string_view destFileName)
82{
83 auto importer = std::unique_ptr<RNTupleImporter>(new RNTupleImporter());
84 importer->fNTupleName = treeName;
85 importer->fSourceFile = std::unique_ptr<TFile>(TFile::Open(std::string(sourceFileName).c_str()));
86 if (!importer->fSourceFile || importer->fSourceFile->IsZombie()) {
87 throw RException(R__FAIL("cannot open source file " + std::string(sourceFileName)));
88 }
89
90 importer->fSourceTree = importer->fSourceFile->Get<TTree>(std::string(treeName).c_str());
91 if (!importer->fSourceTree) {
92 throw RException(R__FAIL("cannot read TTree " + std::string(treeName) + " from " + std::string(sourceFileName)));
93 }
94
95 // If we have IMT enabled, its best use is for parallel page compression
96 importer->fSourceTree->SetImplicitMT(false);
97 auto result = importer->InitDestination(destFileName);
98
99 if (!result)
101
102 return importer;
103}
104
105std::unique_ptr<ROOT::Experimental::RNTupleImporter>
106ROOT::Experimental::RNTupleImporter::Create(TTree *sourceTree, std::string_view destFileName)
107{
108 auto importer = std::unique_ptr<RNTupleImporter>(new RNTupleImporter());
109
110 if (sourceTree->IsA() == TChain::Class() && std::strcmp(sourceTree->GetName(), "") == 0) {
111 if (sourceTree->LoadTree(0) != 0)
112 throw RException(R__FAIL("failure retrieving first tree from provided chain"));
113 importer->fNTupleName = sourceTree->GetTree()->GetName();
114 } else {
115 importer->fNTupleName = sourceTree->GetName();
116 }
117
118 importer->fSourceTree = sourceTree;
119
120 // If we have IMT enabled, its best use is for parallel page compression
121 importer->fSourceTree->SetImplicitMT(false);
122 auto result = importer->InitDestination(destFileName);
123
124 if (!result)
126
127 return importer;
128}
129
131{
132 fDestFileName = destFileName;
133 fDestFile = std::unique_ptr<TFile>(TFile::Open(fDestFileName.c_str(), "UPDATE"));
134 if (!fDestFile || fDestFile->IsZombie()) {
135 return R__FAIL("cannot open dest file " + std::string(fDestFileName));
136 }
137
138 return RResult<void>::Success();
139}
140
142{
143 for (const auto &f : fImportFields) {
144 std::cout << "Importing '" << f.fField->GetFieldName() << "' [" << f.fField->GetTypeName() << "]\n";
145 }
146 for (const auto &f : Internal::GetProjectedFieldsOfModel(*fModel).GetFieldZero().GetSubFields()) {
147 std::cout << "Importing (projected) '" << f->GetFieldName() << "' [" << f->GetTypeName() << "]\n";
148 }
149}
150
152{
153 fImportBranches.clear();
154 fImportFields.clear();
155 fLeafCountCollections.clear();
158 fEntry = nullptr;
159}
160
162{
163 ResetSchema();
164
165 // Browse through all branches and their leaves, create corresponding fields and prepare the memory buffers for
166 // reading and writing. Usually, reading and writing share the same memory buffer, i.e. the object is read from TTree
167 // and written as-is to the RNTuple. There are exceptions, e.g. for leaf count arrays and C strings.
169 assert(b);
170 const auto firstLeaf = static_cast<TLeaf *>(b->GetListOfLeaves()->First());
171 assert(firstLeaf);
172
173 const bool isLeafList = b->GetNleaves() > 1;
174 const bool isCountLeaf = firstLeaf->IsRange(); // A leaf storing the number of elements of a leaf count array
175 const bool isClass = (firstLeaf->IsA() == TLeafElement::Class()); // STL or user-defined class
176 if (isLeafList && isClass)
177 return R__FAIL("unsupported: classes in leaf list, branch " + std::string(b->GetName()));
178 if (isLeafList && isCountLeaf)
179 return R__FAIL("unsupported: count leaf arrays in leaf list, branch " + std::string(b->GetName()));
180
181 // Only plain leafs with type identifies 'C' are C strings. Otherwise, they are char arrays.
182 // We use GetLeafCounter instead of GetLeafCount and GetLenStatic because the latter don't distinguish between
183 // char arrays and C strings.
184 Int_t firstLeafCountval;
185 const bool isCString = !isLeafList && (firstLeaf->IsA() == TLeafC::Class()) &&
186 (!firstLeaf->GetLeafCounter(firstLeafCountval)) && (firstLeafCountval == 1);
187
188 if (isCountLeaf) {
189 // This is a count leaf. We expect that this is not part of a leaf list. We also expect that the
190 // leaf count comes before any array leaves that use it.
191 // Count leaf branches do not end up as (physical) fields but they trigger the creation of an untyped
192 // collection, together the collection mode.
194 c.fMaxLength = firstLeaf->GetMaximum();
195 c.fCountVal = std::make_unique<Int_t>(); // count leafs are integers
196 // Casting to void * makes it work for both Int_t and UInt_t
197 fSourceTree->SetBranchAddress(b->GetName(), static_cast<void *>(c.fCountVal.get()));
198 fLeafCountCollections.emplace(firstLeaf->GetName(), std::move(c));
199 continue;
200 }
201
202 std::size_t branchBufferSize = 0; // Size of the memory location into which TTree reads the events' branch data
203 // For leaf lists, every leaf translates into a sub field of an untyped RNTuple record
204 std::vector<std::unique_ptr<RFieldBase>> recordItems;
205 // For leaf count arrays, we expect to find a single leaf; we don't add a field right away but only
206 // later through a projection
207 bool isLeafCountArray = false;
208 for (auto l : TRangeDynCast<TLeaf>(b->GetListOfLeaves())) {
209 if (l->IsA() == TLeafObject::Class()) {
210 return R__FAIL("unsupported: TObject branches, branch: " + std::string(b->GetName()));
211 }
212
213 // We don't use GetLeafCounter() because it relies on the correct format of the leaf title.
214 // There are files in the public where the title is broken (empty).
215 Int_t countval = l->GetLenStatic();
216 auto *countleaf = l->GetLeafCount();
217 const bool isFixedSizeArray = !isCString && (countleaf == nullptr) && (countval > 1);
218 isLeafCountArray = (countleaf != nullptr);
219
220 // The base case for branches with fundamental, single numerical types.
221 // For other types of branches, different field names or types are necessary,
222 // which is determined below.
223 std::string fieldName = b->GetName();
224 std::string fieldType = l->GetTypeName();
225
226 if (isLeafList)
227 fieldName = l->GetName();
228
229 if (isCString)
230 fieldType = "std::string";
231
232 if (isClass)
233 fieldType = b->GetClassName();
234
235 if (isFixedSizeArray)
236 fieldType = "std::array<" + fieldType + "," + std::to_string(countval) + ">";
237
239 // Replace any occurrence of a dot ('.') with an underscore.
240 std::replace(fieldName.begin(), fieldName.end(), '.', '_');
241 }
242
244 auto fieldOrError = RFieldBase::Create(fieldName, fieldType);
245 if (!fieldOrError)
246 return R__FORWARD_ERROR(fieldOrError);
247 auto field = fieldOrError.Unwrap();
248 if (isCString) {
249 branchBufferSize = l->GetMaximum();
250 f.fValue = std::make_unique<RFieldBase::RValue>(field->CreateValue());
251 f.fFieldBuffer = f.fValue->GetPtr<void>().get();
252 fImportTransformations.emplace_back(
253 std::make_unique<RCStringTransformation>(fImportBranches.size(), fImportFields.size()));
254 } else {
255 if (isClass) {
256 // For classes, the branch buffer contains a pointer to object, which gets instantiated by TTree upon
257 // calling SetBranchAddress()
258 branchBufferSize = sizeof(void *) * countval;
259 } else if (isLeafCountArray) {
260 branchBufferSize = fLeafCountCollections[countleaf->GetName()].fMaxLength * field->GetValueSize();
261 } else {
262 branchBufferSize = l->GetOffset() + field->GetValueSize();
263 }
264 }
265 f.fField = field.get();
266
267 if (isLeafList) {
268 recordItems.emplace_back(std::move(field));
269 } else if (isLeafCountArray) {
270 const std::string countleafName = countleaf->GetName();
271 fLeafCountCollections[countleafName].fLeafFields.emplace_back(std::move(field));
272 fLeafCountCollections[countleafName].fLeafBranchIndexes.emplace_back(fImportBranches.size());
273 R__ASSERT(b->GetListOfLeaves()->GetEntries() == 1);
274 break;
275 } else {
276 fModel->AddField(std::move(field));
277 fImportFields.emplace_back(std::move(f));
278 }
279 }
280 if (!recordItems.empty()) {
281 auto recordField = std::make_unique<RRecordField>(b->GetName(), std::move(recordItems));
283 f.fField = recordField.get();
284 fImportFields.emplace_back(std::move(f));
285 fModel->AddField(std::move(recordField));
286 }
287
288 RImportBranch ib;
289 ib.fBranchName = b->GetName();
290 ib.fBranchBuffer = std::make_unique<unsigned char[]>(branchBufferSize);
291 if (isClass) {
292 auto klass = TClass::GetClass(b->GetClassName());
293 if (!klass) {
294 return R__FAIL("unable to load class " + std::string(b->GetClassName()) + " for branch " +
295 std::string(b->GetName()));
296 }
297 auto ptrBuf = reinterpret_cast<void **>(ib.fBranchBuffer.get());
298 fSourceTree->SetBranchAddress(b->GetName(), ptrBuf, klass, EDataType::kOther_t, true /* isptr*/);
299 } else {
300 fSourceTree->SetBranchAddress(b->GetName(), reinterpret_cast<void *>(ib.fBranchBuffer.get()));
301 }
302
303 // If the TTree branch type and the RNTuple field type match, use the branch read buffer as RNTuple write buffer
304 if (!isLeafCountArray && !fImportFields.back().fFieldBuffer) {
305 fImportFields.back().fFieldBuffer =
306 isClass ? *reinterpret_cast<void **>(ib.fBranchBuffer.get()) : ib.fBranchBuffer.get();
307 }
308
309 fImportBranches.emplace_back(std::move(ib));
310 }
311
312 int iLeafCountCollection = 0;
313 for (auto &p : fLeafCountCollections) {
314 // We want to capture this variable, which is not possible with a
315 // structured binding in C++17. Explicitly defining a variable works.
316 auto countLeafName = p.first;
317 auto &c = p.second;
318
319 c.fFieldName = "_collection" + std::to_string(iLeafCountCollection);
320 auto recordField = std::make_unique<RRecordField>("_0", std::move(c.fLeafFields));
321 c.fRecordField = recordField.get();
322 auto collectionField = RVectorField::CreateUntyped(c.fFieldName, std::move(recordField));
323 fModel->AddField(std::move(collectionField));
324
325 // Add projected fields for all leaf count arrays
326 for (const auto leaf : c.fRecordField->GetSubFields()) {
327 const auto name = leaf->GetFieldName();
328 auto projectedField = RFieldBase::Create(name, "ROOT::VecOps::RVec<" + leaf->GetTypeName() + ">").Unwrap();
329 fModel->AddProjectedField(std::move(projectedField), [&name, &c](const std::string &fieldName) {
330 if (fieldName == name)
331 return c.fFieldName;
332 else
333 return c.fFieldName + "._0." + name;
334 });
335 }
336
338 // Replace any occurrenceof a dot ('.') in the count leaf name with an underscore.
339 std::replace(countLeafName.begin(), countLeafName.end(), '.', '_');
340 }
341
342 // Add projected fields for count leaf
343 auto projectedField = RFieldBase::Create(countLeafName, "ROOT::RNTupleCardinality<std::uint32_t>").Unwrap();
344 fModel->AddProjectedField(std::move(projectedField), [&c](const std::string &) { return c.fFieldName; });
345
346 iLeafCountCollection++;
347 }
348
349 if (fFieldModifier) {
350 for (auto &field : fModel->GetMutableFieldZero()) {
351 fFieldModifier(field);
352 }
353 }
354
355 fModel->Freeze();
356
357 fEntry = fModel->CreateBareEntry();
358 for (const auto &f : fImportFields) {
359 fEntry->BindRawPtr(f.fField->GetFieldName(), f.fFieldBuffer);
360 }
361 for (auto &[_, c] : fLeafCountCollections) {
362 fEntry->BindRawPtr<void>(c.fFieldName, &c.fFieldBuffer);
363 }
364
365 if (!fIsQuiet)
366 ReportSchema();
367
368 return RResult<void>::Success();
369}
370
372{
373 if (fDestFile->FindKey(fNTupleName.c_str()) != nullptr)
374 throw RException(R__FAIL("Key '" + fNTupleName + "' already exists in file " + fDestFileName));
375
377
378 std::unique_ptr<Internal::RPageSink> sink =
379 std::make_unique<Internal::RPageSinkFile>(fNTupleName, *fDestFile, fWriteOptions);
380 sink->GetMetrics().Enable();
381 auto ctrZippedBytes = sink->GetMetrics().GetCounter("RPageSinkFile.szWritePayload");
382
384 sink = std::make_unique<Internal::RPageSinkBuf>(std::move(sink));
385 }
386
387 auto ntplWriter = Internal::CreateRNTupleWriter(std::move(fModel), std::move(sink));
388 // The guard needs to be destructed before the writer goes out of scope
389 RImportGuard importGuard(*this);
390
391 fProgressCallback = fIsQuiet ? nullptr : std::make_unique<RDefaultProgressCallback>();
392
393 auto nEntries = fSourceTree->GetEntries();
394
395 if (fMaxEntries >= 0 && fMaxEntries < nEntries) {
396 nEntries = fMaxEntries;
397 }
398
399 for (decltype(nEntries) i = 0; i < nEntries; ++i) {
401
402 for (auto &[_, c] : fLeafCountCollections) {
403 const auto sizeOfRecord = c.fRecordField->GetValueSize();
404 c.fFieldBuffer.resize(sizeOfRecord * (*c.fCountVal));
405
406 const auto nLeafs = c.fRecordField->GetSubFields().size();
407 for (std::size_t l = 0; l < nLeafs; ++l) {
408 const auto offset = c.fRecordField->GetOffsets()[l];
409 const auto sizeOfLeaf = c.fRecordField->GetSubFields()[l]->GetValueSize();
410 const auto idxImportBranch = c.fLeafBranchIndexes[l];
411 for (Int_t j = 0; j < *c.fCountVal; ++j) {
412 memcpy(c.fFieldBuffer.data() + j * sizeOfRecord + offset,
413 fImportBranches[idxImportBranch].fBranchBuffer.get() + (j * sizeOfLeaf), sizeOfLeaf);
414 }
415 }
416 }
417
418 for (auto &t : fImportTransformations) {
419 auto result = t->Transform(fImportBranches[t->fImportBranchIdx], fImportFields[t->fImportFieldIdx]);
420 if (!result)
422 }
423
424 ntplWriter->Fill(*fEntry);
425
427 fProgressCallback->Call(ctrZippedBytes->GetValueAsInt(), i);
428 }
430 fProgressCallback->Finish(ctrZippedBytes->GetValueAsInt(), nEntries);
431}
#define R__FORWARD_ERROR(res)
Short-hand to return an RResult<T> in an error state (i.e. after checking)
Definition RError.hxx:294
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
Definition RError.hxx:290
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
#define c(i)
Definition RSha256.hxx:101
@ kOther_t
Definition TDataType.h:32
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
Definition TError.h:125
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
char name[80]
Definition TGX11.cxx:110
#define _(A, B)
Definition cfortran.h:108
Base class for all ROOT issued exceptions.
Definition RError.hxx:78
std::vector< RFieldBase * > GetSubFields()
Definition RField.cxx:995
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &canonicalType, const std::string &typeAlias, bool continueOnError=false)
Factory method to resurrect a field from the stored on-disk type information.
Definition RField.cxx:612
Used to report every ~100 MB (compressed), and at the end about the status of the import.
virtual void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten)=0
virtual void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten)=0
bool fConvertDotsInBranchNames
Whether or not dot characters in branch names should be converted to underscores.
std::int64_t fMaxEntries
The maximum number of entries to import. When this value is -1 (default), import all entries.
std::map< std::string, RImportLeafCountCollection > fLeafCountCollections
Maps the count leaf to the information about the corresponding untyped collection.
std::vector< RImportBranch > fImportBranches
static std::unique_ptr< RNTupleImporter > Create(std::string_view sourceFileName, std::string_view treeName, std::string_view destFileName)
Opens the input file for reading and the output file for writing (update).
std::unique_ptr< RProgressCallback > fProgressCallback
RResult< void > PrepareSchema()
Sets up the connection from TTree branches to RNTuple fields, including initialization of the memory ...
ROOT::Experimental::RResult< void > InitDestination(std::string_view destFileName)
void Import()
Import works in two steps:
bool fIsQuiet
No standard output, conversely if set to false, schema information and progress is printed.
std::vector< RImportField > fImportFields
std::unique_ptr< RNTupleModel > fModel
std::vector< std::unique_ptr< RImportTransformation > > fImportTransformations
The list of transformations to be performed for every entry.
static std::unique_ptr< RNTupleModel > CreateBare()
A bare model has no default entry.
The class is used as a return type for operations that can fail; wraps a value of type T or an RError...
Definition RError.hxx:194
static std::unique_ptr< RVectorField > CreateUntyped(std::string_view fieldName, std::unique_ptr< RFieldBase > itemField)
Definition RField.cxx:2641
static TClass * Class()
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Definition TClass.cxx:3037
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition TFile.cxx:4089
static TClass * Class()
static TClass * Class()
static TClass * Class()
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition TLeaf.h:57
TClass * IsA() const override
Definition TLine.h:79
const char * GetName() const override
Returns name of object.
Definition TNamed.h:47
virtual const char * GetName() const
Returns name of object.
Definition TObject.cxx:444
A TTree represents a columnar dataset.
Definition TTree.h:79
virtual Int_t GetEntry(Long64_t entry, Int_t getall=0)
Read all branches of entry and return total number of bytes read.
Definition TTree.cxx:5638
virtual Int_t SetBranchAddress(const char *bname, void *add, TBranch **ptr=nullptr)
Change branch address, dealing with clone trees properly.
Definition TTree.cxx:8385
virtual Long64_t GetEntries() const
Definition TTree.h:463
virtual void SetImplicitMT(bool enabled)
Definition TTree.h:661
virtual TObjArray * GetListOfBranches()
Definition TTree.h:528
virtual TTree * GetTree() const
Definition TTree.h:557
virtual Long64_t LoadTree(Long64_t entry)
Set current entry.
Definition TTree.cxx:6473
TClass * IsA() const override
Definition TTree.h:705
RProjectedFields & GetProjectedFieldsOfModel(RNTupleModel &model)
std::unique_ptr< RNTupleWriter > CreateRNTupleWriter(std::unique_ptr< RNTupleModel > model, std::unique_ptr< Internal::RPageSink > sink)
RResult< void > Transform(const RImportBranch &branch, RImportField &field) final
std::string fBranchName
Top-level branch name from the input TTree.
std::unique_ptr< unsigned char[]> fBranchBuffer
The destination of SetBranchAddress() for fBranchName
void * fFieldBuffer
Usually points to the corresponding RImportBranch::fBranchBuffer but not always.
RFieldBase * fField
The field is kept during schema preparation and transferred to the fModel before the writing starts.
When the schema is set up and the import started, it needs to be reset before the next Import() call ...
Int_t fMaxLength
Stores count leaf GetMaximum() to create large enough buffers for the array leafs.
TLine l
Definition textangle.C:4