Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RDFSnapshotHelpers.cxx
Go to the documentation of this file.
1/**
2 \file RDFSnapshotHelpers.cxx
3 \ingroup dataframe
4 \author Enrico Guiraud, CERN
5 \author Danilo Piparo, CERN
6 \date 2016-12
7 \author Vincenzo Eduardo Padulano
8 \author Stephan Hageboeck
9 \date 2025-06
10*/
11
12/*************************************************************************
13 * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. *
14 * All rights reserved. *
15 * *
16 * For the licensing terms see $ROOTSYS/LICENSE. *
17 * For the list of contributors see $ROOTSYS/README/CREDITS. *
18 *************************************************************************/
19
21
22#include <ROOT/REntry.hxx>
23#include <ROOT/RFieldToken.hxx>
24#include <ROOT/RNTuple.hxx>
25#include <ROOT/RNTupleDS.hxx>
28#include <ROOT/RTTreeDS.hxx>
30
31#include <TBranchObject.h>
32#include <TClassEdit.h>
33#include <TDictionary.h>
34#include <TDataType.h>
35#include <TFile.h>
36#include <TLeaf.h>
37#include <TTreeReader.h>
38
39namespace {
40
42{
43 if (inputTree) {
44 if (auto *getBranchRes = inputTree->GetBranch(branchName.c_str()))
45 return getBranchRes;
46
47 // try harder
48 if (auto *findBranchRes = inputTree->FindBranch(branchName.c_str()))
49 return findBranchRes;
50 }
51 return nullptr;
52}
53
55 const std::string &outputBranchName, int basketSize, void *address)
56{
57 if (!inputBranch)
58 return;
59 const auto STLKind = TClassEdit::IsSTLCont(inputBranch->GetClassName());
60 if (STLKind == ROOT::ESTLType::kSTLvector || STLKind == ROOT::ESTLType::kROOTRVec)
61 return;
62 // must construct the leaflist for the output branch and create the branch in the output tree
63 const auto *leaf = static_cast<TLeaf *>(inputBranch->GetListOfLeaves()->UncheckedAt(0));
64 if (!leaf)
65 return;
66 const auto bname = leaf->GetName();
67 auto *sizeLeaf = leaf->GetLeafCount();
68 const auto sizeLeafName = sizeLeaf ? std::string(sizeLeaf->GetName()) : std::to_string(leaf->GetLenStatic());
69
70 // We proceed only if branch is a fixed-or-variable-sized array
71 if (sizeLeaf || leaf->GetLenStatic() > 1) {
73 // The output array branch `bname` has dynamic size stored in leaf `sizeLeafName`, but that leaf has not been
74 // added to the output tree yet. However, the size leaf has to be available for the creation of the array
75 // branch to be successful. So we create the size leaf here.
77 // Use Original basket size for Existing Branches otherwise use Custom basket Size.
78 const auto bufSize = (basketSize > 0) ? basketSize : sizeLeaf->GetBranch()->GetBasketSize();
79 // The null branch address is a placeholder. It will be set when SetBranchesHelper is called for `sizeLeafName`
80 auto *outputBranch = outputTree.Branch(sizeLeafName.c_str(), static_cast<void *>(nullptr),
81 (sizeLeafName + '/' + sizeTypeStr).c_str(), bufSize);
83 }
84
85 const auto btype = leaf->GetTypeName();
87 if (rootbtype == ' ') {
88 Warning("Snapshot",
89 "RDataFrame::Snapshot: could not correctly construct a leaflist for C-style array in column %s. The "
90 "leaf is of type '%s'. This column will not be written out.",
91 bname, btype);
92 return;
93 }
94
95 const auto leaflist = std::string(bname) + "[" + sizeLeafName + "]/" + rootbtype;
96 // Use original basket size for existing branches and new basket size for new branches
97 const auto bufSize = (basketSize > 0) ? basketSize : inputBranch->GetBasketSize();
98 void *addressForBranch = [address]() -> void * {
99 if (address) {
100 // Address here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we need
101 // its buffer, so we cast it and extract the address of the buffer
102 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(address);
103 return rawRVec->data();
104 }
105 return nullptr;
106 }();
107 auto *outputBranch = outputTree.Branch(outputBranchName.c_str(), addressForBranch, leaflist.c_str(), bufSize);
108 outputBranch->SetTitle(inputBranch->GetTitle());
110 }
111}
112
113void SetBranchAddress(TBranch *inputBranch, TBranch &outputBranch, void *&outputBranchAddress, bool isCArray,
114 void *valueAddress)
115{
116 const static TClassRef TBOClRef("TBranchObject");
117 if (inputBranch && inputBranch->IsA() == TBOClRef) {
118 outputBranch.SetAddress(reinterpret_cast<void **>(inputBranch->GetAddress()));
119 } else if (outputBranch.IsA() != TBranch::Class()) {
122 } else {
123 void *correctAddress = [valueAddress, isCArray]() -> void * {
124 if (isCArray) {
125 // Address here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
126 // need its buffer, so we cast it and extract the address of the buffer
127 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(valueAddress);
128 return rawRVec->data();
129 }
130 return valueAddress;
131 }();
132 outputBranch.SetAddress(correctAddress);
134 }
135}
136
139 int bufSize)
140{
141 // Logic taken from
142 // TTree::BranchImpRef(
143 // const char* branchname, TClass* ptrClass, EDataType datatype, void* addobj, Int_t bufsize, Int_t splitlevel)
145 if (rootTypeChar == ' ') {
146 Warning("Snapshot",
147 "RDataFrame::Snapshot: could not correctly construct a leaflist for fundamental type in column %s. This "
148 "column will not be written out.",
149 outputBranchName.c_str());
150 return;
151 }
152 std::string leafList{outputBranchName + '/' + rootTypeChar};
153 auto *outputBranch = outputTree.Branch(outputBranchName.c_str(), valueAddress, leafList.c_str(), bufSize);
155}
156
157/// Ensure that the TTree with the resulting snapshot can be written to the target TFile. This means checking that the
158/// TFile can be opened in the mode specified in `opts`, deleting any existing TTrees in case
159/// `opts.fOverwriteIfExists = true`, or throwing an error otherwise.
161 const std::string &fileName)
162{
163 TString fileMode = opts.fMode;
164 fileMode.ToLower();
165 if (fileMode != "update")
166 return;
167
168 // output file opened in "update" mode: must check whether output TTree is already present in file
169 std::unique_ptr<TFile> outFile{TFile::Open(fileName.c_str(), "update")};
170 if (!outFile || outFile->IsZombie())
171 throw std::invalid_argument("Snapshot: cannot open file \"" + fileName + "\" in update mode");
172
173 TObject *outTree = outFile->Get(treeName.c_str());
174 if (outTree == nullptr)
175 return;
176
177 // object called treeName is already present in the file
178 if (opts.fOverwriteIfExists) {
179 if (outTree->InheritsFrom("TTree")) {
180 static_cast<TTree *>(outTree)->Delete("all");
181 } else {
182 outFile->Delete(treeName.c_str());
183 }
184 } else {
185 const std::string msg = "Snapshot: tree \"" + treeName + "\" already present in file \"" + fileName +
186 "\". If you want to delete the original tree and write another, please set "
187 "RSnapshotOptions::fOverwriteIfExists to true.";
188 throw std::invalid_argument(msg);
189 }
190}
191
192/// Ensure that the RNTuple with the resulting snapshot can be written to the target TFile. This means checking that the
193/// TFile can be opened in the mode specified in `opts`, deleting any existing RNTuples in case
194/// `opts.fOverwriteIfExists = true`, or throwing an error otherwise.
196 const std::string &fileName)
197{
198 TString fileMode = opts.fMode;
199 fileMode.ToLower();
200 if (fileMode != "update")
201 return;
202
203 // output file opened in "update" mode: must check whether output RNTuple is already present in file
204 std::unique_ptr<TFile> outFile{TFile::Open(fileName.c_str(), "update")};
205 if (!outFile || outFile->IsZombie())
206 throw std::invalid_argument("Snapshot: cannot open file \"" + fileName + "\" in update mode");
207
208 auto *outNTuple = outFile->Get<ROOT::RNTuple>(ntupleName.c_str());
209
210 if (outNTuple) {
211 if (opts.fOverwriteIfExists) {
212 outFile->Delete((ntupleName + ";*").c_str());
213 return;
214 } else {
215 const std::string msg = "Snapshot: RNTuple \"" + ntupleName + "\" already present in file \"" + fileName +
216 "\". If you want to delete the original ntuple and write another, please set "
217 "the 'fOverwriteIfExists' option to true in RSnapshotOptions.";
218 throw std::invalid_argument(msg);
219 }
220 }
221
222 // Also check if there is any object other than an RNTuple with the provided ntupleName.
223 TObject *outObj = outFile->Get(ntupleName.c_str());
224
225 if (!outObj)
226 return;
227
228 // An object called ntupleName is already present in the file.
229 if (opts.fOverwriteIfExists) {
230 if (auto tree = dynamic_cast<TTree *>(outObj)) {
231 tree->Delete("all");
232 } else {
233 outFile->Delete((ntupleName + ";*").c_str());
234 }
235 } else {
236 const std::string msg = "Snapshot: object \"" + ntupleName + "\" already present in file \"" + fileName +
237 "\". If you want to delete the original object and write a new RNTuple, please set "
238 "the 'fOverwriteIfExists' option to true in RSnapshotOptions.";
239 throw std::invalid_argument(msg);
240 }
241}
242
244 int basketSize, const std::string &inputBranchName, const std::string &outputBranchName,
245 const std::type_info &valueTypeID, void *valueAddress, TBranch *&actionHelperBranchPtr,
247{
248
250
251 // Respect the original bufsize and splitlevel arguments
252 // In particular, by keeping splitlevel equal to 0 if this was the case for `inputBranch`, we avoid
253 // writing garbage when unsplit objects cannot be written as split objects (e.g. in case of a polymorphic
254 // TObject branch, see https://bit.ly/2EjLMId ).
255 // A user-provided basket size value takes precedence.
256 const auto bufSize = (basketSize > 0) ? basketSize : (inputBranch ? inputBranch->GetBasketSize() : 32000);
257 const auto splitLevel = inputBranch ? inputBranch->GetSplitLevel() : 99;
258
260 // The output branch was already created, we just need to (re)set its address
263 return;
264 }
265
267 if (dynamic_cast<TDataType *>(dictionary)) {
268 // Branch of fundamental type
270 return;
271 }
272
273 if (!isDefine) {
274 // Cases where we need a leaflist (e.g. C-style arrays)
275 // We only enter this code path if the input value does not come from a Define/Redefine. In those cases, it is
276 // not allowed to create a column of C-style array type, so that can't happen when writing the TTree. This is
277 // currently what prevents writing the wrong branch output type in a scenario where the input branch of the TTree
278 // is a C-style array and then the user is Redefining it with some other type (e.g. a ROOT::RVec).
280 }
282 // A branch was created in the previous function call
284 if (valueAddress) {
285 // valueAddress here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
286 // need its buffer, so we cast it and extract the address of the buffer
287 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(valueAddress);
289 }
290 return;
291 }
292
293 if (auto *classPtr = dynamic_cast<TClass *>(dictionary)) {
295 // Case of unsplit object with polymorphic type
296 if (inputBranch && dynamic_cast<TBranchObject *>(inputBranch) && valueAddress)
298 inputBranch->GetAddress(), bufSize, splitLevel);
299 // General case, with valid address
300 else if (valueAddress)
304 // No value was passed, we're just creating a hollow branch to populate the dataset schema
305 else
306 outputBranch = outputTree.Branch(outputBranchName.c_str(), classPtr->GetName(), nullptr, bufSize);
308 return;
309 }
310
311 // We are not aware of other cases
312 throw std::logic_error(
313 "RDataFrame::Snapshot: something went wrong when creating a TTree branch, please report this as a bug.");
314}
315} // namespace
316
318{
319 auto it = std::find(fNames.begin(), fNames.end(), name);
320 if (it == fNames.end())
321 return nullptr;
322 return fBranches[std::distance(fNames.begin(), it)];
323}
324
326{
327 if (auto it = std::find(fNames.begin(), fNames.end(), name); it != fNames.end())
328 return fIsCArray[std::distance(fNames.begin(), it)];
329 return false;
330}
331
332void ROOT::Internal::RDF::RBranchSet::Insert(const std::string &name, TBranch *address, bool isCArray)
333{
334 if (address == nullptr) {
335 throw std::logic_error("Trying to insert a null branch address.");
336 }
337 if (std::find(fBranches.begin(), fBranches.end(), address) != fBranches.end()) {
338 throw std::logic_error("Trying to insert a branch address that's already present.");
339 }
340 if (std::find(fNames.begin(), fNames.end(), name) != fNames.end()) {
341 throw std::logic_error("Trying to insert a branch name that's already present.");
342 }
343 fNames.emplace_back(name);
344 fBranches.emplace_back(address);
345 fIsCArray.push_back(isCArray);
346}
347
349{
350 fBranches.clear();
351 fNames.clear();
352 fIsCArray.clear();
353}
354
356{
357 std::vector<TBranch *> branchesWithNullAddress;
358 std::copy_if(fBranches.begin(), fBranches.end(), std::back_inserter(branchesWithNullAddress),
359 [](TBranch *b) { return b->GetAddress() == nullptr; });
360
361 if (branchesWithNullAddress.empty())
362 return;
363
364 // otherwise build error message and throw
365 std::vector<std::string> missingBranchNames;
367 std::back_inserter(missingBranchNames), [](TBranch *b) { return b->GetName(); });
368 std::string msg = "RDataFrame::Snapshot:";
369 if (missingBranchNames.size() == 1) {
370 msg += " branch " + missingBranchNames[0] +
371 " is needed as it provides the size for one or more branches containing dynamically sized arrays, but "
372 "it is";
373 } else {
374 msg += " branches ";
375 for (const auto &bName : missingBranchNames)
376 msg += bName + ", ";
377 msg.resize(msg.size() - 2); // remove last ", "
378 msg += " are needed as they provide the size of other branches containing dynamically sized arrays, but they are";
379 }
380 msg += " not part of the set of branches that are being written out.";
381 throw std::runtime_error(msg);
382}
383
385 std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames,
386 const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector<bool> &&isDefine,
388 const std::vector<const std::type_info *> &colTypeIDs)
389 : fFileName(filename),
390 fDirName(dirname),
391 fTreeName(treename),
392 fOptions(options),
393 fInputBranchNames(vbnames),
394 fOutputBranchNames(ReplaceDotWithUnderscore(bnames)),
395 fBranches(vbnames.size(), nullptr),
396 fBranchAddresses(vbnames.size(), nullptr),
397 fIsDefine(std::move(isDefine)),
398 fOutputLoopManager(loopManager),
399 fInputLoopManager(inputLM),
400 fInputColumnTypeIDs(colTypeIDs)
401{
403}
404
405// Define special member methods here where the definition of all the data member types is available
409 ROOT::Internal::RDF::UntypedSnapshotTTreeHelper &&) noexcept = default;
410
412{
413 if (!fTreeName.empty() /*not moved from*/ && !fOutputFile /* did not run */ && fOptions.fLazy) {
414 const auto fileOpenMode = [&]() {
415 TString checkupdate = fOptions.fMode;
416 checkupdate.ToLower();
417 return checkupdate == "update" ? "updated" : "created";
418 }();
419 Warning("Snapshot",
420 "A lazy Snapshot action was booked but never triggered. The tree '%s' in output file '%s' was not %s. "
421 "In case it was desired instead, remember to trigger the Snapshot operation, by storing "
422 "its result in a variable and for example calling the GetValue() method on it.",
423 fTreeName.c_str(), fFileName.c_str(), fileOpenMode);
424 }
425}
426
428{
429 // We ask the input RLoopManager if it has a TTree. We cannot rely on getting this information when constructing
430 // this action helper, since the TTree might change e.g. when ChangeSpec is called in-between distributed tasks.
431 if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputLoopManager->GetDataSource()))
432 fInputTree = treeDS->GetTree();
433 fBranchAddressesNeedReset = true;
434}
435
436void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::Exec(unsigned int, const std::vector<void *> &values)
437{
438 if (!fBranchAddressesNeedReset) {
439 UpdateCArraysPtrs(values);
440 } else {
441 SetBranches(values);
442 fBranchAddressesNeedReset = false;
443 }
444
445 fOutputTree->Fill();
446}
447
449{
450 // This code deals with branches which hold C arrays of variable size. It can happen that the buffers
451 // associated to those is re-allocated. As a result the value of the pointer can change therewith
452 // leaving associated to the branch of the output tree an invalid pointer.
453 // With this code, we set the value of the pointer in the output branch anew when needed.
454 assert(values.size() == fBranches.size());
455 auto nValues = values.size();
456 for (decltype(nValues) i{}; i < nValues; i++) {
457 if (fBranches[i] && fOutputBranches.IsCArray(fOutputBranchNames[i])) {
458 // valueAddress here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
459 // need its buffer, so we cast it and extract the address of the buffer
460 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(values[i]);
461 if (auto *data = rawRVec->data(); fBranchAddresses[i] != data) {
462 // reset the branch address
463 fBranches[i]->SetAddress(data);
464 fBranchAddresses[i] = data;
465 }
466 }
467 }
468}
469
471{
472 // create branches in output tree
473 auto nValues = values.size();
474 for (decltype(nValues) i{}; i < nValues; i++) {
475 SetBranchesHelper(fInputTree, *fOutputTree, fOutputBranches, fOptions.fBasketSize, fInputBranchNames[i],
476 fOutputBranchNames[i], *fInputColumnTypeIDs[i], values[i], fBranches[i], fBranchAddresses[i],
477 fIsDefine[i]);
478 }
479 fOutputBranches.AssertNoNullBranchAddresses();
480}
481
483{
484 void *dummyValueAddress{};
486 void *dummyTBranchAddress{};
488 auto nBranches = fInputBranchNames.size();
489 for (decltype(nBranches) i{}; i < nBranches; i++) {
490 SetBranchesHelper(inputTree, outputTree, outputBranches, fOptions.fBasketSize, fInputBranchNames[i],
491 fOutputBranchNames[i], *fInputColumnTypeIDs[i], dummyValueAddress, dummyTBranchPtr,
492 dummyTBranchAddress, fIsDefine[i]);
493 }
494}
495
497{
498 fOutputFile.reset(
499 TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/"",
500 ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel)));
501 if (!fOutputFile)
502 throw std::runtime_error("Snapshot: could not create output file " + fFileName);
503
504 TDirectory *outputDir = fOutputFile.get();
505 if (!fDirName.empty()) {
506 TString checkupdate = fOptions.fMode;
507 checkupdate.ToLower();
508 if (checkupdate == "update")
509 outputDir = fOutputFile->mkdir(fDirName.c_str(), "", true); // do not overwrite existing directory
510 else
511 outputDir = fOutputFile->mkdir(fDirName.c_str());
512 }
513
514 fOutputTree = std::make_unique<TTree>(fTreeName.c_str(), fTreeName.c_str(), fOptions.fSplitLevel, /*dir=*/outputDir);
515
516 if (fOptions.fAutoFlush)
517 fOutputTree->SetAutoFlush(fOptions.fAutoFlush);
518}
519
521{
522 assert(fOutputTree != nullptr);
523 assert(fOutputFile != nullptr);
524
525 // There were no entries to fill the TTree with (either the input TTree was empty or no event passed after
526 // filtering). We have already created an empty TTree, now also create the branches to preserve the schema
527 if (fOutputTree->GetEntries() == 0) {
528 SetEmptyBranches(fInputTree, *fOutputTree);
529 }
530 // use AutoSave to flush TTree contents because TTree::Write writes in gDirectory, not in fDirectory
531 fOutputTree->AutoSave("flushbaskets");
532 // must destroy the TTree first, otherwise TFile will delete it too leading to a double delete
533 fOutputTree.reset();
534 fOutputFile->Close();
535
536 // Now connect the data source to the loop manager so it can be used for further processing
537 auto fullTreeName = fDirName.empty() ? fTreeName : fDirName + '/' + fTreeName;
538 fOutputLoopManager->SetDataSource(std::make_unique<ROOT::Internal::RDF::RTTreeDS>(fullTreeName, fFileName));
539}
540
541/**
542 * \brief Create a new UntypedSnapshotTTreeHelper with a different output file name
543 *
544 * \param newName A type-erased string with the output file name
545 * \return UntypedSnapshotTTreeHelper
546 *
547 * This MakeNew implementation is tied to the cloning feature of actions
548 * of the computation graph. In particular, cloning a Snapshot node usually
549 * also involves changing the name of the output file, otherwise the cloned
550 * Snapshot would overwrite the same file.
551 */
554{
555 const std::string finalName = *reinterpret_cast<const std::string *>(newName);
557 fDirName,
558 fTreeName,
559 fInputBranchNames,
560 fOutputBranchNames,
561 fOptions,
562 std::vector<bool>(fIsDefine),
563 fOutputLoopManager,
564 fInputLoopManager,
565 fInputColumnTypeIDs};
566}
567
569 unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view treename,
570 const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options,
572 const std::vector<const std::type_info *> &colTypeIDs)
573 : fNSlots(nSlots),
574 fOutputFiles(fNSlots),
575 fOutputTrees(fNSlots),
576 fBranchAddressesNeedReset(fNSlots, 1),
577 fInputTrees(fNSlots),
578 fBranches(fNSlots, std::vector<TBranch *>(vbnames.size(), nullptr)),
579 fBranchAddresses(fNSlots, std::vector<void *>(vbnames.size(), nullptr)),
580 fOutputBranches(fNSlots),
581 fFileName(filename),
582 fDirName(dirname),
583 fTreeName(treename),
584 fOptions(options),
585 fOutputBranchNames(ReplaceDotWithUnderscore(bnames)),
586 fOutputLoopManager(loopManager),
587 fInputLoopManager(inputLM),
588 fInputBranchNames(vbnames),
589 fInputColumnTypeIDs(colTypeIDs),
590 fIsDefine(std::move(isDefine))
591{
593}
594
595// Define special member methods here where the definition of all the data member types is available
600
602{
603 if (!fTreeName.empty() /*not moved from*/ && fOptions.fLazy && !fOutputFiles.empty() &&
604 std::all_of(fOutputFiles.begin(), fOutputFiles.end(), [](const auto &f) { return !f; }) /* never run */) {
605 const auto fileOpenMode = [&]() {
606 TString checkupdate = fOptions.fMode;
607 checkupdate.ToLower();
608 return checkupdate == "update" ? "updated" : "created";
609 }();
610 Warning("Snapshot",
611 "A lazy Snapshot action was booked but never triggered. The tree '%s' in output file '%s' was not %s. "
612 "In case it was desired instead, remember to trigger the Snapshot operation, by storing "
613 "its result in a variable and for example calling the GetValue() method on it.",
614 fTreeName.c_str(), fFileName.c_str(), fileOpenMode);
615 }
616}
617
619{
620 ::TDirectory::TContext c; // do not let tasks change the thread-local gDirectory
621 if (!fOutputFiles[slot]) {
622 // first time this thread executes something, let's create a TBufferMerger output directory
623 fOutputFiles[slot] = fMerger->GetFile();
624 }
625 TDirectory *treeDirectory = fOutputFiles[slot].get();
626 if (!fDirName.empty()) {
627 // call returnExistingDirectory=true since MT can end up making this call multiple times
628 treeDirectory = fOutputFiles[slot]->mkdir(fDirName.c_str(), "", true);
629 }
630 // re-create output tree as we need to create its branches again, with new input variables
631 // TODO we could instead create the output tree and its branches, change addresses of input variables in each task
632 fOutputTrees[slot] =
633 std::make_unique<TTree>(fTreeName.c_str(), fTreeName.c_str(), fOptions.fSplitLevel, /*dir=*/treeDirectory);
634 fOutputTrees[slot]->SetBit(TTree::kEntriesReshuffled);
635 // TODO can be removed when RDF supports interleaved TBB task execution properly, see ROOT-10269
636 fOutputTrees[slot]->SetImplicitMT(false);
637 if (fOptions.fAutoFlush)
638 fOutputTrees[slot]->SetAutoFlush(fOptions.fAutoFlush);
639 if (r) {
640 // We could be getting a task-local TTreeReader from the TTreeProcessorMT.
641 fInputTrees[slot] = r->GetTree();
642 } else if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputLoopManager->GetDataSource())) {
643 fInputTrees[slot] = treeDS->GetTree();
644 }
645 fBranchAddressesNeedReset[slot] = 1; // reset first event flag for this slot
646}
647
649{
650 if (fOutputTrees[slot]->GetEntries() > 0)
651 fOutputFiles[slot]->Write();
652 // clear now to avoid concurrent destruction of output trees and input tree (which has them listed as fClones)
653 fOutputTrees[slot].reset(nullptr);
654 fOutputBranches[slot].Clear();
655}
656
657void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::Exec(unsigned int slot, const std::vector<void *> &values)
658{
659 if (fBranchAddressesNeedReset[slot] == 0) {
660 UpdateCArraysPtrs(slot, values);
661 } else {
662 SetBranches(slot, values);
663 fBranchAddressesNeedReset[slot] = 0;
664 }
665 fOutputTrees[slot]->Fill();
666 auto entries = fOutputTrees[slot]->GetEntries();
667 auto autoFlush = fOutputTrees[slot]->GetAutoFlush();
668 if ((autoFlush > 0) && (entries % autoFlush == 0))
669 fOutputFiles[slot]->Write();
670}
671
673 const std::vector<void *> &values)
674{
675 // This code deals with branches which hold C arrays of variable size. It can happen that the buffers
676 // associated to those is re-allocated. As a result the value of the pointer can change therewith
677 // leaving associated to the branch of the output tree an invalid pointer.
678 // With this code, we set the value of the pointer in the output branch anew when needed.
679 assert(values.size() == fBranches[slot].size());
680 auto nValues = values.size();
681 for (decltype(nValues) i{}; i < nValues; i++) {
682 if (fBranches[slot][i] && fOutputBranches[slot].IsCArray(fOutputBranchNames[i])) {
683 // valueAddress here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
684 // need its buffer, so we cast it and extract the address of the buffer
685 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(values[i]);
686 if (auto *data = rawRVec->data(); fBranchAddresses[slot][i] != data) {
687 // reset the branch address
688 fBranches[slot][i]->SetAddress(data);
689 fBranchAddresses[slot][i] = data;
690 }
691 }
692 }
693}
694
696 const std::vector<void *> &values)
697{
698 // create branches in output tree
699 auto nValues = values.size();
700 for (decltype(nValues) i{}; i < nValues; i++) {
701 SetBranchesHelper(fInputTrees[slot], *fOutputTrees[slot], fOutputBranches[slot], fOptions.fBasketSize,
702 fInputBranchNames[i], fOutputBranchNames[i], *fInputColumnTypeIDs[i], values[i],
703 fBranches[slot][i], fBranchAddresses[slot][i], fIsDefine[i]);
704 }
705 fOutputBranches[slot].AssertNoNullBranchAddresses();
706}
707
709{
710 void *dummyValueAddress{};
712 void *dummyTBranchAddress{};
714 auto nBranches = fInputBranchNames.size();
715 for (decltype(nBranches) i{}; i < nBranches; i++) {
716 SetBranchesHelper(inputTree, outputTree, outputBranches, fOptions.fBasketSize, fInputBranchNames[i],
717 fOutputBranchNames[i], *fInputColumnTypeIDs[i], dummyValueAddress, dummyTBranchPtr,
718 dummyTBranchAddress, fIsDefine[i]);
719 }
720}
721
723{
724 const auto cs = ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel);
725 auto outFile =
726 std::unique_ptr<TFile>{TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/fFileName.c_str(), cs)};
727 if (!outFile)
728 throw std::runtime_error("Snapshot: could not create output file " + fFileName);
729 fOutputFile = outFile.get();
730 fMerger = std::make_unique<ROOT::TBufferMerger>(std::move(outFile));
731}
732
734{
735
736 for (auto &file : fOutputFiles) {
737 if (file) {
738 file->Write();
739 file->Close();
740 }
741 }
742
743 // If there were no entries to fill the TTree with (either the input TTree was empty or no event passed after
744 // filtering), create an empty TTree in the output file and create the branches to preserve the schema
745 auto fullTreeName = fDirName.empty() ? fTreeName : fDirName + '/' + fTreeName;
746 assert(fOutputFile && "Missing output file in Snapshot finalization.");
747 if (!fOutputFile->Get(fullTreeName.c_str())) {
748
749 // First find in which directory we need to write the output TTree
750 TDirectory *treeDirectory = fOutputFile;
751 if (!fDirName.empty()) {
752 treeDirectory = fOutputFile->mkdir(fDirName.c_str(), "", true);
753 }
755
756 // Create the output TTree and create the user-requested branches
757 auto outTree =
758 std::make_unique<TTree>(fTreeName.c_str(), fTreeName.c_str(), fOptions.fSplitLevel, /*dir=*/treeDirectory);
759 TTree *inputTree{};
760 if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputLoopManager->GetDataSource()))
761 inputTree = treeDS->GetTree();
762 SetEmptyBranches(inputTree, *outTree);
763
764 fOutputFile->Write();
765 }
766
767 // flush all buffers to disk by destroying the TBufferMerger
768 fOutputFiles.clear();
769 fMerger.reset();
770
771 // Now connect the data source to the loop manager so it can be used for further processing
772 fOutputLoopManager->SetDataSource(std::make_unique<ROOT::Internal::RDF::RTTreeDS>(fullTreeName, fFileName));
773}
774
775/**
776 * \brief Create a new UntypedSnapshotTTreeHelperMT with a different output file name
777 *
778 * \param newName A type-erased string with the output file name
779 * \return UntypedSnapshotTTreeHelperMT
780 *
781 * This MakeNew implementation is tied to the cloning feature of actions
782 * of the computation graph. In particular, cloning a Snapshot node usually
783 * also involves changing the name of the output file, otherwise the cloned
784 * Snapshot would overwrite the same file.
785 */
788{
789 const std::string finalName = *reinterpret_cast<const std::string *>(newName);
791 finalName,
792 fDirName,
793 fTreeName,
794 fInputBranchNames,
795 fOutputBranchNames,
796 fOptions,
797 std::vector<bool>(fIsDefine),
798 fOutputLoopManager,
799 fInputLoopManager,
800 fInputColumnTypeIDs};
801}
802
804 unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view ntuplename,
805 const ColumnNames_t &vfnames, const ColumnNames_t &fnames, const RSnapshotOptions &options,
807 const std::vector<const std::type_info *> &colTypeIDs)
808 : fFileName(filename),
809 fDirName(dirname),
810 fNTupleName(ntuplename),
811 fOptions(options),
812 fInputLoopManager(inputLM),
813 fOutputLoopManager(outputLM),
814 fInputFieldNames(vfnames),
815 fOutputFieldNames(ReplaceDotWithUnderscore(fnames)),
816 fNSlots(nSlots),
817 fFillContexts(nSlots),
818 fEntries(nSlots),
819 fInputColumnTypeIDs(colTypeIDs)
820{
822}
823
824// Define special member methods here where the definition of all the data member types is available
829
831{
832 if (!fNTupleName.empty() /* not moved from */ && !fOutputFile /* did not run */ && fOptions.fLazy)
833 Warning("Snapshot", "A lazy Snapshot action was booked but never triggered.");
834}
835
837{
838 auto model = ROOT::RNTupleModel::CreateBare();
839 auto nFields = fOutputFieldNames.size();
840 fFieldTokens.resize(nFields);
841 for (decltype(nFields) i = 0; i < nFields; i++) {
842 // Need to retrieve the type of every field to create as a string
843 // If the input type for a field does not have RTTI, internally we store it as the tag UseNativeDataType. When
844 // that is detected, we need to ask the data source which is the type name based on the on-disk information.
845 const auto typeName = *fInputColumnTypeIDs[i] == typeid(ROOT::Internal::RDF::UseNativeDataType)
846 ? ROOT::Internal::RDF::GetTypeNameWithOpts(*fInputLoopManager->GetDataSource(),
847 fInputFieldNames[i], fOptions.fVector2RVec)
848 : ROOT::Internal::RDF::TypeID2TypeName(*fInputColumnTypeIDs[i]);
849 model->AddField(ROOT::RFieldBase::Create(fOutputFieldNames[i], typeName).Unwrap());
850 fFieldTokens[i] = model->GetToken(fOutputFieldNames[i]);
851 }
852 model->Freeze();
853
855 writeOptions.SetCompression(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel);
856
857 fOutputFile.reset(TFile::Open(fFileName.c_str(), fOptions.fMode.c_str()));
858 if (!fOutputFile)
859 throw std::runtime_error("Snapshot: could not create output file " + fFileName);
860
861 TDirectory *outputDir = fOutputFile.get();
862 if (!fDirName.empty()) {
863 TString checkupdate = fOptions.fMode;
864 checkupdate.ToLower();
865 if (checkupdate == "update")
866 outputDir = fOutputFile->mkdir(fDirName.c_str(), "", true); // do not overwrite existing directory
867 else
868 outputDir = fOutputFile->mkdir(fDirName.c_str());
869 }
870
871 // The RNTupleParallelWriter has exclusive access to the underlying TFile, no further synchronization is needed for
872 // calls to Fill() (in Exec) and FlushCluster() (in FinalizeTask).
873 fWriter = ROOT::Experimental::RNTupleParallelWriter::Append(std::move(model), fNTupleName, *outputDir, writeOptions);
874}
875
877{
878 if (!fFillContexts[slot]) {
879 fFillContexts[slot] = fWriter->CreateFillContext();
880 fEntries[slot] = fFillContexts[slot]->GetModel().CreateBareEntry();
881 }
882}
883
884void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Exec(unsigned int slot, const std::vector<void *> &values)
885{
886 auto &fillContext = fFillContexts[slot];
887 auto &outputEntry = fEntries[slot];
888 assert(values.size() == fFieldTokens.size());
889 for (decltype(values.size()) i = 0; i < values.size(); i++) {
890 outputEntry->BindRawPtr(fFieldTokens[i], values[i]);
891 }
892 fillContext->Fill(*outputEntry);
893}
894
896{
897 // In principle we would not need to flush a cluster here, but we want to benefit from parallelism for compression.
898 // NB: RNTupleFillContext::FlushCluster() is a nop if there is no new entry since the last flush.
899 fFillContexts[slot]->FlushCluster();
900}
901
903{
904 // First clear and destroy all entries, which were created from the RNTupleFillContexts.
905 fEntries.clear();
906 fFillContexts.clear();
907 // Then destroy the RNTupleParallelWriter and write the metadata.
908 fWriter.reset();
909 // We can now set the data source of the loop manager for the RDataFrame that is returned by the Snapshot call.
910 fOutputLoopManager->SetDataSource(std::make_unique<ROOT::RDF::RNTupleDS>(fDirName + "/" + fNTupleName, fFileName));
911}
912
913/**
914 * Create a new UntypedSnapshotRNTupleHelper with a different output file name.
915 *
916 * \param[in] newName A type-erased string with the output file name
917 * \return UntypedSnapshotRNTupleHelper
918 *
919 * This MakeNew implementation is tied to the cloning feature of actions
920 * of the computation graph. In particular, cloning a Snapshot node usually
921 * also involves changing the name of the output file, otherwise the cloned
922 * Snapshot would overwrite the same file.
923 */
926{
927 const std::string finalName = *reinterpret_cast<const std::string *>(newName);
929 fNSlots, finalName, fDirName, fNTupleName, fInputFieldNames,
930 fOutputFieldNames, fOptions, fInputLoopManager, fOutputLoopManager, fInputColumnTypeIDs};
931}
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
#define c(i)
Definition RSha256.hxx:101
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition TError.cxx:252
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void data
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char filename
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t r
char name[80]
Definition TGX11.cxx:110
static TBranch * SearchForBranch(TTree *tree, const char *name)
Definition TTreePyz.cxx:50
The head node of a RDF computation graph.
static std::unique_ptr< RNTupleParallelWriter > Append(std::unique_ptr< ROOT::RNTupleModel > model, std::string_view ntupleName, TDirectory &fileOrDirectory, const ROOT::RNTupleWriteOptions &options=ROOT::RNTupleWriteOptions())
Append an ntuple to the existing file, which must not be accessed while data is filled into any creat...
void Insert(const std::string &name, TBranch *address, bool isCArray=false)
std::vector< TBranch * > fBranches
std::vector< std::string > fNames
bool IsCArray(const std::string &name) const
TBranch * Get(const std::string &name) const
UntypedSnapshotRNTupleHelper(unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view ntuplename, const ColumnNames_t &vfnames, const ColumnNames_t &fnames, const RSnapshotOptions &options, ROOT::Detail::RDF::RLoopManager *inputLM, ROOT::Detail::RDF::RLoopManager *outputLM, const std::vector< const std::type_info * > &colTypeIDs)
void Exec(unsigned int slot, const std::vector< void * > &values)
UntypedSnapshotRNTupleHelper MakeNew(void *newName)
Create a new UntypedSnapshotRNTupleHelper with a different output file name.
void InitTask(TTreeReader *, unsigned int slot)
UntypedSnapshotTTreeHelperMT(unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector< bool > &&isDefine, ROOT::Detail::RDF::RLoopManager *loopManager, ROOT::Detail::RDF::RLoopManager *inputLM, const std::vector< const std::type_info * > &colTypeIDs)
void UpdateCArraysPtrs(unsigned int slot, const std::vector< void * > &values)
void SetEmptyBranches(TTree *inputTree, TTree &outputTree)
UntypedSnapshotTTreeHelperMT MakeNew(void *newName, std::string_view="nominal")
Create a new UntypedSnapshotTTreeHelperMT with a different output file name.
void InitTask(TTreeReader *r, unsigned int slot)
void Exec(unsigned int slot, const std::vector< void * > &values)
void SetBranches(unsigned int slot, const std::vector< void * > &values)
UntypedSnapshotTTreeHelper MakeNew(void *newName, std::string_view="nominal")
Create a new UntypedSnapshotTTreeHelper with a different output file name.
void SetEmptyBranches(TTree *inputTree, TTree &outputTree)
void SetBranches(const std::vector< void * > &values)
void Exec(unsigned int, const std::vector< void * > &values)
UntypedSnapshotTTreeHelper(std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector< bool > &&isDefine, ROOT::Detail::RDF::RLoopManager *loopManager, ROOT::Detail::RDF::RLoopManager *inputLM, const std::vector< const std::type_info * > &colTypeIDs)
void UpdateCArraysPtrs(const std::vector< void * > &values)
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &typeName, const ROOT::RCreateFieldOptions &options, const ROOT::RNTupleDescriptor *desc, ROOT::DescriptorId_t fieldId)
Factory method to resurrect a field from the stored on-disk type information.
static std::unique_ptr< RNTupleModel > CreateBare()
Creates a "bare model", i.e. an RNTupleModel with no default entry.
Common user-tunable settings for storing RNTuples.
Representation of an RNTuple data set in a ROOT file.
Definition RNTuple.hxx:67
const_iterator begin() const
const_iterator end() const
A "std::vector"-like collection of values implementing handy operation to analyse them.
Definition RVec.hxx:1526
A Branch for the case of an object.
A TTree is a list of TBranches.
Definition TBranch.h:93
static TClass * Class()
TClassRef is used to implement a permanent reference to a TClass object.
Definition TClassRef.h:29
TClass instances represent classes, structs and namespaces in the ROOT type system.
Definition TClass.h:84
Basic data type descriptor (datatype information is obtained from CINT).
Definition TDataType.h:44
Int_t GetType() const
Definition TDataType.h:68
static TDictionary * GetDictionary(const char *name)
Retrieve the type (class, fundamental type, typedef etc) named "name".
TDirectory::TContext keeps track and restore the current directory.
Definition TDirectory.h:89
Describe directory structure in memory.
Definition TDirectory.h:45
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition TFile.cxx:3764
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition TLeaf.h:57
Mother of all ROOT objects.
Definition TObject.h:41
Basic string class.
Definition TString.h:138
A simple, robust and fast interface to read values from ROOT columnar datasets such as TTree,...
Definition TTreeReader.h:46
A TTree represents a columnar dataset.
Definition TTree.h:89
@ kEntriesReshuffled
If set, signals that this TTree is the output of the processing of another TTree, and the entries are...
Definition TTree.h:297
std::vector< std::string > ReplaceDotWithUnderscore(const std::vector< std::string > &columnNames)
Replace occurrences of '.
Definition RDFUtils.cxx:397
char TypeName2ROOTTypeName(const std::string &b)
Convert type name (e.g.
Definition RDFUtils.cxx:342
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition RDFUtils.cxx:178
std::string GetTypeNameWithOpts(const ROOT::RDF::RDataSource &ds, std::string_view colName, bool vector2RVec)
Definition RDFUtils.cxx:627
char TypeID2ROOTTypeName(const std::type_info &tid)
Definition RDFUtils.cxx:206
TBranch * CallBranchImp(TTree &tree, const char *branchname, TClass *ptrClass, void *addobj, Int_t bufsize=32000, Int_t splitlevel=99)
Definition TTree.cxx:10085
TBranch * CallBranchImpRef(TTree &tree, const char *branchname, TClass *ptrClass, EDataType datatype, void *addobj, Int_t bufsize=32000, Int_t splitlevel=99)
Definition TTree.cxx:10079
std::vector< std::string > ColumnNames_t
@ kROOTRVec
Definition ESTLType.h:46
@ kSTLvector
Definition ESTLType.h:30
int CompressionSettings(RCompressionSetting::EAlgorithm::EValues algorithm, int compressionLevel)
ROOT::ESTLType STLKind(std::string_view type)
Converts STL container name to number.
ROOT::ESTLType IsSTLCont(std::string_view type)
type : type name: vector<list<classA,allocator>,allocator> result: 0 : not stl container code of cont...
Tag to let data sources use the native data type when creating a column reader.
Definition Utils.hxx:344
A collection of options to steer the creation of the dataset on file.