Logo ROOT  
Reference Guide
Loading...
Searching...
No Matches
RDFSnapshotHelpers.cxx
Go to the documentation of this file.
1/**
2 \file RDFSnapshotHelpers.cxx
3 \author Enrico Guiraud, CERN
4 \author Danilo Piparo, CERN
5 \date 2016-12
6 \author Vincenzo Eduardo Padulano
7 \author Stephan Hageboeck
8 \date 2025-06
9*/
10
11/*************************************************************************
12 * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers. *
13 * All rights reserved. *
14 * *
15 * For the licensing terms see $ROOTSYS/LICENSE. *
16 * For the list of contributors see $ROOTSYS/README/CREDITS. *
17 *************************************************************************/
18
20
21#include <ROOT/REntry.hxx>
22#include <ROOT/RFieldToken.hxx>
23#include <ROOT/RNTuple.hxx>
24#include <ROOT/RNTupleDS.hxx>
27#include <ROOT/RTTreeDS.hxx>
29
30#include <TBranchObject.h>
31#include <TClassEdit.h>
32#include <TDictionary.h>
33#include <TDataType.h>
34#include <TFile.h>
35#include <TLeaf.h>
36#include <TTreeReader.h>
37
38#include <algorithm>
39#include <type_traits>
40#include <utility>
41
43// Maintaining the following allows for faster vector resize:
44static_assert(std::is_nothrow_move_assignable_v<RBranchData>);
45static_assert(std::is_nothrow_move_constructible_v<RBranchData>);
46
47namespace {
48
49void AssertNoNullBranchAddresses(const std::vector<RBranchData> &branches)
50{
51 std::vector<TBranch *> branchesWithNullAddress;
52 for (const auto &branchData : branches) {
53 if (branchData.fOutputBranch->GetAddress() == nullptr)
54 branchesWithNullAddress.push_back(branchData.fOutputBranch);
55 }
56
57 if (branchesWithNullAddress.empty())
58 return;
59
60 // otherwise build error message and throw
61 std::vector<std::string> missingBranchNames;
62 std::transform(branchesWithNullAddress.begin(), branchesWithNullAddress.end(),
63 std::back_inserter(missingBranchNames), [](TBranch *b) { return b->GetName(); });
64 std::string msg = "RDataFrame::Snapshot:";
65 if (missingBranchNames.size() == 1) {
66 msg += " branch " + missingBranchNames[0] +
67 " is needed as it provides the size for one or more branches containing dynamically sized arrays, but "
68 "it is";
69 } else {
70 msg += " branches ";
71 for (const auto &bName : missingBranchNames)
72 msg += bName + ", ";
73 msg.resize(msg.size() - 2); // remove last ", "
74 msg += " are needed as they provide the size of other branches containing dynamically sized arrays, but they are";
75 }
76 msg += " not part of the set of branches that are being written out.";
77 throw std::runtime_error(msg);
78}
79
80TBranch *SearchForBranch(TTree *inputTree, const std::string &branchName)
81{
82 if (inputTree) {
83 if (auto *getBranchRes = inputTree->GetBranch(branchName.c_str()))
84 return getBranchRes;
85
86 // try harder
87 if (auto *findBranchRes = inputTree->FindBranch(branchName.c_str()))
88 return findBranchRes;
89 }
90 return nullptr;
91}
92
93std::vector<RBranchData>::iterator CreateCStyleArrayBranch(TTree &outputTree, std::vector<RBranchData> &outputBranches,
94 std::vector<RBranchData>::iterator thisBranch,
95 TBranch *inputBranch, int basketSize, void *address)
96{
97 if (!inputBranch)
98 return thisBranch;
99 const auto STLKind = TClassEdit::IsSTLCont(inputBranch->GetClassName());
100 if (STLKind == ROOT::ESTLType::kSTLvector || STLKind == ROOT::ESTLType::kROOTRVec)
101 return thisBranch;
102 // must construct the leaflist for the output branch and create the branch in the output tree
103 const auto *leaf = static_cast<TLeaf *>(inputBranch->GetListOfLeaves()->UncheckedAt(0));
104 if (!leaf)
105 return thisBranch;
106 const auto bname = leaf->GetName();
107 auto *sizeLeaf = leaf->GetLeafCount();
108 const auto sizeLeafName = sizeLeaf ? std::string(sizeLeaf->GetName()) : std::to_string(leaf->GetLenStatic());
109
110 // We proceed only if branch is a fixed-or-variable-sized array
111 if (sizeLeaf || leaf->GetLenStatic() > 1) {
112 if (sizeLeaf) {
113 // The array branch `bname` has dynamic size stored in leaf `sizeLeafName`, so we need to ensure that it's
114 // in the output tree.
115 auto sizeLeafIt =
116 std::find_if(outputBranches.begin(), outputBranches.end(),
117 [&sizeLeafName](const RBranchData &bd) { return bd.fOutputBranchName == sizeLeafName; });
118 if (sizeLeafIt == outputBranches.end()) {
119 // The size leaf is not part of the output branches yet, so emplace an empty slot for it.
120 // This means that iterators need to be updated in case the container reallocates.
121 const auto indexBeforeEmplace = std::distance(outputBranches.begin(), thisBranch);
122 outputBranches.emplace_back("", sizeLeafName, /*isDefine=*/false, /*typeID=*/nullptr);
123 thisBranch = outputBranches.begin() + indexBeforeEmplace;
124 sizeLeafIt = outputBranches.end() - 1;
125 }
126 if (!sizeLeafIt->fOutputBranch) {
127 // The size leaf was emplaced, but not initialised yet
128 const auto sizeTypeStr = ROOT::Internal::RDF::TypeName2ROOTTypeName(sizeLeaf->GetTypeName());
129 // Use original basket size for existing branches otherwise use custom basket size.
130 const auto bufSize = (basketSize > 0) ? basketSize : sizeLeaf->GetBranch()->GetBasketSize();
131 // The null branch address is a placeholder. It will be set when SetBranchesHelper is called for
132 // `sizeLeafName`
133 auto *outputBranch = outputTree.Branch(sizeLeafName.c_str(), static_cast<void *>(nullptr),
134 (sizeLeafName + '/' + sizeTypeStr).c_str(), bufSize);
135 sizeLeafIt->fOutputBranch = outputBranch;
136 }
137 }
138
139 const auto btype = leaf->GetTypeName();
140 const auto rootbtype = ROOT::Internal::RDF::TypeName2ROOTTypeName(btype);
141 if (rootbtype == ' ') {
142 Warning("Snapshot",
143 "RDataFrame::Snapshot: could not correctly construct a leaflist for C-style array in column %s. The "
144 "leaf is of type '%s'. This column will not be written out.",
145 bname, btype);
146 return thisBranch;
147 }
148
149 const auto leaflist = std::string(bname) + "[" + sizeLeafName + "]/" + rootbtype;
150 // Use original basket size for existing branches and new basket size for new branches
151 const auto bufSize = (basketSize > 0) ? basketSize : inputBranch->GetBasketSize();
152 void *addressForBranch = [address]() -> void * {
153 if (address) {
154 // Address here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we need
155 // its buffer, so we cast it and extract the address of the buffer
156 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(address);
157 return rawRVec->data();
158 }
159 return nullptr;
160 }();
161 thisBranch->fOutputBranch =
162 outputTree.Branch(thisBranch->fOutputBranchName.c_str(), addressForBranch, leaflist.c_str(), bufSize);
163 thisBranch->fOutputBranch->SetTitle(inputBranch->GetTitle());
164 thisBranch->fIsCArray = true;
165 }
166
167 return thisBranch;
168}
169
170void SetBranchAddress(TBranch *inputBranch, RBranchData &branchData, void *valueAddress)
171{
172 const static TClassRef TBOClRef("TBranchObject");
173 if (inputBranch && inputBranch->IsA() == TBOClRef) {
174 branchData.fOutputBranch->SetAddress(reinterpret_cast<void **>(inputBranch->GetAddress()));
175 } else if (branchData.fOutputBranch->IsA() != TBranch::Class()) {
176 // This is a relatively rare case of a fixed-size array getting redefined
177 branchData.fBranchAddressForCArrays = valueAddress;
178 branchData.fOutputBranch->SetAddress(&branchData.fBranchAddressForCArrays);
179 } else {
180 void *correctAddress = [valueAddress, isCArray = branchData.fIsCArray]() -> void * {
181 if (isCArray) {
182 // Address here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
183 // need its buffer, so we cast it and extract the address of the buffer
184 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(valueAddress);
185 return rawRVec->data();
186 }
187 return valueAddress;
188 }();
189 branchData.fOutputBranch->SetAddress(correctAddress);
190 branchData.fBranchAddressForCArrays = valueAddress;
191 }
192}
193
194void CreateFundamentalTypeBranch(TTree &outputTree, RBranchData &bd, void *valueAddress, int bufSize)
195{
196 // Logic taken from
197 // TTree::BranchImpRef(
198 // const char* branchname, TClass* ptrClass, EDataType datatype, void* addobj, Int_t bufsize, Int_t splitlevel)
200 if (rootTypeChar == ' ') {
201 Warning("Snapshot",
202 "RDataFrame::Snapshot: could not correctly construct a leaflist for fundamental type in column %s. This "
203 "column will not be written out.",
204 bd.fOutputBranchName.c_str());
205 return;
206 }
207 std::string leafList{bd.fOutputBranchName + '/' + rootTypeChar};
208 bd.fOutputBranch = outputTree.Branch(bd.fOutputBranchName.c_str(), valueAddress, leafList.c_str(), bufSize);
209}
210
211/// Ensure that an object with the input name can be written to the target file. This means checking that the
212/// TFile can be opened in the mode specified in `opts`, deleting any pre-existing objects with the same name in case
213/// `opts.fOverwriteIfExists = true`, or throwing an error otherwise.
214void EnsureValidSnapshotOutput(const ROOT::RDF::RSnapshotOptions &opts, const std::string &objName,
215 const std::string &fileName)
216{
217 TString fileMode = opts.fMode;
218 fileMode.ToLower();
219 if (fileMode != "update")
220 return;
221
222 // output file opened in "update" mode: must check whether target object name is already present in file
223 std::unique_ptr<TFile> outFile{TFile::Open(fileName.c_str(), "update")};
224 if (!outFile || outFile->IsZombie())
225 throw std::invalid_argument("Snapshot: cannot open file \"" + fileName + "\" in update mode");
226
227 // Object is not present in the file, we are good
228 if (!outFile->GetKey(objName.c_str()))
229 return;
230
231 // object called objName is already present in the file
232 if (opts.fOverwriteIfExists) {
233 if (auto existingTree = outFile->Get<TTree>(objName.c_str()); existingTree) {
234 // Special case for TTree. TTree::Delete invalidates the 'this' pointer, so we don't wrap it in a
235 // std::unique_ptr.
236 existingTree->Delete("all");
237 } else {
238 // Ensure deletion of object and all its cycles.
239 outFile->Delete((objName + ";*").c_str());
240 }
241 } else {
242 const std::string msg = "Snapshot: object \"" + objName + "\" already present in file \"" + fileName +
243 "\". If you want to delete the original object and write another, please set the "
244 "'fOverwriteIfExists' option to true in RSnapshotOptions.";
245 throw std::invalid_argument(msg);
246 }
247}
248
249void SetBranchesHelper(TTree *inputTree, TTree &outputTree,
250 std::vector<ROOT::Internal::RDF::RBranchData> &allBranchData, std::size_t currentIndex,
251 int basketSize, void *valueAddress)
252{
253 auto branchData = allBranchData.begin() + currentIndex;
254 auto *inputBranch = branchData->fIsDefine ? nullptr : SearchForBranch(inputTree, branchData->fInputBranchName);
255
256 if (branchData->fOutputBranch && valueAddress) {
257 // The output branch was already created, we just need to (re)set its address
258 SetBranchAddress(inputBranch, *branchData, valueAddress);
259 return;
260 }
261
262 // Respect the original bufsize and splitlevel arguments
263 // In particular, by keeping splitlevel equal to 0 if this was the case for `inputBranch`, we avoid
264 // writing garbage when unsplit objects cannot be written as split objects (e.g. in case of a polymorphic
265 // TObject branch, see https://bit.ly/2EjLMId ).
266 // A user-provided basket size value takes precedence.
267 const auto bufSize = (basketSize > 0) ? basketSize : (inputBranch ? inputBranch->GetBasketSize() : 32000);
268 const auto splitLevel = inputBranch ? inputBranch->GetSplitLevel() : 99;
269
270 auto *dictionary = TDictionary::GetDictionary(*branchData->fInputTypeID);
271 if (dynamic_cast<TDataType *>(dictionary)) {
272 // Branch of fundamental type
273 CreateFundamentalTypeBranch(outputTree, *branchData, valueAddress, bufSize);
274 return;
275 }
276
277 if (!branchData->fIsDefine) {
278 // Cases where we need a leaflist (e.g. C-style arrays)
279 // We only enter this code path if the input value does not come from a Define/Redefine. In those cases, it is
280 // not allowed to create a column of C-style array type, so that can't happen when writing the TTree. This is
281 // currently what prevents writing the wrong branch output type in a scenario where the input branch of the TTree
282 // is a C-style array and then the user is Redefining it with some other type (e.g. a ROOT::RVec).
283 branchData = CreateCStyleArrayBranch(outputTree, allBranchData, branchData, inputBranch, bufSize, valueAddress);
284 }
285 if (branchData->fOutputBranch) {
286 // A branch was created in the previous function call
287 if (valueAddress) {
288 // valueAddress here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
289 // need its buffer, so we cast it and extract the address of the buffer
290 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(valueAddress);
291 branchData->fBranchAddressForCArrays = rawRVec->data();
292 }
293 return;
294 }
295
296 if (auto *classPtr = dynamic_cast<TClass *>(dictionary)) {
297 // Case of unsplit object with polymorphic type
298 if (inputBranch && dynamic_cast<TBranchObject *>(inputBranch) && valueAddress)
299 branchData->fOutputBranch =
300 ROOT::Internal::TreeUtils::CallBranchImp(outputTree, branchData->fOutputBranchName.c_str(), classPtr,
301 inputBranch->GetAddress(), bufSize, splitLevel);
302 // General case, with valid address
303 else if (valueAddress)
305 outputTree, branchData->fOutputBranchName.c_str(), classPtr, TDataType::GetType(*branchData->fInputTypeID),
306 valueAddress, bufSize, splitLevel);
307 // No value was passed, we're just creating a hollow branch to populate the dataset schema
308 else
309 branchData->fOutputBranch =
310 outputTree.Branch(branchData->fOutputBranchName.c_str(), classPtr->GetName(), nullptr, bufSize);
311 return;
312 }
313
314 // We are not aware of other cases
315 throw std::logic_error(
316 "RDataFrame::Snapshot: something went wrong when creating a TTree branch, please report this as a bug.");
317}
318
319auto GetSnapshotCompressionSettings(const ROOT::RDF::RSnapshotOptions &options)
320{
322 using OutputFormat = ROOT::RDF::ESnapshotOutputFormat;
323
324 if (options.fOutputFormat == OutputFormat::kTTree || options.fOutputFormat == OutputFormat::kDefault) {
325 // The default compression settings for TTree is 101
326 if (options.fCompressionAlgorithm == CompAlgo::kUndefined) {
327 return ROOT::CompressionSettings(CompAlgo::kZLIB, 1);
328 }
330 } else if (options.fOutputFormat == OutputFormat::kRNTuple) {
331 // The default compression settings for RNTuple is 505
332 if (options.fCompressionAlgorithm == CompAlgo::kUndefined) {
333 return ROOT::CompressionSettings(CompAlgo::kZSTD, 5);
334 }
336 } else {
337 throw std::invalid_argument("RDataFrame::Snapshot: unrecognized output format");
338 }
339}
340} // namespace
341
342ROOT::Internal::RDF::RBranchData::RBranchData(std::string inputBranchName, std::string outputBranchName, bool isDefine,
343 const std::type_info *typeID)
344 : fInputBranchName{std::move(inputBranchName)},
345 fOutputBranchName{std::move(outputBranchName)},
346 fInputTypeID{typeID},
347 fIsDefine{isDefine}
348{
349 auto *dictionary = TDictionary::GetDictionary(*fInputTypeID);
350 if (auto datatype = dynamic_cast<TDataType *>(dictionary); datatype) {
351 fTypeData = FundamentalType(datatype->Size());
352 } else if (auto tclass = dynamic_cast<TClass *>(dictionary); tclass) {
353 fTypeData = EmptyDynamicType{tclass};
354 }
355}
356
357/// @brief Return a pointer to an empty instance of the type represented by this branch.
358/// For fundamental types, this is simply an 8-byte region of zeroes. For classes, it is an instance created with
359/// TClass::New.
360/// @param pointerToPointer Return a pointer to a pointer, so it can be used directly in TTree::SetBranchAddress().
362{
363 if (auto fundamental = std::get_if<FundamentalType>(&fTypeData); fundamental) {
364 assert(!pointerToPointer); // Not used for fundamental types
365 return fundamental->fBytes.data();
366 }
367
368 auto &dynamic = std::get<EmptyDynamicType>(fTypeData);
369 if (!dynamic.fEmptyInstance) {
370 auto *dictionary = TDictionary::GetDictionary(*fInputTypeID);
371 assert(dynamic_cast<TDataType *>(dictionary) ==
372 nullptr); // TDataType should be handled by writing into the local buffer
373
374 auto tclass = dynamic_cast<TClass *>(dictionary);
375 assert(tclass);
376 dynamic.fEmptyInstance = std::shared_ptr<void>{tclass->New(), tclass->GetDestructor()};
377 }
378
379 if (pointerToPointer) {
380 // Make TTree happy (needs a pointer to pointer):
381 dynamic.fRawPtrToEmptyInstance = dynamic.fEmptyInstance.get();
382 return &dynamic.fRawPtrToEmptyInstance;
383 } else {
384 return dynamic.fEmptyInstance.get();
385 }
386}
387
388/// Point the branch address to an empty instance of the type represented by this branch
389/// or write null bytes into the space used by the fundamental type.
390/// This is used in case of variations, when certain defines/actions don't execute. We
391/// nevertheless need to write something, so we point the branch to an empty instance.
393{
394 if (!fOutputBranch)
395 return;
396
397 if (auto fundamental = std::get_if<FundamentalType>(&fTypeData); fundamental) {
398 fundamental->fBytes.fill(std::byte{0});
399 } else {
400 // TTree expects pointer to pointer, to figure out who allocates the object:
401 fOutputBranch->SetAddress(EmptyInstance(/*pointerToPointer=*/true));
402 }
403}
404
406 std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames,
407 const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector<bool> &&isDefine,
409 const std::vector<const std::type_info *> &colTypeIDs)
410 : fFileName(filename),
411 fDirName(dirname),
412 fTreeName(treename),
413 fOptions(options),
414 fOutputLoopManager(loopManager),
415 fInputLoopManager(inputLM)
416{
417 EnsureValidSnapshotOutput(fOptions, fTreeName, fFileName);
418
419 auto outputBranchNames = ReplaceDotWithUnderscore(bnames);
420 fBranchData.reserve(vbnames.size());
421 for (unsigned int i = 0; i < vbnames.size(); ++i) {
422 fBranchData.emplace_back(vbnames[i], std::move(outputBranchNames[i]), isDefine[i], colTypeIDs[i]);
423 }
424}
425
426// Define special member methods here where the definition of all the data member types is available
430 ROOT::Internal::RDF::UntypedSnapshotTTreeHelper &&) noexcept = default;
431
433{
434 if (!fTreeName.empty() /*not moved from*/ && !fOutputFile /* did not run */ && fOptions.fLazy) {
435 const auto fileOpenMode = [&]() {
436 TString checkupdate = fOptions.fMode;
437 checkupdate.ToLower();
438 return checkupdate == "update" ? "updated" : "created";
439 }();
440 Warning("Snapshot",
441 "A lazy Snapshot action was booked but never triggered. The tree '%s' in output file '%s' was not %s. "
442 "In case it was desired instead, remember to trigger the Snapshot operation, by storing "
443 "its result in a variable and for example calling the GetValue() method on it.",
444 fTreeName.c_str(), fFileName.c_str(), fileOpenMode);
445 }
446}
447
449{
450 // We ask the input RLoopManager if it has a TTree. We cannot rely on getting this information when constructing
451 // this action helper, since the TTree might change e.g. when ChangeSpec is called in-between distributed tasks.
452 if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputLoopManager->GetDataSource()))
453 fInputTree = treeDS->GetTree();
455}
456
457void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::Exec(unsigned int, const std::vector<void *> &values)
458{
460 UpdateCArraysPtrs(values);
461 } else {
462 SetBranches(values);
464 }
465
466 fOutputTree->Fill();
467}
468
470{
471 // This code deals with branches which hold C arrays of variable size. It can happen that the buffers
472 // associated to those is re-allocated. As a result the value of the pointer can change therewith
473 // leaving associated to the branch of the output tree an invalid pointer.
474 // With this code, we set the value of the pointer in the output branch anew when needed.
475 assert(values.size() == fBranchData.size());
476 auto nValues = values.size();
477 for (decltype(nValues) i{}; i < nValues; i++) {
478 if (fBranchData[i].fIsCArray) {
479 // valueAddress here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
480 // need its buffer, so we cast it and extract the address of the buffer
481 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(values[i]);
482 if (auto *data = rawRVec->data(); fBranchData[i].fBranchAddressForCArrays != data) {
483 fBranchData[i].fOutputBranch->SetAddress(data);
484 fBranchData[i].fBranchAddressForCArrays = data;
485 }
486 }
487 }
488}
489
491{
492 // create branches in output tree
493 assert(fBranchData.size() == values.size());
494 for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions
495 SetBranchesHelper(fInputTree, *fOutputTree, fBranchData, i, fOptions.fBasketSize, values[i]);
496 }
497 AssertNoNullBranchAddresses(fBranchData);
498}
499
501{
502 void *dummyValueAddress{};
503 for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions
504 SetBranchesHelper(inputTree, outputTree, fBranchData, i, fOptions.fBasketSize, dummyValueAddress);
505 }
506}
507
509{
510 fOutputFile.reset(
511 TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/"", GetSnapshotCompressionSettings(fOptions)));
512 if (!fOutputFile)
513 throw std::runtime_error("Snapshot: could not create output file " + fFileName);
514
515 TDirectory *outputDir = fOutputFile.get();
516 if (!fDirName.empty()) {
517 TString checkupdate = fOptions.fMode;
518 checkupdate.ToLower();
519 if (checkupdate == "update")
520 outputDir = fOutputFile->mkdir(fDirName.c_str(), "", true); // do not overwrite existing directory
521 else
522 outputDir = fOutputFile->mkdir(fDirName.c_str());
523 }
524
525 fOutputTree = std::make_unique<TTree>(fTreeName.c_str(), fTreeName.c_str(), fOptions.fSplitLevel, /*dir=*/outputDir);
526
527 if (fOptions.fAutoFlush)
528 fOutputTree->SetAutoFlush(fOptions.fAutoFlush);
529}
530
532{
533 assert(fOutputTree != nullptr);
534 assert(fOutputFile != nullptr);
535
536 // There were no entries to fill the TTree with (either the input TTree was empty or no event passed after
537 // filtering). We have already created an empty TTree, now also create the branches to preserve the schema
538 if (fOutputTree->GetEntries() == 0) {
540 }
541 // use AutoSave to flush TTree contents because TTree::Write writes in gDirectory, not in fDirectory
542 fOutputTree->AutoSave("flushbaskets");
543 // must destroy the TTree first, otherwise TFile will delete it too leading to a double delete
544 fOutputTree.reset();
545 fOutputFile->Close();
546
547 // Now connect the data source to the loop manager so it can be used for further processing
548 auto fullTreeName = fDirName.empty() ? fTreeName : fDirName + '/' + fTreeName;
549 fOutputLoopManager->SetDataSource(std::make_unique<ROOT::Internal::RDF::RTTreeDS>(fullTreeName, fFileName));
550}
551
552/**
553 * \brief Create a new UntypedSnapshotTTreeHelper with a different output file name
554 *
555 * \param newName A type-erased string with the output file name
556 * \return UntypedSnapshotTTreeHelper
557 *
558 * This MakeNew implementation is tied to the cloning feature of actions
559 * of the computation graph. In particular, cloning a Snapshot node usually
560 * also involves changing the name of the output file, otherwise the cloned
561 * Snapshot would overwrite the same file.
562 */
565{
566 const std::string finalName = *reinterpret_cast<const std::string *>(newName);
567 std::vector<std::string> inputBranchNames;
568 std::vector<std::string> outputBranchNames;
569 std::vector<bool> isDefine;
570 std::vector<const std::type_info *> inputColumnTypeIDs;
571 for (const auto &bd : fBranchData) {
572 if (bd.fInputBranchName.empty())
573 break;
574 inputBranchNames.push_back(bd.fInputBranchName);
575 outputBranchNames.push_back(bd.fOutputBranchName);
576 isDefine.push_back(bd.fIsDefine);
577 inputColumnTypeIDs.push_back(bd.fInputTypeID);
578 }
579
581 fDirName,
582 fTreeName,
583 std::move(inputBranchNames),
584 std::move(outputBranchNames),
585 fOptions,
586 std::move(isDefine),
589 inputColumnTypeIDs};
590}
591
593 unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view treename,
594 const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options,
595 std::vector<bool> &&isDefine, ROOT::Detail::RDF::RLoopManager *loopManager, ROOT::Detail::RDF::RLoopManager *inputLM,
596 const std::vector<const std::type_info *> &colTypeIDs)
597 : fNSlots(nSlots),
602 fFileName(filename),
603 fDirName(dirname),
604 fTreeName(treename),
605 fOptions(options),
606 fOutputLoopManager(loopManager),
607 fInputLoopManager(inputLM)
608{
609 EnsureValidSnapshotOutput(fOptions, fTreeName, fFileName);
610
611 auto outputBranchNames = ReplaceDotWithUnderscore(bnames);
612 fBranchData.reserve(fNSlots);
613 for (unsigned int slot = 0; slot < fNSlots; ++slot) {
614 fBranchData.emplace_back();
615 auto &thisSlot = fBranchData.back();
616 thisSlot.reserve(vbnames.size());
617 for (unsigned int i = 0; i < vbnames.size(); ++i) {
618 thisSlot.emplace_back(vbnames[i], outputBranchNames[i], isDefine[i], colTypeIDs[i]);
619 }
620 }
621}
622
623// Define special member methods here where the definition of all the data member types is available
627 ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT &&) noexcept = default;
628
630{
631 if (!fTreeName.empty() /*not moved from*/ && fOptions.fLazy && !fOutputFiles.empty() &&
632 std::all_of(fOutputFiles.begin(), fOutputFiles.end(), [](const auto &f) { return !f; }) /* never run */) {
633 const auto fileOpenMode = [&]() {
634 TString checkupdate = fOptions.fMode;
635 checkupdate.ToLower();
636 return checkupdate == "update" ? "updated" : "created";
637 }();
638 Warning("Snapshot",
639 "A lazy Snapshot action was booked but never triggered. The tree '%s' in output file '%s' was not %s. "
640 "In case it was desired instead, remember to trigger the Snapshot operation, by storing "
641 "its result in a variable and for example calling the GetValue() method on it.",
642 fTreeName.c_str(), fFileName.c_str(), fileOpenMode);
643 }
644}
645
647{
648 ::TDirectory::TContext c; // do not let tasks change the thread-local gDirectory
649 if (!fOutputFiles[slot]) {
650 // first time this thread executes something, let's create a TBufferMerger output directory
651 fOutputFiles[slot] = fMerger->GetFile();
652 }
653 TDirectory *treeDirectory = fOutputFiles[slot].get();
654 if (!fDirName.empty()) {
655 // call returnExistingDirectory=true since MT can end up making this call multiple times
656 treeDirectory = fOutputFiles[slot]->mkdir(fDirName.c_str(), "", true);
657 }
658 // re-create output tree as we need to create its branches again, with new input variables
659 // TODO we could instead create the output tree and its branches, change addresses of input variables in each task
660 fOutputTrees[slot] =
661 std::make_unique<TTree>(fTreeName.c_str(), fTreeName.c_str(), fOptions.fSplitLevel, /*dir=*/treeDirectory);
663 // TODO can be removed when RDF supports interleaved TBB task execution properly, see ROOT-10269
664 fOutputTrees[slot]->SetImplicitMT(false);
665 if (fOptions.fAutoFlush)
666 fOutputTrees[slot]->SetAutoFlush(fOptions.fAutoFlush);
667 if (r) {
668 // We could be getting a task-local TTreeReader from the TTreeProcessorMT.
669 fInputTrees[slot] = r->GetTree();
670 } else if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputLoopManager->GetDataSource())) {
671 fInputTrees[slot] = treeDS->GetTree();
672 }
673 fBranchAddressesNeedReset[slot] = 1; // reset first event flag for this slot
674}
675
677{
678 if (fOutputTrees[slot]->GetEntries() > 0)
679 fOutputFiles[slot]->Write();
680 for (auto &branchData : fBranchData[slot])
681 branchData.ClearBranchPointers(); // The branch pointers will go stale below
682 // clear now to avoid concurrent destruction of output trees and input tree (which has them listed as fClones)
683 fOutputTrees[slot].reset(nullptr);
684}
685
686void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::Exec(unsigned int slot, const std::vector<void *> &values)
687{
688 if (fBranchAddressesNeedReset[slot] == 0) {
689 UpdateCArraysPtrs(slot, values);
690 } else {
691 SetBranches(slot, values);
693 }
694 fOutputTrees[slot]->Fill();
695 auto entries = fOutputTrees[slot]->GetEntries();
696 auto autoFlush = fOutputTrees[slot]->GetAutoFlush();
697 if ((autoFlush > 0) && (entries % autoFlush == 0))
698 fOutputFiles[slot]->Write();
699}
700
702 const std::vector<void *> &values)
703{
704 // This code deals with branches which hold C arrays of variable size. It can happen that the buffers
705 // associated to those is re-allocated. As a result the value of the pointer can change therewith
706 // leaving associated to the branch of the output tree an invalid pointer.
707 // With this code, we set the value of the pointer in the output branch anew when needed.
708 assert(values.size() == fBranchData[slot].size());
709 auto nValues = values.size();
710 for (decltype(nValues) i{}; i < nValues; i++) {
711 auto &branchData = fBranchData[slot][i];
712 if (branchData.fIsCArray) {
713 // valueAddress here points to a ROOT::RVec<std::byte> coming from RTreeUntypedArrayColumnReader. We know we
714 // need its buffer, so we cast it and extract the address of the buffer
715 auto *rawRVec = reinterpret_cast<ROOT::RVec<std::byte> *>(values[i]);
716 if (auto *data = rawRVec->data(); branchData.fBranchAddressForCArrays != data) {
717 // reset the branch address
718 branchData.fOutputBranch->SetAddress(data);
719 branchData.fBranchAddressForCArrays = data;
720 }
721 }
722 }
723}
724
726 const std::vector<void *> &values)
727{
728 // create branches in output tree
729 auto &branchData = fBranchData[slot];
730 assert(branchData.size() == values.size());
731 for (std::size_t i = 0; i < branchData.size(); i++) { // branchData can grow due to insertions
732 SetBranchesHelper(fInputTrees[slot], *fOutputTrees[slot], branchData, i, fOptions.fBasketSize, values[i]);
733 }
734
735 AssertNoNullBranchAddresses(branchData);
736}
737
739{
740 void *dummyValueAddress{};
741 auto &branchData = fBranchData.front();
742 for (std::size_t i = 0; i < branchData.size(); i++) { // branchData can grow due to insertions
743 SetBranchesHelper(inputTree, outputTree, branchData, i, fOptions.fBasketSize, dummyValueAddress);
744 }
745}
746
748{
749 auto outFile =
750 std::unique_ptr<TFile>{TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/fFileName.c_str(),
751 GetSnapshotCompressionSettings(fOptions))};
752 if (!outFile)
753 throw std::runtime_error("Snapshot: could not create output file " + fFileName);
754 fOutputFile = outFile.get();
755 fMerger = std::make_unique<ROOT::TBufferMerger>(std::move(outFile));
756}
757
759{
760
761 for (auto &file : fOutputFiles) {
762 if (file) {
763 file->Write();
764 file->Close();
765 }
766 }
767
768 // If there were no entries to fill the TTree with (either the input TTree was empty or no event passed after
769 // filtering), create an empty TTree in the output file and create the branches to preserve the schema
770 auto fullTreeName = fDirName.empty() ? fTreeName : fDirName + '/' + fTreeName;
771 assert(fOutputFile && "Missing output file in Snapshot finalization.");
772 // Use GetKey to avoid having to deal with memory management of the object in the file
773 if (!fOutputFile->GetKey(fullTreeName.c_str())) {
774
775 // First find in which directory we need to write the output TTree
776 TDirectory *treeDirectory = fOutputFile;
777 if (!fDirName.empty()) {
778 treeDirectory = fOutputFile->mkdir(fDirName.c_str(), "", true);
779 }
780 ::TDirectory::TContext c{treeDirectory};
781
782 // Create the output TTree and create the user-requested branches
783 auto outTree =
784 std::make_unique<TTree>(fTreeName.c_str(), fTreeName.c_str(), fOptions.fSplitLevel, /*dir=*/treeDirectory);
785 TTree *inputTree{};
786 if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputLoopManager->GetDataSource()))
787 inputTree = treeDS->GetTree();
788 SetEmptyBranches(inputTree, *outTree);
789
790 fOutputFile->Write();
791 }
792
793 // flush all buffers to disk by destroying the TBufferMerger
794 fOutputFiles.clear();
795 fMerger.reset();
796
797 // Now connect the data source to the loop manager so it can be used for further processing
798 fOutputLoopManager->SetDataSource(std::make_unique<ROOT::Internal::RDF::RTTreeDS>(fullTreeName, fFileName));
799}
800
801/**
802 * \brief Create a new UntypedSnapshotTTreeHelperMT with a different output file name
803 *
804 * \param newName A type-erased string with the output file name
805 * \return UntypedSnapshotTTreeHelperMT
806 *
807 * This MakeNew implementation is tied to the cloning feature of actions
808 * of the computation graph. In particular, cloning a Snapshot node usually
809 * also involves changing the name of the output file, otherwise the cloned
810 * Snapshot would overwrite the same file.
811 */
814{
815 const std::string finalName = *reinterpret_cast<const std::string *>(newName);
816 std::vector<std::string> inputBranchNames;
817 std::vector<std::string> outputBranchNames;
818 std::vector<bool> isDefine;
819 std::vector<const std::type_info *> inputColumnTypeIDs;
820 for (const auto &bd : fBranchData.front()) {
821 if (bd.fInputBranchName.empty())
822 break;
823 inputBranchNames.push_back(bd.fInputBranchName);
824 outputBranchNames.push_back(bd.fOutputBranchName);
825 isDefine.push_back(bd.fIsDefine);
826 inputColumnTypeIDs.push_back(bd.fInputTypeID);
827 }
828
830 finalName,
831 fDirName,
832 fTreeName,
833 std::move(inputBranchNames),
834 std::move(outputBranchNames),
835 fOptions,
836 std::move(isDefine),
839 std::move(inputColumnTypeIDs)};
840}
841
843 unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view ntuplename,
844 const ColumnNames_t &vfnames, const ColumnNames_t &fnames, const RSnapshotOptions &options,
846 const std::vector<const std::type_info *> &colTypeIDs)
847 : fFileName(filename),
848 fDirName(dirname),
849 fNTupleName(ntuplename),
850 fOptions(options),
851 fInputLoopManager(inputLM),
852 fOutputLoopManager(outputLM),
853 fInputFieldNames(vfnames),
855 fNSlots(nSlots),
856 fFillContexts(nSlots),
857 fEntries(nSlots),
858 fInputColumnTypeIDs(colTypeIDs)
859{
860 EnsureValidSnapshotOutput(fOptions, fNTupleName, fFileName);
861}
862
863// Define special member methods here where the definition of all the data member types is available
867 ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper &&) noexcept = default;
868
870{
871 if (!fNTupleName.empty() /* not moved from */ && !fOutputFile /* did not run */ && fOptions.fLazy)
872 Warning("Snapshot", "A lazy Snapshot action was booked but never triggered.");
873}
874
876{
877 auto model = ROOT::RNTupleModel::CreateBare();
878 auto nFields = fOutputFieldNames.size();
879 fFieldTokens.resize(nFields);
880 for (decltype(nFields) i = 0; i < nFields; i++) {
881 // Need to retrieve the type of every field to create as a string
882 // If the input type for a field does not have RTTI, internally we store it as the tag UseNativeDataType. When
883 // that is detected, we need to ask the data source which is the type name based on the on-disk information.
884 const auto typeName = *fInputColumnTypeIDs[i] == typeid(ROOT::Internal::RDF::UseNativeDataType)
886 fInputFieldNames[i], fOptions.fVector2RVec)
888
889 // Cardinality fields are read-only, so instead we snapshot them as their inner type.
890 if (typeName.substr(0, 25) == "ROOT::RNTupleCardinality<") {
891 // Get "T" from "ROOT::RNTupleCardinality<T>".
892 std::string cardinalityType = typeName.substr(25, typeName.size() - 26);
893 Warning("Snapshot",
894 "Column \"%s\" is a read-only \"%s\" column. It will be snapshot as its inner type \"%s\" instead.",
895 fInputFieldNames[i].c_str(), typeName.c_str(), cardinalityType.c_str());
896 model->AddField(ROOT::RFieldBase::Create(fOutputFieldNames[i], cardinalityType).Unwrap());
897 } else {
898 model->AddField(ROOT::RFieldBase::Create(fOutputFieldNames[i], typeName).Unwrap());
899 }
900 fFieldTokens[i] = model->GetToken(fOutputFieldNames[i]);
901 }
902 model->Freeze();
903
904 ROOT::RNTupleWriteOptions writeOptions;
905 writeOptions.SetCompression(GetSnapshotCompressionSettings(fOptions));
906 writeOptions.SetInitialUnzippedPageSize(fOptions.fInitialUnzippedPageSize);
907 writeOptions.SetMaxUnzippedPageSize(fOptions.fMaxUnzippedPageSize);
908 writeOptions.SetApproxZippedClusterSize(fOptions.fApproxZippedClusterSize);
909 writeOptions.SetMaxUnzippedClusterSize(fOptions.fMaxUnzippedClusterSize);
910 writeOptions.SetEnablePageChecksums(fOptions.fEnablePageChecksums);
911 writeOptions.SetEnableSamePageMerging(fOptions.fEnableSamePageMerging);
912
913 fOutputFile.reset(TFile::Open(fFileName.c_str(), fOptions.fMode.c_str()));
914 if (!fOutputFile)
915 throw std::runtime_error("Snapshot: could not create output file " + fFileName);
916
917 TDirectory *outputDir = fOutputFile.get();
918 if (!fDirName.empty()) {
919 TString checkupdate = fOptions.fMode;
920 checkupdate.ToLower();
921 if (checkupdate == "update")
922 outputDir = fOutputFile->mkdir(fDirName.c_str(), "", true); // do not overwrite existing directory
923 else
924 outputDir = fOutputFile->mkdir(fDirName.c_str());
925 }
926
927 // The RNTupleParallelWriter has exclusive access to the underlying TFile, no further synchronization is needed for
928 // calls to Fill() (in Exec) and FlushCluster() (in FinalizeTask).
929 fWriter = ROOT::RNTupleParallelWriter::Append(std::move(model), fNTupleName, *outputDir, writeOptions);
930}
931
933{
934 if (!fFillContexts[slot]) {
935 fFillContexts[slot] = fWriter->CreateFillContext();
936 fEntries[slot] = fFillContexts[slot]->GetModel().CreateBareEntry();
937 }
938}
939
940void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Exec(unsigned int slot, const std::vector<void *> &values)
941{
942 auto &fillContext = fFillContexts[slot];
943 auto &outputEntry = fEntries[slot];
944 assert(values.size() == fFieldTokens.size());
945 for (decltype(values.size()) i = 0; i < values.size(); i++) {
946 outputEntry->BindRawPtr(fFieldTokens[i], values[i]);
947 }
948 fillContext->Fill(*outputEntry);
949}
950
952{
953 // In principle we would not need to flush a cluster here, but we want to benefit from parallelism for compression.
954 // NB: RNTupleFillContext::FlushCluster() is a nop if there is no new entry since the last flush.
955 fFillContexts[slot]->FlushCluster();
956}
957
959{
960 // First clear and destroy all entries, which were created from the RNTupleFillContexts.
961 fEntries.clear();
962 fFillContexts.clear();
963 // Then destroy the RNTupleParallelWriter and write the metadata.
964 fWriter.reset();
965 // We can now set the data source of the loop manager for the RDataFrame that is returned by the Snapshot call.
966 fOutputLoopManager->SetDataSource(std::make_unique<ROOT::RDF::RNTupleDS>(fDirName + "/" + fNTupleName, fFileName));
967}
968
969/**
970 * Create a new UntypedSnapshotRNTupleHelper with a different output file name.
971 *
972 * \param[in] newName A type-erased string with the output file name
973 * \return UntypedSnapshotRNTupleHelper
974 *
975 * This MakeNew implementation is tied to the cloning feature of actions
976 * of the computation graph. In particular, cloning a Snapshot node usually
977 * also involves changing the name of the output file, otherwise the cloned
978 * Snapshot would overwrite the same file.
979 */
982{
983 const std::string finalName = *reinterpret_cast<const std::string *>(newName);
987}
988
989/*
990 * ------------------------------------
991 * Snapshot with systematic variations
992 * ------------------------------------
993 */
994namespace ROOT::Internal::RDF {
995/// An object to store an output file and a tree in one common place to share them between instances
996/// of Snapshot with systematic uncertainties.
998 std::unique_ptr<TFile> fFile;
999 std::unique_ptr<TTree> fTree;
1000 std::string fDirectoryName;
1002
1003 // Bitmasks to indicate whether syst. uncertainties have been computed. Bound to TBranches, so need to be stable in
1004 // memory.
1005 struct Bitmask {
1006 std::string branchName;
1007 std::bitset<64> bitset{};
1008 std::unique_ptr<uint64_t> branchBuffer{new uint64_t{}};
1009 };
1010 std::vector<Bitmask> fBitMasks;
1011
1012 std::unordered_map<std::string, unsigned int> fBranchToVariationMapping;
1013 // The corresponding ROOT dictionary is declared in core/clingutils/src
1014 std::unordered_map<std::string, std::pair<std::string, unsigned int>> fBranchToBitmaskMapping;
1015 unsigned int fNBits = 0;
1016
1017 SnapshotOutputWriter(TFile *file) : fFile{file} { assert(fFile); }
1019 {
1020 if (!fBranchToBitmaskMapping.empty()) {
1021 fFile->WriteObject(&fBranchToBitmaskMapping,
1022 (std::string{"R_rdf_column_to_bitmask_mapping_"} + fTree->GetName()).c_str());
1023 }
1024 if (fTree) {
1025 // use AutoSave to flush TTree contents because TTree::Write writes in gDirectory, not in fDirectory
1026 fTree->AutoSave("flushbaskets");
1027
1028 // Now connect the data source to the loop manager so it can be used for further processing
1029 std::string tree = fTree->GetName();
1030 if (!fDirectoryName.empty())
1031 tree = fDirectoryName + '/' + tree;
1032 std::string file = fFile->GetName();
1033
1034 fTree.reset();
1035 fFile.reset();
1036
1038 fOutputLoopManager->SetDataSource(std::make_unique<ROOT::Internal::RDF::RTTreeDS>(tree, file));
1039 }
1040 }
1041 SnapshotOutputWriter(SnapshotOutputWriter const &) = delete; // Anyway deleted because of the unique_ptrs
1044 delete; // Can be done, but need to make move-from object safe to destruct
1045 SnapshotOutputWriter &operator=(SnapshotOutputWriter &&) noexcept = delete;
1046
1047 /// Register a branch and corresponding systematic uncertainty.
1048 /// This will create an entry in the mapping from branch names to bitmasks, so the corresponding
1049 /// column can be masked if it doesn't contain valid entries. This mapping is written next to the
1050 /// tree into the output file.
1051 void RegisterBranch(std::string const &branchName, unsigned int variationIndex)
1052 {
1053 if (auto it = fBranchToVariationMapping.find(branchName); it != fBranchToVariationMapping.end()) {
1054 if (variationIndex != it->second) {
1055 throw std::logic_error("Branch " + branchName +
1056 " is being registered with different variation index than the expected one: " +
1057 std::to_string(variationIndex));
1058 }
1059 return;
1060 }
1061
1062 // Neither branch nor systematic are known, so a new entry needs to be created
1063 fNBits = std::max(fNBits, variationIndex);
1064 const auto vectorIndex = variationIndex / 64u;
1065 const auto bitIndex = variationIndex % 64u;
1066
1067 // Create bitmask branches as long as necessary to capture the bit
1068 while (vectorIndex >= fBitMasks.size()) {
1069 std::string bitmaskBranchName =
1070 std::string{"R_rdf_mask_"} + fTree->GetName() + '_' + std::to_string(fBitMasks.size());
1071 fBitMasks.push_back(Bitmask{bitmaskBranchName});
1072 fTree->Branch(bitmaskBranchName.c_str(), fBitMasks.back().branchBuffer.get());
1073 }
1074
1075 fBranchToVariationMapping[branchName] = variationIndex;
1076 fBranchToBitmaskMapping[branchName] = std::make_pair(fBitMasks[vectorIndex].branchName, bitIndex);
1077 }
1078
1079 /// Clear all bits, as if none of the variations passed its filter.
1081 {
1082 for (auto &mask : fBitMasks)
1083 mask.bitset.reset();
1084 }
1085
1086 /// Set a bit signalling that the variation at `index` passed its filter.
1087 void SetMaskBit(unsigned int index)
1088 {
1089 const auto vectorIndex = index / 64;
1090 const auto bitIndex = index % 64;
1091 fBitMasks[vectorIndex].bitset.set(bitIndex, true);
1092 }
1093
1094 /// Test if any of the mask bits are set.
1095 bool MaskEmpty() const
1096 {
1097 return std::none_of(fBitMasks.begin(), fBitMasks.end(), [](Bitmask const &mask) { return mask.bitset.any(); });
1098 }
1099
1100 /// Write the current event and the bitmask to the output dataset.
1101 void Write() const
1102 {
1103 if (!fTree)
1104 throw std::runtime_error("The TTree associated to the Snapshot action doesn't exist, any more.");
1105
1106 for (auto const &mask : fBitMasks) {
1107 *mask.branchBuffer = mask.bitset.to_ullong();
1108 }
1109
1110 fTree->Fill();
1111 }
1112};
1113
1114} // namespace ROOT::Internal::RDF
1115
1117 std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames,
1118 const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector<bool> &&isDefine,
1120 const std::vector<const std::type_info *> &colTypeIDs)
1121 : fOptions(options), fInputLoopManager{inputLoopMgr}, fOutputLoopManager{outputLoopMgr}
1122{
1123 EnsureValidSnapshotOutput(fOptions, std::string(treename), std::string(filename));
1124
1125 TDirectory::TContext fileCtxt;
1126 fOutputHandle = std::make_shared<SnapshotOutputWriter>(
1127 TFile::Open(filename.data(), fOptions.fMode.c_str(), /*ftitle=*/"", GetSnapshotCompressionSettings(fOptions)));
1128 if (!fOutputHandle->fFile)
1129 throw std::runtime_error(std::string{"Snapshot: could not create output file "} + std::string{filename});
1130
1131 TDirectory *outputDir = fOutputHandle->fFile.get();
1132 if (!dirname.empty()) {
1133 fOutputHandle->fDirectoryName = dirname;
1134 TString checkupdate = fOptions.fMode;
1135 checkupdate.ToLower();
1136 if (checkupdate == "update")
1137 outputDir =
1138 fOutputHandle->fFile->mkdir(std::string{dirname}.c_str(), "", true); // do not overwrite existing directory
1139 else
1140 outputDir = fOutputHandle->fFile->mkdir(std::string{dirname}.c_str());
1141 }
1142
1143 fOutputHandle->fTree = std::make_unique<TTree>(std::string{treename}.c_str(), std::string{treename}.c_str(),
1144 fOptions.fSplitLevel, /*dir=*/outputDir);
1145 fOutputHandle->fOutputLoopManager = fOutputLoopManager;
1146 if (fOptions.fAutoFlush)
1147 fOutputHandle->fTree->SetAutoFlush(fOptions.fAutoFlush);
1148
1149 auto outputBranchNames = ReplaceDotWithUnderscore(bnames);
1150
1151 fBranchData.reserve(vbnames.size());
1152 for (unsigned int i = 0; i < vbnames.size(); ++i) {
1153 fOutputHandle->RegisterBranch(outputBranchNames[i], 0);
1154 fBranchData.emplace_back(vbnames[i], outputBranchNames[i], isDefine[i], colTypeIDs[i]);
1155 }
1156}
1157
1158/// Register a new column as a variation of the column at `originalColumnIndex`, and clone its properties.
1159/// If a nominal column is registered here, it is written without changes, but it means that it will be masked
1160/// in case its selection cuts don't pass.
1161/// \param slot Task ID for MT runs.
1162/// \param columnIndex Index where the data of this column will be passed into the helper.
1163/// \param originalColumnIndex If the column being registered is a variation of a "nominal" column, this designates the
1164/// original.
1165/// Properties such as name and output type are cloned from the original.
1166/// \param variationName The variation that this column belongs to. If "nominal" is used, this column is considered as
1167/// the original.
1169 unsigned int columnIndex,
1170 unsigned int originalColumnIndex,
1171 unsigned int variationIndex,
1172 std::string const &variationName)
1173{
1174 if (columnIndex == originalColumnIndex) {
1175 // This is a nominal column, but it participates in variations.
1176 // It always needs to be written, but we still need to create a mask bit to mark when nominal is invalid.
1177 assert(variationIndex == 0);
1178 fBranchData[columnIndex].fVariationIndex = 0;
1179 fOutputHandle->RegisterBranch(fBranchData[columnIndex].fOutputBranchName, variationIndex);
1180 } else if (columnIndex >= fBranchData.size()) {
1181 // First task, need to create branches
1182 fBranchData.resize(columnIndex + 1);
1183 auto &bd = fBranchData[columnIndex];
1184 bd = fBranchData[originalColumnIndex];
1185 std::string newOutputName = bd.fOutputBranchName + "__" + variationName;
1186 std::replace(newOutputName.begin(), newOutputName.end(), ':', '_');
1187 bd.fOutputBranchName = std::move(newOutputName);
1188 bd.fVariationIndex = variationIndex;
1189
1190 fOutputHandle->RegisterBranch(bd.fOutputBranchName, variationIndex);
1191 } else {
1192 assert(static_cast<unsigned int>(fBranchData[columnIndex].fVariationIndex) == variationIndex);
1193 }
1194}
1195
1196/// Bind all output branches to RDF columns for the given slots.
1198{
1199 // We ask the input RLoopManager if it has a TTree. We cannot rely on getting this information when constructing
1200 // this action helper, since the TTree might change e.g. when ChangeSpec is called in-between distributed tasks.
1201 if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputLoopManager->GetDataSource()))
1202 fInputTree = treeDS->GetTree();
1203
1204 // Create all output branches; and bind them to empty values
1205 for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions
1206 SetBranchesHelper(fInputTree, *fOutputHandle->fTree, fBranchData, i, fOptions.fBasketSize,
1207 fBranchData[i].EmptyInstance(/*pointerToPointer=*/false));
1208 }
1209
1210 AssertNoNullBranchAddresses(fBranchData);
1211}
1212
1213/// Connect all output fields to the values pointed to by `values`, fill the output dataset,
1214/// call the Fill of the output tree, and clear the mask bits that show whether a variation was reached.
1215void ROOT::Internal::RDF::SnapshotHelperWithVariations::Exec(unsigned int /*slot*/, const std::vector<void *> &values,
1216 std::vector<bool> const &filterPassed)
1217{
1218 // Rebind branch pointers to RDF values
1219 assert(fBranchData.size() == values.size());
1220 for (std::size_t i = 0; i < values.size(); i++) {
1221 const auto variationIndex = fBranchData[i].fVariationIndex;
1222 if (variationIndex < 0) {
1223 // Branch without variations, it always needs to be written
1224 SetBranchesHelper(fInputTree, *fOutputHandle->fTree, fBranchData, i, fOptions.fBasketSize, values[i]);
1225 } else {
1226 // Nominal will always be written, systematics only if needed
1227 if (variationIndex == 0 || filterPassed[variationIndex]) {
1228 const bool fundamentalType = fBranchData[i].WriteValueIfFundamental(values[i]);
1229 if (!fundamentalType) {
1230 SetBranchesHelper(fInputTree, *fOutputHandle->fTree, fBranchData, i, fOptions.fBasketSize, values[i]);
1231 }
1232 }
1233
1234 if (filterPassed[variationIndex]) {
1235 fOutputHandle->SetMaskBit(variationIndex);
1236 }
1237 }
1238 }
1239
1240 assert(!fOutputHandle->MaskEmpty()); // Exec should not have been called if nothing passes
1241
1242 fOutputHandle->Write();
1243 fOutputHandle->ClearMaskBits();
1244 for (auto &branchData : fBranchData) {
1245 branchData.ClearBranchContents();
1246 }
1247}
1248
ROOT::R::TRInterface & r
Definition Object.C:4
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
#define c(i)
Definition RSha256.hxx:101
if(name) objname
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition TError.cxx:252
static TBranch * SearchForBranch(TTree *tree, const char *name)
Definition TTreePyz.cxx:61
The head node of a RDF computation graph.
std::shared_ptr< SnapshotOutputWriter > fOutputHandle
SnapshotHelperWithVariations(std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector< bool > &&, ROOT::Detail::RDF::RLoopManager *outputLoopMgr, ROOT::Detail::RDF::RLoopManager *inputLoopMgr, const std::vector< const std::type_info * > &colTypeIDs)
void InitTask(TTreeReader *, unsigned int slot)
Bind all output branches to RDF columns for the given slots.
ROOT::Detail::RDF::RLoopManager * fInputLoopManager
ROOT::Detail::RDF::RLoopManager * fOutputLoopManager
void Exec(unsigned int, const std::vector< void * > &values, std::vector< bool > const &filterPassed)
Connect all output fields to the values pointed to by values, fill the output dataset,...
void RegisterVariedColumn(unsigned int slot, unsigned int columnIndex, unsigned int originalColumnIndex, unsigned int varationIndex, std::string const &variationName)
Register a new column as a variation of the column at originalColumnIndex, and clone its properties.
std::vector< std::shared_ptr< ROOT::RNTupleFillContext > > fFillContexts
std::unique_ptr< ROOT::RNTupleParallelWriter > fWriter
UntypedSnapshotRNTupleHelper(unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view ntuplename, const ColumnNames_t &vfnames, const ColumnNames_t &fnames, const RSnapshotOptions &options, ROOT::Detail::RDF::RLoopManager *inputLM, ROOT::Detail::RDF::RLoopManager *outputLM, const std::vector< const std::type_info * > &colTypeIDs)
std::vector< std::unique_ptr< ROOT::REntry > > fEntries
std::vector< const std::type_info * > fInputColumnTypeIDs
void Exec(unsigned int slot, const std::vector< void * > &values)
ROOT::Detail::RDF::RLoopManager * fOutputLoopManager
UntypedSnapshotRNTupleHelper MakeNew(void *newName)
Create a new UntypedSnapshotRNTupleHelper with a different output file name.
ROOT::Detail::RDF::RLoopManager * fInputLoopManager
void InitTask(TTreeReader *, unsigned int slot)
UntypedSnapshotTTreeHelperMT(unsigned int nSlots, std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector< bool > &&isDefine, ROOT::Detail::RDF::RLoopManager *loopManager, ROOT::Detail::RDF::RLoopManager *inputLM, const std::vector< const std::type_info * > &colTypeIDs)
void UpdateCArraysPtrs(unsigned int slot, const std::vector< void * > &values)
void SetEmptyBranches(TTree *inputTree, TTree &outputTree)
ROOT::Detail::RDF::RLoopManager * fInputLoopManager
ROOT::Detail::RDF::RLoopManager * fOutputLoopManager
std::vector< std::shared_ptr< ROOT::TBufferMergerFile > > fOutputFiles
std::vector< std::vector< RBranchData > > fBranchData
UntypedSnapshotTTreeHelperMT MakeNew(void *newName, std::string_view="nominal")
Create a new UntypedSnapshotTTreeHelperMT with a different output file name.
void InitTask(TTreeReader *r, unsigned int slot)
void Exec(unsigned int slot, const std::vector< void * > &values)
std::vector< std::unique_ptr< TTree > > fOutputTrees
std::unique_ptr< ROOT::TBufferMerger > fMerger
void SetBranches(unsigned int slot, const std::vector< void * > &values)
ROOT::Detail::RDF::RLoopManager * fOutputLoopManager
ROOT::Detail::RDF::RLoopManager * fInputLoopManager
UntypedSnapshotTTreeHelper MakeNew(void *newName, std::string_view="nominal")
Create a new UntypedSnapshotTTreeHelper with a different output file name.
void SetEmptyBranches(TTree *inputTree, TTree &outputTree)
void SetBranches(const std::vector< void * > &values)
void Exec(unsigned int, const std::vector< void * > &values)
UntypedSnapshotTTreeHelper(std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector< bool > &&isDefine, ROOT::Detail::RDF::RLoopManager *loopManager, ROOT::Detail::RDF::RLoopManager *inputLM, const std::vector< const std::type_info * > &colTypeIDs)
void UpdateCArraysPtrs(const std::vector< void * > &values)
pointer data() noexcept
Return a pointer to the vector's buffer, even if empty().
Definition RVec.hxx:282
static RResult< std::unique_ptr< RFieldBase > > Create(const std::string &fieldName, const std::string &typeName, const ROOT::RCreateFieldOptions &options, const ROOT::RNTupleDescriptor *desc, ROOT::DescriptorId_t fieldId)
Factory method to resurrect a field from the stored on-disk type information.
static std::unique_ptr< RNTupleModel > CreateBare()
Creates a "bare model", i.e. an RNTupleModel with no default entry.
static std::unique_ptr< RNTupleParallelWriter > Append(std::unique_ptr< ROOT::RNTupleModel > model, std::string_view ntupleName, TDirectory &fileOrDirectory, const ROOT::RNTupleWriteOptions &options=ROOT::RNTupleWriteOptions())
Append an RNTuple to the existing file.
Common user-tunable settings for storing RNTuples.
void SetEnablePageChecksums(bool val)
Note that turning off page checksums will also turn off the same page merging optimization (see tunin...
void SetMaxUnzippedClusterSize(std::size_t val)
void SetMaxUnzippedPageSize(std::size_t val)
void SetInitialUnzippedPageSize(std::size_t val)
void SetApproxZippedClusterSize(std::size_t val)
void SetCompression(std::uint32_t val)
A "std::vector"-like collection of values implementing handy operation to analyse them.
Definition RVec.hxx:1525
A Branch for the case of an object.
A TTree is a list of TBranches.
Definition TBranch.h:93
virtual const char * GetClassName() const
Return the name of the user class whose content is stored in this branch, if any.
Definition TBranch.cxx:1323
virtual char * GetAddress() const
Definition TBranch.h:221
static TClass * Class()
Int_t GetSplitLevel() const
Definition TBranch.h:259
TClass * IsA() const override
Definition TBranch.h:304
virtual void SetAddress(void *add)
Set address of this branch.
Definition TBranch.cxx:2694
TObjArray * GetListOfLeaves()
Definition TBranch.h:256
TClassRef is used to implement a permanent reference to a TClass object.
Definition TClassRef.h:29
TClass instances represent classes, structs and namespaces in the ROOT type system.
Definition TClass.h:84
Basic data type descriptor (datatype information is obtained from CINT).
Definition TDataType.h:44
Int_t GetType() const
Definition TDataType.h:71
static TDictionary * GetDictionary(const char *name)
Retrieve the type (class, fundamental type, typedef etc) named "name".
TDirectory::TContext keeps track and restore the current directory.
Definition TDirectory.h:89
Describe directory structure in memory.
Definition TDirectory.h:45
A file, usually with extension .root, that stores data and code in the form of serialized objects in ...
Definition TFile.h:130
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition TFile.cxx:3787
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition TLeaf.h:57
const char * GetTitle() const override
Returns title of object.
Definition TNamed.h:50
Basic string class.
Definition TString.h:138
void ToLower()
Change string to lower-case.
Definition TString.cxx:1189
A simple, robust and fast interface to read values from ROOT columnar datasets such as TTree,...
Definition TTreeReader.h:46
A TTree represents a columnar dataset.
Definition TTree.h:89
virtual TBranch * FindBranch(const char *name)
Return the branch that correspond to the path 'branchname', which can include the name of the tree or...
Definition TTree.cxx:4890
virtual TBranch * GetBranch(const char *name)
Return pointer to the branch with the given name in this tree or its friends.
Definition TTree.cxx:5430
TBranch * Branch(const char *name, T *obj, Int_t bufsize=32000, Int_t splitlevel=99)
Add a new branch, and infer the data type from the type of obj being passed.
Definition TTree.h:397
virtual TTree * GetTree() const
Definition TTree.h:604
@ kEntriesReshuffled
If set, signals that this TTree is the output of the processing of another TTree, and the entries are...
Definition TTree.h:305
std::vector< std::string > ReplaceDotWithUnderscore(const std::vector< std::string > &columnNames)
Replace occurrences of '.
Definition RDFUtils.cxx:415
char TypeName2ROOTTypeName(const std::string &b)
Convert type name (e.g.
Definition RDFUtils.cxx:360
std::string TypeID2TypeName(const std::type_info &id)
Returns the name of a type starting from its type_info An empty string is returned in case of failure...
Definition RDFUtils.cxx:191
std::string GetTypeNameWithOpts(const ROOT::RDF::RDataSource &ds, std::string_view colName, bool vector2RVec)
Definition RDFUtils.cxx:645
char TypeID2ROOTTypeName(const std::type_info &tid)
Definition RDFUtils.cxx:219
TBranch * CallBranchImp(TTree &tree, const char *branchname, TClass *ptrClass, void *addobj, Int_t bufsize=32000, Int_t splitlevel=99)
Definition TTree.cxx:10133
TBranch * CallBranchImpRef(TTree &tree, const char *branchname, TClass *ptrClass, EDataType datatype, void *addobj, Int_t bufsize=32000, Int_t splitlevel=99)
Definition TTree.cxx:10127
std::vector< std::string > ColumnNames_t
@ kROOTRVec
Definition ESTLType.h:46
@ kSTLvector
Definition ESTLType.h:30
int CompressionSettings(RCompressionSetting::EAlgorithm::EValues algorithm, int compressionLevel)
ROOT::ESTLType STLKind(std::string_view type)
Converts STL container name to number.
ROOT::ESTLType IsSTLCont(std::string_view type)
type : type name: vector<list<classA,allocator>,allocator> result: 0 : not stl container code of cont...
Stores empty instances of classes, so a dummy object can be written when a systematic variation doesn...
Stores variations of a fundamental type.
Stores properties of each output branch in a Snapshot.
void * EmptyInstance(bool pointerToPointer)
Return a pointer to an empty instance of the type represented by this branch.
void ClearBranchContents()
Point the branch address to an empty instance of the type represented by this branch or write null by...
std::variant< FundamentalType, EmptyDynamicType > fTypeData
const std::type_info * fInputTypeID
void Write() const
Write the current event and the bitmask to the output dataset.
void ClearMaskBits()
Clear all bits, as if none of the variations passed its filter.
SnapshotOutputWriter(SnapshotOutputWriter const &)=delete
std::unordered_map< std::string, std::pair< std::string, unsigned int > > fBranchToBitmaskMapping
void RegisterBranch(std::string const &branchName, unsigned int variationIndex)
Register a branch and corresponding systematic uncertainty.
void SetMaskBit(unsigned int index)
Set a bit signalling that the variation at index passed its filter.
bool MaskEmpty() const
Test if any of the mask bits are set.
SnapshotOutputWriter & operator=(SnapshotOutputWriter const &)=delete
std::unordered_map< std::string, unsigned int > fBranchToVariationMapping
SnapshotOutputWriter(SnapshotOutputWriter &&) noexcept=delete
Tag to let data sources use the native data type when creating a column reader.
Definition Utils.hxx:347
EValues
Note: this is only temporarily a struct and will become a enum class hence the name convention used.
Definition Compression.h:88
A collection of options to steer the creation of the dataset on disk through Snapshot().
ESnapshotOutputFormat fOutputFormat
Which data format to write to.
std::string fMode
Mode of creation of output file.
ECAlgo fCompressionAlgorithm
Compression algorithm of output file.
int fCompressionLevel
Compression level of output file.
bool fOverwriteIfExists
If fMode is "UPDATE", overwrite object in output file if it already exists.