Logo ROOT  
Reference Guide
 
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Modules Pages
Loading...
Searching...
No Matches
RNTupleExporter.cxx
Go to the documentation of this file.
1/// \file RNTupleExporter.cxx
2/// \ingroup NTuple ROOT7
3/// \author Giacomo Parolini <giacomo.parolini@cern.ch>
4/// \date 2024-12-10
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2024, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
17#include <ROOT/RPageStorage.hxx>
19#include <ROOT/RClusterPool.hxx>
20#include <ROOT/RLogger.hxx>
21#include <fstream>
22#include <sstream>
23
25
26namespace {
27
29{
30 static RLogChannel sLog("ROOT.RNTupleExporter");
31 return sLog;
32}
33
34struct RColumnExportInfo {
37 std::string fQualifiedName;
38
39 RColumnExportInfo(const ROOT::RNTupleDescriptor &desc, const ROOT::RColumnDescriptor &colDesc,
41 : fColDesc(&colDesc),
43 // NOTE: we don't need to keep the column representation index into account because exactly 1 representation
44 // is active per page, so there is no risk of name collisions.
45 fQualifiedName(desc.GetQualifiedFieldName(fieldDesc.GetId()) + '-' + std::to_string(colDesc.GetIndex()))
46 {
47 }
48};
49
50struct RAddColumnsResult {
51 int fNColsTotal = 0;
52
53 RAddColumnsResult &operator+=(const RAddColumnsResult &other)
54 {
55 fNColsTotal += other.fNColsTotal;
56 return *this;
57 }
58};
59
60template <typename T>
61bool ItemIsFilteredOut(const RNTupleExporter::RFilter<T> &filter, const T &item)
62{
63 bool filterHasType = filter.fSet.find(item) != filter.fSet.end();
65 return isFiltered;
66}
67
68RAddColumnsResult AddColumnsFromField(std::vector<RColumnExportInfo> &vec, const ROOT::RNTupleDescriptor &desc,
70 const RNTupleExporter::RPagesOptions &options)
71{
72 R__LOG_DEBUG(1, RNTupleExporterLog()) << "processing field \"" << desc.GetQualifiedFieldName(fieldDesc.GetId())
73 << "\"";
74
75 RAddColumnsResult res{};
76
77 for (const auto &subfieldDesc : desc.GetFieldIterable(fieldDesc)) {
78 if (subfieldDesc.IsProjectedField())
79 continue;
80
81 for (const auto &colDesc : desc.GetColumnIterable(subfieldDesc)) {
82 // Filter columns by type
83 bool typeIsFiltered = ItemIsFilteredOut(options.fColumnTypeFilter, colDesc.GetType());
84 if (!typeIsFiltered)
85 vec.emplace_back(desc, colDesc, subfieldDesc);
86 res.fNColsTotal += 1;
87 }
88 res += AddColumnsFromField(vec, desc, subfieldDesc, options);
89 }
90
91 return res;
92}
93
94int CountPages(const ROOT::RNTupleDescriptor &desc, std::span<const RColumnExportInfo> columns)
95{
96 int nPages = 0;
97 auto clusterId = desc.FindClusterId(0, 0);
99 const auto &clusterDesc = desc.GetClusterDescriptor(clusterId);
100 for (const auto &colInfo : columns) {
101 const auto &pages = clusterDesc.GetPageRange(colInfo.fColDesc->GetPhysicalId());
102 nPages += pages.GetPageInfos().size();
103 }
105 }
106 return nPages;
107}
108
109} // namespace
110
111RNTupleExporter::RPagesResult
113{
114 RPagesResult res = {};
115
116 // make sure the source is attached
117 source.Attach();
118
119 auto desc = source.GetSharedDescriptorGuard();
121
122 // Collect column info
123 std::vector<RColumnExportInfo> columnInfos;
124 const RAddColumnsResult addColRes = AddColumnsFromField(columnInfos, desc.GetRef(), desc->GetFieldZero(), options);
125
126 // Collect ColumnSet for the cluster pool query
128 columnSet.reserve(columnInfos.size());
129 for (const auto &colInfo : columnInfos) {
130 columnSet.emplace(colInfo.fColDesc->GetPhysicalId());
131 }
132
133 const auto nPages = CountPages(desc.GetRef(), columnInfos);
134
135 const bool showProgress = (options.fFlags & RPagesOptions::kShowProgressBar) != 0;
136 res.fExportedFileNames.reserve(nPages);
137
138 // Iterate over the clusters in order and dump pages
139 auto clusterId = nPages > 0 ? desc->FindClusterId(0, 0) : ROOT::kInvalidDescriptorId;
140 int pagesExported = 0;
141 int prevIntPercent = 0;
143 const auto &clusterDesc = desc->GetClusterDescriptor(clusterId);
144 const RCluster *cluster = clusterPool.GetCluster(clusterId, columnSet);
145 for (const auto &colInfo : columnInfos) {
146 auto columnId = colInfo.fColDesc->GetPhysicalId();
147 const auto &pages = clusterDesc.GetPageRange(columnId);
148 const auto &colRange = clusterDesc.GetColumnRange(columnId);
149 std::uint64_t pageIdx = 0;
150
152 << "exporting column \"" << colInfo.fQualifiedName << "\" (" << pages.GetPageInfos().size() << " pages)";
153
154 // We should never try to export a suppressed column range
155 assert(!colRange.IsSuppressed() || pages.GetPageInfos().empty());
156
157 for (const auto &pageInfo : pages.GetPageInfos()) {
159 const ROnDiskPage *onDiskPage = cluster->GetOnDiskPage(key);
160
161 // dump the page
162 const void *pageBuf = onDiskPage->GetAddress();
163 const bool incChecksum = (options.fFlags & RPagesOptions::kIncludeChecksums) != 0 && pageInfo.HasChecksum();
164 const std::size_t maybeChecksumSize = incChecksum * 8;
165 const std::uint64_t pageBufSize = pageInfo.GetLocator().GetNBytesOnStorage() + maybeChecksumSize;
166 std::ostringstream ss{options.fOutputPath, std::ios_base::ate};
167 assert(colRange.GetCompressionSettings());
168 ss << "/cluster_" << clusterDesc.GetId() << "_" << colInfo.fQualifiedName << "_page_" << pageIdx
169 << "_elems_" << pageInfo.GetNElements() << "_comp_" << *colRange.GetCompressionSettings() << ".page";
170 const auto outFileName = ss.str();
171 std::ofstream outFile{outFileName, std::ios_base::binary};
172 if (!outFile)
173 throw ROOT::RException(
174 R__FAIL(std::string("output path ") + options.fOutputPath + " does not exist or is not writable!"));
175
176 outFile.write(reinterpret_cast<const char *>(pageBuf), pageBufSize);
177
178 res.fExportedFileNames.push_back(outFileName);
179
181
182 if (showProgress) {
183 int intPercent = static_cast<int>(100.f * pagesExported / res.fExportedFileNames.size());
184 if (intPercent != prevIntPercent) {
185 fprintf(stderr, "\rExport progress: %02d%%", intPercent);
186 if (intPercent == 100)
187 fprintf(stderr, "\n");
189 }
190 }
191 }
192 }
194 }
195
196 assert(res.fExportedFileNames.size() == static_cast<size_t>(pagesExported));
197 std::ostringstream ss;
198 ss << "exported " << res.fExportedFileNames.size() << " pages (";
199 if (options.fColumnTypeFilter.fSet.empty()) {
200 ss << addColRes.fNColsTotal << " columns)";
201 } else {
202 auto nColsFilteredOut = addColRes.fNColsTotal - columnInfos.size();
203 ss << nColsFilteredOut << "/" << addColRes.fNColsTotal << " columns filtered out)";
204 }
206
207 return res;
208}
209
210} // namespace ROOT::Experimental::Internal
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
Definition RError.hxx:299
#define R__LOG_DEBUG(DEBUGLEVEL,...)
Definition RLogger.hxx:360
#define R__LOG_INFO(...)
Definition RLogger.hxx:359
const ROOT::RFieldDescriptor * fFieldDesc
int fNColsTotal
std::string fQualifiedName
const ROOT::RColumnDescriptor * fColDesc
static void AddColumnsFromField(std::vector< RColumnMergeInfo > &columns, const ROOT::RNTupleDescriptor &srcDesc, RNTupleMergeData &mergeData, const ROOT::RFieldDescriptor &srcFieldDesc, const ROOT::RFieldDescriptor &dstFieldDesc, const std::string &prefix="")
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
std::string & operator+=(std::string &left, const TString &right)
Definition TString.h:486
Managed a set of clusters containing compressed and packed pages.
An in-memory subset of the packed and compressed pages of a cluster.
Definition RCluster.hxx:152
std::unordered_set< ROOT::DescriptorId_t > ColumnSet_t
Definition RCluster.hxx:154
@ kBlacklist
Don't export items contained in the filter's set.
static RPagesResult ExportPages(ROOT::Internal::RPageSource &source, const RPagesOptions &options={})
Given a page source, writes all its pages to individual files (1 per page).
A page as being stored on disk, that is packed and compressed.
Definition RCluster.hxx:42
Abstract interface to read data from an ntuple.
Metadata stored for every column of an RNTuple.
Base class for all ROOT issued exceptions.
Definition RError.hxx:79
Metadata stored for every field of an RNTuple.
A log configuration for a channel, e.g.
Definition RLogger.hxx:98
The on-storage metadata of an RNTuple.
ROOT::DescriptorId_t FindNextClusterId(ROOT::DescriptorId_t clusterId) const
ROOT::DescriptorId_t FindClusterId(ROOT::NTupleSize_t entryIdx) const
const RClusterDescriptor & GetClusterDescriptor(ROOT::DescriptorId_t clusterId) const
std::string GetQualifiedFieldName(ROOT::DescriptorId_t fieldId) const
Walks up the parents of the field ID and returns a field name of the form a.b.c.d In case of invalid ...
const RFieldDescriptor & GetFieldZero() const
const_iterator end() const
constexpr DescriptorId_t kInvalidDescriptorId
RFilter< ENTupleColumnType > fColumnTypeFilter
Optional filter that determines which columns are included or excluded from being exported.
@ kShowProgressBar
If enabled, the exporter will report the current progress on the stderr.
On-disk pages within a page source are identified by the column and page number.
Definition RCluster.hxx:52