Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleExporter.cxx
Go to the documentation of this file.
1/// \file RNTupleExporter.cxx
2/// \ingroup NTuple ROOT7
3/// \author Giacomo Parolini <giacomo.parolini@cern.ch>
4/// \date 2024-12-10
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2024, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
16#include <ROOT/RError.hxx>
18#include <ROOT/RPageStorage.hxx>
20#include <ROOT/RNTupleZip.hxx>
21#include <ROOT/RClusterPool.hxx>
22#include <ROOT/RLogger.hxx>
23#include <fstream>
24#include <sstream>
25#include <vector>
26
28
29namespace {
30
32{
33 static RLogChannel sLog("ROOT.RNTupleExporter");
34 return sLog;
35}
36
37struct RColumnExportInfo {
40 std::string fQualifiedName;
41
42 RColumnExportInfo(const ROOT::RNTupleDescriptor &desc, const ROOT::RColumnDescriptor &colDesc,
44 : fColDesc(&colDesc),
46 // NOTE: we don't need to keep the column representation index into account because exactly 1 representation
47 // is active per page, so there is no risk of name collisions.
48 fQualifiedName(desc.GetQualifiedFieldName(fieldDesc.GetId()) + '-' + std::to_string(colDesc.GetIndex()))
49 {
50 }
51};
52
53struct RAddColumnsResult {
54 int fNColsTotal = 0;
55
56 RAddColumnsResult &operator+=(const RAddColumnsResult &other)
57 {
58 fNColsTotal += other.fNColsTotal;
59 return *this;
60 }
61};
62
63template <typename T>
64bool ItemIsFilteredOut(const RNTupleExporter::RFilter<T> &filter, const T &item)
65{
66 bool filterHasType = filter.fSet.find(item) != filter.fSet.end();
68 return isFiltered;
69}
70
71RAddColumnsResult AddColumnsFromField(std::vector<RColumnExportInfo> &vec, const ROOT::RNTupleDescriptor &desc,
73 const RNTupleExporter::RPagesOptions &options)
74{
75 R__LOG_DEBUG(1, RNTupleExporterLog()) << "processing field \"" << desc.GetQualifiedFieldName(fieldDesc.GetId())
76 << "\"";
77
78 RAddColumnsResult res{};
79
80 for (const auto &subfieldDesc : desc.GetFieldIterable(fieldDesc)) {
81 if (subfieldDesc.IsProjectedField())
82 continue;
83
84 for (const auto &colDesc : desc.GetColumnIterable(subfieldDesc)) {
85 // Filter columns by type
86 bool typeIsFiltered = ItemIsFilteredOut(options.fColumnTypeFilter, colDesc.GetType());
87 if (!typeIsFiltered)
88 vec.emplace_back(desc, colDesc, subfieldDesc);
89 res.fNColsTotal += 1;
90 }
91 res += AddColumnsFromField(vec, desc, subfieldDesc, options);
92 }
93
94 return res;
95}
96
97int CountPages(const ROOT::RNTupleDescriptor &desc, std::span<const RColumnExportInfo> columns)
98{
99 int nPages = 0;
100 auto clusterId = desc.FindClusterId(0, 0);
102 const auto &clusterDesc = desc.GetClusterDescriptor(clusterId);
103 for (const auto &colInfo : columns) {
104 const auto &pages = clusterDesc.GetPageRange(colInfo.fColDesc->GetPhysicalId());
105 nPages += pages.GetPageInfos().size();
106 }
108 }
109 return nPages;
110}
111
112} // namespace
113
114RNTupleExporter::RPagesResult
116{
118 throw ROOT::RException(R__FAIL("exporting checksums is incompatible with decompressing the pages"));
119
120 RPagesResult res = {};
121
122 // make sure the source is attached
123 source.Attach();
124
125 auto desc = source.GetSharedDescriptorGuard();
127
128 // Collect column info
129 std::vector<RColumnExportInfo> columnInfos;
130 const RAddColumnsResult addColRes = AddColumnsFromField(columnInfos, desc.GetRef(), desc->GetFieldZero(), options);
131
132 // Collect ColumnSet for the cluster pool query
134 columnSet.reserve(columnInfos.size());
135 for (const auto &colInfo : columnInfos) {
136 columnSet.emplace(colInfo.fColDesc->GetPhysicalId());
137 }
138
139 const auto nPages = CountPages(desc.GetRef(), columnInfos);
140
141 const bool showProgress = (options.fFlags & RPagesOptions::kShowProgressBar) != 0;
142 res.fExportedFileNames.reserve(nPages);
143
144 // Iterate over the clusters in order and dump pages
145 auto clusterId = nPages > 0 ? desc->FindClusterId(0, 0) : ROOT::kInvalidDescriptorId;
146 int pagesExported = 0;
147 int prevIntPercent = 0;
148 std::vector<char> unzipBuf; // Only used when pages get decompressed
150 const auto &clusterDesc = desc->GetClusterDescriptor(clusterId);
152 for (const auto &colInfo : columnInfos) {
153 auto columnId = colInfo.fColDesc->GetPhysicalId();
154 const auto &pages = clusterDesc.GetPageRange(columnId);
155 const auto &colRange = clusterDesc.GetColumnRange(columnId);
156 auto colElement = ROOT::Internal::RColumnElementBase::Generate<void>(colInfo.fColDesc->GetType());
157 std::uint64_t pageIdx = 0;
158
160 << "exporting column \"" << colInfo.fQualifiedName << "\" (" << pages.GetPageInfos().size() << " pages)";
161
162 // We should never try to export a suppressed column range
163 assert(!colRange.IsSuppressed() || pages.GetPageInfos().empty());
164
165 for (const auto &pageInfo : pages.GetPageInfos()) {
167 const ROOT::Internal::ROnDiskPage *onDiskPage = cluster->GetOnDiskPage(key);
168
169 // prepare the output file
170 std::ostringstream ss{options.fOutputPath, std::ios_base::ate};
171 assert(colRange.GetCompressionSettings());
172 ss << "/cluster_" << clusterDesc.GetId() << "_" << colInfo.fQualifiedName << "_page_" << pageIdx
173 << "_elems_" << pageInfo.GetNElements() << "_comp_" << *colRange.GetCompressionSettings() << ".page";
174 const auto outFileName = ss.str();
175 std::ofstream outFile{outFileName, std::ios_base::binary};
176 if (!outFile) {
177 throw ROOT::RException(
178 R__FAIL(std::string("output path ") + options.fOutputPath + " does not exist or is not writable!"));
179 }
180
181 // dump the page
182 const auto *pageBuf = static_cast<const char *>(onDiskPage->GetAddress());
183 if (options.fFlags & RPagesOptions::kDecompress) {
184 const auto nbytesPacked = colElement->GetPackedSize(pageInfo.GetNElements());
185 const auto nbytesData = pageInfo.GetLocator().GetNBytesOnStorage();
186 if (unzipBuf.size() < nbytesPacked)
187 unzipBuf.resize(nbytesPacked);
189 outFile.write(unzipBuf.data(), nbytesPacked);
190 } else {
191 const bool includeChecksum =
192 (options.fFlags & RPagesOptions::kIncludeChecksums) != 0 && pageInfo.HasChecksum();
193 const std::size_t maybeChecksumSize = includeChecksum * 8;
194 const auto nbytesData = pageInfo.GetLocator().GetNBytesOnStorage() + maybeChecksumSize;
195 outFile.write(pageBuf, nbytesData);
196 }
197
198 res.fExportedFileNames.push_back(outFileName);
199
201
202 if (showProgress) {
203 int intPercent = static_cast<int>(100.f * pagesExported / res.fExportedFileNames.size());
204 if (intPercent != prevIntPercent) {
205 fprintf(stderr, "\rExport progress: %02d%%", intPercent);
206 if (intPercent == 100)
207 fprintf(stderr, "\n");
209 }
210 }
211 }
212 }
214 }
215
216 assert(res.fExportedFileNames.size() == static_cast<size_t>(pagesExported));
217 std::ostringstream ss;
218 ss << "exported " << res.fExportedFileNames.size() << " pages (";
219 if (options.fColumnTypeFilter.fSet.empty()) {
220 ss << addColRes.fNColsTotal << " columns)";
221 } else {
222 auto nColsFilteredOut = addColRes.fNColsTotal - columnInfos.size();
223 ss << nColsFilteredOut << "/" << addColRes.fNColsTotal << " columns filtered out)";
224 }
226
227 return res;
228}
229
230} // namespace ROOT::Experimental::Internal
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
Definition RError.hxx:300
#define R__LOG_DEBUG(DEBUGLEVEL,...)
Definition RLogger.hxx:360
#define R__LOG_INFO(...)
Definition RLogger.hxx:359
const ROOT::RFieldDescriptor * fFieldDesc
int fNColsTotal
std::string fQualifiedName
const ROOT::RColumnDescriptor * fColDesc
static void AddColumnsFromField(std::vector< RColumnMergeInfo > &columns, const ROOT::RNTupleDescriptor &srcDesc, RNTupleMergeData &mergeData, const ROOT::RFieldDescriptor &srcFieldDesc, const ROOT::RFieldDescriptor &dstFieldDesc, const std::string &prefix="")
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
std::string & operator+=(std::string &left, const TString &right)
Definition TString.h:494
@ kBlacklist
Don't export items contained in the filter's set.
static RPagesResult ExportPages(ROOT::Internal::RPageSource &source, const RPagesOptions &options={})
Given a page source, writes all its pages to individual files (1 per page).
Managed a set of clusters containing compressed and packed pages.
An in-memory subset of the packed and compressed pages of a cluster.
Definition RCluster.hxx:148
std::unordered_set< ROOT::DescriptorId_t > ColumnSet_t
Definition RCluster.hxx:150
static void Unzip(const void *from, size_t nbytes, size_t dataLen, void *to)
The nbytes parameter provides the size ls of the from buffer.
A page as being stored on disk, that is packed and compressed.
Definition RCluster.hxx:41
Abstract interface to read data from an ntuple.
Metadata stored for every column of an RNTuple.
Base class for all ROOT issued exceptions.
Definition RError.hxx:79
Metadata stored for every field of an RNTuple.
A log configuration for a channel, e.g.
Definition RLogger.hxx:98
The on-storage metadata of an RNTuple.
ROOT::DescriptorId_t FindNextClusterId(ROOT::DescriptorId_t clusterId) const
ROOT::DescriptorId_t FindClusterId(ROOT::NTupleSize_t entryIdx) const
const RClusterDescriptor & GetClusterDescriptor(ROOT::DescriptorId_t clusterId) const
std::string GetQualifiedFieldName(ROOT::DescriptorId_t fieldId) const
Walks up the parents of the field ID and returns a field name of the form a.b.c.d In case of invalid ...
const RFieldDescriptor & GetFieldZero() const
const_iterator end() const
constexpr DescriptorId_t kInvalidDescriptorId
RFilter< ENTupleColumnType > fColumnTypeFilter
Optional filter that determines which columns are included or excluded from being exported.
@ kDecompress
If enabled, uncompress (but don't unpack) the page (mutually exclusive with kIncludeChecksums)
@ kShowProgressBar
If enabled, the exporter will report the current progress on the stderr.
On-disk pages within a page source are identified by the column and page number.
Definition RCluster.hxx:51