Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RNTupleExporter.cxx
Go to the documentation of this file.
1/// \file RNTupleExporter.cxx
2/// \ingroup NTuple ROOT7
3/// \author Giacomo Parolini <giacomo.parolini@cern.ch>
4/// \date 2024-12-10
5/// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
6/// is welcome!
7
8/*************************************************************************
9 * Copyright (C) 1995-2024, Rene Brun and Fons Rademakers. *
10 * All rights reserved. *
11 * *
12 * For the licensing terms see $ROOTSYS/LICENSE. *
13 * For the list of contributors see $ROOTSYS/README/CREDITS. *
14 *************************************************************************/
15
17#include <ROOT/RPageStorage.hxx>
19#include <ROOT/RClusterPool.hxx>
20#include <ROOT/RLogger.hxx>
21#include <fstream>
22#include <sstream>
23
25
26namespace {
27
28ROOT::Experimental::RLogChannel &RNTupleExporterLog()
29{
30 static RLogChannel sLog("ROOT.RNTupleExporter");
31 return sLog;
32}
33
34struct RColumnExportInfo {
35 const RColumnDescriptor *fColDesc;
36 const RFieldDescriptor *fFieldDesc;
37 std::string fQualifiedName;
38
39 RColumnExportInfo(const RNTupleDescriptor &desc, const RColumnDescriptor &colDesc, const RFieldDescriptor &fieldDesc)
40 : fColDesc(&colDesc),
41 fFieldDesc(&fieldDesc),
42 // NOTE: we don't need to keep the column representation index into account because exactly 1 representation
43 // is active per page, so there is no risk of name collisions.
44 fQualifiedName(desc.GetQualifiedFieldName(fieldDesc.GetId()) + '-' + std::to_string(colDesc.GetIndex()))
45 {
46 }
47};
48
49void AddColumnsFromField(std::vector<RColumnExportInfo> &vec, const RNTupleDescriptor &desc,
50 const RFieldDescriptor &fieldDesc)
51{
52 R__LOG_DEBUG(1, RNTupleExporterLog()) << "processing field \"" << desc.GetQualifiedFieldName(fieldDesc.GetId())
53 << "\"";
54
55 for (const auto &subfieldDesc : desc.GetFieldIterable(fieldDesc)) {
56 if (subfieldDesc.IsProjectedField())
57 continue;
58
59 for (const auto &colDesc : desc.GetColumnIterable(subfieldDesc)) {
60 vec.emplace_back(desc, colDesc, subfieldDesc);
61 }
62 AddColumnsFromField(vec, desc, subfieldDesc);
63 }
64}
65
66int CountPages(const RNTupleDescriptor &desc, std::span<const RColumnExportInfo> columns)
67{
68 int nPages = 0;
69 DescriptorId_t clusterId = desc.FindClusterId(0, 0);
70 while (clusterId != kInvalidDescriptorId) {
71 const auto &clusterDesc = desc.GetClusterDescriptor(clusterId);
72 for (const auto &colInfo : columns) {
73 const auto &pages = clusterDesc.GetPageRange(colInfo.fColDesc->GetPhysicalId());
74 nPages += pages.fPageInfos.size();
75 }
76 clusterId = desc.FindNextClusterId(clusterId);
77 }
78 return nPages;
79}
80
81} // namespace
82
84{
85 RPagesResult res = {};
86
87 // make sure the source is attached
88 source.Attach();
89
90 auto desc = source.GetSharedDescriptorGuard();
91 RClusterPool clusterPool{source};
92
93 // Collect column info
94 std::vector<RColumnExportInfo> columnInfos;
95 AddColumnsFromField(columnInfos, desc.GetRef(), desc->GetFieldZero());
96
97 // Collect ColumnSet for the cluster pool query
98 RCluster::ColumnSet_t columnSet;
99 columnSet.reserve(columnInfos.size());
100 for (const auto &colInfo : columnInfos) {
101 columnSet.emplace(colInfo.fColDesc->GetPhysicalId());
102 }
103
104 const auto nPages = CountPages(desc.GetRef(), columnInfos);
105
106 const bool showProgress = (options.fFlags & RPagesOptions::kShowProgressBar) != 0;
107 res.fExportedFileNames.reserve(nPages);
108
109 // Iterate over the clusters in order and dump pages
110 DescriptorId_t clusterId = desc->FindClusterId(0, 0);
111 int pagesExported = 0;
112 int prevIntPercent = 0;
113 while (clusterId != kInvalidDescriptorId) {
114 const auto &clusterDesc = desc->GetClusterDescriptor(clusterId);
115 const RCluster *cluster = clusterPool.GetCluster(clusterId, columnSet);
116 for (const auto &colInfo : columnInfos) {
117 DescriptorId_t columnId = colInfo.fColDesc->GetPhysicalId();
118 const auto &pages = clusterDesc.GetPageRange(columnId);
119 const auto &colRange = clusterDesc.GetColumnRange(columnId);
120 std::uint64_t pageIdx = 0;
121
122 R__LOG_DEBUG(0, RNTupleExporterLog())
123 << "exporting column \"" << colInfo.fQualifiedName << "\" (" << pages.fPageInfos.size() << " pages)";
124
125 // We should never try to export a suppressed column range
126 assert(!colRange.fIsSuppressed || pages.fPageInfos.empty());
127
128 for (const auto &pageInfo : pages.fPageInfos) {
129 ROnDiskPage::Key key{columnId, pageIdx};
130 const ROnDiskPage *onDiskPage = cluster->GetOnDiskPage(key);
131
132 // dump the page
133 const void *pageBuf = onDiskPage->GetAddress();
134 const bool incChecksum = (options.fFlags & RPagesOptions::kIncludeChecksums) != 0 && pageInfo.fHasChecksum;
135 const std::size_t maybeChecksumSize = incChecksum * 8;
136 const std::uint64_t pageBufSize = pageInfo.fLocator.fBytesOnStorage + maybeChecksumSize;
137 std::ostringstream ss{options.fOutputPath, std::ios_base::ate};
138 ss << "/cluster_" << clusterDesc.GetId() << "_" << colInfo.fQualifiedName << "_page_" << pageIdx
139 << "_elems_" << pageInfo.fNElements << "_comp_" << colRange.fCompressionSettings << ".page";
140 const auto outFileName = ss.str();
141 std::ofstream outFile{outFileName, std::ios_base::binary};
142 outFile.write(reinterpret_cast<const char *>(pageBuf), pageBufSize);
143
144 res.fExportedFileNames.push_back(outFileName);
145
146 ++pageIdx, ++pagesExported;
147
148 if (showProgress) {
149 int intPercent = static_cast<int>(100.f * pagesExported / res.fExportedFileNames.size());
150 if (intPercent != prevIntPercent) {
151 fprintf(stderr, "\rExport progress: %02d%%", intPercent);
152 if (intPercent == 100)
153 fprintf(stderr, "\n");
154 prevIntPercent = intPercent;
155 }
156 }
157 }
158 }
159 clusterId = desc->FindNextClusterId(clusterId);
160 }
161
162 assert(res.fExportedFileNames.size() == static_cast<size_t>(pagesExported));
163 R__LOG_INFO(RNTupleExporterLog()) << "exported " << res.fExportedFileNames.size() << " pages.";
164
165 return res;
166}
167
168} // namespace ROOT::Experimental::Internal
#define R__LOG_DEBUG(DEBUGLEVEL,...)
Definition RLogger.hxx:365
#define R__LOG_INFO(...)
Definition RLogger.hxx:364
const RColumnDescriptor * fColDesc
const RFieldDescriptor * fFieldDesc
std::string fQualifiedName
static void AddColumnsFromField(std::vector< RColumnMergeInfo > &columns, const RNTupleDescriptor &srcDesc, RNTupleMergeData &mergeData, const RFieldDescriptor &srcFieldDesc, const RFieldDescriptor &dstFieldDesc, const std::string &prefix="")
Managed a set of clusters containing compressed and packed pages.
An in-memory subset of the packed and compressed pages of a cluster.
Definition RCluster.hxx:152
const ROnDiskPage * GetOnDiskPage(const ROnDiskPage::Key &key) const
Definition RCluster.cxx:32
std::unordered_set< DescriptorId_t > ColumnSet_t
Definition RCluster.hxx:154
static RPagesResult ExportPages(RPageSource &source, const RPagesOptions &options={})
Given a page source, writes all its pages to individual files (1 per page).
A page as being stored on disk, that is packed and compressed.
Definition RCluster.hxx:42
Abstract interface to read data from an ntuple.
const RSharedDescriptorGuard GetSharedDescriptorGuard() const
Takes the read lock for the descriptor.
void Attach()
Open the physical storage container and deserialize header and footer.
A log configuration for a channel, e.g.
Definition RLogger.hxx:101
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
constexpr DescriptorId_t kInvalidDescriptorId
@ kShowProgressBar
If enabled, the exporter will report the current progress on the stderr.
On-disk pages within a page source are identified by the column and page number.
Definition RCluster.hxx:52