30 static RLogChannel sLog(
"ROOT.RNTupleExporter");
34struct RColumnExportInfo {
39 RColumnExportInfo(
const RNTupleDescriptor &desc,
const RColumnDescriptor &colDesc,
const RFieldDescriptor &fieldDesc)
44 fQualifiedName(desc.GetQualifiedFieldName(fieldDesc.GetId()) +
'-' + std::to_string(colDesc.GetIndex()))
50 const RFieldDescriptor &fieldDesc)
52 R__LOG_DEBUG(1, RNTupleExporterLog()) <<
"processing field \"" << desc.GetQualifiedFieldName(fieldDesc.GetId())
55 for (
const auto &subfieldDesc : desc.GetFieldIterable(fieldDesc)) {
56 if (subfieldDesc.IsProjectedField())
59 for (
const auto &colDesc : desc.GetColumnIterable(subfieldDesc)) {
60 vec.emplace_back(desc, colDesc, subfieldDesc);
66int CountPages(
const RNTupleDescriptor &desc, std::span<const RColumnExportInfo> columns)
71 const auto &clusterDesc = desc.GetClusterDescriptor(clusterId);
72 for (
const auto &colInfo : columns) {
73 const auto &pages = clusterDesc.GetPageRange(colInfo.fColDesc->GetPhysicalId());
74 nPages += pages.fPageInfos.size();
76 clusterId = desc.FindNextClusterId(clusterId);
94 std::vector<RColumnExportInfo> columnInfos;
99 columnSet.reserve(columnInfos.size());
100 for (
const auto &colInfo : columnInfos) {
101 columnSet.emplace(colInfo.fColDesc->GetPhysicalId());
104 const auto nPages = CountPages(desc.GetRef(), columnInfos);
111 int pagesExported = 0;
112 int prevIntPercent = 0;
114 const auto &clusterDesc = desc->GetClusterDescriptor(clusterId);
115 const RCluster *cluster = clusterPool.GetCluster(clusterId, columnSet);
116 for (
const auto &colInfo : columnInfos) {
118 const auto &pages = clusterDesc.GetPageRange(columnId);
119 const auto &colRange = clusterDesc.GetColumnRange(columnId);
120 std::uint64_t pageIdx = 0;
123 <<
"exporting column \"" << colInfo.fQualifiedName <<
"\" (" << pages.fPageInfos.size() <<
" pages)";
126 assert(!colRange.fIsSuppressed || pages.fPageInfos.empty());
128 for (
const auto &pageInfo : pages.fPageInfos) {
133 const void *pageBuf = onDiskPage->
GetAddress();
135 const std::size_t maybeChecksumSize = incChecksum * 8;
136 const std::uint64_t pageBufSize = pageInfo.fLocator.fBytesOnStorage + maybeChecksumSize;
137 std::ostringstream ss{options.
fOutputPath, std::ios_base::ate};
138 ss <<
"/cluster_" << clusterDesc.GetId() <<
"_" << colInfo.fQualifiedName <<
"_page_" << pageIdx
139 <<
"_elems_" << pageInfo.fNElements <<
"_comp_" << colRange.fCompressionSettings <<
".page";
140 const auto outFileName = ss.str();
141 std::ofstream outFile{outFileName, std::ios_base::binary};
142 outFile.write(
reinterpret_cast<const char *
>(pageBuf), pageBufSize);
146 ++pageIdx, ++pagesExported;
149 int intPercent =
static_cast<int>(100.f * pagesExported / res.
fExportedFileNames.size());
150 if (intPercent != prevIntPercent) {
151 fprintf(stderr,
"\rExport progress: %02d%%", intPercent);
152 if (intPercent == 100)
153 fprintf(stderr,
"\n");
154 prevIntPercent = intPercent;
159 clusterId = desc->FindNextClusterId(clusterId);
#define R__LOG_DEBUG(DEBUGLEVEL,...)
const RColumnDescriptor * fColDesc
const RFieldDescriptor * fFieldDesc
std::string fQualifiedName
static void AddColumnsFromField(std::vector< RColumnMergeInfo > &columns, const RNTupleDescriptor &srcDesc, RNTupleMergeData &mergeData, const RFieldDescriptor &srcFieldDesc, const RFieldDescriptor &dstFieldDesc, const std::string &prefix="")
Managed a set of clusters containing compressed and packed pages.
An in-memory subset of the packed and compressed pages of a cluster.
const ROnDiskPage * GetOnDiskPage(const ROnDiskPage::Key &key) const
std::unordered_set< DescriptorId_t > ColumnSet_t
static RPagesResult ExportPages(RPageSource &source, const RPagesOptions &options={})
Given a page source, writes all its pages to individual files (1 per page).
A page as being stored on disk, that is packed and compressed.
const void * GetAddress() const
Abstract interface to read data from an ntuple.
const RSharedDescriptorGuard GetSharedDescriptorGuard() const
Takes the read lock for the descriptor.
void Attach()
Open the physical storage container and deserialize header and footer.
A log configuration for a channel, e.g.
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
constexpr DescriptorId_t kInvalidDescriptorId
@ kShowProgressBar
If enabled, the exporter will report the current progress on the stderr.
std::vector< std::string > fExportedFileNames
On-disk pages within a page source are identified by the column and page number.