30ROOT::RLogChannel &RNTupleExporterLog()
32 static RLogChannel sLog(
"ROOT.RNTupleExporter");
36struct RColumnExportInfo {
37 const ROOT::RColumnDescriptor *fColDesc;
38 const ROOT::RFieldDescriptor *fFieldDesc;
39 std::string fQualifiedName;
41 RColumnExportInfo(
const ROOT::RNTupleDescriptor &desc,
const ROOT::RColumnDescriptor &colDesc,
42 const ROOT::RFieldDescriptor &fieldDesc)
44 fFieldDesc(&fieldDesc),
47 fQualifiedName(desc.GetQualifiedFieldName(fieldDesc.GetId()) +
'-' + std::to_string(colDesc.GetIndex()))
52struct RAddColumnsResult {
55 RAddColumnsResult &
operator+=(
const RAddColumnsResult &other)
57 fNColsTotal += other.fNColsTotal;
65 bool filterHasType = filter.fSet.find(item) != filter.fSet.end();
70RAddColumnsResult
AddColumnsFromField(std::vector<RColumnExportInfo> &vec,
const ROOT::RNTupleDescriptor &desc,
71 const ROOT::RFieldDescriptor &fieldDesc,
77 RAddColumnsResult res{};
80 if (subfieldDesc.IsProjectedField())
85 bool typeIsFiltered = ItemIsFilteredOut(options.fColumnTypeFilter, colDesc.GetType());
87 vec.emplace_back(desc, colDesc, subfieldDesc);
96int CountPages(
const ROOT::RNTupleDescriptor &desc, std::span<const RColumnExportInfo> columns)
102 for (
const auto &colInfo : columns) {
103 const auto &pages = clusterDesc.GetPageRange(colInfo.fColDesc->GetPhysicalId());
104 nPages += pages.GetPageInfos().size();
128 std::vector<RColumnExportInfo> columnInfos;
133 columnSet.reserve(columnInfos.size());
134 for (
const auto &colInfo : columnInfos) {
135 columnSet.emplace(colInfo.fColDesc->GetPhysicalId());
138 const auto nPages = CountPages(desc.GetRef(), columnInfos);
145 int pagesExported = 0;
146 int prevIntPercent = 0;
147 std::vector<char> unzipBuf;
151 for (
const auto &colInfo : columnInfos) {
152 auto columnId = colInfo.fColDesc->GetPhysicalId();
153 const auto &pages = clusterDesc.GetPageRange(columnId);
154 const auto &colRange = clusterDesc.GetColumnRange(columnId);
156 colElement->SetBitsOnStorage(colInfo.fColDesc->GetBitsOnStorage());
158 std::uint64_t pageIdx = 0;
161 <<
"exporting column \"" << colInfo.fQualifiedName <<
"\" (" << pages.GetPageInfos().size() <<
" pages)";
164 assert(!colRange.IsSuppressed() || pages.GetPageInfos().empty());
166 for (
const auto &pageInfo : pages.GetPageInfos()) {
171 std::ostringstream ss{options.
fOutputPath, std::ios_base::ate};
172 assert(colRange.GetCompressionSettings());
173 ss <<
"/cluster_" << clusterDesc.GetId() <<
"_" << colInfo.fQualifiedName <<
"_page_" << pageIdx
174 <<
"_elems_" << pageInfo.GetNElements() <<
"_comp_" << *colRange.GetCompressionSettings() <<
".page";
175 const auto outFileName = ss.str();
176 std::ofstream outFile{outFileName, std::ios_base::binary};
179 R__FAIL(std::string(
"output path ") + options.
fOutputPath +
" does not exist or is not writable!"));
183 const auto *pageBuf =
static_cast<const char *
>(onDiskPage->
GetAddress());
185 const auto nbytesPacked = colElement->GetPackedSize(pageInfo.GetNElements());
186 const auto nbytesData = pageInfo.GetLocator().GetNBytesOnStorage();
187 if (unzipBuf.size() < nbytesPacked)
188 unzipBuf.resize(nbytesPacked);
190 outFile.write(unzipBuf.data(), nbytesPacked);
192 const bool includeChecksum =
194 const std::size_t maybeChecksumSize = includeChecksum * 8;
195 const auto nbytesData = pageInfo.GetLocator().GetNBytesOnStorage() + maybeChecksumSize;
196 outFile.write(pageBuf, nbytesData);
201 ++pageIdx, ++pagesExported;
204 int intPercent =
static_cast<int>(100.f * pagesExported / res.
fExportedFileNames.size());
205 if (intPercent != prevIntPercent) {
206 fprintf(stderr,
"\rExport progress: %02d%%", intPercent);
207 if (intPercent == 100)
208 fprintf(stderr,
"\n");
209 prevIntPercent = intPercent;
218 std::ostringstream ss;
221 ss << addColRes.fNColsTotal <<
" columns)";
223 auto nColsFilteredOut = addColRes.fNColsTotal - columnInfos.size();
224 ss << nColsFilteredOut <<
"/" << addColRes.fNColsTotal <<
" columns filtered out)";
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
#define R__LOG_DEBUG(DEBUGLEVEL,...)
static void AddColumnsFromField(std::vector< RColumnMergeInfo > &columns, const ROOT::RNTupleDescriptor &srcDesc, RNTupleMergeData &mergeData, const ROOT::RFieldDescriptor &srcFieldDesc, const ROOT::RFieldDescriptor &dstFieldDesc, const std::string &prefix="")
std::string & operator+=(std::string &left, const TString &right)
@ kBlacklist
Don't export items contained in the filter's set.
static RPagesResult ExportPages(ROOT::Internal::RPageSource &source, const RPagesOptions &options={})
Given a page source, writes all its pages to individual files (1 per page).
Managed a set of clusters containing compressed and packed pages.
RCluster * GetCluster(ROOT::DescriptorId_t clusterId, const RCluster::ColumnSet_t &physicalColumns)
Returns the requested cluster either from the pool or, in case of a cache miss, lets the I/O thread l...
An in-memory subset of the packed and compressed pages of a cluster.
std::unordered_set< ROOT::DescriptorId_t > ColumnSet_t
const ROnDiskPage * GetOnDiskPage(const ROnDiskPage::Key &key) const
static std::unique_ptr< RColumnElementBase > Generate(ROOT::ENTupleColumnType type)
If CppT == void, use the default C++ type for the given column type.
static void Unzip(const void *from, size_t nbytes, size_t dataLen, void *to)
The nbytes parameter provides the size ls of the from buffer.
A page as being stored on disk, that is packed and compressed.
const void * GetAddress() const
Abstract interface to read data from an ntuple.
void Attach(ROOT::Internal::RNTupleSerializer::EDescriptorDeserializeMode mode=ROOT::Internal::RNTupleSerializer::EDescriptorDeserializeMode::kForReading)
Open the physical storage container and deserialize header and footer.
const RSharedDescriptorGuard GetSharedDescriptorGuard() const
Takes the read lock for the descriptor.
Base class for all ROOT issued exceptions.
ROOT::DescriptorId_t GetId() const
ROOT::DescriptorId_t FindNextClusterId(ROOT::DescriptorId_t clusterId) const
RFieldDescriptorIterable GetFieldIterable(const RFieldDescriptor &fieldDesc) const
RColumnDescriptorIterable GetColumnIterable() const
ROOT::DescriptorId_t FindClusterId(ROOT::NTupleSize_t entryIdx) const
const RClusterDescriptor & GetClusterDescriptor(ROOT::DescriptorId_t clusterId) const
std::string GetQualifiedFieldName(ROOT::DescriptorId_t fieldId) const
Walks up the parents of the field ID and returns a field name of the form a.b.c.d In case of invalid ...
const RFieldDescriptor & GetFieldZero() const
constexpr DescriptorId_t kInvalidDescriptorId
std::unordered_set< T > fSet
RFilter< ENTupleColumnType > fColumnTypeFilter
Optional filter that determines which columns are included or excluded from being exported.
@ kDecompress
If enabled, uncompress (but don't unpack) the page (mutually exclusive with kIncludeChecksums).
@ kShowProgressBar
If enabled, the exporter will report the current progress on the stderr.
std::vector< std::string > fExportedFileNames
On-disk pages within a page source are identified by the column and page number.