37 auto descriptorGuard =
fPageSource->GetSharedDescriptorGuard();
52 for (
const auto &colDesc :
fDescriptor.GetColumnIterable()) {
53 if (colDesc.IsAliasColumn())
56 auto colId = colDesc.GetPhysicalId();
61 std::uint64_t nElems = 0;
62 std::vector<std::uint64_t> compressedPageSizes{};
64 for (
const auto &clusterDescriptor :
fDescriptor.GetClusterIterable()) {
65 if (!clusterDescriptor.ContainsColumn(colId)) {
69 auto columnRange = clusterDescriptor.GetColumnRange(colId);
70 if (columnRange.IsSuppressed())
73 nElems += columnRange.GetNElements();
84 std::to_string(*columnRange.GetCompressionSettings()) +
85 ") for column with physical ID " + std::to_string(colId)));
88 const auto &pageRange = clusterDescriptor.GetPageRange(colId);
90 for (
const auto &page : pageRange.GetPageInfos()) {
91 compressedPageSizes.emplace_back(page.GetLocator().GetNBytesOnStorage());
97 std::accumulate(compressedPageSizes.begin(), compressedPageSizes.end(),
static_cast<std::uint64_t
>(0));
105 std::uint64_t compressedSize = 0;
106 std::uint64_t uncompressedSize = 0;
108 for (
const auto &colDescriptor :
fDescriptor.GetColumnIterable(fieldId)) {
110 compressedSize += colInfo.GetCompressedSize();
111 uncompressedSize += colInfo.GetUncompressedSize();
114 for (
const auto &subFieldDescriptor :
fDescriptor.GetFieldIterable(fieldId)) {
115 auto subFieldId = subFieldDescriptor.GetId();
119 compressedSize += subFieldInfo.GetCompressedSize();
120 uncompressedSize += subFieldInfo.GetUncompressedSize();
128std::vector<ROOT::DescriptorId_t>
131 std::vector<ROOT::DescriptorId_t> colIds;
132 std::deque<ROOT::DescriptorId_t> fieldIdQueue{fieldId};
134 while (!fieldIdQueue.empty()) {
135 auto currId = fieldIdQueue.front();
136 fieldIdQueue.pop_front();
138 for (
const auto &col :
fDescriptor.GetColumnIterable(currId)) {
139 if (col.IsAliasColumn()) {
143 colIds.emplace_back(col.GetPhysicalId());
146 for (
const auto &fld :
fDescriptor.GetFieldIterable(currId)) {
147 fieldIdQueue.push_back(fld.GetId());
154std::unique_ptr<ROOT::Experimental::RNTupleInspector>
158 return std::unique_ptr<RNTupleInspector>(
new RNTupleInspector(std::move(pageSource)));
161std::unique_ptr<ROOT::Experimental::RNTupleInspector>
165 return std::unique_ptr<RNTupleInspector>(
new RNTupleInspector(std::move(pageSource)));
177 " (level " + std::to_string(level) +
")";
185 if (physicalColumnId >
fDescriptor.GetNPhysicalColumns()) {
186 throw RException(
R__FAIL(
"No column with physical ID " + std::to_string(physicalColumnId) +
" present"));
194 size_t typeCount = 0;
197 if (colInfo.GetType() == colType) {
205std::vector<ROOT::DescriptorId_t>
208 std::vector<ROOT::DescriptorId_t> colIds;
211 if (colInfo.GetType() == colType)
212 colIds.emplace_back(colId);
220 std::set<ROOT::ENTupleColumnType> colTypes;
223 colTypes.emplace(colInfo.GetType());
226 return std::vector(colTypes.begin(), colTypes.end());
231 struct ColumnTypeInfo {
232 std::uint64_t nElems = 0;
233 std::uint64_t compressedSize = 0;
234 std::uint64_t uncompressedSize = 0;
235 std::uint64_t nPages = 0;
236 std::uint32_t count = 0;
250 if (compressedSize == 0)
252 return static_cast<float>(uncompressedSize) /
static_cast<float>(compressedSize);
256 std::map<ENTupleColumnType, ColumnTypeInfo> colTypeInfo;
260 colTypeInfo[colInfo.GetType()] += colInfo;
265 output <<
" column type | count | # elements | compressed bytes | uncompressed bytes | compression ratio | "
267 <<
"----------------|---------|-------------|------------------|--------------------|-------------------|-"
270 for (
const auto &[colType, typeInfo] : colTypeInfo)
272 << typeInfo.count <<
" |" << std::setw(12) << typeInfo.nElems <<
" |" << std::setw(17)
273 << typeInfo.compressedSize <<
" |" << std::setw(19) << typeInfo.uncompressedSize <<
" |" << std::fixed
274 << std::setprecision(3) << std::setw(18) << typeInfo.GetCompressionFactor() <<
" |" << std::setw(6)
275 << typeInfo.nPages <<
" " << std::endl;
278 output <<
"columnType,count,nElements,compressedSize,uncompressedSize,compressionFactor,nPages" << std::endl;
279 for (
const auto &[colType, typeInfo] : colTypeInfo) {
281 <<
"," << typeInfo.compressedSize <<
"," << typeInfo.uncompressedSize <<
"," << std::fixed
282 << std::setprecision(3) << typeInfo.GetCompressionFactor() <<
"," << typeInfo.nPages << std::endl;
285 default:
R__ASSERT(
false &&
"Invalid print format");
291 std::string_view histName, std::string_view histTitle)
293 if (histName.empty()) {
303 if (histTitle.empty()) {
313 auto hist = std::make_unique<TH1D>(std::string(histName).c_str(), std::string(histTitle).c_str(), 1, 0, 1);
333 std::string histName, std::string histTitle,
size_t nBins)
335 if (histTitle.empty())
336 histTitle =
"Page size distribution for column with ID " + std::to_string(physicalColumnId);
342 std::string histName,
343 std::string histTitle,
size_t nBins)
345 if (histName.empty())
347 if (histTitle.empty())
353 if (perTypeHist->GetNhists() < 1)
354 return std::make_unique<TH1D>(histName.c_str(), histTitle.c_str(), 64, 0, 0);
356 auto hist = std::unique_ptr<TH1D>(
dynamic_cast<TH1D *
>(perTypeHist->GetHists()->First()));
358 hist->SetName(histName.c_str());
359 hist->SetTitle(histTitle.c_str());
360 hist->SetXTitle(
"Page size (B)");
361 hist->SetYTitle(
"N_{pages}");
367 std::string histName, std::string histTitle,
size_t nBins)
369 auto hist = std::make_unique<TH1D>();
371 if (histName.empty())
372 histName =
"pageSizeHist";
373 hist->SetName(histName.c_str());
374 if (histTitle.empty())
375 histTitle =
"Page size distribution";
376 hist->SetTitle(histTitle.c_str());
377 hist->SetXTitle(
"Page size (B)");
378 hist->SetYTitle(
"N_{pages}");
380 std::vector<std::uint64_t> pageSizes;
381 std::for_each(colIds.begin(), colIds.end(), [
this, &pageSizes](
const auto colId) {
382 auto colInfo = GetColumnInspector(colId);
383 pageSizes.insert(pageSizes.end(), colInfo.GetCompressedPageSizes().begin(),
384 colInfo.GetCompressedPageSizes().end());
387 if (!pageSizes.empty()) {
388 auto histMinMax = std::minmax_element(pageSizes.begin(), pageSizes.end());
389 hist->SetBins(nBins, *histMinMax.first,
390 *histMinMax.second + ((*histMinMax.second - *histMinMax.first) /
static_cast<double>(nBins)));
392 for (
const auto pageSize : pageSizes) {
393 hist->Fill(pageSize);
400std::unique_ptr<THStack>
402 std::string histName, std::string histTitle,
size_t nBins)
404 if (histName.empty())
405 histName =
"pageSizeHist";
406 if (histTitle.empty())
407 histTitle =
"Per-column type page size distribution";
409 auto stackedHist = std::make_unique<THStack>(histName.c_str(), histTitle.c_str());
411 double histMin = std::numeric_limits<double>::max();
413 std::map<ROOT::ENTupleColumnType, std::vector<std::uint64_t>> pageSizes;
415 std::vector<ROOT::ENTupleColumnType> colTypeVec = colTypes;
416 if (std::empty(colTypes)) {
420 for (
const auto colType : colTypeVec) {
426 std::vector<std::uint64_t> pageSizesForColType;
427 std::for_each(colIds.cbegin(), colIds.cend(), [
this, &pageSizesForColType](
const auto colId) {
428 auto colInfo = GetColumnInspector(colId);
429 pageSizesForColType.insert(pageSizesForColType.end(), colInfo.GetCompressedPageSizes().begin(),
430 colInfo.GetCompressedPageSizes().end());
432 if (pageSizesForColType.empty())
435 pageSizes.emplace(colType, pageSizesForColType);
437 auto histMinMax = std::minmax_element(pageSizesForColType.begin(), pageSizesForColType.end());
438 histMin = std::min(histMin,
static_cast<double>(*histMinMax.first));
439 histMax = std::max(histMax,
static_cast<double>(*histMinMax.second));
442 for (
const auto &[colType, pageSizesForColType] : pageSizes) {
443 auto hist = std::make_unique<TH1D>(
446 histMax + ((histMax - histMin) /
static_cast<double>(nBins)));
448 for (
const auto pageSize : pageSizesForColType) {
449 hist->Fill(pageSize);
452 stackedHist->Add(hist.release());
464 throw RException(
R__FAIL(
"No field with ID " + std::to_string(fieldId) +
" present"));
476 throw RException(
R__FAIL(
"Could not find field `" + std::string(fieldName) +
"`"));
483 bool includeSubfields)
const
485 size_t typeCount = 0;
488 if (!includeSubfields && fldInfo.GetDescriptor().GetParentId() !=
fDescriptor.GetFieldZeroId()) {
492 if (std::regex_match(fldInfo.GetDescriptor().GetTypeName(), typeNamePattern)) {
500std::vector<ROOT::DescriptorId_t>
503 std::vector<ROOT::DescriptorId_t> fieldIds;
507 if (!searchInSubfields && fldInfo.GetDescriptor().GetParentId() !=
fDescriptor.GetFieldZeroId()) {
511 if (std::regex_match(fldInfo.GetDescriptor().GetFieldName(), fieldNamePattern)) {
512 fieldIds.emplace_back(fldId);
520 std::ostream &output)
const
525 output <<
"digraph D {\n";
526 output <<
"node[shape=box]\n";
528 const std::string &nodeId = (isZeroField) ?
"0" : std::to_string(fieldDescriptor.
GetId() + 1);
532 auto htmlEscape = [&](
const std::string &in) -> std::string {
534 out.reserve(in.size());
535 for (
const char &
c : in) {
537 case '&': out +=
"&";
break;
538 case '<': out +=
"<";
break;
539 case '>': out +=
">";
break;
540 case '\"': out +=
""";
break;
541 case '\'': out +=
"'";
break;
542 default: out +=
c;
break;
548 output << nodeId <<
"[label=<";
550 output <<
"<b>Name: </b>" << htmlEscape(fieldDescriptor.
GetFieldName()) <<
"<br></br>";
551 output <<
"<b>Type: </b>" << htmlEscape(fieldDescriptor.
GetTypeName()) <<
"<br></br>";
552 output <<
"<b>ID: </b>" << std::to_string(fieldDescriptor.
GetId()) <<
"<br></br>";
553 if (description !=
"")
554 output <<
"<b>Description: </b>" << htmlEscape(description) <<
"<br></br>";
556 output <<
"<b>Version: </b>" << version <<
"<br></br>";
558 output <<
"<b>RFieldZero</b>";
560 for (
const auto &childFieldId : fieldDescriptor.
GetLinkIds()) {
561 const auto &childFieldDescriptor = tupleDescriptor.GetFieldDescriptor(childFieldId);
562 output << nodeId +
"->" + std::to_string(childFieldDescriptor.GetId() + 1) +
"\n";
#define R__FAIL(msg)
Short-hand to return an RResult<T> in an error state; the RError is implicitly converted into RResult...
#define R__ASSERT(e)
Checks condition e and reports a fatal error if it's false.
std::string & operator+=(std::string &left, const TString &right)
Provides column-level storage information.
std::uint64_t GetCompressedSize() const
std::uint64_t GetUncompressedSize() const
std::uint64_t GetNPages() const
std::uint64_t GetNElements() const
Provides field-level storage information.
float GetCompressionFactor() const
Get the compression factor of the RNTuple being inspected.
std::vector< ROOT::DescriptorId_t > GetFieldsByName(const std::regex &fieldNamePattern, bool searchInSubfields=true) const
Get the IDs of (sub-)fields whose name matches the given string.
const RFieldTreeInspector & GetFieldTreeInspector(ROOT::DescriptorId_t fieldId) const
Get storage information for a given (sub)field by ID.
std::unique_ptr< TH1D > GetPageSizeDistribution(ROOT::DescriptorId_t physicalColumnId, std::string histName="", std::string histTitle="", size_t nBins=64)
Get a histogram containing the size distribution of the compressed pages for an individual column.
const ROOT::RNTupleDescriptor & GetDescriptor() const
Get the descriptor for the RNTuple being inspected.
size_t GetColumnCountByType(ROOT::ENTupleColumnType colType) const
Get the number of columns of a given type present in the RNTuple.
std::optional< std::uint32_t > fCompressionSettings
The compression settings are unknown for an empty ntuple.
std::vector< ROOT::ENTupleColumnType > GetColumnTypes()
Get all column types present in the RNTuple being inspected.
size_t GetFieldCountByType(const std::regex &typeNamePattern, bool searchInSubfields=true) const
Get the number of fields of a given type or class present in the RNTuple.
std::vector< ROOT::DescriptorId_t > GetColumnsByType(ROOT::ENTupleColumnType colType)
Get the IDs of all columns with the given type.
std::string GetCompressionSettingsAsString() const
Get a string describing compression settings of the RNTuple being inspected.
RFieldTreeInspector CollectFieldTreeInfo(ROOT::DescriptorId_t fieldId)
Recursively gather field-level information.
std::uint64_t fCompressedSize
RNTupleInspector(std::unique_ptr< ROOT::Internal::RPageSource > pageSource)
void PrintColumnTypeInfo(ENTupleInspectorPrintFormat format=ENTupleInspectorPrintFormat::kTable, std::ostream &output=std::cout)
Print storage information per column type.
std::uint64_t fUncompressedSize
const RColumnInspector & GetColumnInspector(ROOT::DescriptorId_t physicalColumnId) const
Get storage information for a given column.
std::unique_ptr< ROOT::Internal::RPageSource > fPageSource
static std::unique_ptr< RNTupleInspector > Create(const RNTuple &sourceNTuple)
Create a new RNTupleInspector.
std::unordered_map< int, RFieldTreeInspector > fFieldTreeInfo
void CollectColumnInfo()
Gather column-level and RNTuple-level information.
std::unordered_map< int, RColumnInspector > fColumnInfo
void PrintFieldTreeAsDot(const ROOT::RFieldDescriptor &fieldDescriptor, std::ostream &output=std::cout) const
Print a .dot string that represents the tree of the (sub)fields of an RNTuple.
std::vector< ROOT::DescriptorId_t > GetAllColumnsOfField(ROOT::DescriptorId_t fieldId) const
Get the columns that make up the given field, including its subfields.
std::unique_ptr< TH1D > GetColumnTypeInfoAsHist(ENTupleInspectorHist histKind, std::string_view histName="", std::string_view histTitle="")
Get a histogram showing information for each column type present,.
ROOT::RNTupleDescriptor fDescriptor
A column element encapsulates the translation between basic C++ types and their column representation...
static const char * GetColumnTypeName(ROOT::ENTupleColumnType type)
static std::unique_ptr< RColumnElementBase > Generate(ROOT::ENTupleColumnType type)
If CppT == void, use the default C++ type for the given column type.
static std::unique_ptr< RPageSourceFile > CreateFromAnchor(const RNTuple &anchor, const ROOT::RNTupleReadOptions &options=ROOT::RNTupleReadOptions())
Used from the RNTuple class to build a datasource if the anchor is already available.
static std::unique_ptr< RPageSource > Create(std::string_view ntupleName, std::string_view location, const ROOT::RNTupleReadOptions &options=ROOT::RNTupleReadOptions())
Guess the concrete derived page source from the file name (location).
Base class for all ROOT issued exceptions.
Metadata stored for every field of an RNTuple.
ROOT::DescriptorId_t GetId() const
std::uint32_t GetFieldVersion() const
const std::vector< ROOT::DescriptorId_t > & GetLinkIds() const
ROOT::DescriptorId_t GetParentId() const
const std::string & GetFieldDescription() const
const std::string & GetFieldName() const
const std::string & GetTypeName() const
Representation of an RNTuple data set in a ROOT file.
1-D histogram with a double per channel (see TH1 documentation)
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString.
ENTupleInspectorPrintFormat
std::uint64_t DescriptorId_t
Distriniguishes elements of the same type within a descriptor, e.g. different fields.
constexpr DescriptorId_t kInvalidDescriptorId
EValues
Note: this is only temporarily a struct and will become a enum class hence the name convention used.
static std::string AlgorithmToString(EAlgorithm::EValues algorithm)