98std::string RCsvDS::AsString()
100 return "CSV data source";
104const TRegexp RCsvDS::fgIntRegex(
"^[-+]?[0-9]+$");
105const TRegexp RCsvDS::fgDoubleRegex1(
"^[-+]?[0-9]+\\.[0-9]*$");
106const TRegexp RCsvDS::fgDoubleRegex2(
"^[-+]?[0-9]*\\.[0-9]+$");
107const TRegexp RCsvDS::fgDoubleRegex3(
"^[-+]?[0-9]*\\.[0-9]+[eEdDqQ][-+]?[0-9]+$");
108const TRegexp RCsvDS::fgTrueRegex(
"^true$");
109const TRegexp RCsvDS::fgFalseRegex(
"^false$");
111const std::unordered_map<RCsvDS::ColType_t, std::string>
112 RCsvDS::fgColTypeMap({{
'O',
"bool"}, {
'D',
"double"}, {
'L',
"Long64_t"}, {
'T',
"std::string"}});
114bool RCsvDS::Readln(std::string &
line)
117 const auto N = s.size();
127 for (
auto itr = s.rbegin();
itr != s.rend() && std::isspace(*
itr); ++
itr, ++
nTrim)
130 s.resize(s.size() -
nTrim);
159void RCsvDS::RewindToData()
165void RCsvDS::FillHeaders(
const std::string &
line)
172 " column names for a CSV file containing " + std::to_string(
columns.size()) +
" columns!";
173 throw std::runtime_error(
msg);
196 record.emplace_back(
new double((col !=
"nan") ? std::stod(col) : std::numeric_limits<double>::quiet_NaN()));
212 std::istringstream(col) >> std::boolalpha >> *
b;
220 record.emplace_back(
new std::string(col));
228void RCsvDS::GenerateHeaders(
size_t size)
233 " column names for a CSV file containing " + std::to_string(
size) +
" columns!";
234 throw std::runtime_error(
msg);
241 for (
size_t i = 0
u; i <
size; ++i) {
242 fHeaders.push_back(
"Col" + std::to_string(i));
246std::vector<void *> RCsvDS::GetColumnReadersImpl(std::string_view,
const std::type_info &)
251void RCsvDS::ValidateColTypes(std::vector<std::string> &
columns)
const
255 std::string
msg =
"There is no column with name \"" + col.first +
"\".";
257 msg +=
"\nSince the input csv file does not contain headers, valid column names";
258 msg +=
" are [\"Col0\", ..., \"Col" + std::to_string(
columns.size() - 1) +
"\"].";
260 throw std::runtime_error(
msg);
262 if (std::string(
"ODLT").find(col.second) == std::string::npos) {
263 std::string
msg =
"Type alias '" + std::string(1, col.second) +
"' is not supported.\n";
264 msg +=
"Supported type aliases are 'O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string.";
265 throw std::runtime_error(
msg);
270void RCsvDS::InferColTypes(std::vector<std::string> &
columns)
274 for (
auto i = 0
u; i <
columns.size(); ++i) {
303void RCsvDS::InferType(
const std::string &col,
unsigned int idxCol)
324std::vector<std::string> RCsvDS::ParseColumns(
const std::string &
line)
326 std::vector<std::string>
columns;
328 for (
size_t i = 0; i <
line.size(); ++i) {
335size_t RCsvDS::ParseValue(
const std::string &
line, std::vector<std::string> &
columns,
size_t i)
341 for (; i <
line.size(); ++i) {
344 }
else if (
line[i] ==
'"') {
346 if (
line[i + 1] !=
'"') {
356 if (
prevPos == i || val ==
"nan" || val ==
"NaN")
359 columns.emplace_back(std::move(val));
369void RCsvDS::Construct()
380 std::string
msg =
"Error: too many footer lines to skip in CSV file ";
382 throw std::runtime_error(
msg);
399 std::string
msg =
"Error reading headers of CSV file ";
401 throw std::runtime_error(
msg);
424 std::string
msg =
"Could not infer column types of CSV file ";
426 throw std::runtime_error(
msg);
434RCsvDS::RCsvDS(std::string_view fileName,
const ROptions &options)
435 : fOptions(options), fCsvFile(
ROOT::Internal::
RRawFile::Create(fileName))
453 std::unordered_map<std::string, char> &&
colTypes)
463void RCsvDS::FreeRecords()
466 for (
size_t i = 0; i <
record.size(); ++i) {
471 delete static_cast<double *
>(
p);
479 delete static_cast<bool *
>(
p);
483 delete static_cast<std::string *
>(
p);
499void RCsvDS::Finalize()
507const std::vector<std::string> &RCsvDS::GetColumnNames()
const
512std::vector<std::pair<ULong64_t, ULong64_t>> RCsvDS::GetEntryRanges()
526 std::string
msg =
"";
529 msg +=
"Column \"" + col +
"\" of type " +
colT +
" contains empty cell(s) or NaN(s).\n";
530 msg +=
"There is no `nan` equivalent for type " +
colT +
", hence ";
531 msg += std::string(
colT ==
"Long64_t" ?
"`0`" :
"`false`") +
" is stored.\n";
533 msg +=
"Please manually set the column type to `double` (with `D`) in `FromCSV` to read NaNs instead.\n";
539 Info(
"GetEntryRanges",
"Attempted to read entire CSV file into memory, %zu lines read",
fRecords.size());
541 Info(
"GetEntryRanges",
"Attempted to read chunk of %" PRId64 " lines of CSV file into memory, %zu lines read",
546 std::vector<std::pair<ULong64_t, ULong64_t>>
entryRanges;
573 std::string
msg =
"The dataset does not have column ";
575 throw std::runtime_error(
msg);
581std::string RCsvDS::GetTypeName(std::string_view
colName)
const
586bool RCsvDS::HasColumn(std::string_view
colName)
const
622void RCsvDS::SetNSlots(
unsigned int nSlots)
624 assert(0U ==
fNSlots &&
"Setting the number of slots even if the number of slots is different from zero.");
639std::string RCsvDS::GetLabel()
651 std::unordered_map<std::string, char> &&
colTypes)
662std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
669 std::string err =
"The type selected for column \"";
671 err +=
"\" does not correspond to column type, which is ";
673 throw std::runtime_error(err);
680 if (
tid ==
typeid(
double)) {
684 }
else if (
tid ==
typeid(std::string)) {
690 return std::make_unique<ROOT::Internal::RDF::RCsvDSColumnReader>(val);
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
long long Long64_t
Portable signed long integer 8 bytes.
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
void Info(const char *location, const char *msgfmt,...)
Use this function for informational messages.
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
Int_t gDebug
Global variable setting the debug level. Set to 0 to disable, increase it in steps of 1 to increase t...
The RRawFile provides read-only access to local and remote files.
std::int64_t fDataLineNumber
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
void FillRecord(const std::string &, Record_t &)
ColType_t GetType(std::string_view colName) const
std::vector< std::vector< double > > fDoubleEvtValues
void InferType(const std::string &, unsigned int)
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
static const TRegexp fgTrueRegex
void GenerateHeaders(size_t)
std::vector< std::vector< void * > > fColAddresses
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int slot, std::string_view colName, const std::type_info &tid) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
bool Readln(std::string &line)
std::vector< std::string > fHeaders
ULong64_t fEntryRangesRequested
std::int64_t fMaxLineNumber
ULong64_t fProcessedLines
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
void InferColTypes(std::vector< std::string > &)
std::unordered_map< std::string, ColType_t > fColTypes
std::vector< std::vector< Long64_t > > fLong64EvtValues
static const TRegexp fgDoubleRegex2
std::vector< Record_t > fRecords
std::set< std::string > fColContainingEmpty
static const TRegexp fgFalseRegex
static const TRegexp fgDoubleRegex3
void ValidateColTypes(std::vector< std::string > &) const
static const TRegexp fgIntRegex
std::vector< std::string > ParseColumns(const std::string &)
void FillHeaders(const std::string &)
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
static const TRegexp fgDoubleRegex1
std::vector< std::vector< std::string > > fStringEvtValues
std::vector< std::deque< bool > > fBoolEvtValues
std::list< ColType_t > fColTypesList
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
const_iterator begin() const
const_iterator end() const
Regular expression class.
RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options)
Factory method to create a CSV RDataFrame.
Namespace for new ROOT classes and functions.
TSeq< unsigned int > TSeqU
Options that control how the CSV file is parsed.
bool fHeaders
The first line describes the columns.
bool fRightTrim
Trailing whitespaces are removed.
std::int64_t fSkipFirstNLines
Ignore the first N lines of the file.
std::vector< std::string > fColumnNames
Impose column names.
std::int64_t fSkipLastNLines
Ignore the last N lines of the file.
std::unordered_map< std::string, char > fColumnTypes
Specify custom column types, accepts an unordered map with keys being column name,...
bool fSkipBlankLines
Ignore empty lines (after trimming, if trimming is enabled)
char fDelimiter
Column delimiter character.
char fComment
Character indicating that the remainder of the line should be ignored, if different from '\0'.
bool fLeftTrim
Leading whitespaces are removed.
std::int64_t fLinesChunkSize
Number of lines to read, -1 to read all.