98std::string RCsvDS::AsString()
100 return "CSV data source";
104const TRegexp RCsvDS::fgIntRegex(
"^[-+]?[0-9]+$");
105const TRegexp RCsvDS::fgDoubleRegex1(
"^[-+]?[0-9]+\\.[0-9]*$");
106const TRegexp RCsvDS::fgDoubleRegex2(
"^[-+]?[0-9]*\\.[0-9]+$");
107const TRegexp RCsvDS::fgDoubleRegex3(
"^[-+]?[0-9]*\\.[0-9]+[eEdDqQ][-+]?[0-9]+$");
108const TRegexp RCsvDS::fgTrueRegex(
"^true$");
109const TRegexp RCsvDS::fgFalseRegex(
"^false$");
111const std::unordered_map<RCsvDS::ColType_t, std::string>
112 RCsvDS::fgColTypeMap({{
'O',
"bool"}, {
'D',
"double"}, {
'L',
"Long64_t"}, {
'T',
"std::string"}});
114bool RCsvDS::Readln(std::string &
line)
117 const auto N = s.size();
127 for (
auto itr = s.rbegin();
itr != s.rend() && std::isspace(*
itr); ++
itr, ++
nTrim)
130 s.resize(s.size() -
nTrim);
159void RCsvDS::RewindToData()
165void RCsvDS::FillHeaders(
const std::string &
line)
172 " column names for a CSV file containing " + std::to_string(
columns.size()) +
" columns!";
173 throw std::runtime_error(
msg);
196 record.emplace_back(
new double((col !=
"nan") ? std::stod(col) : std::numeric_limits<double>::quiet_NaN()));
212 std::istringstream(col) >> std::boolalpha >> *
b;
220 record.emplace_back(
new std::string(col));
228void RCsvDS::GenerateHeaders(
size_t size)
233 " column names for a CSV file containing " + std::to_string(
size) +
" columns!";
234 throw std::runtime_error(
msg);
241 for (
size_t i = 0
u; i <
size; ++i) {
242 fHeaders.push_back(
"Col" + std::to_string(i));
246std::vector<void *> RCsvDS::GetColumnReadersImpl(std::string_view
colName,
const std::type_info &
ti)
251 (
colType ==
'T' &&
typeid(std::string) !=
ti) || (
colType ==
'O' &&
typeid(
bool) !=
ti)) {
252 std::string err =
"The type selected for column \"";
254 err +=
"\" does not correspond to column type, which is ";
256 throw std::runtime_error(err);
264 if (
ti ==
typeid(
double)) {
268 }
else if (
ti ==
typeid(std::string)) {
278void RCsvDS::ValidateColTypes(std::vector<std::string> &
columns)
const
282 std::string
msg =
"There is no column with name \"" + col.first +
"\".";
284 msg +=
"\nSince the input csv file does not contain headers, valid column names";
285 msg +=
" are [\"Col0\", ..., \"Col" + std::to_string(
columns.size() - 1) +
"\"].";
287 throw std::runtime_error(
msg);
289 if (std::string(
"ODLT").find(col.second) == std::string::npos) {
290 std::string
msg =
"Type alias '" + std::string(1, col.second) +
"' is not supported.\n";
291 msg +=
"Supported type aliases are 'O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string.";
292 throw std::runtime_error(
msg);
297void RCsvDS::InferColTypes(std::vector<std::string> &
columns)
301 for (
auto i = 0
u; i <
columns.size(); ++i) {
330void RCsvDS::InferType(
const std::string &col,
unsigned int idxCol)
351std::vector<std::string> RCsvDS::ParseColumns(
const std::string &
line)
353 std::vector<std::string>
columns;
355 for (
size_t i = 0; i <
line.size(); ++i) {
362size_t RCsvDS::ParseValue(
const std::string &
line, std::vector<std::string> &
columns,
size_t i)
368 for (; i <
line.size(); ++i) {
371 }
else if (
line[i] ==
'"') {
373 if (
line[i + 1] !=
'"') {
383 if (
prevPos == i || val ==
"nan" || val ==
"NaN")
386 columns.emplace_back(std::move(val));
396void RCsvDS::Construct()
407 std::string
msg =
"Error: too many footer lines to skip in CSV file ";
409 throw std::runtime_error(
msg);
426 std::string
msg =
"Error reading headers of CSV file ";
428 throw std::runtime_error(
msg);
451 std::string
msg =
"Could not infer column types of CSV file ";
453 throw std::runtime_error(
msg);
461RCsvDS::RCsvDS(std::string_view fileName,
const ROptions &options)
462 : fOptions(options), fCsvFile(
ROOT::Internal::
RRawFile::Create(fileName))
480 std::unordered_map<std::string, char> &&
colTypes)
490void RCsvDS::FreeRecords()
493 for (
size_t i = 0; i <
record.size(); ++i) {
498 delete static_cast<double *
>(
p);
506 delete static_cast<bool *
>(
p);
510 delete static_cast<std::string *
>(
p);
526void RCsvDS::Finalize()
534const std::vector<std::string> &RCsvDS::GetColumnNames()
const
539std::vector<std::pair<ULong64_t, ULong64_t>> RCsvDS::GetEntryRanges()
553 std::string
msg =
"";
556 msg +=
"Column \"" + col +
"\" of type " +
colT +
" contains empty cell(s) or NaN(s).\n";
557 msg +=
"There is no `nan` equivalent for type " +
colT +
", hence ";
558 msg += std::string(
colT ==
"Long64_t" ?
"`0`" :
"`false`") +
" is stored.\n";
560 msg +=
"Please manually set the column type to `double` (with `D`) in `FromCSV` to read NaNs instead.\n";
566 Info(
"GetEntryRanges",
"Attempted to read entire CSV file into memory, %zu lines read",
fRecords.size());
568 Info(
"GetEntryRanges",
"Attempted to read chunk of %" PRId64 " lines of CSV file into memory, %zu lines read",
573 std::vector<std::pair<ULong64_t, ULong64_t>>
entryRanges;
600 std::string
msg =
"The dataset does not have column ";
602 throw std::runtime_error(
msg);
608std::string RCsvDS::GetTypeName(std::string_view
colName)
const
613bool RCsvDS::HasColumn(std::string_view
colName)
const
649void RCsvDS::SetNSlots(
unsigned int nSlots)
651 assert(0U ==
fNSlots &&
"Setting the number of slots even if the number of slots is different from zero.");
666std::string RCsvDS::GetLabel()
678 std::unordered_map<std::string, char> &&
colTypes)
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
unsigned long long ULong64_t
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
The RRawFile provides read-only access to local and remote files.
std::int64_t fDataLineNumber
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
void FillRecord(const std::string &, Record_t &)
ColType_t GetType(std::string_view colName) const
std::vector< std::vector< double > > fDoubleEvtValues
void InferType(const std::string &, unsigned int)
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
static const TRegexp fgTrueRegex
void GenerateHeaders(size_t)
std::vector< std::vector< void * > > fColAddresses
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
bool Readln(std::string &line)
std::vector< std::string > fHeaders
ULong64_t fEntryRangesRequested
std::int64_t fMaxLineNumber
ULong64_t fProcessedLines
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
void InferColTypes(std::vector< std::string > &)
std::unordered_map< std::string, ColType_t > fColTypes
std::vector< std::vector< Long64_t > > fLong64EvtValues
static const TRegexp fgDoubleRegex2
std::vector< Record_t > fRecords
std::set< std::string > fColContainingEmpty
static const TRegexp fgFalseRegex
static const TRegexp fgDoubleRegex3
void ValidateColTypes(std::vector< std::string > &) const
static const TRegexp fgIntRegex
std::vector< std::string > ParseColumns(const std::string &)
void FillHeaders(const std::string &)
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
static const TRegexp fgDoubleRegex1
std::vector< std::vector< std::string > > fStringEvtValues
std::vector< std::deque< bool > > fBoolEvtValues
std::list< ColType_t > fColTypesList
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
const_iterator begin() const
const_iterator end() const
Regular expression class.
RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options)
Factory method to create a CSV RDataFrame.
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
TSeq< unsigned int > TSeqU
Options that control how the CSV file is parsed.
bool fHeaders
The first line describes the columns.
bool fRightTrim
Trailing whitespaces are removed.
std::int64_t fSkipFirstNLines
Ignore the first N lines of the file.
std::vector< std::string > fColumnNames
Impose column names.
std::int64_t fSkipLastNLines
Ignore the last N lines of the file.
std::unordered_map< std::string, char > fColumnTypes
Specify custom column types, accepts an unordered map with keys being column name,...
bool fSkipBlankLines
Ignore empty lines (after trimming, if trimming is enabled)
char fDelimiter
Column delimiter character.
char fComment
Character indicating that the remainder of the line should be ignored, if different from '\0'.
bool fLeftTrim
Leading whitespaces are removed.
std::int64_t fLinesChunkSize
Number of lines to read, -1 to read all.