100 return "CSV data source";
111const std::unordered_map<RCsvDS::ColType_t, std::string>
112 RCsvDS::fgColTypeMap({{
'O',
"bool"}, {
'D',
"double"}, {
'L',
"Long64_t"}, {
'T',
"std::string"}});
116 auto fnLeftTrim = [](std::string &s) {
117 const auto N = s.size();
118 std::size_t idxStart = 0;
119 for (; idxStart <
N && std::isspace(s[idxStart]); ++idxStart)
122 s.erase(0, idxStart);
125 auto fnRightTrim = [](std::string &s) {
127 for (
auto itr = s.rbegin(); itr != s.rend() && std::isspace(*itr); ++itr, ++nTrim)
130 s.resize(s.size() - nTrim);
147 if (idxComment != std::string::npos)
148 line.resize(idxComment);
169 if (!
fOptions.fColumnNames.empty()) {
170 if (
fOptions.fColumnNames.size() != columns.size()) {
171 auto msg = std::string(
"Error: passed ") + std::to_string(
fOptions.fColumnNames.size()) +
172 " column names for a CSV file containing " + std::to_string(columns.size()) +
" columns!";
173 throw std::runtime_error(msg);
180 for (
auto &col : columns) {
191 for (
auto &col : columns) {
196 record.emplace_back(
new double((col !=
"nan") ? std::stod(col) : std::numeric_limits<double>::quiet_NaN()));
201 record.emplace_back(
new Long64_t(std::stoll(col)));
204 record.emplace_back(
new Long64_t(0));
210 record.emplace_back(
b);
212 std::istringstream(col) >> std::boolalpha >> *
b;
220 record.emplace_back(
new std::string(col));
230 if (!
fOptions.fColumnNames.empty()) {
232 auto msg = std::string(
"Error: passed ") + std::to_string(
fOptions.fColumnNames.size()) +
233 " column names for a CSV file containing " + std::to_string(
size) +
" columns!";
234 throw std::runtime_error(msg);
241 for (
size_t i = 0u; i <
size; ++i) {
242 fHeaders.push_back(
"Col" + std::to_string(i));
255 std::string msg =
"There is no column with name \"" + col.first +
"\".";
257 msg +=
"\nSince the input csv file does not contain headers, valid column names";
258 msg +=
" are [\"Col0\", ..., \"Col" + std::to_string(columns.size() - 1) +
"\"].";
260 throw std::runtime_error(msg);
262 if (std::string(
"ODLT").find(col.second) == std::string::npos) {
263 std::string msg =
"Type alias '" + std::string(1, col.second) +
"' is not supported.\n";
264 msg +=
"Supported type aliases are 'O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string.";
265 throw std::runtime_error(msg);
272 const auto second_line =
fCsvFile->GetFilePos();
274 for (
auto i = 0u; i < columns.size(); ++i) {
276 if (userSpecifiedType !=
fColTypes.end()) {
282 for (
auto extraRowsRead = 0u; extraRowsRead < 10u && columns[i] ==
"nan"; ++extraRowsRead) {
287 if (temp_columns[i] !=
"nan")
288 columns[i] = temp_columns[i];
293 if (columns[i] ==
"nan") {
326 std::vector<std::string> columns;
328 for (
size_t i = 0; i <
line.size(); ++i) {
339 const size_t prevPos = i;
341 for (; i <
line.size(); ++i) {
344 }
else if (
line[i] ==
'"') {
346 if (
line[i + 1] !=
'"') {
356 if (prevPos == i || val ==
"nan" || val ==
"NaN")
357 columns.emplace_back(
"nan");
359 columns.emplace_back(std::move(val));
364 columns.emplace_back(
"nan");
375 std::int64_t nLines = 0;
379 if (nLines <
fOptions.fSkipLastNLines) {
380 std::string msg =
"Error: too many footer lines to skip in CSV file ";
382 throw std::runtime_error(msg);
388 for (std::int64_t i = 0; i <
fOptions.fSkipFirstNLines; ++i) {
399 std::string msg =
"Error reading headers of CSV file ";
401 throw std::runtime_error(msg);
424 std::string msg =
"Could not infer column types of CSV file ";
426 throw std::runtime_error(msg);
453 std::unordered_map<std::string, char> &&colTypes)
458 fOptions.fLinesChunkSize = linesChunkSize;
466 for (
size_t i = 0; i < record.size(); ++i) {
471 delete static_cast<double *
>(p);
479 delete static_cast<bool *
>(p);
483 delete static_cast<std::string *
>(p);
515 auto linesToRead =
fOptions.fLinesChunkSize;
526 std::string msg =
"";
529 msg +=
"Column \"" + col +
"\" of type " + colT +
" contains empty cell(s) or NaN(s).\n";
530 msg +=
"There is no `nan` equivalent for type " + colT +
", hence ";
531 msg += std::string(colT ==
"Long64_t" ?
"`0`" :
"`false`") +
" is stored.\n";
533 msg +=
"Please manually set the column type to `double` (with `D`) in `FromCSV` to read NaNs instead.\n";
534 Warning(
"RCsvDS",
"%s", msg.c_str());
538 if (
fOptions.fLinesChunkSize == -1LL) {
539 Info(
"GetEntryRanges",
"Attempted to read entire CSV file into memory, %zu lines read",
fRecords.size());
541 Info(
"GetEntryRanges",
"Attempted to read chunk of %" PRId64
" lines of CSV file into memory, %zu lines read",
546 std::vector<std::pair<ULong64_t, ULong64_t>> entryRanges;
547 const auto nRecords =
fRecords.size();
551 const auto chunkSize = nRecords /
fNSlots;
559 entryRanges.emplace_back(
start, end);
562 entryRanges.back().second += remainder;
573 std::string msg =
"The dataset does not have column ";
575 throw std::runtime_error(msg);
595 const auto recordPos = entry - offset;
598 auto dataPtr =
fRecords[recordPos][colIndex];
624 assert(0U ==
fNSlots &&
"Setting the number of slots even if the number of slots is different from zero.");
628 const auto nColumns =
fHeaders.size();
651 std::unordered_map<std::string, char> &&colTypes)
654 std::make_unique<RCsvDS>(fileName, readHeaders, delimiter, linesChunkSize, std::move(colTypes)));
662std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
665 const auto colType =
GetType(colName);
667 if ((colType ==
'D' &&
typeid(
double) != tid) || (colType ==
'L' &&
typeid(
Long64_t) != tid) ||
668 (colType ==
'T' &&
typeid(std::string) != tid) || (colType ==
'O' &&
typeid(
bool) != tid)) {
669 std::string
err =
"The type selected for column \"";
671 err +=
"\" does not correspond to column type, which is ";
673 throw std::runtime_error(
err);
677 const auto index = std::distance(colNames.begin(), std::find(colNames.begin(), colNames.end(), colName));
680 if (tid ==
typeid(
double)) {
682 }
else if (tid ==
typeid(
Long64_t)) {
684 }
else if (tid ==
typeid(std::string)) {
690 return std::make_unique<ROOT::Internal::RDF::RCsvDSColumnReader>(val);
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
long long Long64_t
Portable signed long integer 8 bytes.
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
void Info(const char *location, const char *msgfmt,...)
Use this function for informational messages.
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
The RRawFile provides read-only access to local and remote files.
std::int64_t fDataLineNumber
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
void FillRecord(const std::string &, Record_t &)
void Finalize() final
Convenience method called after concluding an event-loop.
ColType_t GetType(std::string_view colName) const
std::vector< std::vector< double > > fDoubleEvtValues
void InferType(const std::string &, unsigned int)
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
static const TRegexp fgTrueRegex
void GenerateHeaders(size_t)
std::vector< std::vector< void * > > fColAddresses
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int slot, std::string_view colName, const std::type_info &tid) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
std::string AsString() final
bool Readln(std::string &line)
std::vector< std::string > fHeaders
ULong64_t fEntryRangesRequested
std::int64_t fMaxLineNumber
ULong64_t fProcessedLines
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
void InferColTypes(std::vector< std::string > &)
std::unordered_map< std::string, ColType_t > fColTypes
std::vector< std::vector< Long64_t > > fLong64EvtValues
RCsvDS(std::string_view fileName, const ROptions &options)
Constructor to create a CSV RDataSource for RDataFrame.
static const TRegexp fgDoubleRegex2
std::vector< Record_t > fRecords
std::set< std::string > fColContainingEmpty
~RCsvDS() final
Destructor.
static const TRegexp fgFalseRegex
static const TRegexp fgDoubleRegex3
void ValidateColTypes(std::vector< std::string > &) const
static const TRegexp fgIntRegex
std::vector< std::string > ParseColumns(const std::string &)
void FillHeaders(const std::string &)
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
std::string GetLabel() final
Return a string representation of the datasource type.
std::vector< void * > GetColumnReadersImpl(std::string_view, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
static const TRegexp fgDoubleRegex1
std::vector< std::vector< std::string > > fStringEvtValues
std::vector< std::deque< bool > > fBoolEvtValues
bool SetEntry(unsigned int slot, ULong64_t entry) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
std::list< ColType_t > fColTypesList
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
Regular expression class.
RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options)
Factory method to create a CSV RDataFrame.
TSeq< unsigned int > TSeqU
Options that control how the CSV file is parsed.