Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RCsvDS.cxx
Go to the documentation of this file.
1// Author: Enric Tejedor CERN 10/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11// clang-format off
12/** \class ROOT::RDF::RCsvDS
13 \ingroup dataframe
14 \brief RDataFrame data source class for reading CSV files.
15
16The RCsvDS class implements a CSV file reader for RDataFrame.
17
18A RDataFrame that reads from a CSV file can be constructed using the factory method
19ROOT::RDF::MakeCsvDataFrame, which accepts five parameters:
201. Path to the CSV file.
212. Boolean that specifies whether the first row of the CSV file contains headers or
22not (optional, default `true`). If `false`, header names will be automatically generated as Col0, Col1, ..., ColN.
233. Delimiter (optional, default ',').
244. Chunk size (optional, default is -1 to read all) - number of lines to read at a time
255. Column Types (optional, default is an empty map). A map with column names as keys and their type
26(expressed as a single character, see below) as values.
27
28The type of columns that do not appear in the map is inferred from the data.
29The supported types are:
30- Integer: stored as a 64-bit long long int; can be specified in the column types map with 'L'.
31- Floating point number: stored with double precision; specified with 'D'.
32- Boolean: matches the literals `true` and `false`; specified with 'O'.
33- String: stored as an std::string, matches anything that does not fall into any of the
34previous types; specified with 'T'.
35
36These are some formatting rules expected by the RCsvDS implementation:
37- All records must have the same number of fields, in the same order.
38- Any field may be quoted.
39~~~
40 "1997","Ford","E350"
41~~~
42- Fields with embedded delimiters (e.g. comma) must be quoted.
43~~~
44 1997,Ford,E350,"Super, luxurious truck"
45~~~
46- Fields with double-quote characters must be quoted, and each of the embedded
47double-quote characters must be represented by a pair of double-quote characters.
48~~~
49 1997,Ford,E350,"Super, ""luxurious"" truck"
50~~~
51- Fields with embedded line breaks are not supported, even when quoted.
52~~~
53 1997,Ford,E350,"Go get one now
54 they are going fast"
55~~~
56- Spaces are considered part of a field and are not ignored.
57~~~
58 1997, Ford , E350
59 not same as
60 1997,Ford,E350
61 but same as
62 1997, "Ford" , E350
63~~~
64- If a header row is provided, it must contain column names for each of the fields.
65~~~
66 Year,Make,Model
67 1997,Ford,E350
68 2000,Mercury,Cougar
69~~~
70
71The current implementation of RCsvDS reads the entire CSV file content into memory before
72RDataFrame starts processing it. Therefore, before creating a CSV RDataFrame, it is
73important to check both how much memory is available and the size of the CSV file.
74
75RCsvDS can handle empty cells and also allows the usage of the special keywords "NaN" and "nan" to
76indicate `nan` values. If the column is of type double, these cells are stored internally as `nan`.
77Empty cells and explicit `nan`-s inside columns of type Long64_t/bool are stored as zeros.
78*/
79// clang-format on
80
81#include <ROOT/TSeq.hxx>
82#include <ROOT/RCsvDS.hxx>
83#include <ROOT/RRawFile.hxx>
84#include <TError.h>
85
86#include <algorithm>
87#include <memory>
88#include <sstream>
89#include <string>
90
91namespace ROOT {
92
93namespace RDF {
94
95std::string RCsvDS::AsString()
96{
97 return "CSV data source";
98}
99
100// Regular expressions for type inference
101const TRegexp RCsvDS::fgIntRegex("^[-+]?[0-9]+$");
102const TRegexp RCsvDS::fgDoubleRegex1("^[-+]?[0-9]+\\.[0-9]*$");
103const TRegexp RCsvDS::fgDoubleRegex2("^[-+]?[0-9]*\\.[0-9]+$");
104const TRegexp RCsvDS::fgDoubleRegex3("^[-+]?[0-9]*\\.[0-9]+[eEdDqQ][-+]?[0-9]+$");
105const TRegexp RCsvDS::fgTrueRegex("^true$");
106const TRegexp RCsvDS::fgFalseRegex("^false$");
107
108const std::unordered_map<RCsvDS::ColType_t, std::string>
109 RCsvDS::fgColTypeMap({{'O', "bool"}, {'D', "double"}, {'L', "Long64_t"}, {'T', "std::string"}});
110
111void RCsvDS::FillHeaders(const std::string &line)
112{
113 auto columns = ParseColumns(line);
114 fHeaders.reserve(columns.size());
115 for (auto &col : columns) {
116 fHeaders.emplace_back(col);
117 }
118}
119
120void RCsvDS::FillRecord(const std::string &line, Record_t &record)
121{
122 auto i = 0U;
123
124 auto columns = ParseColumns(line);
125
126 for (auto &col : columns) {
127 auto colType = fColTypes[fHeaders[i]];
128
129 switch (colType) {
130 case 'D': {
131 record.emplace_back(new double((col != "nan") ? std::stod(col) : std::numeric_limits<double>::quiet_NaN()));
132 break;
133 }
134 case 'L': {
135 if (col != "nan") {
136 record.emplace_back(new Long64_t(std::stoll(col)));
137 } else {
138 fColContainingEmpty.insert(fHeaders[i]);
139 record.emplace_back(new Long64_t(0));
140 }
141 break;
142 }
143 case 'O': {
144 auto b = new bool();
145 record.emplace_back(b);
146 if (col != "nan") {
147 std::istringstream(col) >> std::boolalpha >> *b;
148 } else {
149 fColContainingEmpty.insert(fHeaders[i]);
150 *b = false;
151 }
152 break;
153 }
154 case 'T': {
155 record.emplace_back(new std::string(col));
156 break;
157 }
158 }
159 ++i;
160 }
161}
162
164{
165 fHeaders.reserve(size);
166 for (size_t i = 0u; i < size; ++i) {
167 fHeaders.push_back("Col" + std::to_string(i));
168 }
169}
170
171std::vector<void *> RCsvDS::GetColumnReadersImpl(std::string_view colName, const std::type_info &ti)
172{
173 const auto colType = GetType(colName);
174
175 if ((colType == 'D' && typeid(double) != ti) || (colType == 'L' && typeid(Long64_t) != ti) ||
176 (colType == 'T' && typeid(std::string) != ti) || (colType == 'O' && typeid(bool) != ti)) {
177 std::string err = "The type selected for column \"";
178 err += colName;
179 err += "\" does not correspond to column type, which is ";
180 err += fgColTypeMap.at(colType);
181 throw std::runtime_error(err);
182 }
183
184 const auto &colNames = GetColumnNames();
185 const auto index = std::distance(colNames.begin(), std::find(colNames.begin(), colNames.end(), colName));
186 std::vector<void *> ret(fNSlots);
187 for (auto slot : ROOT::TSeqU(fNSlots)) {
188 auto &val = fColAddresses[index][slot];
189 if (ti == typeid(double)) {
190 val = &fDoubleEvtValues[index][slot];
191 } else if (ti == typeid(Long64_t)) {
192 val = &fLong64EvtValues[index][slot];
193 } else if (ti == typeid(std::string)) {
194 val = &fStringEvtValues[index][slot];
195 } else {
196 val = &fBoolEvtValues[index][slot];
197 }
198 ret[slot] = &val;
199 }
200 return ret;
201}
202
203void RCsvDS::ValidateColTypes(std::vector<std::string> &columns) const
204{
205 for (const auto &col : fColTypes) {
206 if (!HasColumn(col.first)) {
207 std::string msg = "There is no column with name \"" + col.first + "\".";
208 if (!fReadHeaders) {
209 msg += "\nSince the input csv file does not contain headers, valid column names";
210 msg += " are [\"Col0\", ..., \"Col" + std::to_string(columns.size() - 1) + "\"].";
211 }
212 throw std::runtime_error(msg);
213 }
214 if (std::string("ODLT").find(col.second) == std::string::npos) {
215 std::string msg = "Type alias '" + std::string(1, col.second) + "' is not supported.\n";
216 msg += "Supported type aliases are 'O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string.";
217 throw std::runtime_error(msg);
218 }
219 }
220}
221
222void RCsvDS::InferColTypes(std::vector<std::string> &columns)
223{
224 const auto second_line = fCsvFile->GetFilePos();
225
226 for (auto i = 0u; i < columns.size(); ++i) {
227 const auto userSpecifiedType = fColTypes.find(fHeaders[i]);
228 if (userSpecifiedType != fColTypes.end()) {
229 fColTypesList.push_back(userSpecifiedType->second);
230 continue;
231 }
232
233 // read <=10 extra lines until a non-empty cell on this column is found, so that type is determined
234 for (auto extraRowsRead = 0u; extraRowsRead < 10u && columns[i] == "nan"; ++extraRowsRead) {
235 std::string line;
236 if (!fCsvFile->Readln(line))
237 break; // EOF
238 const auto temp_columns = ParseColumns(line);
239 if (temp_columns[i] != "nan")
240 columns[i] = temp_columns[i]; // will break the loop in the next iteration
241 }
242 // reset the reading from the second line, because the first line is already loaded in `columns`
243 fCsvFile->Seek(second_line);
244
245 if (columns[i] == "nan") {
246 // could not find a non-empty value, default to double
247 fColTypes[fHeaders[i]] = 'D';
248 fColTypesList.push_back('D');
249 } else {
250 InferType(columns[i], i);
251 }
252 }
253}
254
255void RCsvDS::InferType(const std::string &col, unsigned int idxCol)
256{
258 int dummy;
259
260 if (fgIntRegex.Index(col, &dummy) != -1) {
261 type = 'L'; // Long64_t
262 } else if (fgDoubleRegex1.Index(col, &dummy) != -1 || fgDoubleRegex2.Index(col, &dummy) != -1 ||
263 fgDoubleRegex3.Index(col, &dummy) != -1) {
264 type = 'D'; // double
265 } else if (fgTrueRegex.Index(col, &dummy) != -1 || fgFalseRegex.Index(col, &dummy) != -1) {
266 type = 'O'; // bool
267 } else { // everything else is a string
268 type = 'T'; // std::string
269 }
270 // TODO: Date
271
272 fColTypes[fHeaders[idxCol]] = type;
273 fColTypesList.push_back(type);
274}
275
276std::vector<std::string> RCsvDS::ParseColumns(const std::string &line)
277{
278 std::vector<std::string> columns;
279
280 for (size_t i = 0; i < line.size(); ++i) {
281 i = ParseValue(line, columns, i);
282 }
283
284 return columns;
285}
286
287size_t RCsvDS::ParseValue(const std::string &line, std::vector<std::string> &columns, size_t i)
288{
289 std::string val;
290 bool quoted = false;
291 const size_t prevPos = i; // used to check if cell is empty
292
293 for (; i < line.size(); ++i) {
294 if (line[i] == fDelimiter && !quoted) {
295 break;
296 } else if (line[i] == '"') {
297 // Keep just one quote for escaped quotes, none for the normal quotes
298 if (line[i + 1] != '"') {
299 quoted = !quoted;
300 } else {
301 val += line[++i];
302 }
303 } else {
304 val += line[i];
305 }
306 }
307
308 if (prevPos == i || val == "nan" || val == "NaN") // empty cell or explicit nan/NaN
309 columns.emplace_back("nan");
310 else
311 columns.emplace_back(std::move(val));
312
313 // if the line ends with the delimiter, we need to append the default column value
314 // for the _next_, last column that won't be parsed (because we are out of characters)
315 if (i == line.size() - 1 && line[i] == fDelimiter)
316 columns.emplace_back("nan");
317
318 return i;
319}
320
321////////////////////////////////////////////////////////////////////////
322/// Constructor to create a CSV RDataSource for RDataFrame.
323/// \param[in] fileName Path or URL of the CSV file.
324/// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
325/// (default `true`).
326/// \param[in] delimiter Delimiter character (default ',').
327/// \param[in] linesChunkSize bunch of lines to read, use -1 to read all
328/// \param[in] colTypes Allows users to manually specify column types. Accepts an unordered map with keys being
329/// column names, values being type specifiers ('O' for boolean, 'D' for double, 'L' for
330/// Long64_t, 'T' for std::string)
331RCsvDS::RCsvDS(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize,
332 std::unordered_map<std::string, char> &&colTypes)
333 : fReadHeaders(readHeaders), fCsvFile(ROOT::Internal::RRawFile::Create(fileName)), fDelimiter(delimiter),
334 fLinesChunkSize(linesChunkSize), fColTypes(std::move(colTypes))
335{
336 std::string line;
337
338 // Read the headers if present
339 if (fReadHeaders) {
340 if (fCsvFile->Readln(line)) {
342 } else {
343 std::string msg = "Error reading headers of CSV file ";
344 msg += fileName;
345 throw std::runtime_error(msg);
346 }
347 }
348
349 fDataPos = fCsvFile->GetFilePos();
350 bool eof = false;
351 do {
352 eof = !fCsvFile->Readln(line);
353 } while (line.empty() && !eof);
354 if (!eof) {
355 auto columns = ParseColumns(line);
356
357 // Generate headers if not present
358 if (!fReadHeaders) {
359 GenerateHeaders(columns.size());
360 }
361
362 // Ensure user is trying to set types only of existing columns
363 ValidateColTypes(columns);
364
365 // Infer types of columns with first record
366 InferColTypes(columns);
367
368 // rewind
369 fCsvFile->Seek(fDataPos);
370 } else {
371 std::string msg = "Could not infer column types of CSV file ";
372 msg += fileName;
373 throw std::runtime_error(msg);
374 }
375}
376
378{
379 for (auto &record : fRecords) {
380 for (size_t i = 0; i < record.size(); ++i) {
381 void *p = record[i];
382 const auto colType = fColTypes[fHeaders[i]];
383 switch (colType) {
384 case 'D': {
385 delete static_cast<double *>(p);
386 break;
387 }
388 case 'L': {
389 delete static_cast<Long64_t *>(p);
390 break;
391 }
392 case 'O': {
393 delete static_cast<bool *>(p);
394 break;
395 }
396 case 'T': {
397 delete static_cast<std::string *>(p);
398 break;
399 }
400 }
401 }
402 }
403 fRecords.clear();
404}
405
406////////////////////////////////////////////////////////////////////////
407/// Destructor.
409{
410 FreeRecords();
411}
412
414{
415 fCsvFile->Seek(fDataPos);
416 fProcessedLines = 0ULL;
418 FreeRecords();
419}
420
421const std::vector<std::string> &RCsvDS::GetColumnNames() const
422{
423 return fHeaders;
424}
425
426std::vector<std::pair<ULong64_t, ULong64_t>> RCsvDS::GetEntryRanges()
427{
428 // Read records and store them in memory
429 auto linesToRead = fLinesChunkSize;
430 FreeRecords();
431
432 std::string line;
433 while ((-1LL == fLinesChunkSize || 0 != linesToRead) && fCsvFile->Readln(line)) {
434 if (line.empty()) continue; // skip empty lines
435 fRecords.emplace_back();
436 FillRecord(line, fRecords.back());
437 --linesToRead;
438 }
439
440 if (!fColContainingEmpty.empty()) {
441 std::string msg = "";
442 for (const auto &col : fColContainingEmpty) {
443 const auto colT = GetTypeName(col);
444 msg += "Column \"" + col + "\" of type " + colT + " contains empty cell(s) or NaN(s).\n";
445 msg += "There is no `nan` equivalent for type " + colT + ", hence ";
446 msg += std::string(colT == "Long64_t" ? "`0`" : "`false`") + " is stored.\n";
447 }
448 msg += "Please manually set the column type to `double` (with `D`) in `FromCSV` to read NaNs instead.\n";
449 Warning("RCsvDS", "%s", msg.c_str());
450 }
451
452 if (gDebug > 0) {
453 if (fLinesChunkSize == -1LL) {
454 Info("GetEntryRanges", "Attempted to read entire CSV file into memory, %zu lines read", fRecords.size());
455 } else {
456 Info("GetEntryRanges", "Attempted to read chunk of %lld lines of CSV file into memory, %zu lines read", fLinesChunkSize, fRecords.size());
457 }
458 }
459
460 std::vector<std::pair<ULong64_t, ULong64_t>> entryRanges;
461 const auto nRecords = fRecords.size();
462 if (0 == nRecords)
463 return entryRanges;
464
465 const auto chunkSize = nRecords / fNSlots;
466 const auto remainder = 1U == fNSlots ? 0 : nRecords % fNSlots;
467 auto start = fProcessedLines;
468 auto end = start;
469
470 for (auto i : ROOT::TSeqU(fNSlots)) {
471 start = end;
472 end += chunkSize;
473 entryRanges.emplace_back(start, end);
474 (void)i;
475 }
476 entryRanges.back().second += remainder;
477
478 fProcessedLines += nRecords;
480
481 return entryRanges;
482}
483
484RCsvDS::ColType_t RCsvDS::GetType(std::string_view colName) const
485{
486 if (!HasColumn(colName)) {
487 std::string msg = "The dataset does not have column ";
488 msg += colName;
489 throw std::runtime_error(msg);
490 }
491
492 return fColTypes.at(colName.data());
493}
494
495std::string RCsvDS::GetTypeName(std::string_view colName) const
496{
497 return fgColTypeMap.at(GetType(colName));
498}
499
500bool RCsvDS::HasColumn(std::string_view colName) const
501{
502 return fHeaders.end() != std::find(fHeaders.begin(), fHeaders.end(), colName);
503}
504
505bool RCsvDS::SetEntry(unsigned int slot, ULong64_t entry)
506{
507 // Here we need to normalise the entry to the number of lines we already processed.
508 const auto offset = (fEntryRangesRequested - 1) * fLinesChunkSize;
509 const auto recordPos = entry - offset;
510 int colIndex = 0;
511 for (auto &colType : fColTypesList) {
512 auto dataPtr = fRecords[recordPos][colIndex];
513 switch (colType) {
514 case 'D': {
515 fDoubleEvtValues[colIndex][slot] = *static_cast<double *>(dataPtr);
516 break;
517 }
518 case 'L': {
519 fLong64EvtValues[colIndex][slot] = *static_cast<Long64_t *>(dataPtr);
520 break;
521 }
522 case 'O': {
523 fBoolEvtValues[colIndex][slot] = *static_cast<bool *>(dataPtr);
524 break;
525 }
526 case 'T': {
527 fStringEvtValues[colIndex][slot] = *static_cast<std::string *>(dataPtr);
528 break;
529 }
530 }
531 colIndex++;
532 }
533 return true;
534}
535
536void RCsvDS::SetNSlots(unsigned int nSlots)
537{
538 assert(0U == fNSlots && "Setting the number of slots even if the number of slots is different from zero.");
539
540 fNSlots = nSlots;
541
542 const auto nColumns = fHeaders.size();
543 // Initialize the entire set of addresses
544 fColAddresses.resize(nColumns, std::vector<void *>(fNSlots, nullptr));
545
546 // Initialize the per event data holders
547 fDoubleEvtValues.resize(nColumns, std::vector<double>(fNSlots));
548 fLong64EvtValues.resize(nColumns, std::vector<Long64_t>(fNSlots));
549 fStringEvtValues.resize(nColumns, std::vector<std::string>(fNSlots));
550 fBoolEvtValues.resize(nColumns, std::deque<bool>(fNSlots));
551}
552
553std::string RCsvDS::GetLabel()
554{
555 return "RCsv";
556}
557
558RDataFrame FromCSV(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize,
559 std::unordered_map<std::string, char> &&colTypes)
560{
562 std::make_unique<RCsvDS>(fileName, readHeaders, delimiter, linesChunkSize, std::move(colTypes)));
563 return rdf;
564}
565
566RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize,
567 std::unordered_map<std::string, char> &&colTypes)
568{
569 return FromCSV(fileName, readHeaders, delimiter, linesChunkSize, std::move(colTypes));
570}
571
572} // ns RDF
573
574} // ns ROOT
#define b(i)
Definition RSha256.hxx:100
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
long long Long64_t
Definition RtypesCore.h:80
unsigned long long ULong64_t
Definition RtypesCore.h:81
void Info(const char *location, const char *msgfmt,...)
Use this function for informational messages.
Definition TError.cxx:230
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition TError.cxx:241
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
Int_t gDebug
Definition TROOT.cxx:585
The RRawFile provides read-only access to local and remote files.
Definition RRawFile.hxx:43
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
Definition RCsvDS.cxx:495
void FillRecord(const std::string &, Record_t &)
Definition RCsvDS.cxx:120
void Finalize() final
Convenience method called after concluding an event-loop.
Definition RCsvDS.cxx:413
ColType_t GetType(std::string_view colName) const
Definition RCsvDS.cxx:484
std::vector< std::vector< double > > fDoubleEvtValues
Definition RCsvDS.hxx:59
void InferType(const std::string &, unsigned int)
Definition RCsvDS.cxx:255
std::uint64_t fDataPos
Definition RCsvDS.hxx:45
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
Definition RCsvDS.cxx:536
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
Definition RCsvDS.hxx:40
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition RCsvDS.cxx:287
static const TRegexp fgTrueRegex
Definition RCsvDS.hxx:43
void GenerateHeaders(size_t)
Definition RCsvDS.cxx:163
std::vector< std::vector< void * > > fColAddresses
Definition RCsvDS.hxx:57
unsigned int fNSlots
Definition RCsvDS.hxx:47
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
Definition RCsvDS.cxx:421
const Long64_t fLinesChunkSize
Definition RCsvDS.hxx:50
std::string AsString() final
Definition RCsvDS.cxx:95
std::vector< std::string > fHeaders
Definition RCsvDS.hxx:53
ULong64_t fEntryRangesRequested
Definition RCsvDS.hxx:51
ULong64_t fProcessedLines
Definition RCsvDS.hxx:52
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
Definition RCsvDS.cxx:500
void InferColTypes(std::vector< std::string > &)
Definition RCsvDS.cxx:222
std::unordered_map< std::string, ColType_t > fColTypes
Definition RCsvDS.hxx:54
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition RCsvDS.hxx:60
const char fDelimiter
Definition RCsvDS.hxx:49
static const TRegexp fgDoubleRegex2
Definition RCsvDS.hxx:43
std::vector< Record_t > fRecords
Definition RCsvDS.hxx:58
RCsvDS(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL, std::unordered_map< std::string, char > &&colTypes={})
Constructor to create a CSV RDataSource for RDataFrame.
Definition RCsvDS.cxx:331
std::set< std::string > fColContainingEmpty
Definition RCsvDS.hxx:55
static const TRegexp fgFalseRegex
Definition RCsvDS.hxx:43
static const TRegexp fgDoubleRegex3
Definition RCsvDS.hxx:43
void ValidateColTypes(std::vector< std::string > &) const
Definition RCsvDS.cxx:203
static const TRegexp fgIntRegex
Definition RCsvDS.hxx:43
std::vector< std::string > ParseColumns(const std::string &)
Definition RCsvDS.cxx:276
void FillHeaders(const std::string &)
Definition RCsvDS.cxx:111
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
Definition RCsvDS.hxx:48
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
Definition RCsvDS.cxx:426
std::string GetLabel() final
Return a string representation of the datasource type.
Definition RCsvDS.cxx:553
std::vector< void * > GetColumnReadersImpl(std::string_view, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
Definition RCsvDS.cxx:171
static const TRegexp fgDoubleRegex1
Definition RCsvDS.hxx:43
std::vector< std::vector< std::string > > fStringEvtValues
Definition RCsvDS.hxx:61
std::vector< std::deque< bool > > fBoolEvtValues
Definition RCsvDS.hxx:64
bool SetEntry(unsigned int slot, ULong64_t entry) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition RCsvDS.cxx:505
~RCsvDS()
Destructor.
Definition RCsvDS.cxx:408
std::list< ColType_t > fColTypesList
Definition RCsvDS.hxx:56
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
Regular expression class.
Definition TRegexp.h:31
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition TRegexp.cxx:209
TLine * line
RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize, std::unordered_map< std::string, char > &&colTypes)
Definition RCsvDS.cxx:566
RDataFrame FromCSV(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL, std::unordered_map< std::string, char > &&colTypes={})
Factory method to create a CSV RDataFrame.
Definition RCsvDS.cxx:558
This file contains a specialised ROOT message handler to test for diagnostic in unit tests.
TSeq< unsigned int > TSeqU
Definition TSeq.hxx:204