Logo ROOT  
Reference Guide
RCsvDS.cxx
Go to the documentation of this file.
1// Author: Enric Tejedor CERN 10/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11// clang-format off
12/** \class ROOT::RDF::RCsvDS
13 \ingroup dataframe
14 \brief RDataFrame data source class for reading CSV files.
15
16The RCsvDS class implements a CSV file reader for RDataFrame.
17
18A RDataFrame that reads from a CSV file can be constructed using the factory method
19ROOT::RDF::MakeCsvDataFrame, which accepts five parameters:
201. Path to the CSV file.
212. Boolean that specifies whether the first row of the CSV file contains headers or
22not (optional, default `true`). If `false`, header names will be automatically generated as Col0, Col1, ..., ColN.
233. Delimiter (optional, default ',').
244. Chunk size (optional, default is -1 to read all) - number of lines to read at a time
255. Column Types (optional, default is an empty map). A map with column names as keys and their type
26(expressed as a single character, see below) as values.
27
28The type of columns that do not appear in the map is inferred from the data.
29The supported types are:
30- Integer: stored as a 64-bit long long int; can be specified in the column types map with 'L'.
31- Floating point number: stored with double precision; specified with 'D'.
32- Boolean: matches the literals `true` and `false`; specified with 'O'.
33- String: stored as an std::string, matches anything that does not fall into any of the
34previous types; specified with 'T'.
35
36These are some formatting rules expected by the RCsvDS implementation:
37- All records must have the same number of fields, in the same order.
38- Any field may be quoted.
39~~~
40 "1997","Ford","E350"
41~~~
42- Fields with embedded delimiters (e.g. comma) must be quoted.
43~~~
44 1997,Ford,E350,"Super, luxurious truck"
45~~~
46- Fields with double-quote characters must be quoted, and each of the embedded
47double-quote characters must be represented by a pair of double-quote characters.
48~~~
49 1997,Ford,E350,"Super, ""luxurious"" truck"
50~~~
51- Fields with embedded line breaks are not supported, even when quoted.
52~~~
53 1997,Ford,E350,"Go get one now
54 they are going fast"
55~~~
56- Spaces are considered part of a field and are not ignored.
57~~~
58 1997, Ford , E350
59 not same as
60 1997,Ford,E350
61 but same as
62 1997, "Ford" , E350
63~~~
64- If a header row is provided, it must contain column names for each of the fields.
65~~~
66 Year,Make,Model
67 1997,Ford,E350
68 2000,Mercury,Cougar
69~~~
70
71The current implementation of RCsvDS reads the entire CSV file content into memory before
72RDataFrame starts processing it. Therefore, before creating a CSV RDataFrame, it is
73important to check both how much memory is available and the size of the CSV file.
74
75RCsvDS can handle empty cells and also allows the usage of the special keywords "NaN" and "nan" to
76indicate `nan` values. If the column is of type double, these cells are stored internally as `nan`.
77Empty cells and explicit `nan`-s inside columns of type Long64_t/bool are stored as zeros.
78*/
79// clang-format on
80
81#include <ROOT/TSeq.hxx>
82#include <ROOT/RCsvDS.hxx>
83#include <ROOT/RRawFile.hxx>
84#include <TError.h>
85
86#include <algorithm>
87#include <memory>
88#include <sstream>
89#include <string>
90
91namespace ROOT {
92
93namespace RDF {
94
95std::string RCsvDS::AsString()
96{
97 return "CSV data source";
98}
99
100// Regular expressions for type inference
101const TRegexp RCsvDS::fgIntRegex("^[-+]?[0-9]+$");
102const TRegexp RCsvDS::fgDoubleRegex1("^[-+]?[0-9]+\\.[0-9]*$");
103const TRegexp RCsvDS::fgDoubleRegex2("^[-+]?[0-9]*\\.[0-9]+$");
104const TRegexp RCsvDS::fgDoubleRegex3("^[-+]?[0-9]*\\.[0-9]+[eEdDqQ][-+]?[0-9]+$");
105const TRegexp RCsvDS::fgTrueRegex("^true$");
106const TRegexp RCsvDS::fgFalseRegex("^false$");
107
108const std::unordered_map<RCsvDS::ColType_t, std::string>
109 RCsvDS::fgColTypeMap({{'O', "bool"}, {'D', "double"}, {'L', "Long64_t"}, {'T', "std::string"}});
110
111void RCsvDS::FillHeaders(const std::string &line)
112{
113 auto columns = ParseColumns(line);
114 fHeaders.reserve(columns.size());
115 for (auto &col : columns) {
116 fHeaders.emplace_back(col);
117 }
118}
119
120void RCsvDS::FillRecord(const std::string &line, Record_t &record)
121{
122 auto i = 0U;
123
124 auto columns = ParseColumns(line);
125
126 for (auto &col : columns) {
127 auto colType = fColTypes[fHeaders[i]];
128
129 switch (colType) {
130 case 'D': {
131 record.emplace_back(new double((col != "nan") ? std::stod(col) : std::numeric_limits<double>::quiet_NaN()));
132 break;
133 }
134 case 'L': {
135 if (col != "nan") {
136 record.emplace_back(new Long64_t(std::stoll(col)));
137 } else {
138 fColContainingEmpty.insert(fHeaders[i]);
139 record.emplace_back(new Long64_t(0));
140 }
141 break;
142 }
143 case 'O': {
144 auto b = new bool();
145 record.emplace_back(b);
146 if (col != "nan") {
147 std::istringstream(col) >> std::boolalpha >> *b;
148 } else {
149 fColContainingEmpty.insert(fHeaders[i]);
150 *b = false;
151 }
152 break;
153 }
154 case 'T': {
155 record.emplace_back(new std::string(col));
156 break;
157 }
158 }
159 ++i;
160 }
161}
162
164{
165 fHeaders.reserve(size);
166 for (size_t i = 0u; i < size; ++i) {
167 fHeaders.push_back("Col" + std::to_string(i));
168 }
169}
170
171std::vector<void *> RCsvDS::GetColumnReadersImpl(std::string_view colName, const std::type_info &ti)
172{
173 const auto colType = GetType(colName);
174
175 if ((colType == 'D' && typeid(double) != ti) || (colType == 'L' && typeid(Long64_t) != ti) ||
176 (colType == 'T' && typeid(std::string) != ti) || (colType == 'O' && typeid(bool) != ti)) {
177 std::string err = "The type selected for column \"";
178 err += colName;
179 err += "\" does not correspond to column type, which is ";
180 err += fgColTypeMap.at(colType);
181 throw std::runtime_error(err);
182 }
183
184 const auto &colNames = GetColumnNames();
185 const auto index = std::distance(colNames.begin(), std::find(colNames.begin(), colNames.end(), colName));
186 std::vector<void *> ret(fNSlots);
187 for (auto slot : ROOT::TSeqU(fNSlots)) {
188 auto &val = fColAddresses[index][slot];
189 if (ti == typeid(double)) {
190 val = &fDoubleEvtValues[index][slot];
191 } else if (ti == typeid(Long64_t)) {
192 val = &fLong64EvtValues[index][slot];
193 } else if (ti == typeid(std::string)) {
194 val = &fStringEvtValues[index][slot];
195 } else {
196 val = &fBoolEvtValues[index][slot];
197 }
198 ret[slot] = &val;
199 }
200 return ret;
201}
202
203void RCsvDS::ValidateColTypes(std::vector<std::string> &columns) const
204{
205 for (const auto &col : fColTypes) {
206 if (!HasColumn(col.first)) {
207 std::string msg = "There is no column with name \"" + col.first + "\".";
208 if (!fReadHeaders) {
209 msg += "\nSince the input csv file does not contain headers, valid column names";
210 msg += " are [\"Col0\", ..., \"Col" + std::to_string(columns.size() - 1) + "\"].";
211 }
212 throw std::runtime_error(msg);
213 }
214 if (std::string("ODLT").find(col.second) == std::string::npos) {
215 std::string msg = "Type alias '" + std::string(1, col.second) + "' is not supported.\n";
216 msg += "Supported type aliases are 'O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string.";
217 throw std::runtime_error(msg);
218 }
219 }
220}
221
222void RCsvDS::InferColTypes(std::vector<std::string> &columns)
223{
224 const auto second_line = fCsvFile->GetFilePos();
225
226 for (auto i = 0u; i < columns.size(); ++i) {
227
228 if (fColTypes.find(fHeaders[i]) != fColTypes.end())
229 continue; // type was manually specified, nothing to do
230
231 // read <=10 extra lines until a non-empty cell on this column is found, so that type is determined
232 for (auto extraRowsRead = 0u; extraRowsRead < 10u && columns[i] == "nan"; ++extraRowsRead) {
233 std::string line;
234 if (!fCsvFile->Readln(line))
235 break; // EOF
236 const auto temp_columns = ParseColumns(line);
237 if (temp_columns[i] != "nan")
238 columns[i] = temp_columns[i]; // will break the loop in the next iteration
239 }
240 // reset the reading from the second line, because the first line is already loaded in `columns`
241 fCsvFile->Seek(second_line);
242
243 if (columns[i] == "nan") {
244 // could not find a non-empty value, default to double
245 fColTypes[fHeaders[i]] = 'D';
246 fColTypesList.push_back('D');
247 } else {
248 InferType(columns[i], i);
249 }
250 }
251}
252
253void RCsvDS::InferType(const std::string &col, unsigned int idxCol)
254{
256 int dummy;
257
258 if (fgIntRegex.Index(col, &dummy) != -1) {
259 type = 'L'; // Long64_t
260 } else if (fgDoubleRegex1.Index(col, &dummy) != -1 || fgDoubleRegex2.Index(col, &dummy) != -1 ||
261 fgDoubleRegex3.Index(col, &dummy) != -1) {
262 type = 'D'; // double
263 } else if (fgTrueRegex.Index(col, &dummy) != -1 || fgFalseRegex.Index(col, &dummy) != -1) {
264 type = 'O'; // bool
265 } else { // everything else is a string
266 type = 'T'; // std::string
267 }
268 // TODO: Date
269
270 fColTypes[fHeaders[idxCol]] = type;
271 fColTypesList.push_back(type);
272}
273
274std::vector<std::string> RCsvDS::ParseColumns(const std::string &line)
275{
276 std::vector<std::string> columns;
277
278 for (size_t i = 0; i < line.size(); ++i) {
279 i = ParseValue(line, columns, i);
280 }
281
282 return columns;
283}
284
285size_t RCsvDS::ParseValue(const std::string &line, std::vector<std::string> &columns, size_t i)
286{
287 std::string val;
288 bool quoted = false;
289 const size_t prevPos = i; // used to check if cell is empty
290
291 for (; i < line.size(); ++i) {
292 if (line[i] == fDelimiter && !quoted) {
293 break;
294 } else if (line[i] == '"') {
295 // Keep just one quote for escaped quotes, none for the normal quotes
296 if (line[i + 1] != '"') {
297 quoted = !quoted;
298 } else {
299 val += line[++i];
300 }
301 } else {
302 val += line[i];
303 }
304 }
305
306 if (prevPos == i || val == "nan" || val == "NaN") // empty cell or explicit nan/NaN
307 columns.emplace_back("nan");
308 else
309 columns.emplace_back(std::move(val));
310
311 // if the line ends with the delimiter, we need to append the default column value
312 // for the _next_, last column that won't be parsed (because we are out of characters)
313 if (i == line.size() - 1 && line[i] == fDelimiter)
314 columns.emplace_back("nan");
315
316 return i;
317}
318
319////////////////////////////////////////////////////////////////////////
320/// Constructor to create a CSV RDataSource for RDataFrame.
321/// \param[in] fileName Path or URL of the CSV file.
322/// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
323/// (default `true`).
324/// \param[in] delimiter Delimiter character (default ',').
325/// \param[in] linesChunkSize bunch of lines to read, use -1 to read all
326/// \param[in] colTypes Allows users to manually specify column types. Accepts an unordered map with keys being
327/// column names, values being type specifiers ('O' for boolean, 'D' for double, 'L' for
328/// Long64_t, 'T' for std::string)
329RCsvDS::RCsvDS(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize,
330 std::unordered_map<std::string, char> &&colTypes)
331 : fReadHeaders(readHeaders), fCsvFile(ROOT::Internal::RRawFile::Create(fileName)), fDelimiter(delimiter),
332 fLinesChunkSize(linesChunkSize), fColTypes(std::move(colTypes))
333{
334 std::string line;
335
336 // Read the headers if present
337 if (fReadHeaders) {
338 if (fCsvFile->Readln(line)) {
340 } else {
341 std::string msg = "Error reading headers of CSV file ";
342 msg += fileName;
343 throw std::runtime_error(msg);
344 }
345 }
346
347 fDataPos = fCsvFile->GetFilePos();
348 bool eof = false;
349 do {
350 eof = !fCsvFile->Readln(line);
351 } while (line.empty() && !eof);
352 if (!eof) {
353 auto columns = ParseColumns(line);
354
355 // Generate headers if not present
356 if (!fReadHeaders) {
357 GenerateHeaders(columns.size());
358 }
359
360 // Ensure user is trying to set types only of existing columns
361 ValidateColTypes(columns);
362
363 // Infer types of columns with first record
364 InferColTypes(columns);
365
366 // rewind
367 fCsvFile->Seek(fDataPos);
368 } else {
369 std::string msg = "Could not infer column types of CSV file ";
370 msg += fileName;
371 throw std::runtime_error(msg);
372 }
373}
374
376{
377 for (auto &record : fRecords) {
378 for (size_t i = 0; i < record.size(); ++i) {
379 void *p = record[i];
380 const auto colType = fColTypes[fHeaders[i]];
381 switch (colType) {
382 case 'D': {
383 delete static_cast<double *>(p);
384 break;
385 }
386 case 'L': {
387 delete static_cast<Long64_t *>(p);
388 break;
389 }
390 case 'O': {
391 delete static_cast<bool *>(p);
392 break;
393 }
394 case 'T': {
395 delete static_cast<std::string *>(p);
396 break;
397 }
398 }
399 }
400 }
401 fRecords.clear();
402}
403
404////////////////////////////////////////////////////////////////////////
405/// Destructor.
407{
408 FreeRecords();
409}
410
412{
413 fCsvFile->Seek(fDataPos);
414 fProcessedLines = 0ULL;
416 FreeRecords();
417}
418
419const std::vector<std::string> &RCsvDS::GetColumnNames() const
420{
421 return fHeaders;
422}
423
424std::vector<std::pair<ULong64_t, ULong64_t>> RCsvDS::GetEntryRanges()
425{
426 // Read records and store them in memory
427 auto linesToRead = fLinesChunkSize;
428 FreeRecords();
429
430 std::string line;
431 while ((-1LL == fLinesChunkSize || 0 != linesToRead) && fCsvFile->Readln(line)) {
432 if (line.empty()) continue; // skip empty lines
433 fRecords.emplace_back();
434 FillRecord(line, fRecords.back());
435 --linesToRead;
436 }
437
438 if (!fColContainingEmpty.empty()) {
439 std::string msg = "";
440 for (const auto &col : fColContainingEmpty) {
441 const auto colT = GetTypeName(col);
442 msg += "Column \"" + col + "\" of type " + colT + " contains empty cell(s) or NaN(s).\n";
443 msg += "There is no `nan` equivalent for type " + colT + ", hence ";
444 msg += std::string(colT == "Long64_t" ? "`0`" : "`false`") + " is stored.\n";
445 }
446 msg += "Please manually set the column type to `double` (with `D`) in `MakeCsvDataFrame` to read NaNs instead.\n";
447 Warning("RCsvDS", "%s", msg.c_str());
448 }
449
450 if (gDebug > 0) {
451 if (fLinesChunkSize == -1LL) {
452 Info("GetEntryRanges", "Attempted to read entire CSV file into memory, %zu lines read", fRecords.size());
453 } else {
454 Info("GetEntryRanges", "Attempted to read chunk of %lld lines of CSV file into memory, %zu lines read", fLinesChunkSize, fRecords.size());
455 }
456 }
457
458 std::vector<std::pair<ULong64_t, ULong64_t>> entryRanges;
459 const auto nRecords = fRecords.size();
460 if (0 == nRecords)
461 return entryRanges;
462
463 const auto chunkSize = nRecords / fNSlots;
464 const auto remainder = 1U == fNSlots ? 0 : nRecords % fNSlots;
465 auto start = fProcessedLines;
466 auto end = start;
467
468 for (auto i : ROOT::TSeqU(fNSlots)) {
469 start = end;
470 end += chunkSize;
471 entryRanges.emplace_back(start, end);
472 (void)i;
473 }
474 entryRanges.back().second += remainder;
475
476 fProcessedLines += nRecords;
478
479 return entryRanges;
480}
481
483{
484 if (!HasColumn(colName)) {
485 std::string msg = "The dataset does not have column ";
486 msg += colName;
487 throw std::runtime_error(msg);
488 }
489
490 return fColTypes.at(colName.data());
491}
492
493std::string RCsvDS::GetTypeName(std::string_view colName) const
494{
495 return fgColTypeMap.at(GetType(colName));
496}
497
499{
500 return fHeaders.end() != std::find(fHeaders.begin(), fHeaders.end(), colName);
501}
502
503bool RCsvDS::SetEntry(unsigned int slot, ULong64_t entry)
504{
505 // Here we need to normalise the entry to the number of lines we already processed.
506 const auto offset = (fEntryRangesRequested - 1) * fLinesChunkSize;
507 const auto recordPos = entry - offset;
508 int colIndex = 0;
509 for (auto &colType : fColTypesList) {
510 auto dataPtr = fRecords[recordPos][colIndex];
511 switch (colType) {
512 case 'D': {
513 fDoubleEvtValues[colIndex][slot] = *static_cast<double *>(dataPtr);
514 break;
515 }
516 case 'L': {
517 fLong64EvtValues[colIndex][slot] = *static_cast<Long64_t *>(dataPtr);
518 break;
519 }
520 case 'O': {
521 fBoolEvtValues[colIndex][slot] = *static_cast<bool *>(dataPtr);
522 break;
523 }
524 case 'T': {
525 fStringEvtValues[colIndex][slot] = *static_cast<std::string *>(dataPtr);
526 break;
527 }
528 }
529 colIndex++;
530 }
531 return true;
532}
533
534void RCsvDS::SetNSlots(unsigned int nSlots)
535{
536 assert(0U == fNSlots && "Setting the number of slots even if the number of slots is different from zero.");
537
538 fNSlots = nSlots;
539
540 const auto nColumns = fHeaders.size();
541 // Initialize the entire set of addresses
542 fColAddresses.resize(nColumns, std::vector<void *>(fNSlots, nullptr));
543
544 // Initialize the per event data holders
545 fDoubleEvtValues.resize(nColumns, std::vector<double>(fNSlots));
546 fLong64EvtValues.resize(nColumns, std::vector<Long64_t>(fNSlots));
547 fStringEvtValues.resize(nColumns, std::vector<std::string>(fNSlots));
548 fBoolEvtValues.resize(nColumns, std::deque<bool>(fNSlots));
549}
550
551std::string RCsvDS::GetLabel()
552{
553 return "RCsv";
554}
555
556RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize,
557 std::unordered_map<std::string, char> &&colTypes)
558{
560 std::make_unique<RCsvDS>(fileName, readHeaders, delimiter, linesChunkSize, std::move(colTypes)));
561 return rdf;
562}
563
564} // ns RDF
565
566} // ns ROOT
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
long long Long64_t
Definition: RtypesCore.h:80
unsigned long long ULong64_t
Definition: RtypesCore.h:81
void Info(const char *location, const char *msgfmt,...)
Use this function for informational messages.
Definition: TError.cxx:221
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition: TError.cxx:232
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t b
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
Int_t gDebug
Definition: TROOT.cxx:585
The RRawFile provides read-only access to local and remote files.
Definition: RRawFile.hxx:43
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
Definition: RCsvDS.cxx:493
void FillRecord(const std::string &, Record_t &)
Definition: RCsvDS.cxx:120
void Finalize() final
Convenience method called after concluding an event-loop.
Definition: RCsvDS.cxx:411
ColType_t GetType(std::string_view colName) const
Definition: RCsvDS.cxx:482
std::vector< std::vector< double > > fDoubleEvtValues
Definition: RCsvDS.hxx:59
void InferType(const std::string &, unsigned int)
Definition: RCsvDS.cxx:253
std::uint64_t fDataPos
Definition: RCsvDS.hxx:45
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
Definition: RCsvDS.cxx:534
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
Definition: RCsvDS.hxx:40
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition: RCsvDS.cxx:285
static const TRegexp fgTrueRegex
Definition: RCsvDS.hxx:43
void GenerateHeaders(size_t)
Definition: RCsvDS.cxx:163
std::vector< std::vector< void * > > fColAddresses
Definition: RCsvDS.hxx:57
unsigned int fNSlots
Definition: RCsvDS.hxx:47
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
Definition: RCsvDS.cxx:419
const Long64_t fLinesChunkSize
Definition: RCsvDS.hxx:50
std::string AsString() final
Definition: RCsvDS.cxx:95
std::vector< std::string > fHeaders
Definition: RCsvDS.hxx:53
ULong64_t fEntryRangesRequested
Definition: RCsvDS.hxx:51
ULong64_t fProcessedLines
Definition: RCsvDS.hxx:52
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
Definition: RCsvDS.cxx:498
void InferColTypes(std::vector< std::string > &)
Definition: RCsvDS.cxx:222
std::unordered_map< std::string, ColType_t > fColTypes
Definition: RCsvDS.hxx:54
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition: RCsvDS.hxx:60
const char fDelimiter
Definition: RCsvDS.hxx:49
static const TRegexp fgDoubleRegex2
Definition: RCsvDS.hxx:43
std::vector< Record_t > fRecords
Definition: RCsvDS.hxx:58
RCsvDS(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL, std::unordered_map< std::string, char > &&colTypes={})
Constructor to create a CSV RDataSource for RDataFrame.
Definition: RCsvDS.cxx:329
std::set< std::string > fColContainingEmpty
Definition: RCsvDS.hxx:55
static const TRegexp fgFalseRegex
Definition: RCsvDS.hxx:43
static const TRegexp fgDoubleRegex3
Definition: RCsvDS.hxx:43
void ValidateColTypes(std::vector< std::string > &) const
Definition: RCsvDS.cxx:203
static const TRegexp fgIntRegex
Definition: RCsvDS.hxx:43
std::vector< std::string > ParseColumns(const std::string &)
Definition: RCsvDS.cxx:274
void FillHeaders(const std::string &)
Definition: RCsvDS.cxx:111
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
Definition: RCsvDS.hxx:48
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
Definition: RCsvDS.cxx:424
std::string GetLabel() final
Return a string representation of the datasource type.
Definition: RCsvDS.cxx:551
std::vector< void * > GetColumnReadersImpl(std::string_view, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
Definition: RCsvDS.cxx:171
static const TRegexp fgDoubleRegex1
Definition: RCsvDS.hxx:43
std::vector< std::vector< std::string > > fStringEvtValues
Definition: RCsvDS.hxx:61
std::vector< std::deque< bool > > fBoolEvtValues
Definition: RCsvDS.hxx:64
bool SetEntry(unsigned int slot, ULong64_t entry) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition: RCsvDS.cxx:503
void FreeRecords()
Definition: RCsvDS.cxx:375
~RCsvDS()
Destructor.
Definition: RCsvDS.cxx:406
std::list< ColType_t > fColTypesList
Definition: RCsvDS.hxx:56
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
Definition: RDataFrame.hxx:40
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:67
Regular expression class.
Definition: TRegexp.h:31
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition: TRegexp.cxx:209
TLine * line
RVec< PromoteTypes< T0, T1 > > remainder(const T0 &x, const RVec< T1 > &v)
Definition: RVec.hxx:1772
basic_string_view< char > string_view
void(off) SmallVectorTemplateBase< T
RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL, std::unordered_map< std::string, char > &&colTypes={})
Factory method to create a CSV RDataFrame.
Definition: RCsvDS.cxx:556
This file contains a specialised ROOT message handler to test for diagnostic in unit tests.