Logo ROOT  
Reference Guide
RCsvDS.cxx
Go to the documentation of this file.
1// Author: Enric Tejedor CERN 10/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11// clang-format off
12/** \class ROOT::RDF::RCsvDS
13 \ingroup dataframe
14 \brief RDataFrame data source class for reading CSV files.
15
16The RCsvDS class implements a CSV file reader for RDataFrame.
17
18A RDataFrame that reads from a CSV file can be constructed using the factory method
19ROOT::RDF::MakeCsvDataFrame, which accepts five parameters:
201. Path to the CSV file.
212. Boolean that specifies whether the first row of the CSV file contains headers or
22not (optional, default `true`). If `false`, header names will be automatically generated as Col0, Col1, ..., ColN.
233. Delimiter (optional, default ',').
244. Chunk size (optional, default is -1 to read all) - number of lines to read at a time
255. Column Types (optional, default is an empty map). A map with column names as keys and their type
26(expressed as a single character, see below) as values.
27
28The type of columns that do not appear in the map is inferred from the data.
29The supported types are:
30- Integer: stored as a 64-bit long long int; can be specified in the column types map with 'L'.
31- Floating point number: stored with double precision; specified with 'D'.
32- Boolean: matches the literals `true` and `false`; specified with 'O'.
33- String: stored as an std::string, matches anything that does not fall into any of the
34previous types; specified with 'T'.
35
36These are some formatting rules expected by the RCsvDS implementation:
37- All records must have the same number of fields, in the same order.
38- Any field may be quoted.
39~~~
40 "1997","Ford","E350"
41~~~
42- Fields with embedded delimiters (e.g. comma) must be quoted.
43~~~
44 1997,Ford,E350,"Super, luxurious truck"
45~~~
46- Fields with double-quote characters must be quoted, and each of the embedded
47double-quote characters must be represented by a pair of double-quote characters.
48~~~
49 1997,Ford,E350,"Super, ""luxurious"" truck"
50~~~
51- Fields with embedded line breaks are not supported, even when quoted.
52~~~
53 1997,Ford,E350,"Go get one now
54 they are going fast"
55~~~
56- Spaces are considered part of a field and are not ignored.
57~~~
58 1997, Ford , E350
59 not same as
60 1997,Ford,E350
61 but same as
62 1997, "Ford" , E350
63~~~
64- If a header row is provided, it must contain column names for each of the fields.
65~~~
66 Year,Make,Model
67 1997,Ford,E350
68 2000,Mercury,Cougar
69~~~
70
71The current implementation of RCsvDS reads the entire CSV file content into memory before
72RDataFrame starts processing it. Therefore, before creating a CSV RDataFrame, it is
73important to check both how much memory is available and the size of the CSV file.
74
75RCsvDS can handle empty cells and also allows the usage of the special keywords "NaN" and "nan" to
76indicate `nan` values. If the column is of type double, these cells are stored internally as `nan`.
77Empty cells and explicit `nan`-s inside columns of type Long64_t/bool are stored as zeros.
78*/
79// clang-format on
80
81#include <ROOT/RDF/Utils.hxx>
82#include <ROOT/TSeq.hxx>
83#include <ROOT/RCsvDS.hxx>
84#include <ROOT/RRawFile.hxx>
85#include <TError.h>
86
87#include <algorithm>
88#include <iostream>
89#include <memory>
90#include <sstream>
91#include <string>
92
93namespace ROOT {
94
95namespace RDF {
96
97std::string RCsvDS::AsString()
98{
99 return "CSV data source";
100}
101
102// Regular expressions for type inference
103const TRegexp RCsvDS::fgIntRegex("^[-+]?[0-9]+$");
104const TRegexp RCsvDS::fgDoubleRegex1("^[-+]?[0-9]+\\.[0-9]*$");
105const TRegexp RCsvDS::fgDoubleRegex2("^[-+]?[0-9]*\\.[0-9]+$");
106const TRegexp RCsvDS::fgDoubleRegex3("^[-+]?[0-9]*\\.[0-9]+[eEdDqQ][-+]?[0-9]+$");
107const TRegexp RCsvDS::fgTrueRegex("^true$");
108const TRegexp RCsvDS::fgFalseRegex("^false$");
109
110const std::unordered_map<RCsvDS::ColType_t, std::string>
111 RCsvDS::fgColTypeMap({{'O', "bool"}, {'D', "double"}, {'L', "Long64_t"}, {'T', "std::string"}});
112
113void RCsvDS::FillHeaders(const std::string &line)
114{
115 auto columns = ParseColumns(line);
116 fHeaders.reserve(columns.size());
117 for (auto &col : columns) {
118 fHeaders.emplace_back(col);
119 }
120}
121
122void RCsvDS::FillRecord(const std::string &line, Record_t &record)
123{
124 auto i = 0U;
125
126 auto columns = ParseColumns(line);
127
128 for (auto &col : columns) {
129 auto colType = fColTypes[fHeaders[i]];
130
131 switch (colType) {
132 case 'D': {
133 record.emplace_back(new double((col != "nan") ? std::stod(col) : std::numeric_limits<double>::quiet_NaN()));
134 break;
135 }
136 case 'L': {
137 if (col != "nan") {
138 record.emplace_back(new Long64_t(std::stoll(col)));
139 } else {
140 fColContainingEmpty.insert(fHeaders[i]);
141 record.emplace_back(new Long64_t(0));
142 }
143 break;
144 }
145 case 'O': {
146 auto b = new bool();
147 record.emplace_back(b);
148 if (col != "nan") {
149 std::istringstream(col) >> std::boolalpha >> *b;
150 } else {
151 fColContainingEmpty.insert(fHeaders[i]);
152 *b = false;
153 }
154 break;
155 }
156 case 'T': {
157 record.emplace_back(new std::string(col));
158 break;
159 }
160 }
161 ++i;
162 }
163}
164
166{
167 fHeaders.reserve(size);
168 for (size_t i = 0u; i < size; ++i) {
169 fHeaders.push_back("Col" + std::to_string(i));
170 }
171}
172
173std::vector<void *> RCsvDS::GetColumnReadersImpl(std::string_view colName, const std::type_info &ti)
174{
175 const auto colType = GetType(colName);
176
177 if ((colType == 'D' && typeid(double) != ti) || (colType == 'L' && typeid(Long64_t) != ti) ||
178 (colType == 'T' && typeid(std::string) != ti) || (colType == 'O' && typeid(bool) != ti)) {
179 std::string err = "The type selected for column \"";
180 err += colName;
181 err += "\" does not correspond to column type, which is ";
182 err += fgColTypeMap.at(colType);
183 throw std::runtime_error(err);
184 }
185
186 const auto &colNames = GetColumnNames();
187 const auto index = std::distance(colNames.begin(), std::find(colNames.begin(), colNames.end(), colName));
188 std::vector<void *> ret(fNSlots);
189 for (auto slot : ROOT::TSeqU(fNSlots)) {
190 auto &val = fColAddresses[index][slot];
191 if (ti == typeid(double)) {
192 val = &fDoubleEvtValues[index][slot];
193 } else if (ti == typeid(Long64_t)) {
194 val = &fLong64EvtValues[index][slot];
195 } else if (ti == typeid(std::string)) {
196 val = &fStringEvtValues[index][slot];
197 } else {
198 val = &fBoolEvtValues[index][slot];
199 }
200 ret[slot] = &val;
201 }
202 return ret;
203}
204
205void RCsvDS::ValidateColTypes(std::vector<std::string> &columns) const
206{
207 for (const auto &col : fColTypes) {
208 if (!HasColumn(col.first)) {
209 std::string msg = "There is no column with name \"" + col.first + "\".";
210 if (!fReadHeaders) {
211 msg += "\nSince the input csv file does not contain headers, valid column names";
212 msg += " are [\"Col0\", ..., \"Col" + std::to_string(columns.size() - 1) + "\"].";
213 }
214 throw std::runtime_error(msg);
215 }
216 if (std::string("ODLT").find(col.second) == std::string::npos) {
217 std::string msg = "Type alias '" + std::string(1, col.second) + "' is not supported.\n";
218 msg += "Supported type aliases are 'O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string.";
219 throw std::runtime_error(msg);
220 }
221 }
222}
223
224void RCsvDS::InferColTypes(std::vector<std::string> &columns)
225{
226 const auto second_line = fCsvFile->GetFilePos();
227
228 for (auto i = 0u; i < columns.size(); ++i) {
229
230 if (fColTypes.find(fHeaders[i]) != fColTypes.end())
231 continue; // type was manually specified, nothing to do
232
233 // read <=10 extra lines until a non-empty cell on this column is found, so that type is determined
234 for (auto extraRowsRead = 0u; extraRowsRead < 10u && columns[i] == "nan"; ++extraRowsRead) {
235 std::string line;
236 if (!fCsvFile->Readln(line))
237 break; // EOF
238 const auto temp_columns = ParseColumns(line);
239 if (temp_columns[i] != "nan")
240 columns[i] = temp_columns[i]; // will break the loop in the next iteration
241 }
242 // reset the reading from the second line, because the first line is already loaded in `columns`
243 fCsvFile->Seek(second_line);
244
245 if (columns[i] == "nan") {
246 // could not find a non-empty value, default to double
247 fColTypes[fHeaders[i]] = 'D';
248 fColTypesList.push_back('D');
249 } else {
250 InferType(columns[i], i);
251 }
252 }
253}
254
255void RCsvDS::InferType(const std::string &col, unsigned int idxCol)
256{
258 int dummy;
259
260 if (fgIntRegex.Index(col, &dummy) != -1) {
261 type = 'L'; // Long64_t
262 } else if (fgDoubleRegex1.Index(col, &dummy) != -1 || fgDoubleRegex2.Index(col, &dummy) != -1 ||
263 fgDoubleRegex3.Index(col, &dummy) != -1) {
264 type = 'D'; // double
265 } else if (fgTrueRegex.Index(col, &dummy) != -1 || fgFalseRegex.Index(col, &dummy) != -1) {
266 type = 'O'; // bool
267 } else { // everything else is a string
268 type = 'T'; // std::string
269 }
270 // TODO: Date
271
272 fColTypes[fHeaders[idxCol]] = type;
273 fColTypesList.push_back(type);
274}
275
276std::vector<std::string> RCsvDS::ParseColumns(const std::string &line)
277{
278 std::vector<std::string> columns;
279
280 for (size_t i = 0; i < line.size(); ++i) {
281 i = ParseValue(line, columns, i);
282 }
283
284 return columns;
285}
286
287size_t RCsvDS::ParseValue(const std::string &line, std::vector<std::string> &columns, size_t i)
288{
289 std::string val;
290 bool quoted = false;
291 const size_t prevPos = i; // used to check if cell is empty
292
293 for (; i < line.size(); ++i) {
294 if (line[i] == fDelimiter && !quoted) {
295 break;
296 } else if (line[i] == '"') {
297 // Keep just one quote for escaped quotes, none for the normal quotes
298 if (line[i + 1] != '"') {
299 quoted = !quoted;
300 } else {
301 val += line[++i];
302 }
303 } else {
304 val += line[i];
305 }
306 }
307
308 if (prevPos == i || val == "nan" || val == "NaN") // empty cell or explicit nan/NaN
309 columns.emplace_back("nan");
310 else
311 columns.emplace_back(std::move(val));
312
313 // if the line ends with the delimiter, we need to append the default column value
314 // for the _next_, last column that won't be parsed (because we are out of characters)
315 if (i == line.size() - 1 && line[i] == fDelimiter)
316 columns.emplace_back("nan");
317
318 return i;
319}
320
321////////////////////////////////////////////////////////////////////////
322/// Constructor to create a CSV RDataSource for RDataFrame.
323/// \param[in] fileName Path or URL of the CSV file.
324/// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
325/// (default `true`).
326/// \param[in] delimiter Delimiter character (default ',').
327/// \param[in] linesChunkSize bunch of lines to read, use -1 to read all
328/// \param[in] colTypes Allows users to manually specify column types. Accepts an unordered map with keys being
329/// column names, values being type specifiers ('O' for boolean, 'D' for double, 'L' for
330/// Long64_t, 'T' for std::string)
331RCsvDS::RCsvDS(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize,
332 std::unordered_map<std::string, char> &&colTypes)
333 : fReadHeaders(readHeaders), fCsvFile(ROOT::Internal::RRawFile::Create(fileName)), fDelimiter(delimiter),
334 fLinesChunkSize(linesChunkSize), fColTypes(std::move(colTypes))
335{
336 std::string line;
337
338 // Read the headers if present
339 if (fReadHeaders) {
340 if (fCsvFile->Readln(line)) {
342 } else {
343 std::string msg = "Error reading headers of CSV file ";
344 msg += fileName;
345 throw std::runtime_error(msg);
346 }
347 }
348
349 fDataPos = fCsvFile->GetFilePos();
350 bool eof = false;
351 do {
352 eof = !fCsvFile->Readln(line);
353 } while (line.empty() && !eof);
354 if (!eof) {
355 auto columns = ParseColumns(line);
356
357 // Generate headers if not present
358 if (!fReadHeaders) {
359 GenerateHeaders(columns.size());
360 }
361
362 // Ensure user is trying to set types only of existing columns
363 ValidateColTypes(columns);
364
365 // Infer types of columns with first record
366 InferColTypes(columns);
367
368 // rewind
369 fCsvFile->Seek(fDataPos);
370 } else {
371 std::string msg = "Could not infer column types of CSV file ";
372 msg += fileName;
373 throw std::runtime_error(msg);
374 }
375}
376
378{
379 for (auto &record : fRecords) {
380 for (size_t i = 0; i < record.size(); ++i) {
381 void *p = record[i];
382 const auto colType = fColTypes[fHeaders[i]];
383 switch (colType) {
384 case 'D': {
385 delete static_cast<double *>(p);
386 break;
387 }
388 case 'L': {
389 delete static_cast<Long64_t *>(p);
390 break;
391 }
392 case 'O': {
393 delete static_cast<bool *>(p);
394 break;
395 }
396 case 'T': {
397 delete static_cast<std::string *>(p);
398 break;
399 }
400 }
401 }
402 }
403 fRecords.clear();
404}
405
406////////////////////////////////////////////////////////////////////////
407/// Destructor.
409{
410 FreeRecords();
411}
412
414{
415 fCsvFile->Seek(fDataPos);
416 fProcessedLines = 0ULL;
418 FreeRecords();
419}
420
421const std::vector<std::string> &RCsvDS::GetColumnNames() const
422{
423 return fHeaders;
424}
425
426std::vector<std::pair<ULong64_t, ULong64_t>> RCsvDS::GetEntryRanges()
427{
428
429 // Read records and store them in memory
430 auto linesToRead = fLinesChunkSize;
431 FreeRecords();
432
433 std::string line;
434 while ((-1LL == fLinesChunkSize || 0 != linesToRead) && fCsvFile->Readln(line)) {
435 if (line.empty()) continue; // skip empty lines
436 fRecords.emplace_back();
437 FillRecord(line, fRecords.back());
438 --linesToRead;
439 }
440
441 if (!fColContainingEmpty.empty()) {
442 std::string msg = "";
443 for (const auto &col : fColContainingEmpty) {
444 const auto colT = GetTypeName(col);
445 msg += "Column \"" + col + "\" of type " + colT + " contains empty cell(s) or NaN(s).\n";
446 msg += "There is no `nan` equivalent for type " + colT + ", hence ";
447 msg += std::string(colT == "Long64_t" ? "`0`" : "`false`") + " is stored.\n";
448 }
449 msg += "Please manually set the column type to `double` (with `D`) in `MakeCsvDataFrame` to read NaNs instead.\n";
450 Warning("RCsvDS", "%s", msg.c_str());
451 }
452
453 if (gDebug > 0) {
454 if (fLinesChunkSize == -1LL) {
455 Info("GetEntryRanges", "Attempted to read entire CSV file into memory, %zu lines read", fRecords.size());
456 } else {
457 Info("GetEntryRanges", "Attempted to read chunk of %lld lines of CSV file into memory, %zu lines read", fLinesChunkSize, fRecords.size());
458 }
459 }
460
461 std::vector<std::pair<ULong64_t, ULong64_t>> entryRanges;
462 const auto nRecords = fRecords.size();
463 if (0 == nRecords)
464 return entryRanges;
465
466 const auto chunkSize = nRecords / fNSlots;
467 const auto remainder = 1U == fNSlots ? 0 : nRecords % fNSlots;
468 auto start = 0ULL == fEntryRangesRequested ? 0ULL : fProcessedLines;
469 auto end = start;
470
471 for (auto i : ROOT::TSeqU(fNSlots)) {
472 start = end;
473 end += chunkSize;
474 entryRanges.emplace_back(start, end);
475 (void)i;
476 }
477 entryRanges.back().second += remainder;
478
479 fProcessedLines += nRecords;
481
482 return entryRanges;
483}
484
486{
487 if (!HasColumn(colName)) {
488 std::string msg = "The dataset does not have column ";
489 msg += colName;
490 throw std::runtime_error(msg);
491 }
492
493 return fColTypes.at(colName.data());
494}
495
496std::string RCsvDS::GetTypeName(std::string_view colName) const
497{
498 return fgColTypeMap.at(GetType(colName));
499}
500
502{
503 return fHeaders.end() != std::find(fHeaders.begin(), fHeaders.end(), colName);
504}
505
506bool RCsvDS::SetEntry(unsigned int slot, ULong64_t entry)
507{
508 // Here we need to normalise the entry to the number of lines we already processed.
509 const auto offset = (fEntryRangesRequested - 1) * fLinesChunkSize;
510 const auto recordPos = entry - offset;
511 int colIndex = 0;
512 for (auto &colType : fColTypesList) {
513 auto dataPtr = fRecords[recordPos][colIndex];
514 switch (colType) {
515 case 'D': {
516 fDoubleEvtValues[colIndex][slot] = *static_cast<double *>(dataPtr);
517 break;
518 }
519 case 'L': {
520 fLong64EvtValues[colIndex][slot] = *static_cast<Long64_t *>(dataPtr);
521 break;
522 }
523 case 'O': {
524 fBoolEvtValues[colIndex][slot] = *static_cast<bool *>(dataPtr);
525 break;
526 }
527 case 'T': {
528 fStringEvtValues[colIndex][slot] = *static_cast<std::string *>(dataPtr);
529 break;
530 }
531 }
532 colIndex++;
533 }
534 return true;
535}
536
537void RCsvDS::SetNSlots(unsigned int nSlots)
538{
539 assert(0U == fNSlots && "Setting the number of slots even if the number of slots is different from zero.");
540
541 fNSlots = nSlots;
542
543 const auto nColumns = fHeaders.size();
544 // Initialize the entire set of addresses
545 fColAddresses.resize(nColumns, std::vector<void *>(fNSlots, nullptr));
546
547 // Initialize the per event data holders
548 fDoubleEvtValues.resize(nColumns, std::vector<double>(fNSlots));
549 fLong64EvtValues.resize(nColumns, std::vector<Long64_t>(fNSlots));
550 fStringEvtValues.resize(nColumns, std::vector<std::string>(fNSlots));
551 fBoolEvtValues.resize(nColumns, std::deque<bool>(fNSlots));
552}
553
554std::string RCsvDS::GetLabel()
555{
556 return "RCsv";
557}
558
559RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize,
560 std::unordered_map<std::string, char> &&colTypes)
561{
563 std::make_unique<RCsvDS>(fileName, readHeaders, delimiter, linesChunkSize, std::move(colTypes)));
564 return rdf;
565}
566
567} // ns RDF
568
569} // ns ROOT
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
long long Long64_t
Definition: RtypesCore.h:80
unsigned long long ULong64_t
Definition: RtypesCore.h:81
void Info(const char *location, const char *msgfmt,...)
Use this function for informational messages.
Definition: TError.cxx:220
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition: TError.cxx:231
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t b
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
Int_t gDebug
Definition: TROOT.cxx:585
The RRawFile provides read-only access to local and remote files.
Definition: RRawFile.hxx:43
void FillRecord(const std::string &, Record_t &)
Definition: RCsvDS.cxx:122
ColType_t GetType(std::string_view colName) const
Definition: RCsvDS.cxx:485
std::vector< std::vector< double > > fDoubleEvtValues
Definition: RCsvDS.hxx:59
void InferType(const std::string &, unsigned int)
Definition: RCsvDS.cxx:255
std::uint64_t fDataPos
Definition: RCsvDS.hxx:45
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
Definition: RCsvDS.hxx:40
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition: RCsvDS.cxx:287
static const TRegexp fgTrueRegex
Definition: RCsvDS.hxx:43
void GenerateHeaders(size_t)
Definition: RCsvDS.cxx:165
std::vector< std::vector< void * > > fColAddresses
Definition: RCsvDS.hxx:57
unsigned int fNSlots
Definition: RCsvDS.hxx:47
std::string GetLabel()
Return a string representation of the datasource type.
Definition: RCsvDS.cxx:554
const Long64_t fLinesChunkSize
Definition: RCsvDS.hxx:50
std::vector< std::string > fHeaders
Definition: RCsvDS.hxx:53
ULong64_t fEntryRangesRequested
Definition: RCsvDS.hxx:51
const std::vector< std::string > & GetColumnNames() const
Returns a reference to the collection of the dataset's column names.
Definition: RCsvDS.cxx:421
ULong64_t fProcessedLines
Definition: RCsvDS.hxx:52
std::vector< void * > GetColumnReadersImpl(std::string_view, const std::type_info &)
type-erased vector of pointers to pointers to column values - one per slot
Definition: RCsvDS.cxx:173
void InferColTypes(std::vector< std::string > &)
Definition: RCsvDS.cxx:224
std::unordered_map< std::string, ColType_t > fColTypes
Definition: RCsvDS.hxx:54
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition: RCsvDS.hxx:60
const char fDelimiter
Definition: RCsvDS.hxx:49
static const TRegexp fgDoubleRegex2
Definition: RCsvDS.hxx:43
std::vector< Record_t > fRecords
Definition: RCsvDS.hxx:58
RCsvDS(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL, std::unordered_map< std::string, char > &&colTypes={})
Constructor to create a CSV RDataSource for RDataFrame.
Definition: RCsvDS.cxx:331
std::set< std::string > fColContainingEmpty
Definition: RCsvDS.hxx:55
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()
Return ranges of entries to distribute to tasks.
Definition: RCsvDS.cxx:426
static const TRegexp fgFalseRegex
Definition: RCsvDS.hxx:43
bool SetEntry(unsigned int slot, ULong64_t entry)
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition: RCsvDS.cxx:506
bool HasColumn(std::string_view colName) const
Checks if the dataset has a certain column.
Definition: RCsvDS.cxx:501
static const TRegexp fgDoubleRegex3
Definition: RCsvDS.hxx:43
void Finalize()
Convenience method called after concluding an event-loop.
Definition: RCsvDS.cxx:413
void SetNSlots(unsigned int nSlots)
Inform RDataSource of the number of processing slots (i.e.
Definition: RCsvDS.cxx:537
void ValidateColTypes(std::vector< std::string > &) const
Definition: RCsvDS.cxx:205
static const TRegexp fgIntRegex
Definition: RCsvDS.hxx:43
std::string GetTypeName(std::string_view colName) const
Type of a column as a string, e.g.
Definition: RCsvDS.cxx:496
std::vector< std::string > ParseColumns(const std::string &)
Definition: RCsvDS.cxx:276
void FillHeaders(const std::string &)
Definition: RCsvDS.cxx:113
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
Definition: RCsvDS.hxx:48
static const TRegexp fgDoubleRegex1
Definition: RCsvDS.hxx:43
std::string AsString()
Definition: RCsvDS.cxx:97
std::vector< std::vector< std::string > > fStringEvtValues
Definition: RCsvDS.hxx:61
std::vector< std::deque< bool > > fBoolEvtValues
Definition: RCsvDS.hxx:64
void FreeRecords()
Definition: RCsvDS.cxx:377
~RCsvDS()
Destructor.
Definition: RCsvDS.cxx:408
std::list< ColType_t > fColTypesList
Definition: RCsvDS.hxx:56
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
Definition: RDataFrame.hxx:40
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:67
Regular expression class.
Definition: TRegexp.h:31
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition: TRegexp.cxx:209
TLine * line
RVec< PromoteTypes< T0, T1 > > remainder(const T0 &x, const RVec< T1 > &v)
Definition: RVec.hxx:1742
basic_string_view< char > string_view
void(off) SmallVectorTemplateBase< T
RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL, std::unordered_map< std::string, char > &&colTypes={})
Factory method to create a CSV RDataFrame.
Definition: RCsvDS.cxx:559
This file contains a specialised ROOT message handler to test for diagnostic in unit tests.