Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RCsvDS.cxx
Go to the documentation of this file.
1// Author: Enric Tejedor CERN 10/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11// clang-format off
12/** \class ROOT::RDF::RCsvDS
13 \ingroup dataframe
14 \brief RDataFrame data source class for reading CSV files.
15
16The RCsvDS class implements a CSV file reader for RDataFrame.
17
18A RDataFrame that reads from a CSV file can be constructed using the factory method
19ROOT::RDF::FromCSV, which accepts five parameters:
201. Path to the CSV file.
212. Boolean that specifies whether the first row of the CSV file contains headers or
22not (optional, default `true`). If `false`, header names will be automatically generated as Col0, Col1, ..., ColN.
233. Delimiter (optional, default ',').
244. Chunk size (optional, default is -1 to read all) - number of lines to read at a time
255. Column Types (optional, default is an empty map). A map with column names as keys and their type
26(expressed as a single character, see below) as values.
27
28The type of columns that do not appear in the map is inferred from the data.
29The supported types are:
30- Integer: stored as a 64-bit long long int; can be specified in the column types map with 'L'.
31- Floating point number: stored with double precision; specified with 'D'.
32- Boolean: matches the literals `true` and `false`; specified with 'O'.
33- String: stored as an std::string, matches anything that does not fall into any of the
34previous types; specified with 'T'.
35
36These are some formatting rules expected by the RCsvDS implementation:
37- All records must have the same number of fields, in the same order.
38- Any field may be quoted.
39~~~
40 "1997","Ford","E350"
41~~~
42- Fields with embedded delimiters (e.g. comma) must be quoted.
43~~~
44 1997,Ford,E350,"Super, luxurious truck"
45~~~
46- Fields with double-quote characters must be quoted, and each of the embedded
47double-quote characters must be represented by a pair of double-quote characters.
48~~~
49 1997,Ford,E350,"Super, ""luxurious"" truck"
50~~~
51- Fields with embedded line breaks are not supported, even when quoted.
52~~~
53 1997,Ford,E350,"Go get one now
54 they are going fast"
55~~~
56- Spaces are considered part of a field and are not ignored.
57~~~
58 1997, Ford , E350
59 not same as
60 1997,Ford,E350
61 but same as
62 1997, "Ford" , E350
63~~~
64- If a header row is provided, it must contain column names for each of the fields.
65~~~
66 Year,Make,Model
67 1997,Ford,E350
68 2000,Mercury,Cougar
69~~~
70
71The current implementation of RCsvDS reads the entire CSV file content into memory before
72RDataFrame starts processing it. Therefore, before creating a CSV RDataFrame, it is
73important to check both how much memory is available and the size of the CSV file.
74
75RCsvDS can handle empty cells and also allows the usage of the special keywords "NaN" and "nan" to
76indicate `nan` values. If the column is of type double, these cells are stored internally as `nan`.
77Empty cells and explicit `nan`-s inside columns of type Long64_t/bool are stored as zeros.
78*/
79// clang-format on
80
81#include <ROOT/TSeq.hxx>
82#include <ROOT/RCsvDS.hxx>
83#include <ROOT/RRawFile.hxx>
84#include <TError.h>
85
86#include <algorithm>
87#include <cctype>
88#include <cinttypes>
89#include <iterator>
90#include <memory>
91#include <sstream>
92#include <string>
93
94namespace ROOT {
95
96namespace RDF {
97
98std::string RCsvDS::AsString()
99{
100 return "CSV data source";
101}
102
103// Regular expressions for type inference
104const TRegexp RCsvDS::fgIntRegex("^[-+]?[0-9]+$");
105const TRegexp RCsvDS::fgDoubleRegex1("^[-+]?[0-9]+\\.[0-9]*$");
106const TRegexp RCsvDS::fgDoubleRegex2("^[-+]?[0-9]*\\.[0-9]+$");
107const TRegexp RCsvDS::fgDoubleRegex3("^[-+]?[0-9]*\\.[0-9]+[eEdDqQ][-+]?[0-9]+$");
108const TRegexp RCsvDS::fgTrueRegex("^true$");
109const TRegexp RCsvDS::fgFalseRegex("^false$");
110
111const std::unordered_map<RCsvDS::ColType_t, std::string>
112 RCsvDS::fgColTypeMap({{'O', "bool"}, {'D', "double"}, {'L', "Long64_t"}, {'T', "std::string"}});
113
114bool RCsvDS::Readln(std::string &line)
115{
116 auto fnLeftTrim = [](std::string &s) {
117 const auto N = s.size();
118 std::size_t idxStart = 0;
119 for (; idxStart < N && std::isspace(s[idxStart]); ++idxStart)
120 ;
121 if (idxStart)
122 s.erase(0, idxStart);
123 };
124
125 auto fnRightTrim = [](std::string &s) {
126 size_t nTrim = 0;
127 for (auto itr = s.rbegin(); itr != s.rend() && std::isspace(*itr); ++itr, ++nTrim)
128 ;
129 if (nTrim)
130 s.resize(s.size() - nTrim);
131 };
132
133 while (true) {
134 const bool eof = !fCsvFile->Readln(line);
135 if (eof)
136 return false;
137 fLineNumber++;
138 if ((fMaxLineNumber >= 0) && (fLineNumber > fMaxLineNumber))
139 return false;
140
143 if (fOptions.fComment) {
144 auto idxComment = line.find(fOptions.fComment);
145 if (idxComment == 0)
146 continue;
147 if (idxComment != std::string::npos)
148 line.resize(idxComment);
149 }
152 if (fOptions.fSkipBlankLines && line.empty())
153 continue;
154
155 return true;
156 }
157}
158
159void RCsvDS::RewindToData()
160{
161 fCsvFile->Seek(fDataPos);
163}
164
165void RCsvDS::FillHeaders(const std::string &line)
166{
167 const auto columns = ParseColumns(line);
168
169 if (!fOptions.fColumnNames.empty()) {
170 if (fOptions.fColumnNames.size() != columns.size()) {
171 auto msg = std::string("Error: passed ") + std::to_string(fOptions.fColumnNames.size()) +
172 " column names for a CSV file containing " + std::to_string(columns.size()) + " columns!";
173 throw std::runtime_error(msg);
174 }
175 std::swap(fHeaders, fOptions.fColumnNames);
176 return;
177 }
178
179 fHeaders.reserve(columns.size());
180 for (auto &col : columns) {
181 fHeaders.emplace_back(col);
182 }
183}
184
185void RCsvDS::FillRecord(const std::string &line, Record_t &record)
186{
187 auto i = 0U;
188
189 auto columns = ParseColumns(line);
190
191 for (auto &col : columns) {
192 auto colType = fColTypes[fHeaders[i]];
193
194 switch (colType) {
195 case 'D': {
196 record.emplace_back(new double((col != "nan") ? std::stod(col) : std::numeric_limits<double>::quiet_NaN()));
197 break;
198 }
199 case 'L': {
200 if (col != "nan") {
201 record.emplace_back(new Long64_t(std::stoll(col)));
202 } else {
203 fColContainingEmpty.insert(fHeaders[i]);
204 record.emplace_back(new Long64_t(0));
205 }
206 break;
207 }
208 case 'O': {
209 auto b = new bool();
210 record.emplace_back(b);
211 if (col != "nan") {
212 std::istringstream(col) >> std::boolalpha >> *b;
213 } else {
214 fColContainingEmpty.insert(fHeaders[i]);
215 *b = false;
216 }
217 break;
218 }
219 case 'T': {
220 record.emplace_back(new std::string(col));
221 break;
222 }
223 }
224 ++i;
225 }
226}
227
228void RCsvDS::GenerateHeaders(size_t size)
229{
230 if (!fOptions.fColumnNames.empty()) {
231 if (fOptions.fColumnNames.size() != size) {
232 auto msg = std::string("Error: passed ") + std::to_string(fOptions.fColumnNames.size()) +
233 " column names for a CSV file containing " + std::to_string(size) + " columns!";
234 throw std::runtime_error(msg);
235 }
236 std::swap(fHeaders, fOptions.fColumnNames);
237 return;
238 }
239
240 fHeaders.reserve(size);
241 for (size_t i = 0u; i < size; ++i) {
242 fHeaders.push_back("Col" + std::to_string(i));
243 }
244}
245
246std::vector<void *> RCsvDS::GetColumnReadersImpl(std::string_view, const std::type_info &)
247{
248 return {};
249}
250
251void RCsvDS::ValidateColTypes(std::vector<std::string> &columns) const
252{
253 for (const auto &col : fColTypes) {
254 if (!HasColumn(col.first)) {
255 std::string msg = "There is no column with name \"" + col.first + "\".";
256 if (!fOptions.fHeaders) {
257 msg += "\nSince the input csv file does not contain headers, valid column names";
258 msg += " are [\"Col0\", ..., \"Col" + std::to_string(columns.size() - 1) + "\"].";
259 }
260 throw std::runtime_error(msg);
261 }
262 if (std::string("ODLT").find(col.second) == std::string::npos) {
263 std::string msg = "Type alias '" + std::string(1, col.second) + "' is not supported.\n";
264 msg += "Supported type aliases are 'O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string.";
265 throw std::runtime_error(msg);
266 }
267 }
268}
269
270void RCsvDS::InferColTypes(std::vector<std::string> &columns)
271{
272 const auto second_line = fCsvFile->GetFilePos();
273
274 for (auto i = 0u; i < columns.size(); ++i) {
275 const auto userSpecifiedType = fColTypes.find(fHeaders[i]);
276 if (userSpecifiedType != fColTypes.end()) {
277 fColTypesList.push_back(userSpecifiedType->second);
278 continue;
279 }
280
281 // read <=10 extra lines until a non-empty cell on this column is found, so that type is determined
282 for (auto extraRowsRead = 0u; extraRowsRead < 10u && columns[i] == "nan"; ++extraRowsRead) {
283 std::string line;
284 if (!Readln(line))
285 break; // EOF
286 const auto temp_columns = ParseColumns(line);
287 if (temp_columns[i] != "nan")
288 columns[i] = temp_columns[i]; // will break the loop in the next iteration
289 }
290 // reset the reading from the second line, because the first line is already loaded in `columns`
291 fCsvFile->Seek(second_line);
292
293 if (columns[i] == "nan") {
294 // could not find a non-empty value, default to double
295 fColTypes[fHeaders[i]] = 'D';
296 fColTypesList.push_back('D');
297 } else {
298 InferType(columns[i], i);
299 }
300 }
301}
302
303void RCsvDS::InferType(const std::string &col, unsigned int idxCol)
304{
306 int dummy;
307
308 if (fgIntRegex.Index(col, &dummy) != -1) {
309 type = 'L'; // Long64_t
310 } else if (fgDoubleRegex1.Index(col, &dummy) != -1 || fgDoubleRegex2.Index(col, &dummy) != -1 ||
311 fgDoubleRegex3.Index(col, &dummy) != -1) {
312 type = 'D'; // double
313 } else if (fgTrueRegex.Index(col, &dummy) != -1 || fgFalseRegex.Index(col, &dummy) != -1) {
314 type = 'O'; // bool
315 } else { // everything else is a string
316 type = 'T'; // std::string
317 }
318 // TODO: Date
319
321 fColTypesList.push_back(type);
322}
323
324std::vector<std::string> RCsvDS::ParseColumns(const std::string &line)
325{
326 std::vector<std::string> columns;
327
328 for (size_t i = 0; i < line.size(); ++i) {
329 i = ParseValue(line, columns, i);
330 }
331
332 return columns;
333}
334
335size_t RCsvDS::ParseValue(const std::string &line, std::vector<std::string> &columns, size_t i)
336{
337 std::string val;
338 bool quoted = false;
339 const size_t prevPos = i; // used to check if cell is empty
340
341 for (; i < line.size(); ++i) {
342 if (line[i] == fOptions.fDelimiter && !quoted) {
343 break;
344 } else if (line[i] == '"') {
345 // Keep just one quote for escaped quotes, none for the normal quotes
346 if (line[i + 1] != '"') {
347 quoted = !quoted;
348 } else {
349 val += line[++i];
350 }
351 } else {
352 val += line[i];
353 }
354 }
355
356 if (prevPos == i || val == "nan" || val == "NaN") // empty cell or explicit nan/NaN
357 columns.emplace_back("nan");
358 else
359 columns.emplace_back(std::move(val));
360
361 // if the line ends with the delimiter, we need to append the default column value
362 // for the _next_, last column that won't be parsed (because we are out of characters)
363 if (i == line.size() - 1 && line[i] == fOptions.fDelimiter)
364 columns.emplace_back("nan");
365
366 return i;
367}
368
369void RCsvDS::Construct()
370{
371 std::string line;
372
374 // It is possible to not read the file twice, but the implementation would be more complicated
375 std::int64_t nLines = 0;
376 std::string tmp;
377 while (fCsvFile->Readln(tmp))
378 nLines++;
380 std::string msg = "Error: too many footer lines to skip in CSV file ";
381 msg += fCsvFile->GetUrl();
382 throw std::runtime_error(msg);
383 }
384 fCsvFile->Seek(0);
386 }
387
388 for (std::int64_t i = 0; i < fOptions.fSkipFirstNLines; ++i) {
389 if (!fCsvFile->Readln(line))
390 break;
391 fLineNumber++;
392 }
393
394 // Read the headers if present
395 if (fOptions.fHeaders) {
396 if (Readln(line)) {
398 } else {
399 std::string msg = "Error reading headers of CSV file ";
400 msg += fCsvFile->GetUrl();
401 throw std::runtime_error(msg);
402 }
403 }
404
405 fDataPos = fCsvFile->GetFilePos();
407 if (Readln(line)) {
408 auto columns = ParseColumns(line);
409
410 // Generate headers if not present
411 if (!fOptions.fHeaders) {
412 GenerateHeaders(columns.size());
413 }
414
415 // Ensure user is trying to set types only of existing columns
417
418 // Infer types of columns with first record
420
421 // rewind
422 RewindToData();
423 } else {
424 std::string msg = "Could not infer column types of CSV file ";
425 msg += fCsvFile->GetUrl();
426 throw std::runtime_error(msg);
427 }
428}
429
430////////////////////////////////////////////////////////////////////////
431/// Constructor to create a CSV RDataSource for RDataFrame.
432/// \param[in] fileName Path or URL of the CSV file.
433/// \param[in] options File parsing settings
434RCsvDS::RCsvDS(std::string_view fileName, const ROptions &options)
435 : fOptions(options), fCsvFile(ROOT::Internal::RRawFile::Create(fileName))
436{
437 std::swap(fColTypes, fOptions.fColumnTypes);
438
439 Construct();
440}
441
442////////////////////////////////////////////////////////////////////////
443/// Constructor to create a CSV RDataSource for RDataFrame.
444/// \param[in] fileName Path or URL of the CSV file.
445/// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
446/// (default `true`).
447/// \param[in] delimiter Delimiter character (default ',').
448/// \param[in] linesChunkSize bunch of lines to read, use -1 to read all
449/// \param[in] colTypes Allows users to manually specify column types. Accepts an unordered map with keys being
450/// column names, values being type specifiers ('O' for boolean, 'D' for double, 'L' for
451/// Long64_t, 'T' for std::string)
452RCsvDS::RCsvDS(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize,
453 std::unordered_map<std::string, char> &&colTypes)
454 : fCsvFile(ROOT::Internal::RRawFile::Create(fileName)), fColTypes(std::move(colTypes))
455{
459
460 Construct();
461}
462
463void RCsvDS::FreeRecords()
464{
465 for (auto &record : fRecords) {
466 for (size_t i = 0; i < record.size(); ++i) {
467 void *p = record[i];
468 const auto colType = fColTypes[fHeaders[i]];
469 switch (colType) {
470 case 'D': {
471 delete static_cast<double *>(p);
472 break;
473 }
474 case 'L': {
475 delete static_cast<Long64_t *>(p);
476 break;
477 }
478 case 'O': {
479 delete static_cast<bool *>(p);
480 break;
481 }
482 case 'T': {
483 delete static_cast<std::string *>(p);
484 break;
485 }
486 }
487 }
488 }
489 fRecords.clear();
490}
491
492////////////////////////////////////////////////////////////////////////
493/// Destructor.
494RCsvDS::~RCsvDS()
495{
496 FreeRecords();
497}
498
499void RCsvDS::Finalize()
500{
501 RewindToData();
504 FreeRecords();
505}
506
507const std::vector<std::string> &RCsvDS::GetColumnNames() const
508{
509 return fHeaders;
510}
511
512std::vector<std::pair<ULong64_t, ULong64_t>> RCsvDS::GetEntryRanges()
513{
514 // Read records and store them in memory
516 FreeRecords();
517
518 std::string line;
519 while ((-1LL == fOptions.fLinesChunkSize || 0 != linesToRead) && Readln(line)) {
520 fRecords.emplace_back();
521 FillRecord(line, fRecords.back());
522 --linesToRead;
523 }
524
525 if (!fColContainingEmpty.empty()) {
526 std::string msg = "";
527 for (const auto &col : fColContainingEmpty) {
528 const auto colT = GetTypeName(col);
529 msg += "Column \"" + col + "\" of type " + colT + " contains empty cell(s) or NaN(s).\n";
530 msg += "There is no `nan` equivalent for type " + colT + ", hence ";
531 msg += std::string(colT == "Long64_t" ? "`0`" : "`false`") + " is stored.\n";
532 }
533 msg += "Please manually set the column type to `double` (with `D`) in `FromCSV` to read NaNs instead.\n";
534 Warning("RCsvDS", "%s", msg.c_str());
535 }
536
537 if (gDebug > 0) {
538 if (fOptions.fLinesChunkSize == -1LL) {
539 Info("GetEntryRanges", "Attempted to read entire CSV file into memory, %zu lines read", fRecords.size());
540 } else {
541 Info("GetEntryRanges", "Attempted to read chunk of %" PRId64 " lines of CSV file into memory, %zu lines read",
543 }
544 }
545
546 std::vector<std::pair<ULong64_t, ULong64_t>> entryRanges;
547 const auto nRecords = fRecords.size();
548 if (0 == nRecords)
549 return entryRanges;
550
551 const auto chunkSize = nRecords / fNSlots;
552 const auto remainder = 1U == fNSlots ? 0 : nRecords % fNSlots;
553 auto start = fProcessedLines;
554 auto end = start;
555
556 for (auto i : ROOT::TSeqU(fNSlots)) {
557 start = end;
558 end += chunkSize;
559 entryRanges.emplace_back(start, end);
560 (void)i;
561 }
562 entryRanges.back().second += remainder;
563
566
567 return entryRanges;
568}
569
570RCsvDS::ColType_t RCsvDS::GetType(std::string_view colName) const
571{
572 if (!HasColumn(colName)) {
573 std::string msg = "The dataset does not have column ";
574 msg += colName;
575 throw std::runtime_error(msg);
576 }
577
578 return fColTypes.at(colName.data());
579}
580
581std::string RCsvDS::GetTypeName(std::string_view colName) const
582{
583 return fgColTypeMap.at(GetType(colName));
584}
585
586bool RCsvDS::HasColumn(std::string_view colName) const
587{
588 return fHeaders.end() != std::find(fHeaders.begin(), fHeaders.end(), colName);
589}
590
591bool RCsvDS::SetEntry(unsigned int slot, ULong64_t entry)
592{
593 // Here we need to normalise the entry to the number of lines we already processed.
595 const auto recordPos = entry - offset;
596 int colIndex = 0;
597 for (auto &colType : fColTypesList) {
599 switch (colType) {
600 case 'D': {
601 fDoubleEvtValues[colIndex][slot] = *static_cast<double *>(dataPtr);
602 break;
603 }
604 case 'L': {
605 fLong64EvtValues[colIndex][slot] = *static_cast<Long64_t *>(dataPtr);
606 break;
607 }
608 case 'O': {
609 fBoolEvtValues[colIndex][slot] = *static_cast<bool *>(dataPtr);
610 break;
611 }
612 case 'T': {
613 fStringEvtValues[colIndex][slot] = *static_cast<std::string *>(dataPtr);
614 break;
615 }
616 }
617 colIndex++;
618 }
619 return true;
620}
621
622void RCsvDS::SetNSlots(unsigned int nSlots)
623{
624 assert(0U == fNSlots && "Setting the number of slots even if the number of slots is different from zero.");
625
626 fNSlots = nSlots;
627
628 const auto nColumns = fHeaders.size();
629 // Initialize the entire set of addresses
630 fColAddresses.resize(nColumns, std::vector<void *>(fNSlots, nullptr));
631
632 // Initialize the per event data holders
633 fDoubleEvtValues.resize(nColumns, std::vector<double>(fNSlots));
634 fLong64EvtValues.resize(nColumns, std::vector<Long64_t>(fNSlots));
635 fStringEvtValues.resize(nColumns, std::vector<std::string>(fNSlots));
636 fBoolEvtValues.resize(nColumns, std::deque<bool>(fNSlots));
637}
638
639std::string RCsvDS::GetLabel()
640{
641 return "RCsv";
642}
643
644RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options)
645{
646 ROOT::RDataFrame rdf(std::make_unique<RCsvDS>(fileName, options));
647 return rdf;
648}
649
650RDataFrame FromCSV(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize,
651 std::unordered_map<std::string, char> &&colTypes)
652{
654 std::make_unique<RCsvDS>(fileName, readHeaders, delimiter, linesChunkSize, std::move(colTypes)));
655 return rdf;
656}
657
658} // namespace RDF
659
660} // namespace ROOT
661
662std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
663ROOT::RDF::RCsvDS::GetColumnReaders(unsigned int slot, std::string_view colName, const std::type_info &tid)
664{
665 const auto colType = GetType(colName);
666
667 if ((colType == 'D' && typeid(double) != tid) || (colType == 'L' && typeid(Long64_t) != tid) ||
668 (colType == 'T' && typeid(std::string) != tid) || (colType == 'O' && typeid(bool) != tid)) {
669 std::string err = "The type selected for column \"";
670 err += colName;
671 err += "\" does not correspond to column type, which is ";
672 err += fgColTypeMap.at(colType);
673 throw std::runtime_error(err);
674 }
675
676 const auto &colNames = GetColumnNames();
677 const auto index = std::distance(colNames.begin(), std::find(colNames.begin(), colNames.end(), colName));
678 auto &val = fColAddresses[index][slot];
679
680 if (tid == typeid(double)) {
681 val = &fDoubleEvtValues[index][slot];
682 } else if (tid == typeid(Long64_t)) {
683 val = &fLong64EvtValues[index][slot];
684 } else if (tid == typeid(std::string)) {
685 val = &fStringEvtValues[index][slot];
686 } else {
687 val = &fBoolEvtValues[index][slot];
688 }
689
690 return std::make_unique<ROOT::Internal::RDF::RCsvDSColumnReader>(val);
691}
#define b(i)
Definition RSha256.hxx:100
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
long long Long64_t
Portable signed long integer 8 bytes.
Definition RtypesCore.h:83
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Definition RtypesCore.h:84
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
void Info(const char *location, const char *msgfmt,...)
Use this function for informational messages.
Definition TError.cxx:241
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition TError.cxx:252
#define N
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
Int_t gDebug
Global variable setting the debug level. Set to 0 to disable, increase it in steps of 1 to increase t...
Definition TROOT.cxx:627
The RRawFile provides read-only access to local and remote files.
Definition RRawFile.hxx:43
std::int64_t fDataLineNumber
Definition RCsvDS.hxx:83
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
Definition RCsvDS.cxx:581
void FillRecord(const std::string &, Record_t &)
Definition RCsvDS.cxx:185
ColType_t GetType(std::string_view colName) const
Definition RCsvDS.cxx:570
std::vector< std::vector< double > > fDoubleEvtValues
Definition RCsvDS.hxx:95
void InferType(const std::string &, unsigned int)
Definition RCsvDS.cxx:303
std::uint64_t fDataPos
Definition RCsvDS.hxx:82
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
Definition RCsvDS.hxx:76
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition RCsvDS.cxx:335
static const TRegexp fgTrueRegex
Definition RCsvDS.hxx:79
void GenerateHeaders(size_t)
Definition RCsvDS.cxx:228
std::vector< std::vector< void * > > fColAddresses
Definition RCsvDS.hxx:93
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int slot, std::string_view colName, const std::type_info &tid) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
Definition RCsvDS.cxx:663
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
Definition RCsvDS.cxx:507
bool Readln(std::string &line)
Definition RCsvDS.cxx:114
std::vector< std::string > fHeaders
Definition RCsvDS.hxx:89
ULong64_t fEntryRangesRequested
Definition RCsvDS.hxx:87
std::int64_t fMaxLineNumber
Definition RCsvDS.hxx:85
ULong64_t fProcessedLines
Definition RCsvDS.hxx:88
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
Definition RCsvDS.cxx:586
std::int64_t fLineNumber
Definition RCsvDS.hxx:84
void InferColTypes(std::vector< std::string > &)
Definition RCsvDS.cxx:270
std::unordered_map< std::string, ColType_t > fColTypes
Definition RCsvDS.hxx:90
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition RCsvDS.hxx:96
static const TRegexp fgDoubleRegex2
Definition RCsvDS.hxx:79
std::vector< Record_t > fRecords
Definition RCsvDS.hxx:94
ROptions fOptions
Definition RCsvDS.hxx:81
std::set< std::string > fColContainingEmpty
Definition RCsvDS.hxx:91
static const TRegexp fgFalseRegex
Definition RCsvDS.hxx:79
static const TRegexp fgDoubleRegex3
Definition RCsvDS.hxx:79
void ValidateColTypes(std::vector< std::string > &) const
Definition RCsvDS.cxx:251
static const TRegexp fgIntRegex
Definition RCsvDS.hxx:79
void RewindToData()
Definition RCsvDS.cxx:159
std::vector< std::string > ParseColumns(const std::string &)
Definition RCsvDS.cxx:324
void FillHeaders(const std::string &)
Definition RCsvDS.cxx:165
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
Definition RCsvDS.hxx:86
static const TRegexp fgDoubleRegex1
Definition RCsvDS.hxx:79
std::vector< std::vector< std::string > > fStringEvtValues
Definition RCsvDS.hxx:97
std::vector< std::deque< bool > > fBoolEvtValues
Definition RCsvDS.hxx:100
std::list< ColType_t > fColTypesList
Definition RCsvDS.hxx:92
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
const_iterator begin() const
const_iterator end() const
Regular expression class.
Definition TRegexp.h:31
TLine * line
RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options)
Factory method to create a CSV RDataFrame.
Definition RCsvDS.cxx:644
Namespace for new ROOT classes and functions.
TSeq< unsigned int > TSeqU
Definition TSeq.hxx:204
Options that control how the CSV file is parsed.
Definition RCsvDS.hxx:48
bool fHeaders
The first line describes the columns.
Definition RCsvDS.hxx:52
bool fRightTrim
Trailing whitespaces are removed.
Definition RCsvDS.hxx:55
std::int64_t fSkipFirstNLines
Ignore the first N lines of the file.
Definition RCsvDS.hxx:57
std::vector< std::string > fColumnNames
Impose column names.
Definition RCsvDS.hxx:67
std::int64_t fSkipLastNLines
Ignore the last N lines of the file.
Definition RCsvDS.hxx:58
std::unordered_map< std::string, char > fColumnTypes
Specify custom column types, accepts an unordered map with keys being column name,...
Definition RCsvDS.hxx:70
bool fSkipBlankLines
Ignore empty lines (after trimming, if trimming is enabled)
Definition RCsvDS.hxx:56
char fDelimiter
Column delimiter character.
Definition RCsvDS.hxx:53
char fComment
Character indicating that the remainder of the line should be ignored, if different from '\0'.
Definition RCsvDS.hxx:63
bool fLeftTrim
Leading whitespaces are removed.
Definition RCsvDS.hxx:54
std::int64_t fLinesChunkSize
Number of lines to read, -1 to read all.
Definition RCsvDS.hxx:59