Logo ROOT   6.18/05
Reference Guide
RCsvDS.cxx
Go to the documentation of this file.
1// Author: Enric Tejedor CERN 10/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2017, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11// clang-format off
12/** \class ROOT::RDF::RCsvDS
13 \ingroup dataframe
14 \brief RDataFrame data source class for reading CSV files.
15
16The RCsvDS class implements a CSV file reader for RDataFrame.
17
18A RDataFrame that reads from a CSV file can be constructed using the factory method
19ROOT::RDF::MakeCsvDataFrame, which accepts three parameters:
201. Path to the CSV file.
212. Boolean that specifies whether the first row of the CSV file contains headers or
22not (optional, default `true`). If `false`, header names will be automatically generated as Col0, Col1, ..., ColN.
233. Delimiter (optional, default ',').
24
25The types of the columns in the CSV file are automatically inferred. The supported
26types are:
27- Integer: stored as a 64-bit long long int.
28- Floating point number: stored with double precision.
29- Boolean: matches the literals `true` and `false`.
30- String: stored as an std::string, matches anything that does not fall into any of the
31previous types.
32
33These are some formatting rules expected by the RCsvDS implementation:
34- All records must have the same number of fields, in the same order.
35- Any field may be quoted.
36~~~
37 "1997","Ford","E350"
38~~~
39- Fields with embedded delimiters (e.g. comma) must be quoted.
40~~~
41 1997,Ford,E350,"Super, luxurious truck"
42~~~
43- Fields with double-quote characters must be quoted, and each of the embedded
44double-quote characters must be represented by a pair of double-quote characters.
45~~~
46 1997,Ford,E350,"Super, ""luxurious"" truck"
47~~~
48- Fields with embedded line breaks are not supported, even when quoted.
49~~~
50 1997,Ford,E350,"Go get one now
51 they are going fast"
52~~~
53- Spaces are considered part of a field and are not ignored.
54~~~
55 1997, Ford , E350
56 not same as
57 1997,Ford,E350
58 but same as
59 1997, "Ford" , E350
60~~~
61- If a header row is provided, it must contain column names for each of the fields.
62~~~
63 Year,Make,Model
64 1997,Ford,E350
65 2000,Mercury,Cougar
66~~~
67
68The current implementation of RCsvDS reads the entire CSV file content into memory before
69RDataFrame starts processing it. Therefore, before creating a CSV RDataFrame, it is
70important to check both how much memory is available and the size of the CSV file.
71*/
72// clang-format on
73
74#include <ROOT/RDF/Utils.hxx>
75#include <ROOT/TSeq.hxx>
76#include <ROOT/RCsvDS.hxx>
77#include <ROOT/RMakeUnique.hxx>
78#include <TError.h>
79
80#include <algorithm>
81#include <iostream>
82#include <sstream>
83#include <string>
84
85namespace ROOT {
86
87namespace RDF {
88
89std::string RCsvDS::AsString()
90{
91 return "CSV data source";
92}
93
94// Regular expressions for type inference
95TRegexp RCsvDS::intRegex("^[-+]?[0-9]+$");
96TRegexp RCsvDS::doubleRegex1("^[-+]?[0-9]+\\.[0-9]*$");
97TRegexp RCsvDS::doubleRegex2("^[-+]?[0-9]*\\.[0-9]+$");
100
101const std::map<RCsvDS::ColType_t, std::string>
102 RCsvDS::fgColTypeMap({{'b', "bool"}, {'d', "double"}, {'l', "Long64_t"}, {'s', "std::string"}});
103
104void RCsvDS::FillHeaders(const std::string &line)
105{
106 auto columns = ParseColumns(line);
107 for (auto &col : columns) {
108 fHeaders.emplace_back(col);
109 }
110}
111
112void RCsvDS::FillRecord(const std::string &line, Record_t &record)
113{
114 std::istringstream lineStream(line);
115 auto i = 0U;
116
117 auto columns = ParseColumns(line);
118
119 for (auto &col : columns) {
120 auto colType = fColTypes[fHeaders[i]];
121
122 switch (colType) {
123 case 'd': {
124 record.emplace_back(new double(std::stod(col)));
125 break;
126 }
127 case 'l': {
128 record.emplace_back(new Long64_t(std::stoll(col)));
129 break;
130 }
131 case 'b': {
132 auto b = new bool();
133 record.emplace_back(b);
134 std::istringstream is(col);
135 is >> std::boolalpha >> *b;
136 break;
137 }
138 case 's': {
139 record.emplace_back(new std::string(col));
140 break;
141 }
142 }
143 ++i;
144 }
145}
146
147void RCsvDS::GenerateHeaders(size_t size)
148{
149 for (size_t i = 0; i < size; ++i) {
150 fHeaders.push_back("Col" + std::to_string(i));
151 }
152}
153
154std::vector<void *> RCsvDS::GetColumnReadersImpl(std::string_view colName, const std::type_info &ti)
155{
156 const auto colType = GetType(colName);
157
158 if ((colType == 'd' && typeid(double) != ti) || (colType == 'l' && typeid(Long64_t) != ti) ||
159 (colType == 's' && typeid(std::string) != ti) || (colType == 'b' && typeid(bool) != ti)) {
160 std::string err = "The type selected for column \"";
161 err += colName;
162 err += "\" does not correspond to column type, which is ";
163 err += fgColTypeMap.at(colType);
164 throw std::runtime_error(err);
165 }
166
167 const auto &colNames = GetColumnNames();
168 const auto index = std::distance(colNames.begin(), std::find(colNames.begin(), colNames.end(), colName));
169 std::vector<void *> ret(fNSlots);
170 for (auto slot : ROOT::TSeqU(fNSlots)) {
171 auto &val = fColAddresses[index][slot];
172 if (ti == typeid(double)) {
173 val = &fDoubleEvtValues[index][slot];
174 } else if (ti == typeid(Long64_t)) {
175 val = &fLong64EvtValues[index][slot];
176 } else if (ti == typeid(std::string)) {
177 val = &fStringEvtValues[index][slot];
178 } else {
179 val = &fBoolEvtValues[index][slot];
180 }
181 ret[slot] = &val;
182 }
183 return ret;
184}
185
186void RCsvDS::InferColTypes(std::vector<std::string> &columns)
187{
188 auto i = 0U;
189 for (auto &col : columns) {
190 InferType(col, i);
191 ++i;
192 }
193}
194
195void RCsvDS::InferType(const std::string &col, unsigned int idxCol)
196{
198 int dummy;
199
200 if (intRegex.Index(col, &dummy) != -1) {
201 type = 'l'; // Long64_t
202 } else if (doubleRegex1.Index(col, &dummy) != -1 || doubleRegex2.Index(col, &dummy) != -1) {
203 type = 'd'; // double
204 } else if (trueRegex.Index(col, &dummy) != -1 || falseRegex.Index(col, &dummy) != -1) {
205 type = 'b'; // bool
206 } else { // everything else is a string
207 type = 's'; // std::string
208 }
209 // TODO: Date
210
211 fColTypes[fHeaders[idxCol]] = type;
212 fColTypesList.push_back(type);
213}
214
215std::vector<std::string> RCsvDS::ParseColumns(const std::string &line)
216{
217 std::vector<std::string> columns;
218
219 for (size_t i = 0; i < line.size(); ++i) {
220 i = ParseValue(line, columns, i);
221 }
222
223 return columns;
224}
225
226size_t RCsvDS::ParseValue(const std::string &line, std::vector<std::string> &columns, size_t i)
227{
228 std::stringstream val;
229 bool quoted = false;
230
231 for (; i < line.size(); ++i) {
232 if (line[i] == fDelimiter && !quoted) {
233 break;
234 } else if (line[i] == '"') {
235 // Keep just one quote for escaped quotes, none for the normal quotes
236 if (line[i + 1] != '"') {
237 quoted = !quoted;
238 } else {
239 val << line[++i];
240 }
241 } else {
242 val << line[i];
243 }
244 }
245
246 columns.emplace_back(val.str());
247
248 return i;
249}
250
251////////////////////////////////////////////////////////////////////////
252/// Constructor to create a CSV RDataSource for RDataFrame.
253/// \param[in] fileName Path of the CSV file.
254/// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
255/// (default `true`).
256/// \param[in] delimiter Delimiter character (default ',').
257RCsvDS::RCsvDS(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize) // TODO: Let users specify types?
258 : fReadHeaders(readHeaders),
259 fStream(std::string(fileName)),
260 fDelimiter(delimiter),
261 fLinesChunkSize(linesChunkSize)
262{
263 std::string line;
264
265 // Read the headers if present
266 if (fReadHeaders) {
267 if (std::getline(fStream, line)) {
269 } else {
270 std::string msg = "Error reading headers of CSV file ";
271 msg += fileName;
272 throw std::runtime_error(msg);
273 }
274 }
275
276 fDataPos = fStream.tellg();
277 if (std::getline(fStream, line)) {
278 auto columns = ParseColumns(line);
279
280 // Generate headers if not present
281 if (!fReadHeaders) {
282 GenerateHeaders(columns.size());
283 }
284
285 // Infer types of columns with first record
286 InferColTypes(columns);
287
288 // rewind one line
289 fStream.seekg(fDataPos);
290 }
291}
292
294{
295 for (auto &record : fRecords) {
296 for (size_t i = 0; i < record.size(); ++i) {
297 void *p = record[i];
298 const auto colType = fColTypes[fHeaders[i]];
299 switch (colType) {
300 case 'd': {
301 delete static_cast<double *>(p);
302 break;
303 }
304 case 'l': {
305 delete static_cast<Long64_t *>(p);
306 break;
307 }
308 case 'b': {
309 delete static_cast<bool *>(p);
310 break;
311 }
312 case 's': {
313 delete static_cast<std::string *>(p);
314 break;
315 }
316 }
317 }
318 }
319 fRecords.clear();
320}
321
322////////////////////////////////////////////////////////////////////////
323/// Destructor.
325{
326 FreeRecords();
327}
328
330{
331 fStream.clear();
332 fStream.seekg(fDataPos);
333 fProcessedLines = 0ULL;
335 FreeRecords();
336}
337
338const std::vector<std::string> &RCsvDS::GetColumnNames() const
339{
340 return fHeaders;
341}
342
343std::vector<std::pair<ULong64_t, ULong64_t>> RCsvDS::GetEntryRanges()
344{
345
346 // Read records and store them in memory
347 auto linesToRead = fLinesChunkSize;
348 FreeRecords();
349
350 std::string line;
351 while ((-1LL == fLinesChunkSize || 0 != linesToRead--) && std::getline(fStream, line)) {
352 fRecords.emplace_back();
353 FillRecord(line, fRecords.back());
354 }
355
356 if (gDebug > 0) {
357 if (fLinesChunkSize == -1LL) {
358 Info("GetEntryRanges", "Attempted to read entire CSV file into memory, %lu lines read", fRecords.size());
359 } else {
360 Info("GetEntryRanges", "Attempted to read chunk of %lld lines of CSV file into memory, %lu lines read", fLinesChunkSize, fRecords.size());
361 }
362 }
363
364 std::vector<std::pair<ULong64_t, ULong64_t>> entryRanges;
365 const auto nRecords = fRecords.size();
366 if (0 == nRecords)
367 return entryRanges;
368
369 const auto chunkSize = nRecords / fNSlots;
370 const auto remainder = 1U == fNSlots ? 0 : nRecords % fNSlots;
371 auto start = 0ULL == fEntryRangesRequested ? 0ULL : fProcessedLines;
372 auto end = start;
373
374 for (auto i : ROOT::TSeqU(fNSlots)) {
375 start = end;
376 end += chunkSize;
377 entryRanges.emplace_back(start, end);
378 (void)i;
379 }
380 entryRanges.back().second += remainder;
381
382 fProcessedLines += nRecords;
384
385 return entryRanges;
386}
387
389{
390 if (!HasColumn(colName)) {
391 std::string msg = "The dataset does not have column ";
392 msg += colName;
393 throw std::runtime_error(msg);
394 }
395
396 return fColTypes.at(colName.data());
397}
398
399std::string RCsvDS::GetTypeName(std::string_view colName) const
400{
401 return fgColTypeMap.at(GetType(colName));
402}
403
405{
406 return fHeaders.end() != std::find(fHeaders.begin(), fHeaders.end(), colName);
407}
408
409bool RCsvDS::SetEntry(unsigned int slot, ULong64_t entry)
410{
411 // Here we need to normalise the entry to the number of lines we already processed.
412 const auto offset = (fEntryRangesRequested - 1) * fLinesChunkSize;
413 const auto recordPos = entry - offset;
414 int colIndex = 0;
415 for (auto &colType : fColTypesList) {
416 auto dataPtr = fRecords[recordPos][colIndex];
417 switch (colType) {
418 case 'd': {
419 fDoubleEvtValues[colIndex][slot] = *static_cast<double *>(dataPtr);
420 break;
421 }
422 case 'l': {
423 fLong64EvtValues[colIndex][slot] = *static_cast<Long64_t *>(dataPtr);
424 break;
425 }
426 case 'b': {
427 fBoolEvtValues[colIndex][slot] = *static_cast<bool *>(dataPtr);
428 break;
429 }
430 case 's': {
431 fStringEvtValues[colIndex][slot] = *static_cast<std::string *>(dataPtr);
432 break;
433 }
434 }
435 colIndex++;
436 }
437 return true;
438}
439
440void RCsvDS::SetNSlots(unsigned int nSlots)
441{
442 R__ASSERT(0U == fNSlots && "Setting the number of slots even if the number of slots is different from zero.");
443
444 fNSlots = nSlots;
445
446 const auto nColumns = fHeaders.size();
447 // Initialise the entire set of addresses
448 fColAddresses.resize(nColumns, std::vector<void *>(fNSlots, nullptr));
449
450 // Initialize the per event data holders
451 fDoubleEvtValues.resize(nColumns, std::vector<double>(fNSlots));
452 fLong64EvtValues.resize(nColumns, std::vector<Long64_t>(fNSlots));
453 fStringEvtValues.resize(nColumns, std::vector<std::string>(fNSlots));
454 fBoolEvtValues.resize(nColumns, std::deque<bool>(fNSlots));
455}
456
457std::string RCsvDS::GetLabel()
458{
459 return "RCsv";
460}
461
462RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders, char delimiter, Long64_t linesChunkSize)
463{
464 ROOT::RDataFrame tdf(std::make_unique<RCsvDS>(fileName, readHeaders, delimiter, linesChunkSize));
465 return tdf;
466}
467
468} // ns RDF
469
470} // ns ROOT
#define b(i)
Definition: RSha256.hxx:100
static RooMathCoreReg dummy
long long Long64_t
Definition: RtypesCore.h:69
unsigned long long ULong64_t
Definition: RtypesCore.h:70
R__EXTERN Int_t gDebug
Definition: Rtypes.h:91
#define R__ASSERT(e)
Definition: TError.h:96
void Info(const char *location, const char *msgfmt,...)
int type
Definition: TGX11.cxx:120
typedef void((*Func_t)())
RCsvDS(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL)
Constructor to create a CSV RDataSource for RDataFrame.
Definition: RCsvDS.cxx:257
std::map< std::string, ColType_t > fColTypes
Definition: RCsvDS.hxx:44
static TRegexp falseRegex
Definition: RCsvDS.hxx:55
void FillRecord(const std::string &, Record_t &)
Definition: RCsvDS.cxx:112
std::ifstream fStream
Definition: RCsvDS.hxx:38
ColType_t GetType(std::string_view colName) const
Definition: RCsvDS.cxx:388
std::vector< std::vector< double > > fDoubleEvtValues
Definition: RCsvDS.hxx:48
void InferType(const std::string &, unsigned int)
Definition: RCsvDS.cxx:195
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition: RCsvDS.cxx:226
void GenerateHeaders(size_t)
Definition: RCsvDS.cxx:147
std::vector< std::vector< void * > > fColAddresses
Definition: RCsvDS.hxx:46
unsigned int fNSlots
Definition: RCsvDS.hxx:37
std::string GetLabel()
Return a string representation of the datasource type.
Definition: RCsvDS.cxx:457
const Long64_t fLinesChunkSize
Definition: RCsvDS.hxx:40
std::vector< std::string > fHeaders
Definition: RCsvDS.hxx:43
static TRegexp doubleRegex2
Definition: RCsvDS.hxx:55
ULong64_t fEntryRangesRequested
Definition: RCsvDS.hxx:41
const std::vector< std::string > & GetColumnNames() const
Returns a reference to the collection of the dataset's column names.
Definition: RCsvDS.cxx:338
ULong64_t fProcessedLines
Definition: RCsvDS.hxx:42
std::vector< void * > GetColumnReadersImpl(std::string_view, const std::type_info &)
type-erased vector of pointers to pointers to column values - one per slot
Definition: RCsvDS.cxx:154
void InferColTypes(std::vector< std::string > &)
Definition: RCsvDS.cxx:186
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition: RCsvDS.hxx:49
static TRegexp trueRegex
Definition: RCsvDS.hxx:55
static const std::map< ColType_t, std::string > fgColTypeMap
Definition: RCsvDS.hxx:33
const char fDelimiter
Definition: RCsvDS.hxx:39
std::vector< Record_t > fRecords
Definition: RCsvDS.hxx:47
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()
Return ranges of entries to distribute to tasks.
Definition: RCsvDS.cxx:343
bool SetEntry(unsigned int slot, ULong64_t entry)
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition: RCsvDS.cxx:409
bool HasColumn(std::string_view colName) const
Checks if the dataset has a certain column.
Definition: RCsvDS.cxx:404
std::streampos fDataPos
Definition: RCsvDS.hxx:35
static TRegexp doubleRegex1
Definition: RCsvDS.hxx:55
void SetNSlots(unsigned int nSlots)
Inform RDataSource of the number of processing slots (i.e.
Definition: RCsvDS.cxx:440
std::string GetTypeName(std::string_view colName) const
Type of a column as a string, e.g.
Definition: RCsvDS.cxx:399
std::vector< std::string > ParseColumns(const std::string &)
Definition: RCsvDS.cxx:215
void FillHeaders(const std::string &)
Definition: RCsvDS.cxx:104
std::string AsString()
Definition: RCsvDS.cxx:89
void Finalise()
Convenience method called after concluding an event-loop.
Definition: RCsvDS.cxx:329
std::vector< std::vector< std::string > > fStringEvtValues
Definition: RCsvDS.hxx:50
std::vector< std::deque< bool > > fBoolEvtValues
Definition: RCsvDS.hxx:53
static TRegexp intRegex
Definition: RCsvDS.hxx:55
void FreeRecords()
Definition: RCsvDS.cxx:293
~RCsvDS()
Destructor.
Definition: RCsvDS.cxx:324
std::list< ColType_t > fColTypesList
Definition: RCsvDS.hxx:45
std::vector< void * > Record_t
ROOT's RDataFrame offers a high level interface for analyses of data stored in TTrees,...
Definition: RDataFrame.hxx:42
A pseudo container class which is a generator of indices.
Definition: TSeq.hxx:66
Regular expression class.
Definition: TRegexp.h:31
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition: TRegexp.cxx:209
TLine * line
basic_string_view< char > string_view
RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL)
Factory method to create a CSV RDataFrame.
Definition: RCsvDS.cxx:462
Namespace for new ROOT classes and functions.
Definition: StringConv.hxx:21