Logo ROOT  
Reference Guide
Loading...
Searching...
No Matches
RCsvDS.hxx
Go to the documentation of this file.
1// Author: Enric Tejedor CERN 10/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RCSVDS
12#define ROOT_RCSVDS
13
14#include "ROOT/RDataFrame.hxx"
15#include "ROOT/RDataSource.hxx"
16
17#include <cstdint>
18#include <deque>
19#include <list>
20#include <unordered_map>
21#include <set>
22#include <memory>
23#include <vector>
24
25#include <TRegexp.h>
26
27namespace ROOT::Internal::RDF {
28class R__CLING_PTRCHECK(off) RCsvDSColumnReader final : public ROOT::Detail::RDF::RColumnReaderBase {
29 void *fValuePtr;
30 void *GetImpl(Long64_t) final { return fValuePtr; }
31
32public:
33 RCsvDSColumnReader(void *valuePtr) : fValuePtr(valuePtr) {}
34};
35} // namespace ROOT::Internal::RDF
36
37namespace ROOT {
38
39namespace Internal {
40class RRawFile;
41}
42
43namespace RDF {
44
45class RCsvDS final : public ROOT::RDF::RDataSource {
46public:
47 /// Options that control how the CSV file is parsed
48 struct ROptions {
49 /// The first line describes the columns. The names are used as RDF column names
50 /// unless fColumnNames is not empty, in which case it replaces the given names.
51 /// If both, fHeaders is false and fColumnNames is empty, generic column names Col1.n.Col$n$ are used.
52 bool fHeaders = true;
53 char fDelimiter = ','; ///< Column delimiter character
54 bool fLeftTrim = false; ///< Leading whitespaces are removed
55 bool fRightTrim = false; ///< Trailing whitespaces are removed
56 bool fSkipBlankLines = true; ///< Ignore empty lines (after trimming, if trimming is enabled)
57 std::int64_t fSkipFirstNLines = 0; ///< Ignore the first N lines of the file
58 std::int64_t fSkipLastNLines = 0; ///< Ignore the last N lines of the file
59 std::int64_t fLinesChunkSize = -1; ///< Number of lines to read, -1 to read all
60 /// Character indicating that the remainder of the line should be ignored, if different from '\0'.
61 /// If it is the first character of the line (after trimming), the line is ignored altogether.
62 /// Note that the comment character must not be part of the data, e.g. in strings.
63 char fComment = '\0';
64 /// Impose column names. This can be used if a header is missing or if the header has unparsable or
65 /// unwanted column names. If this list is not empty, it must contain exactly as many elements as
66 /// the number of columns in the CSV file.
67 std::vector<std::string> fColumnNames;
68 /// Specify custom column types, accepts an unordered map with keys being column name, values being type alias
69 /// ('O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string)
70 std::unordered_map<std::string, char> fColumnTypes;
71 };
72
73private:
74 // Possible values are D, O, L, T. This is possible only because we treat double, bool, Long64_t and string
75 using ColType_t = char;
76 static const std::unordered_map<ColType_t, std::string> fgColTypeMap;
77
78 // Regular expressions for type inference
80
82 std::uint64_t fDataPos = 0;
83 std::int64_t fDataLineNumber = 0;
84 std::int64_t fLineNumber = 0; // used to skip the last lines
85 std::int64_t fMaxLineNumber = -1; // set to non-negative if fOptions.fSkipLastNLines is set
86 std::unique_ptr<ROOT::Internal::RRawFile> fCsvFile;
88 ULong64_t fProcessedLines = 0ULL; // marks the progress of the consumption of the csv lines
89 std::vector<std::string> fHeaders; // the column names
90 std::unordered_map<std::string, ColType_t> fColTypes;
91 std::set<std::string> fColContainingEmpty; // store columns which had empty entry
92 std::list<ColType_t> fColTypesList; // column types, order is the same as fHeaders, values the same as fColTypes
93 std::vector<std::vector<void *>> fColAddresses; // fColAddresses[column][slot] (same ordering as fHeaders)
94 std::vector<Record_t> fRecords; // fRecords[entry][column] (same ordering as fHeaders)
95 std::vector<std::vector<double>> fDoubleEvtValues; // one per column per slot
96 std::vector<std::vector<Long64_t>> fLong64EvtValues; // one per column per slot
97 std::vector<std::vector<std::string>> fStringEvtValues; // one per column per slot
98 // This must be a deque to avoid the specialisation vector<bool>. This would not
99 // work given that the pointer to the boolean in that case cannot be taken
100 std::vector<std::deque<bool>> fBoolEvtValues; // one per column per slot
101
102 void Construct();
103
104 bool Readln(std::string &line);
105 void RewindToData();
106 void FillHeaders(const std::string &);
107 void FillRecord(const std::string &, Record_t &);
108 void GenerateHeaders(size_t);
109 std::vector<void *> GetColumnReadersImpl(std::string_view, const std::type_info &) final;
110 void ValidateColTypes(std::vector<std::string> &) const;
111 void InferColTypes(std::vector<std::string> &);
112 void InferType(const std::string &, unsigned int);
113 std::vector<std::string> ParseColumns(const std::string &);
114 size_t ParseValue(const std::string &, std::vector<std::string> &, size_t);
115 ColType_t GetType(std::string_view colName) const;
116 void FreeRecords();
117
118protected:
119 std::string AsString() final;
120
121public:
122 RCsvDS(std::string_view fileName, const ROptions &options);
123 RCsvDS(std::string_view fileName, bool readHeaders = true, char delimiter = ',', Long64_t linesChunkSize = -1LL,
124 std::unordered_map<std::string, char> &&colTypes = {});
125 // Rule of five
126 RCsvDS(const RCsvDS &) = delete;
127 RCsvDS &operator=(const RCsvDS &) = delete;
128 RCsvDS(RCsvDS &&) = delete;
129 RCsvDS &operator=(RCsvDS &&) = delete;
130 ~RCsvDS() final;
131
132 void Finalize() final;
133 std::size_t GetNFiles() const final { return 1; }
134 const std::vector<std::string> &GetColumnNames() const final;
135 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final;
136 std::string GetTypeName(std::string_view colName) const final;
137 bool HasColumn(std::string_view colName) const final;
138 bool SetEntry(unsigned int slot, ULong64_t entry) final;
139 void SetNSlots(unsigned int nSlots) final;
140 std::string GetLabel() final;
141
143 GetColumnReaders(unsigned int slot, std::string_view colName, const std::type_info &tid) final;
144};
145
146////////////////////////////////////////////////////////////////////////////////////////////////
147/// \brief Factory method to create a CSV RDataFrame.
148/// \param[in] fileName Path of the CSV file.
149/// \param[in] options File parsing settings.
150RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options);
151
152////////////////////////////////////////////////////////////////////////////////////////////////
153/// \brief Factory method to create a CSV RDataFrame.
154/// \param[in] fileName Path of the CSV file.
155/// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
156/// (default `true`).
157/// \param[in] delimiter Delimiter character (default ',').
158/// \param[in] linesChunkSize bunch of lines to read, use -1 to read all
159/// \param[in] colTypes Allow user to specify custom column types, accepts an unordered map with keys being
160/// column type, values being type alias ('O' for boolean, 'D' for double, 'L' for
161/// Long64_t, 'T' for std::string)
162RDataFrame FromCSV(std::string_view fileName, bool readHeaders = true, char delimiter = ',',
163 Long64_t linesChunkSize = -1LL, std::unordered_map<std::string, char> &&colTypes = {});
164
165} // ns RDF
166
167} // ns ROOT
168
169#endif
long long Long64_t
Portable signed long integer 8 bytes.
Definition RtypesCore.h:83
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Definition RtypesCore.h:84
Pure virtual base class for all column reader types.
void * GetImpl(Long64_t) final
Definition RCsvDS.hxx:30
The RRawFile provides read-only access to local and remote files.
Definition RRawFile.hxx:43
RDataFrame data source class for reading CSV files.
Definition RCsvDS.hxx:45
std::int64_t fDataLineNumber
Definition RCsvDS.hxx:83
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
Definition RCsvDS.cxx:581
void FillRecord(const std::string &, Record_t &)
Definition RCsvDS.cxx:185
void Finalize() final
Convenience method called after concluding an event-loop.
Definition RCsvDS.cxx:499
std::size_t GetNFiles() const final
Returns the number of files from which the dataset is constructed.
Definition RCsvDS.hxx:133
ColType_t GetType(std::string_view colName) const
Definition RCsvDS.cxx:570
std::vector< std::vector< double > > fDoubleEvtValues
Definition RCsvDS.hxx:95
void InferType(const std::string &, unsigned int)
Definition RCsvDS.cxx:303
std::uint64_t fDataPos
Definition RCsvDS.hxx:82
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
Definition RCsvDS.cxx:622
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
Definition RCsvDS.hxx:76
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition RCsvDS.cxx:335
static const TRegexp fgTrueRegex
Definition RCsvDS.hxx:79
void GenerateHeaders(size_t)
Definition RCsvDS.cxx:228
std::vector< std::vector< void * > > fColAddresses
Definition RCsvDS.hxx:93
std::unique_ptr< ROOT::Detail::RDF::RColumnReaderBase > GetColumnReaders(unsigned int slot, std::string_view colName, const std::type_info &tid) final
If the other GetColumnReaders overload returns an empty vector, this overload will be called instead.
Definition RCsvDS.cxx:663
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
Definition RCsvDS.cxx:507
std::string AsString() final
Definition RCsvDS.cxx:98
bool Readln(std::string &line)
Definition RCsvDS.cxx:114
std::vector< std::string > fHeaders
Definition RCsvDS.hxx:89
ULong64_t fEntryRangesRequested
Definition RCsvDS.hxx:87
std::int64_t fMaxLineNumber
Definition RCsvDS.hxx:85
RCsvDS & operator=(RCsvDS &&)=delete
ULong64_t fProcessedLines
Definition RCsvDS.hxx:88
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
Definition RCsvDS.cxx:586
std::int64_t fLineNumber
Definition RCsvDS.hxx:84
void InferColTypes(std::vector< std::string > &)
Definition RCsvDS.cxx:270
std::unordered_map< std::string, ColType_t > fColTypes
Definition RCsvDS.hxx:90
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition RCsvDS.hxx:96
RCsvDS(std::string_view fileName, const ROptions &options)
Constructor to create a CSV RDataSource for RDataFrame.
Definition RCsvDS.cxx:434
static const TRegexp fgDoubleRegex2
Definition RCsvDS.hxx:79
std::vector< Record_t > fRecords
Definition RCsvDS.hxx:94
RCsvDS(RCsvDS &&)=delete
ROptions fOptions
Definition RCsvDS.hxx:81
std::set< std::string > fColContainingEmpty
Definition RCsvDS.hxx:91
~RCsvDS() final
Destructor.
Definition RCsvDS.cxx:494
static const TRegexp fgFalseRegex
Definition RCsvDS.hxx:79
static const TRegexp fgDoubleRegex3
Definition RCsvDS.hxx:79
void ValidateColTypes(std::vector< std::string > &) const
Definition RCsvDS.cxx:251
static const TRegexp fgIntRegex
Definition RCsvDS.hxx:79
RCsvDS(const RCsvDS &)=delete
void RewindToData()
Definition RCsvDS.cxx:159
std::vector< std::string > ParseColumns(const std::string &)
Definition RCsvDS.cxx:324
void FillHeaders(const std::string &)
Definition RCsvDS.cxx:165
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
Definition RCsvDS.hxx:86
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
Definition RCsvDS.cxx:512
std::string GetLabel() final
Return a string representation of the datasource type.
Definition RCsvDS.cxx:639
std::vector< void * > GetColumnReadersImpl(std::string_view, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
Definition RCsvDS.cxx:246
static const TRegexp fgDoubleRegex1
Definition RCsvDS.hxx:79
RCsvDS & operator=(const RCsvDS &)=delete
std::vector< std::vector< std::string > > fStringEvtValues
Definition RCsvDS.hxx:97
std::vector< std::deque< bool > > fBoolEvtValues
Definition RCsvDS.hxx:100
bool SetEntry(unsigned int slot, ULong64_t entry) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition RCsvDS.cxx:591
std::list< ColType_t > fColTypesList
Definition RCsvDS.hxx:92
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
Regular expression class.
Definition TRegexp.h:31
STL class.
STL class.
STL class.
STL class.
TLine * line
Special implementation of ROOT::RRangeCast for TCollection, including a check that the cast target ty...
Definition TObject.h:395
RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options)
Factory method to create a CSV RDataFrame.
Definition RCsvDS.cxx:644
Options that control how the CSV file is parsed.
Definition RCsvDS.hxx:48
bool fHeaders
The first line describes the columns.
Definition RCsvDS.hxx:52
bool fRightTrim
Trailing whitespaces are removed.
Definition RCsvDS.hxx:55
std::int64_t fSkipFirstNLines
Ignore the first N lines of the file.
Definition RCsvDS.hxx:57
std::vector< std::string > fColumnNames
Impose column names.
Definition RCsvDS.hxx:67
std::int64_t fSkipLastNLines
Ignore the last N lines of the file.
Definition RCsvDS.hxx:58
std::unordered_map< std::string, char > fColumnTypes
Specify custom column types, accepts an unordered map with keys being column name,...
Definition RCsvDS.hxx:70
bool fSkipBlankLines
Ignore empty lines (after trimming, if trimming is enabled).
Definition RCsvDS.hxx:56
char fDelimiter
Column delimiter character.
Definition RCsvDS.hxx:53
char fComment
Character indicating that the remainder of the line should be ignored, if different from '\0'.
Definition RCsvDS.hxx:63
bool fLeftTrim
Leading whitespaces are removed.
Definition RCsvDS.hxx:54
std::int64_t fLinesChunkSize
Number of lines to read, -1 to read all.
Definition RCsvDS.hxx:59