Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RCsvDS.hxx
Go to the documentation of this file.
1// Author: Enric Tejedor CERN 10/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RCSVDS
12#define ROOT_RCSVDS
13
14#include "ROOT/RDataFrame.hxx"
15#include "ROOT/RDataSource.hxx"
16
17#include <cstdint>
18#include <deque>
19#include <list>
20#include <unordered_map>
21#include <set>
22#include <memory>
23#include <vector>
24
25#include <TRegexp.h>
26
27namespace ROOT::Internal::RDF {
29 void *fValuePtr;
30 void *GetImpl(Long64_t) final { return fValuePtr; }
31
32public:
33 RCsvDSColumnReader(void *valuePtr) : fValuePtr(valuePtr) {}
34};
35} // namespace ROOT::Internal::RDF
36
37namespace ROOT {
38
39namespace Internal {
40class RRawFile;
41}
42
43namespace RDF {
44
46public:
47 /// Options that control how the CSV file is parsed
48 struct ROptions {
49 /// The first line describes the columns. The names are used as RDF column names
50 /// unless fColumnNames is not empty, in which case it replaces the given names.
51 /// If both, fHeaders is false and fColumnNames is empty, generic column names Col1.n.Col$n$ are used.
52 bool fHeaders = true;
53 char fDelimiter = ','; ///< Column delimiter character
54 bool fLeftTrim = false; ///< Leading whitespaces are removed
55 bool fRightTrim = false; ///< Trailing whitespaces are removed
56 bool fSkipBlankLines = true; ///< Ignore empty lines (after trimming, if trimming is enabled)
57 std::int64_t fSkipFirstNLines = 0; ///< Ignore the first N lines of the file
58 std::int64_t fSkipLastNLines = 0; ///< Ignore the last N lines of the file
59 std::int64_t fLinesChunkSize = -1; ///< Number of lines to read, -1 to read all
60 /// Character indicating that the remainder of the line should be ignored, if different from '\0'.
61 /// If it is the first character of the line (after trimming), the line is ignored altogether.
62 /// Note that the comment character must not be part of the data, e.g. in strings.
63 char fComment = '\0';
64 /// Impose column names. This can be used if a header is missing or if the header has unparsable or
65 /// unwanted column names. If this list is not empty, it must contain exactly as many elements as
66 /// the number of columns in the CSV file.
67 std::vector<std::string> fColumnNames;
68 /// Specify custom column types, accepts an unordered map with keys being column name, values being type alias
69 /// ('O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string)
70 std::unordered_map<std::string, char> fColumnTypes;
71 };
72
73private:
74 // Possible values are D, O, L, T. This is possible only because we treat double, bool, Long64_t and string
75 using ColType_t = char;
76 static const std::unordered_map<ColType_t, std::string> fgColTypeMap;
77
78 // Regular expressions for type inference
79 static const TRegexp fgIntRegex, fgDoubleRegex1, fgDoubleRegex2, fgDoubleRegex3, fgTrueRegex, fgFalseRegex;
80
82 std::uint64_t fDataPos = 0;
83 std::int64_t fDataLineNumber = 0;
84 std::int64_t fLineNumber = 0; // used to skip the last lines
85 std::int64_t fMaxLineNumber = -1; // set to non-negative if fOptions.fSkipLastNLines is set
86 std::unique_ptr<ROOT::Internal::RRawFile> fCsvFile;
87 ULong64_t fEntryRangesRequested = 0ULL;
88 ULong64_t fProcessedLines = 0ULL; // marks the progress of the consumption of the csv lines
89 std::vector<std::string> fHeaders; // the column names
90 std::unordered_map<std::string, ColType_t> fColTypes;
91 std::set<std::string> fColContainingEmpty; // store columns which had empty entry
92 std::list<ColType_t> fColTypesList; // column types, order is the same as fHeaders, values the same as fColTypes
93 std::vector<std::vector<void *>> fColAddresses; // fColAddresses[column][slot] (same ordering as fHeaders)
94 std::vector<Record_t> fRecords; // fRecords[entry][column] (same ordering as fHeaders)
95 std::vector<std::vector<double>> fDoubleEvtValues; // one per column per slot
96 std::vector<std::vector<Long64_t>> fLong64EvtValues; // one per column per slot
97 std::vector<std::vector<std::string>> fStringEvtValues; // one per column per slot
98 // This must be a deque to avoid the specialisation vector<bool>. This would not
99 // work given that the pointer to the boolean in that case cannot be taken
100 std::vector<std::deque<bool>> fBoolEvtValues; // one per column per slot
101
102 void Construct();
103
104 bool Readln(std::string &line);
105 void RewindToData();
106 void FillHeaders(const std::string &);
107 void FillRecord(const std::string &, Record_t &);
108 void GenerateHeaders(size_t);
109 std::vector<void *> GetColumnReadersImpl(std::string_view, const std::type_info &) final;
110 void ValidateColTypes(std::vector<std::string> &) const;
111 void InferColTypes(std::vector<std::string> &);
112 void InferType(const std::string &, unsigned int);
113 std::vector<std::string> ParseColumns(const std::string &);
114 size_t ParseValue(const std::string &, std::vector<std::string> &, size_t);
115 ColType_t GetType(std::string_view colName) const;
116 void FreeRecords();
117
118protected:
119 std::string AsString() final;
120
121public:
122 RCsvDS(std::string_view fileName, const ROptions &options);
123 RCsvDS(std::string_view fileName, bool readHeaders = true, char delimiter = ',', Long64_t linesChunkSize = -1LL,
124 std::unordered_map<std::string, char> &&colTypes = {});
125 // Rule of five
126 RCsvDS(const RCsvDS &) = delete;
127 RCsvDS &operator=(const RCsvDS &) = delete;
128 RCsvDS(RCsvDS &&) = delete;
129 RCsvDS &operator=(RCsvDS &&) = delete;
130 ~RCsvDS() final;
131
132 void Finalize() final;
133 std::size_t GetNFiles() const final { return 1; }
134 const std::vector<std::string> &GetColumnNames() const final;
135 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final;
136 std::string GetTypeName(std::string_view colName) const final;
137 bool HasColumn(std::string_view colName) const final;
138 bool SetEntry(unsigned int slot, ULong64_t entry) final;
139 void SetNSlots(unsigned int nSlots) final;
140 std::string GetLabel() final;
141
142 std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
143 GetColumnReaders(unsigned int slot, std::string_view colName, const std::type_info &tid) final;
144};
145
146////////////////////////////////////////////////////////////////////////////////////////////////
147/// \brief Factory method to create a CSV RDataFrame.
148/// \param[in] fileName Path of the CSV file.
149/// \param[in] options File parsing settings.
150RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options);
151
152////////////////////////////////////////////////////////////////////////////////////////////////
153/// \brief Factory method to create a CSV RDataFrame.
154/// \param[in] fileName Path of the CSV file.
155/// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
156/// (default `true`).
157/// \param[in] delimiter Delimiter character (default ',').
158/// \param[in] linesChunkSize bunch of lines to read, use -1 to read all
159/// \param[in] colTypes Allow user to specify custom column types, accepts an unordered map with keys being
160/// column type, values being type alias ('O' for boolean, 'D' for double, 'L' for
161/// Long64_t, 'T' for std::string)
162RDataFrame FromCSV(std::string_view fileName, bool readHeaders = true, char delimiter = ',',
163 Long64_t linesChunkSize = -1LL, std::unordered_map<std::string, char> &&colTypes = {});
164
165} // ns RDF
166
167} // ns ROOT
168
169#endif
long long Long64_t
Portable signed long integer 8 bytes.
Definition RtypesCore.h:83
unsigned long long ULong64_t
Portable unsigned long integer 8 bytes.
Definition RtypesCore.h:84
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Pure virtual base class for all column reader types.
void * GetImpl(Long64_t) final
Definition RCsvDS.hxx:30
The RRawFile provides read-only access to local and remote files.
Definition RRawFile.hxx:43
RDataFrame data source class for reading CSV files.
Definition RCsvDS.hxx:45
std::vector< std::vector< double > > fDoubleEvtValues
Definition RCsvDS.hxx:95
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
Definition RCsvDS.hxx:76
std::vector< std::vector< void * > > fColAddresses
Definition RCsvDS.hxx:93
std::vector< std::string > fHeaders
Definition RCsvDS.hxx:89
RCsvDS & operator=(RCsvDS &&)=delete
std::unordered_map< std::string, ColType_t > fColTypes
Definition RCsvDS.hxx:90
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition RCsvDS.hxx:96
std::vector< Record_t > fRecords
Definition RCsvDS.hxx:94
RCsvDS(RCsvDS &&)=delete
ROptions fOptions
Definition RCsvDS.hxx:81
std::set< std::string > fColContainingEmpty
Definition RCsvDS.hxx:91
RCsvDS(const RCsvDS &)=delete
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
Definition RCsvDS.hxx:86
static const TRegexp fgDoubleRegex1
Definition RCsvDS.hxx:79
RCsvDS & operator=(const RCsvDS &)=delete
std::vector< std::vector< std::string > > fStringEvtValues
Definition RCsvDS.hxx:97
std::vector< std::deque< bool > > fBoolEvtValues
Definition RCsvDS.hxx:100
std::list< ColType_t > fColTypesList
Definition RCsvDS.hxx:92
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
Regular expression class.
Definition TRegexp.h:31
TLine * line
Namespace for new ROOT classes and functions.
Options that control how the CSV file is parsed.
Definition RCsvDS.hxx:48
std::vector< std::string > fColumnNames
Impose column names.
Definition RCsvDS.hxx:67
std::unordered_map< std::string, char > fColumnTypes
Specify custom column types, accepts an unordered map with keys being column name,...
Definition RCsvDS.hxx:70