Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RCsvDS.hxx
Go to the documentation of this file.
1// Author: Enric Tejedor CERN 10/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RCSVTDS
12#define ROOT_RCSVTDS
13
14#include "ROOT/RDataFrame.hxx"
15#include "ROOT/RDataSource.hxx"
16
17#include <cstdint>
18#include <deque>
19#include <list>
20#include <unordered_map>
21#include <set>
22#include <memory>
23#include <vector>
24
25#include <TRegexp.h>
26
27namespace ROOT {
28
29namespace Internal {
30class RRawFile;
31}
32
33namespace RDF {
34
35class RCsvDS final : public ROOT::RDF::RDataSource {
36public:
37 /// Options that control how the CSV file is parsed
38 struct ROptions {
39 /// The first line describes the columns. The names are used as RDF column names
40 /// unless fColumnNames is not empty, in which case it replaces the given names.
41 /// If both, fHeaders is false and fColumnNames is empty, generic column names Col1.n.Col$n$ are used.
42 bool fHeaders = true;
43 char fDelimiter = ','; ///< Column delimiter character
44 bool fLeftTrim = false; ///< Leading whitespaces are removed
45 bool fRightTrim = false; ///< Trailing whitespaces are removed
46 bool fSkipBlankLines = true; ///< Ignore empty lines (after trimming, if trimming is enabled)
47 std::int64_t fSkipFirstNLines = 0; ///< Ignore the first N lines of the file
48 std::int64_t fSkipLastNLines = 0; ///< Ignore the last N lines of the file
49 std::int64_t fLinesChunkSize = -1; ///< Number of lines to read, -1 to read all
50 /// Character indicating that the remainder of the line should be ignored, if different from '\0'.
51 /// If it is the first character of the line (after trimming), the line is ignored altogether.
52 /// Note that the comment character must not be part of the data, e.g. in strings.
53 char fComment = '\0';
54 /// Impose column names. This can be used if a header is missing or if the header has unparsable or
55 /// unwanted column names. If this list is not empty, it must contain exactly as many elements as
56 /// the number of columns in the CSV file.
57 std::vector<std::string> fColumnNames;
58 /// Specify custom column types, accepts an unordered map with keys being column name, values being type alias
59 /// ('O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string)
60 std::unordered_map<std::string, char> fColumnTypes;
61 };
62
63private:
64 // Possible values are D, O, L, T. This is possible only because we treat double, bool, Long64_t and string
65 using ColType_t = char;
66 static const std::unordered_map<ColType_t, std::string> fgColTypeMap;
67
68 // Regular expressions for type inference
70
72 std::uint64_t fDataPos = 0;
73 std::int64_t fDataLineNumber = 0;
74 std::int64_t fLineNumber = 0; // used to skip the last lines
75 std::int64_t fMaxLineNumber = -1; // set to non-negative if fOptions.fSkipLastNLines is set
76 unsigned int fNSlots = 0U;
77 std::unique_ptr<ROOT::Internal::RRawFile> fCsvFile;
79 ULong64_t fProcessedLines = 0ULL; // marks the progress of the consumption of the csv lines
80 std::vector<std::string> fHeaders; // the column names
81 std::unordered_map<std::string, ColType_t> fColTypes;
82 std::set<std::string> fColContainingEmpty; // store columns which had empty entry
83 std::list<ColType_t> fColTypesList; // column types, order is the same as fHeaders, values the same as fColTypes
84 std::vector<std::vector<void *>> fColAddresses; // fColAddresses[column][slot] (same ordering as fHeaders)
85 std::vector<Record_t> fRecords; // fRecords[entry][column] (same ordering as fHeaders)
86 std::vector<std::vector<double>> fDoubleEvtValues; // one per column per slot
87 std::vector<std::vector<Long64_t>> fLong64EvtValues; // one per column per slot
88 std::vector<std::vector<std::string>> fStringEvtValues; // one per column per slot
89 // This must be a deque to avoid the specialisation vector<bool>. This would not
90 // work given that the pointer to the boolean in that case cannot be taken
91 std::vector<std::deque<bool>> fBoolEvtValues; // one per column per slot
92
93 void Construct();
94
95 bool Readln(std::string &line);
96 void RewindToData();
97 void FillHeaders(const std::string &);
98 void FillRecord(const std::string &, Record_t &);
99 void GenerateHeaders(size_t);
100 std::vector<void *> GetColumnReadersImpl(std::string_view, const std::type_info &) final;
101 void ValidateColTypes(std::vector<std::string> &) const;
102 void InferColTypes(std::vector<std::string> &);
103 void InferType(const std::string &, unsigned int);
104 std::vector<std::string> ParseColumns(const std::string &);
105 size_t ParseValue(const std::string &, std::vector<std::string> &, size_t);
106 ColType_t GetType(std::string_view colName) const;
107 void FreeRecords();
108
109protected:
110 std::string AsString() final;
111
112public:
113 RCsvDS(std::string_view fileName, const ROptions &options);
114 RCsvDS(std::string_view fileName, bool readHeaders = true, char delimiter = ',', Long64_t linesChunkSize = -1LL,
115 std::unordered_map<std::string, char> &&colTypes = {});
116 // Rule of five
117 RCsvDS(const RCsvDS &) = delete;
118 RCsvDS &operator=(const RCsvDS &) = delete;
119 RCsvDS(RCsvDS &&) = delete;
120 RCsvDS &operator=(RCsvDS &&) = delete;
121 ~RCsvDS() final;
122
123 void Finalize() final;
124 std::size_t GetNFiles() const final { return 1; }
125 const std::vector<std::string> &GetColumnNames() const final;
126 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final;
127 std::string GetTypeName(std::string_view colName) const final;
128 bool HasColumn(std::string_view colName) const final;
129 bool SetEntry(unsigned int slot, ULong64_t entry) final;
130 void SetNSlots(unsigned int nSlots) final;
131 std::string GetLabel() final;
132};
133
134////////////////////////////////////////////////////////////////////////////////////////////////
135/// \brief Factory method to create a CSV RDataFrame.
136/// \param[in] fileName Path of the CSV file.
137/// \param[in] options File parsing settings.
138RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options);
139
140////////////////////////////////////////////////////////////////////////////////////////////////
141/// \brief Factory method to create a CSV RDataFrame.
142/// \param[in] fileName Path of the CSV file.
143/// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
144/// (default `true`).
145/// \param[in] delimiter Delimiter character (default ',').
146/// \param[in] linesChunkSize bunch of lines to read, use -1 to read all
147/// \param[in] colTypes Allow user to specify custom column types, accepts an unordered map with keys being
148/// column type, values being type alias ('O' for boolean, 'D' for double, 'L' for
149/// Long64_t, 'T' for std::string)
150RDataFrame FromCSV(std::string_view fileName, bool readHeaders = true, char delimiter = ',',
151 Long64_t linesChunkSize = -1LL, std::unordered_map<std::string, char> &&colTypes = {});
152
153} // ns RDF
154
155} // ns ROOT
156
157#endif
long long Long64_t
Definition RtypesCore.h:69
unsigned long long ULong64_t
Definition RtypesCore.h:70
RDataFrame data source class for reading CSV files.
Definition RCsvDS.hxx:35
std::int64_t fDataLineNumber
Definition RCsvDS.hxx:73
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
Definition RCsvDS.cxx:608
void FillRecord(const std::string &, Record_t &)
Definition RCsvDS.cxx:185
void Finalize() final
Convenience method called after concluding an event-loop.
Definition RCsvDS.cxx:526
std::size_t GetNFiles() const final
Returns the number of files from which the dataset is constructed.
Definition RCsvDS.hxx:124
ColType_t GetType(std::string_view colName) const
Definition RCsvDS.cxx:597
std::vector< std::vector< double > > fDoubleEvtValues
Definition RCsvDS.hxx:86
void InferType(const std::string &, unsigned int)
Definition RCsvDS.cxx:330
std::uint64_t fDataPos
Definition RCsvDS.hxx:72
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
Definition RCsvDS.cxx:649
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
Definition RCsvDS.hxx:66
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition RCsvDS.cxx:362
static const TRegexp fgTrueRegex
Definition RCsvDS.hxx:69
void GenerateHeaders(size_t)
Definition RCsvDS.cxx:228
std::vector< std::vector< void * > > fColAddresses
Definition RCsvDS.hxx:84
unsigned int fNSlots
Definition RCsvDS.hxx:76
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
Definition RCsvDS.cxx:534
std::string AsString() final
Definition RCsvDS.cxx:98
bool Readln(std::string &line)
Definition RCsvDS.cxx:114
std::vector< std::string > fHeaders
Definition RCsvDS.hxx:80
ULong64_t fEntryRangesRequested
Definition RCsvDS.hxx:78
std::int64_t fMaxLineNumber
Definition RCsvDS.hxx:75
RCsvDS & operator=(RCsvDS &&)=delete
ULong64_t fProcessedLines
Definition RCsvDS.hxx:79
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
Definition RCsvDS.cxx:613
std::int64_t fLineNumber
Definition RCsvDS.hxx:74
void InferColTypes(std::vector< std::string > &)
Definition RCsvDS.cxx:297
std::unordered_map< std::string, ColType_t > fColTypes
Definition RCsvDS.hxx:81
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition RCsvDS.hxx:87
static const TRegexp fgDoubleRegex2
Definition RCsvDS.hxx:69
std::vector< Record_t > fRecords
Definition RCsvDS.hxx:85
RCsvDS(RCsvDS &&)=delete
ROptions fOptions
Definition RCsvDS.hxx:71
std::set< std::string > fColContainingEmpty
Definition RCsvDS.hxx:82
~RCsvDS() final
Destructor.
Definition RCsvDS.cxx:521
static const TRegexp fgFalseRegex
Definition RCsvDS.hxx:69
static const TRegexp fgDoubleRegex3
Definition RCsvDS.hxx:69
void ValidateColTypes(std::vector< std::string > &) const
Definition RCsvDS.cxx:278
static const TRegexp fgIntRegex
Definition RCsvDS.hxx:69
RCsvDS(const RCsvDS &)=delete
void RewindToData()
Definition RCsvDS.cxx:159
std::vector< std::string > ParseColumns(const std::string &)
Definition RCsvDS.cxx:351
void FillHeaders(const std::string &)
Definition RCsvDS.cxx:165
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
Definition RCsvDS.hxx:77
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
Definition RCsvDS.cxx:539
std::string GetLabel() final
Return a string representation of the datasource type.
Definition RCsvDS.cxx:666
std::vector< void * > GetColumnReadersImpl(std::string_view, const std::type_info &) final
type-erased vector of pointers to pointers to column values - one per slot
Definition RCsvDS.cxx:246
static const TRegexp fgDoubleRegex1
Definition RCsvDS.hxx:69
RCsvDS & operator=(const RCsvDS &)=delete
std::vector< std::vector< std::string > > fStringEvtValues
Definition RCsvDS.hxx:88
std::vector< std::deque< bool > > fBoolEvtValues
Definition RCsvDS.hxx:91
bool SetEntry(unsigned int slot, ULong64_t entry) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition RCsvDS.cxx:618
std::list< ColType_t > fColTypesList
Definition RCsvDS.hxx:83
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
Regular expression class.
Definition TRegexp.h:31
TLine * line
RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options)
Factory method to create a CSV RDataFrame.
Definition RCsvDS.cxx:671
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Options that control how the CSV file is parsed.
Definition RCsvDS.hxx:38
bool fHeaders
The first line describes the columns.
Definition RCsvDS.hxx:42
bool fRightTrim
Trailing whitespaces are removed.
Definition RCsvDS.hxx:45
std::int64_t fSkipFirstNLines
Ignore the first N lines of the file.
Definition RCsvDS.hxx:47
std::vector< std::string > fColumnNames
Impose column names.
Definition RCsvDS.hxx:57
std::int64_t fSkipLastNLines
Ignore the last N lines of the file.
Definition RCsvDS.hxx:48
std::unordered_map< std::string, char > fColumnTypes
Specify custom column types, accepts an unordered map with keys being column name,...
Definition RCsvDS.hxx:60
bool fSkipBlankLines
Ignore empty lines (after trimming, if trimming is enabled)
Definition RCsvDS.hxx:46
char fDelimiter
Column delimiter character.
Definition RCsvDS.hxx:43
char fComment
Character indicating that the remainder of the line should be ignored, if different from '\0'.
Definition RCsvDS.hxx:53
bool fLeftTrim
Leading whitespaces are removed.
Definition RCsvDS.hxx:44
std::int64_t fLinesChunkSize
Number of lines to read, -1 to read all.
Definition RCsvDS.hxx:49