Logo ROOT   6.16/01
Reference Guide
RCsvDS.hxx
Go to the documentation of this file.
1// Author: Enric Tejedor CERN 10/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2017, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RCSVTDS
12#define ROOT_RCSVTDS
13
14#include "ROOT/RDataFrame.hxx"
15#include "ROOT/RDataSource.hxx"
16
17#include <deque>
18#include <list>
19#include <map>
20#include <vector>
21
22#include <TRegexp.h>
23
24namespace ROOT {
25
26namespace RDF {
27
28class RCsvDS final : public ROOT::RDF::RDataSource {
29
30private:
31 // Possible values are d, b, l, s. This is possible only because we treat double, bool, Long64_t and string
32 using ColType_t = char;
33 static const std::map<ColType_t, std::string> fgColTypeMap;
34
35 std::streampos fDataPos = 0;
36 bool fReadHeaders = false;
37 unsigned int fNSlots = 0U;
38 std::ifstream fStream;
39 const char fDelimiter;
42 ULong64_t fProcessedLines = 0ULL; // marks the progress of the consumption of the csv lines
43 std::vector<std::string> fHeaders;
44 std::map<std::string, ColType_t> fColTypes;
45 std::list<ColType_t> fColTypesList;
46 std::vector<std::vector<void *>> fColAddresses; // fColAddresses[column][slot]
47 std::vector<Record_t> fRecords; // fRecords[entry][column]
48 std::vector<std::vector<double>> fDoubleEvtValues; // one per column per slot
49 std::vector<std::vector<Long64_t>> fLong64EvtValues; // one per column per slot
50 std::vector<std::vector<std::string>> fStringEvtValues; // one per column per slot
51 // This must be a deque to avoid the specialisation vector<bool>. This would not
52 // work given that the pointer to the boolean in that case cannot be taken
53 std::vector<std::deque<bool>> fBoolEvtValues; // one per column per slot
54
56
57 void FillHeaders(const std::string &);
58 void FillRecord(const std::string &, Record_t &);
59 void GenerateHeaders(size_t);
60 std::vector<void *> GetColumnReadersImpl(std::string_view, const std::type_info &);
61 void InferColTypes(std::vector<std::string> &);
62 void InferType(const std::string &, unsigned int);
63 std::vector<std::string> ParseColumns(const std::string &);
64 size_t ParseValue(const std::string &, std::vector<std::string> &, size_t);
65 ColType_t GetType(std::string_view colName) const;
66
67protected:
68 std::string AsString();
69
70public:
71 RCsvDS(std::string_view fileName, bool readHeaders = true, char delimiter = ',', Long64_t linesChunkSize = -1LL);
72 void Finalise();
73 void FreeRecords();
74 ~RCsvDS();
75 const std::vector<std::string> &GetColumnNames() const;
76 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges();
77 std::string GetTypeName(std::string_view colName) const;
78 bool HasColumn(std::string_view colName) const;
79 bool SetEntry(unsigned int slot, ULong64_t entry);
80 void SetNSlots(unsigned int nSlots);
81 std::string GetLabel();
82};
83
84////////////////////////////////////////////////////////////////////////////////////////////////
85/// \brief Factory method to create a CSV RDataFrame.
86/// \param[in] fileName Path of the CSV file.
87/// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
88/// (default `true`).
89/// \param[in] delimiter Delimiter character (default ',').
90RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders = true, char delimiter = ',',
91 Long64_t linesChunkSize = -1LL);
92
93} // ns RDF
94
95} // ns ROOT
96
97#endif
long long Long64_t
Definition: RtypesCore.h:69
unsigned long long ULong64_t
Definition: RtypesCore.h:70
RDataFrame data source class for reading CSV files.
Definition: RCsvDS.hxx:28
RCsvDS(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL)
Constructor to create a CSV RDataSource for RDataFrame.
Definition: RCsvDS.cxx:257
std::map< std::string, ColType_t > fColTypes
Definition: RCsvDS.hxx:44
static TRegexp falseRegex
Definition: RCsvDS.hxx:55
void FillRecord(const std::string &, Record_t &)
Definition: RCsvDS.cxx:112
std::ifstream fStream
Definition: RCsvDS.hxx:38
ColType_t GetType(std::string_view colName) const
Definition: RCsvDS.cxx:380
std::vector< std::vector< double > > fDoubleEvtValues
Definition: RCsvDS.hxx:48
void InferType(const std::string &, unsigned int)
Definition: RCsvDS.cxx:195
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition: RCsvDS.cxx:226
void GenerateHeaders(size_t)
Definition: RCsvDS.cxx:147
std::vector< std::vector< void * > > fColAddresses
Definition: RCsvDS.hxx:46
unsigned int fNSlots
Definition: RCsvDS.hxx:37
std::string GetLabel()
Return a string representation of the datasource type.
Definition: RCsvDS.cxx:449
const Long64_t fLinesChunkSize
Definition: RCsvDS.hxx:40
std::vector< std::string > fHeaders
Definition: RCsvDS.hxx:43
static TRegexp doubleRegex2
Definition: RCsvDS.hxx:55
ULong64_t fEntryRangesRequested
Definition: RCsvDS.hxx:41
const std::vector< std::string > & GetColumnNames() const
Returns a reference to the collection of the dataset's column names.
Definition: RCsvDS.cxx:338
ULong64_t fProcessedLines
Definition: RCsvDS.hxx:42
std::vector< void * > GetColumnReadersImpl(std::string_view, const std::type_info &)
type-erased vector of pointers to pointers to column values - one per slot
Definition: RCsvDS.cxx:154
void InferColTypes(std::vector< std::string > &)
Definition: RCsvDS.cxx:186
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition: RCsvDS.hxx:49
static TRegexp trueRegex
Definition: RCsvDS.hxx:55
static const std::map< ColType_t, std::string > fgColTypeMap
Definition: RCsvDS.hxx:33
const char fDelimiter
Definition: RCsvDS.hxx:39
std::vector< Record_t > fRecords
Definition: RCsvDS.hxx:47
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges()
Return ranges of entries to distribute to tasks.
Definition: RCsvDS.cxx:343
bool SetEntry(unsigned int slot, ULong64_t entry)
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition: RCsvDS.cxx:401
bool HasColumn(std::string_view colName) const
Checks if the dataset has a certain column.
Definition: RCsvDS.cxx:396
std::streampos fDataPos
Definition: RCsvDS.hxx:35
static TRegexp doubleRegex1
Definition: RCsvDS.hxx:55
void SetNSlots(unsigned int nSlots)
Inform RDataSource of the number of processing slots (i.e.
Definition: RCsvDS.cxx:432
std::string GetTypeName(std::string_view colName) const
Type of a column as a string, e.g.
Definition: RCsvDS.cxx:391
std::vector< std::string > ParseColumns(const std::string &)
Definition: RCsvDS.cxx:215
void FillHeaders(const std::string &)
Definition: RCsvDS.cxx:104
std::string AsString()
Definition: RCsvDS.cxx:89
void Finalise()
Convenience method called after concluding an event-loop.
Definition: RCsvDS.cxx:329
std::vector< std::vector< std::string > > fStringEvtValues
Definition: RCsvDS.hxx:50
std::vector< std::deque< bool > > fBoolEvtValues
Definition: RCsvDS.hxx:53
static TRegexp intRegex
Definition: RCsvDS.hxx:55
void FreeRecords()
Definition: RCsvDS.cxx:293
~RCsvDS()
Destructor.
Definition: RCsvDS.cxx:324
std::list< ColType_t > fColTypesList
Definition: RCsvDS.hxx:45
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
std::vector< void * > Record_t
ROOT's RDataFrame offers a high level interface for analyses of data stored in TTrees,...
Definition: RDataFrame.hxx:41
Regular expression class.
Definition: TRegexp.h:31
RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL)
Factory method to create a CSV RDataFrame.
Definition: RCsvDS.cxx:454
Namespace for new ROOT classes and functions.
Definition: StringConv.hxx:21
basic_string_view< char > string_view
Definition: RStringView.hxx:35