Logo ROOT  
Reference Guide
RCsvDS.hxx
Go to the documentation of this file.
1// Author: Enric Tejedor CERN 10/2017
2
3/*************************************************************************
4 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RCSVTDS
12#define ROOT_RCSVTDS
13
14#include "ROOT/RDataFrame.hxx"
15#include "ROOT/RDataSource.hxx"
16
17#include <cstdint>
18#include <deque>
19#include <list>
20#include <unordered_map>
21#include <set>
22#include <memory>
23#include <vector>
24
25#include <TRegexp.h>
26
27namespace ROOT {
28
29namespace Internal {
30class RRawFile;
31}
32
33namespace RDF {
34
35class RCsvDS final : public ROOT::RDF::RDataSource {
36
37private:
38 // Possible values are D, O, L, T. This is possible only because we treat double, bool, Long64_t and string
39 using ColType_t = char;
40 static const std::unordered_map<ColType_t, std::string> fgColTypeMap;
41
42 // Regular expressions for type inference
44
45 std::uint64_t fDataPos = 0;
46 bool fReadHeaders = false;
47 unsigned int fNSlots = 0U;
48 std::unique_ptr<ROOT::Internal::RRawFile> fCsvFile;
49 const char fDelimiter;
52 ULong64_t fProcessedLines = 0ULL; // marks the progress of the consumption of the csv lines
53 std::vector<std::string> fHeaders;
54 std::unordered_map<std::string, ColType_t> fColTypes;
55 std::set<std::string> fColContainingEmpty; // store columns which had empty entry
56 std::list<ColType_t> fColTypesList;
57 std::vector<std::vector<void *>> fColAddresses; // fColAddresses[column][slot]
58 std::vector<Record_t> fRecords; // fRecords[entry][column]
59 std::vector<std::vector<double>> fDoubleEvtValues; // one per column per slot
60 std::vector<std::vector<Long64_t>> fLong64EvtValues; // one per column per slot
61 std::vector<std::vector<std::string>> fStringEvtValues; // one per column per slot
62 // This must be a deque to avoid the specialisation vector<bool>. This would not
63 // work given that the pointer to the boolean in that case cannot be taken
64 std::vector<std::deque<bool>> fBoolEvtValues; // one per column per slot
65
66 void FillHeaders(const std::string &);
67 void FillRecord(const std::string &, Record_t &);
68 void GenerateHeaders(size_t);
69 std::vector<void *> GetColumnReadersImpl(std::string_view, const std::type_info &);
70 void ValidateColTypes(std::vector<std::string> &) const;
71 void InferColTypes(std::vector<std::string> &);
72 void InferType(const std::string &, unsigned int);
73 std::vector<std::string> ParseColumns(const std::string &);
74 size_t ParseValue(const std::string &, std::vector<std::string> &, size_t);
75 ColType_t GetType(std::string_view colName) const;
76 void FreeRecords();
77
78protected:
79 std::string AsString();
80
81public:
82 RCsvDS(std::string_view fileName, bool readHeaders = true, char delimiter = ',', Long64_t linesChunkSize = -1LL,
83 std::unordered_map<std::string, char> &&colTypes = {});
84 void Finalize() final;
85 ~RCsvDS();
86 const std::vector<std::string> &GetColumnNames() const final;
87 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final;
88 std::string GetTypeName(std::string_view colName) const final;
89 bool HasColumn(std::string_view colName) const final;
90 bool SetEntry(unsigned int slot, ULong64_t entry) final;
91 void SetNSlots(unsigned int nSlots) final;
92 std::string GetLabel() final;
93};
94
95////////////////////////////////////////////////////////////////////////////////////////////////
96/// \brief Factory method to create a CSV RDataFrame.
97/// \param[in] fileName Path of the CSV file.
98/// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
99/// (default `true`).
100/// \param[in] delimiter Delimiter character (default ',').
101/// \param[in] linesChunkSize bunch of lines to read, use -1 to read all
102/// \param[in] colTypes Allow user to specify custom column types, accepts an unordered map with keys being
103/// column type, values being type alias ('O' for boolean, 'D' for double, 'L' for
104/// Long64_t, 'T' for std::string)
105RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders = true, char delimiter = ',',
106 Long64_t linesChunkSize = -1LL, std::unordered_map<std::string, char> &&colTypes = {});
107
108} // ns RDF
109
110} // ns ROOT
111
112#endif
long long Long64_t
Definition: RtypesCore.h:80
unsigned long long ULong64_t
Definition: RtypesCore.h:81
RDataFrame data source class for reading CSV files.
Definition: RCsvDS.hxx:35
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
Definition: RCsvDS.cxx:495
void FillRecord(const std::string &, Record_t &)
Definition: RCsvDS.cxx:122
void Finalize() final
Convenience method called after concluding an event-loop.
Definition: RCsvDS.cxx:413
ColType_t GetType(std::string_view colName) const
Definition: RCsvDS.cxx:484
std::vector< std::vector< double > > fDoubleEvtValues
Definition: RCsvDS.hxx:59
void InferType(const std::string &, unsigned int)
Definition: RCsvDS.cxx:255
std::uint64_t fDataPos
Definition: RCsvDS.hxx:45
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
Definition: RCsvDS.cxx:536
static const std::unordered_map< ColType_t, std::string > fgColTypeMap
Definition: RCsvDS.hxx:40
size_t ParseValue(const std::string &, std::vector< std::string > &, size_t)
Definition: RCsvDS.cxx:287
static const TRegexp fgTrueRegex
Definition: RCsvDS.hxx:43
void GenerateHeaders(size_t)
Definition: RCsvDS.cxx:165
std::vector< std::vector< void * > > fColAddresses
Definition: RCsvDS.hxx:57
unsigned int fNSlots
Definition: RCsvDS.hxx:47
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
Definition: RCsvDS.cxx:421
const Long64_t fLinesChunkSize
Definition: RCsvDS.hxx:50
std::vector< std::string > fHeaders
Definition: RCsvDS.hxx:53
ULong64_t fEntryRangesRequested
Definition: RCsvDS.hxx:51
ULong64_t fProcessedLines
Definition: RCsvDS.hxx:52
std::vector< void * > GetColumnReadersImpl(std::string_view, const std::type_info &)
type-erased vector of pointers to pointers to column values - one per slot
Definition: RCsvDS.cxx:173
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
Definition: RCsvDS.cxx:500
void InferColTypes(std::vector< std::string > &)
Definition: RCsvDS.cxx:224
std::unordered_map< std::string, ColType_t > fColTypes
Definition: RCsvDS.hxx:54
std::vector< std::vector< Long64_t > > fLong64EvtValues
Definition: RCsvDS.hxx:60
const char fDelimiter
Definition: RCsvDS.hxx:49
static const TRegexp fgDoubleRegex2
Definition: RCsvDS.hxx:43
std::vector< Record_t > fRecords
Definition: RCsvDS.hxx:58
RCsvDS(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL, std::unordered_map< std::string, char > &&colTypes={})
Constructor to create a CSV RDataSource for RDataFrame.
Definition: RCsvDS.cxx:331
std::set< std::string > fColContainingEmpty
Definition: RCsvDS.hxx:55
static const TRegexp fgFalseRegex
Definition: RCsvDS.hxx:43
static const TRegexp fgDoubleRegex3
Definition: RCsvDS.hxx:43
void ValidateColTypes(std::vector< std::string > &) const
Definition: RCsvDS.cxx:205
static const TRegexp fgIntRegex
Definition: RCsvDS.hxx:43
std::vector< std::string > ParseColumns(const std::string &)
Definition: RCsvDS.cxx:276
void FillHeaders(const std::string &)
Definition: RCsvDS.cxx:113
std::unique_ptr< ROOT::Internal::RRawFile > fCsvFile
Definition: RCsvDS.hxx:48
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
Definition: RCsvDS.cxx:426
std::string GetLabel() final
Return a string representation of the datasource type.
Definition: RCsvDS.cxx:553
static const TRegexp fgDoubleRegex1
Definition: RCsvDS.hxx:43
std::string AsString()
Definition: RCsvDS.cxx:97
std::vector< std::vector< std::string > > fStringEvtValues
Definition: RCsvDS.hxx:61
std::vector< std::deque< bool > > fBoolEvtValues
Definition: RCsvDS.hxx:64
bool SetEntry(unsigned int slot, ULong64_t entry) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition: RCsvDS.cxx:505
void FreeRecords()
Definition: RCsvDS.cxx:377
std::list< ColType_t > fColTypesList
Definition: RCsvDS.hxx:56
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
std::vector< void * > Record_t
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
Definition: RDataFrame.hxx:41
Regular expression class.
Definition: TRegexp.h:31
basic_string_view< char > string_view
RDataFrame MakeCsvDataFrame(std::string_view fileName, bool readHeaders=true, char delimiter=',', Long64_t linesChunkSize=-1LL, std::unordered_map< std::string, char > &&colTypes={})
Factory method to create a CSV RDataFrame.
Definition: RCsvDS.cxx:558
This file contains a specialised ROOT message handler to test for diagnostic in unit tests.