Logo ROOT   6.16/01
Reference Guide
RSqliteDS.hxx
Go to the documentation of this file.
1// Author: Jakob Blomer CERN 07/2018
2
3/*************************************************************************
4 * Copyright (C) 1995-2017, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef ROOT_RSQLITEDS
12#define ROOT_RSQLITEDS
13
14#include "ROOT/RDataFrame.hxx"
15#include "ROOT/RDataSource.hxx"
16#include "ROOT/RStringView.hxx"
17
18#include <map>
19#include <memory>
20#include <mutex>
21#include <string>
22#include <vector>
23
24namespace ROOT {
25
26namespace RDF {
27
28namespace Internal {
29// Members are defined in RSqliteDS.cxx in order to not pullute this header file with sqlite3.h
30struct RSqliteDSDataSet;
31}
32
33// clang-format off
34/**
35\class ROOT::RDF::RSqliteDS
36\ingroup dataframe
37\brief RSqliteDS is an RDF data source implementation for SQL result sets from sqlite3 files.
38
39The RSqliteDS is able to feed an RDataFrame with data from a SQlite SELECT query. One can use it like
40
41 auto rdf = ROOT::RDF::MakeSqliteDataFrame("/path/to/file.sqlite", "select name from table");
42 auto h = rdf.Define("lName", "name.length()").Histo1D("lName");
43
44The data source has to provide column types for all the columns. Determining column types in SQlite is tricky
45as it is dynamically typed and in principle each row can have different column types. The following heuristics
46is used:
47
48 - If a table column is queried as is ("SELECT colname FROM table"), the default/declared column type is taken.
49 - For expressions ("SELECT 1+1 FROM table"), the type of the first row of the result set determines the column type.
50 That can result in a column to be of thought of type NULL where subsequent rows actually have meaningful values.
51 The provided SELECT query can be used to avoid such ambiguities.
52*/
53class RSqliteDS final : public ROOT::RDF::RDataSource {
54private:
55 // clang-format off
56 /// All the types known to SQlite. Changes require changing fgTypeNames, too.
57 enum class ETypes {
59 kReal,
60 kText,
61 kBlob,
62 kNull
63 };
64 // clang-format on
65
66 /// Used to hold a single "cell" of the SELECT query's result table. Can be changed to std::variant once available.
67 struct Value_t {
68 explicit Value_t(ETypes type);
69
71 bool fIsActive; ///< Not all columns of the query are necessarily used by the RDF. Allows for skipping them.
73 double fReal;
74 std::string fText;
75 std::vector<unsigned char> fBlob;
76 void *fNull;
77 void *fPtr; ///< Points to one of the values; an address to this pointer is returned by GetColumnReadersImpl.
78 };
79
80 void SqliteError(int errcode);
81
82 std::unique_ptr<Internal::RSqliteDSDataSet> fDataSet;
83 unsigned int fNSlots;
85 std::vector<std::string> fColumnNames;
86 std::vector<ETypes> fColumnTypes;
87 /// The data source is inherently single-threaded and returns only one row at a time. This vector holds the results.
88 std::vector<Value_t> fValues;
89
90 // clang-format off
91 /// Corresponds to the types defined in ETypes.
92 static constexpr char const *fgTypeNames[] = {
93 "Long64_t",
94 "double",
95 "std::string",
96 "std::vector<unsigned char>",
97 "void *"
98 };
99 // clang-format on
100
101public:
102 RSqliteDS(const std::string &fileName, const std::string &query);
103 ~RSqliteDS();
104 void SetNSlots(unsigned int nSlots) final;
105 const std::vector<std::string> &GetColumnNames() const final;
106 bool HasColumn(std::string_view colName) const final;
107 std::string GetTypeName(std::string_view colName) const final;
108 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final;
109 bool SetEntry(unsigned int slot, ULong64_t entry) final;
110 void Initialise() final;
111 std::string GetLabel() final;
112
113protected:
114 Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final;
115};
116
118
119} // namespace RDF
120
121} // namespace ROOT
122
123#endif
long long Long64_t
Definition: RtypesCore.h:69
unsigned long long ULong64_t
Definition: RtypesCore.h:70
int type
Definition: TGX11.cxx:120
RDataSource defines an API that RDataFrame can use to read arbitrary data formats.
std::vector< void * > Record_t
RSqliteDS is an RDF data source implementation for SQL result sets from sqlite3 files.
Definition: RSqliteDS.hxx:53
void SetNSlots(unsigned int nSlots) final
Almost a no-op, many slots can in fact reduce the performance due to thread synchronization.
Definition: RSqliteDS.cxx:618
static constexpr char const * fgTypeNames[]
Corresponds to the types defined in ETypes.
Definition: RSqliteDS.hxx:92
std::string GetLabel() final
Return a string representation of the datasource type.
Definition: RSqliteDS.cxx:565
void Initialise() final
Resets the SQlite query engine at the beginning of the event loop.
Definition: RSqliteDS.cxx:557
unsigned int fNSlots
Definition: RSqliteDS.hxx:83
std::vector< std::string > fColumnNames
Definition: RSqliteDS.hxx:85
~RSqliteDS()
Frees the sqlite resources and closes the file.
Definition: RSqliteDS.cxx:474
bool HasColumn(std::string_view colName) const final
A linear search through the columns for the given name.
Definition: RSqliteDS.cxx:550
std::vector< ETypes > fColumnTypes
Definition: RSqliteDS.hxx:86
std::string GetTypeName(std::string_view colName) const final
Returns the C++ type for a given column name, implemented as a linear search through all the columns.
Definition: RSqliteDS.cxx:536
ETypes
All the types known to SQlite. Changes require changing fgTypeNames, too.
Definition: RSqliteDS.hxx:57
Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final
Activates the given column's result value.
Definition: RSqliteDS.cxx:494
RSqliteDS(const std::string &fileName, const std::string &query)
Build the dataframe.
Definition: RSqliteDS.cxx:392
std::unique_ptr< Internal::RSqliteDSDataSet > fDataSet
Definition: RSqliteDS.hxx:82
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Returns a range of size 1 as long as more rows are available in the SQL result set.
Definition: RSqliteDS.cxx:517
const std::vector< std::string > & GetColumnNames() const final
Returns the SELECT queries names.
Definition: RSqliteDS.cxx:487
bool SetEntry(unsigned int slot, ULong64_t entry) final
Stores the result of the current active sqlite query row as a C++ value.
Definition: RSqliteDS.cxx:582
void SqliteError(int errcode)
Helper function to throw an exception if there is a fatal sqlite error, e.g. an I/O error.
Definition: RSqliteDS.cxx:629
std::vector< Value_t > fValues
The data source is inherently single-threaded and returns only one row at a time. This vector holds t...
Definition: RSqliteDS.hxx:88
ROOT's RDataFrame offers a high level interface for analyses of data stored in TTrees,...
Definition: RDataFrame.hxx:41
RDataFrame MakeSqliteDataFrame(std::string_view fileName, std::string_view query)
Factory method to create a SQlite RDataFrame.
Definition: RSqliteDS.cxx:574
Namespace for new ROOT classes and functions.
Definition: StringConv.hxx:21
STL namespace.
Used to hold a single "cell" of the SELECT query's result table. Can be changed to std::variant once ...
Definition: RSqliteDS.hxx:67
void * fPtr
Points to one of the values; an address to this pointer is returned by GetColumnReadersImpl.
Definition: RSqliteDS.hxx:77
std::vector< unsigned char > fBlob
Definition: RSqliteDS.hxx:75
bool fIsActive
Not all columns of the query are necessarily used by the RDF. Allows for skipping them.
Definition: RSqliteDS.hxx:71