Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RArrowDS.cxx
Go to the documentation of this file.
1// Author: Giulio Eulisse CERN 2/2018
2
3/*************************************************************************
4 * Copyright (C) 1995-2018, Rene Brun and Fons Rademakers. *
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11// clang-format off
12/** \class ROOT::RDF::RArrowDS
13 \ingroup dataframe
14 \brief RDataFrame data source class to interface with Apache Arrow.
15
16The RArrowDS implements a proxy RDataSource to be able to use Apache Arrow
17tables with RDataFrame.
18
19A RDataFrame that adapts an arrow::Table class can be constructed using the factory method
20ROOT::RDF::MakeArrowDataFrame, which accepts one parameter:
211. An arrow::Table smart pointer.
22
23The types of the columns are derived from the types in the associated
24arrow::Schema.
25
26*/
27// clang-format on
28
29#include <ROOT/RDF/Utils.hxx>
30#include <ROOT/TSeq.hxx>
31#include <ROOT/RArrowDS.hxx>
32#include <snprintf.h>
33
34#include <algorithm>
35#include <memory>
36#include <sstream>
37#include <string>
38
39#if defined(__GNUC__)
40#pragma GCC diagnostic push
41#pragma GCC diagnostic ignored "-Wshadow"
42#pragma GCC diagnostic ignored "-Wunused-parameter"
43#endif
44#include <arrow/table.h>
45#include <arrow/stl.h>
46#if defined(__GNUC__)
47#pragma GCC diagnostic pop
48#endif
49
50namespace ROOT {
51namespace Internal {
52namespace RDF {
53
54// This is needed by Arrow 0.12.0 which dropped
55//
56// using ArrowType = ArrowType_;
57//
58// from ARROW_STL_CONVERSION
59template <typename T>
61
62#define ROOT_ARROW_STL_CONVERSION(c_type, ArrowType_) \
63 template <> \
64 struct RootConversionTraits<c_type> { \
65 using ArrowType = ::arrow::ArrowType_; \
66 };
67
68ROOT_ARROW_STL_CONVERSION(bool, BooleanType)
70ROOT_ARROW_STL_CONVERSION(int16_t, Int16Type)
71ROOT_ARROW_STL_CONVERSION(int32_t, Int32Type)
73ROOT_ARROW_STL_CONVERSION(uint8_t, UInt8Type)
74ROOT_ARROW_STL_CONVERSION(uint16_t, UInt16Type)
75ROOT_ARROW_STL_CONVERSION(uint32_t, UInt32Type)
78ROOT_ARROW_STL_CONVERSION(double, DoubleType)
79ROOT_ARROW_STL_CONVERSION(std::string, StringType)
80
81// Per slot visitor of an Array.
82class ArrayPtrVisitor : public ::arrow::ArrayVisitor {
83private:
84 /// The pointer to update.
85 void **fResult;
86 bool fCachedBool{false}; // Booleans need to be unpacked, so we use a cached entry.
87 // FIXME: I should really use a variant here
94 std::string fCachedString;
95 /// The entry in the array which should be looked up.
97
98 template <typename T>
99 void *getTypeErasedPtrFrom(arrow::ListArray const &array, int32_t entry, RVec<T> &cache)
100 {
101 using ArrowType = typename RootConversionTraits<T>::ArrowType;
102 using ArrayType = typename arrow::TypeTraits<ArrowType>::ArrayType;
103 auto values = reinterpret_cast<ArrayType *>(array.values().get());
104 auto offset = array.value_offset(entry);
105 // Here the cast to void* is a worksround while we figure out the
106 // issues we have with long long types, signed and unsigned.
107 RVec<T> tmp(reinterpret_cast<T *>((void *)values->raw_values()) + offset, array.value_length(entry));
108 std::swap(cache, tmp);
109 return (void *)(&cache);
110 }
111
112public:
113 ArrayPtrVisitor(void **result) : fResult{result}, fCurrentEntry{0} {}
114
115 void SetEntry(ULong64_t entry) { fCurrentEntry = entry; }
116
117 /// Check if we are asking the same entry as before.
118 virtual arrow::Status Visit(arrow::Int32Array const &array) final
119 {
120 *fResult = (void *)(array.raw_values() + fCurrentEntry);
121 return arrow::Status::OK();
122 }
123
124 virtual arrow::Status Visit(arrow::Int64Array const &array) final
125 {
126 *fResult = (void *)(array.raw_values() + fCurrentEntry);
127 return arrow::Status::OK();
128 }
129
130 /// Check if we are asking the same entry as before.
131 virtual arrow::Status Visit(arrow::UInt32Array const &array) final
132 {
133 *fResult = (void *)(array.raw_values() + fCurrentEntry);
134 return arrow::Status::OK();
135 }
136
137 virtual arrow::Status Visit(arrow::UInt64Array const &array) final
138 {
139 *fResult = (void *)(array.raw_values() + fCurrentEntry);
140 return arrow::Status::OK();
141 }
142
143 virtual arrow::Status Visit(arrow::FloatArray const &array) final
144 {
145 *fResult = (void *)(array.raw_values() + fCurrentEntry);
146 return arrow::Status::OK();
147 }
148
149 virtual arrow::Status Visit(arrow::DoubleArray const &array) final
150 {
151 *fResult = (void *)(array.raw_values() + fCurrentEntry);
152 return arrow::Status::OK();
153 }
154
155 virtual arrow::Status Visit(arrow::BooleanArray const &array) final
156 {
157 fCachedBool = array.Value(fCurrentEntry);
158 *fResult = reinterpret_cast<void *>(&fCachedBool);
159 return arrow::Status::OK();
160 }
161
162 virtual arrow::Status Visit(arrow::StringArray const &array) final
163 {
164 fCachedString = array.GetString(fCurrentEntry);
165 *fResult = reinterpret_cast<void *>(&fCachedString);
166 return arrow::Status::OK();
167 }
168
169 virtual arrow::Status Visit(arrow::ListArray const &array) final
170 {
171 switch (array.value_type()->id()) {
172 case arrow::Type::FLOAT: {
173 *fResult = getTypeErasedPtrFrom(array, fCurrentEntry, fCachedRVecFloat);
174 return arrow::Status::OK();
175 }
176 case arrow::Type::DOUBLE: {
177 *fResult = getTypeErasedPtrFrom(array, fCurrentEntry, fCachedRVecDouble);
178 return arrow::Status::OK();
179 }
180 case arrow::Type::UINT32: {
181 *fResult = getTypeErasedPtrFrom(array, fCurrentEntry, fCachedRVecUInt);
182 return arrow::Status::OK();
183 }
184 case arrow::Type::UINT64: {
185 *fResult = getTypeErasedPtrFrom(array, fCurrentEntry, fCachedRVecULong64);
186 return arrow::Status::OK();
187 }
188 case arrow::Type::INT32: {
189 *fResult = getTypeErasedPtrFrom(array, fCurrentEntry, fCachedRVecInt);
190 return arrow::Status::OK();
191 }
192 case arrow::Type::INT64: {
193 *fResult = getTypeErasedPtrFrom(array, fCurrentEntry, fCachedRVecLong64);
194 return arrow::Status::OK();
195 }
196 default: return arrow::Status::TypeError("Type not supported");
197 }
198 }
199
200 using ::arrow::ArrayVisitor::Visit;
201};
202
203/// Helper class which keeps track for each slot where to get the entry.
205private:
206 std::vector<void *> fValuesPtrPerSlot;
207 std::vector<ULong64_t> fLastEntryPerSlot;
208 std::vector<ULong64_t> fLastChunkPerSlot;
209 std::vector<ULong64_t> fFirstEntryPerChunk;
210 std::vector<ArrayPtrVisitor> fArrayVisitorPerSlot;
211 /// Since data can be chunked in different arrays we need to construct an
212 /// index which contains the first element of each chunk, so that we can
213 /// quickly move to the correct chunk.
214 std::vector<ULong64_t> fChunkIndex;
215 arrow::ArrayVector fChunks;
216
217public:
218 TValueGetter(size_t slots, arrow::ArrayVector chunks)
219 : fValuesPtrPerSlot(slots, nullptr), fLastEntryPerSlot(slots, 0), fLastChunkPerSlot(slots, 0), fChunks{chunks}
220 {
221 fChunkIndex.reserve(fChunks.size());
222 size_t next = 0;
223 for (auto &chunk : chunks) {
224 fFirstEntryPerChunk.push_back(next);
225 next += chunk->length();
226 fChunkIndex.push_back(next);
227 }
228 for (size_t si = 0, se = fValuesPtrPerSlot.size(); si != se; ++si) {
230 }
231 }
232
233 /// This returns the ptr to the ptr to actual data.
234 std::vector<void *> SlotPtrs()
235 {
236 std::vector<void *> result;
237 for (size_t i = 0; i < fValuesPtrPerSlot.size(); ++i) {
238 result.push_back(fValuesPtrPerSlot.data() + i);
239 }
240 return result;
241 }
242
243 // Convenience method to avoid code duplication between
244 // SetEntry and InitSlot
245 void UncachedSlotLookup(unsigned int slot, ULong64_t entry)
246 {
247 // If entry is greater than the previous one,
248 // we can skip all the chunks before the last one we
249 // queried.
250 size_t ci = 0;
251 assert(slot < fLastChunkPerSlot.size());
252 if (fLastEntryPerSlot[slot] < entry) {
253 ci = fLastChunkPerSlot.at(slot);
254 }
255
256 for (size_t ce = fChunkIndex.size(); ci != ce; ++ci) {
257 if (entry < fChunkIndex[ci]) {
258 assert(slot < fLastChunkPerSlot.size());
259 fLastChunkPerSlot[slot] = ci;
260 break;
261 }
262 }
263
264 // Update the pointer to the requested entry.
265 // Notice that we need to find the entry
266 auto chunk = fChunks.at(fLastChunkPerSlot[slot]);
267 assert(slot < fArrayVisitorPerSlot.size());
268 fArrayVisitorPerSlot[slot].SetEntry(entry - fFirstEntryPerChunk[fLastChunkPerSlot[slot]]);
269 fLastEntryPerSlot[slot] = entry;
270 auto status = chunk->Accept(fArrayVisitorPerSlot.data() + slot);
271 if (!status.ok()) {
272 std::string msg = "Could not get pointer for slot ";
273 msg += std::to_string(slot) + " looking at entry " + std::to_string(entry);
274 throw std::runtime_error(msg);
275 }
276 }
277
278 /// Set the current entry to be retrieved
279 void SetEntry(unsigned int slot, ULong64_t entry)
280 {
281 // Same entry as before
282 if (fLastEntryPerSlot[slot] == entry) {
283 return;
284 }
285 UncachedSlotLookup(slot, entry);
286 }
287};
288
289} // namespace RDF
290} // namespace Internal
291
292namespace RDF {
293
294/// Helper to get the contents of a given column
295
296/// Helper to get the human readable name of type
297class RDFTypeNameGetter : public ::arrow::TypeVisitor {
298private:
299 std::vector<std::string> fTypeName;
300
301public:
302 arrow::Status Visit(const arrow::Int64Type &) override
303 {
304 fTypeName.push_back("Long64_t");
305 return arrow::Status::OK();
306 }
307 arrow::Status Visit(const arrow::Int32Type &) override
308 {
309 fTypeName.push_back("Int_t");
310 return arrow::Status::OK();
311 }
312 arrow::Status Visit(const arrow::UInt64Type &) override
313 {
314 fTypeName.push_back("ULong64_t");
315 return arrow::Status::OK();
316 }
317 arrow::Status Visit(const arrow::UInt32Type &) override
318 {
319 fTypeName.push_back("UInt_t");
320 return arrow::Status::OK();
321 }
322 arrow::Status Visit(const arrow::FloatType &) override
323 {
324 fTypeName.push_back("float");
325 return arrow::Status::OK();
326 }
327 arrow::Status Visit(const arrow::DoubleType &) override
328 {
329 fTypeName.push_back("double");
330 return arrow::Status::OK();
331 }
332 arrow::Status Visit(const arrow::StringType &) override
333 {
334 fTypeName.push_back("string");
335 return arrow::Status::OK();
336 }
337 arrow::Status Visit(const arrow::BooleanType &) override
338 {
339 fTypeName.push_back("bool");
340 return arrow::Status::OK();
341 }
342 arrow::Status Visit(const arrow::ListType &l) override
343 {
344 /// Recursively visit List types and map them to
345 /// an RVec. We accumulate the result of the recursion on
346 /// fTypeName so that we can create the actual type
347 /// when the recursion is done.
348 fTypeName.push_back("ROOT::VecOps::RVec<%s>");
349 return l.value_type()->Accept(this);
350 }
351 std::string result()
352 {
353 // This recursively builds a nested type.
354 std::string result = "%s";
355 char buffer[8192];
356 for (size_t i = 0; i < fTypeName.size(); ++i) {
357 snprintf(buffer, 8192, result.c_str(), fTypeName[i].c_str());
358 result = buffer;
359 }
360 return result;
361 }
362
363 using ::arrow::TypeVisitor::Visit;
364};
365
366/// Helper to determine if a given Column is a supported type.
367class VerifyValidColumnType : public ::arrow::TypeVisitor {
368private:
369public:
370 virtual arrow::Status Visit(const arrow::Int64Type &) override { return arrow::Status::OK(); }
371 virtual arrow::Status Visit(const arrow::UInt64Type &) override { return arrow::Status::OK(); }
372 virtual arrow::Status Visit(const arrow::Int32Type &) override { return arrow::Status::OK(); }
373 virtual arrow::Status Visit(const arrow::UInt32Type &) override { return arrow::Status::OK(); }
374 virtual arrow::Status Visit(const arrow::FloatType &) override { return arrow::Status::OK(); }
375 virtual arrow::Status Visit(const arrow::DoubleType &) override { return arrow::Status::OK(); }
376 virtual arrow::Status Visit(const arrow::StringType &) override { return arrow::Status::OK(); }
377 virtual arrow::Status Visit(const arrow::BooleanType &) override { return arrow::Status::OK(); }
378 virtual arrow::Status Visit(const arrow::ListType &) override { return arrow::Status::OK(); }
379
380 using ::arrow::TypeVisitor::Visit;
381};
382
383////////////////////////////////////////////////////////////////////////
384/// Constructor to create an Arrow RDataSource for RDataFrame.
385/// \param[in] inTable the arrow Table to observe.
386/// \param[in] inColumns the name of the columns to use
387/// In case columns is empty, we use all the columns found in the table
388RArrowDS::RArrowDS(std::shared_ptr<arrow::Table> inTable, std::vector<std::string> const &inColumns)
389 : fTable{inTable}, fColumnNames{inColumns}
390{
391 auto &columnNames = fColumnNames;
392 auto &table = fTable;
393 auto &index = fGetterIndex;
394 // We want to allow people to specify which columns they
395 // need so that we can think of upfront IO optimizations.
396 auto filterWantedColumns = [&columnNames, &table]() {
397 if (columnNames.empty()) {
398 for (auto &field : table->schema()->fields()) {
399 columnNames.push_back(field->name());
400 }
401 }
402 };
403
404 // To support both arrow 0.14.0 and 0.16.0
405 using ColumnType = decltype(fTable->column(0));
406
407 auto getRecordsFirstColumn = [&columnNames, &table]() {
408 if (columnNames.empty()) {
409 throw std::runtime_error("At least one column required");
410 }
411 const auto name = columnNames.front();
412 const auto columnIdx = table->schema()->GetFieldIndex(name);
413 return table->column(columnIdx)->length();
414 };
415
416 // All columns are supposed to have the same number of entries.
417 auto verifyColumnSize = [&table](ColumnType column, int columnIdx, int nRecords) {
418 if (column->length() != nRecords) {
419 std::string msg = "Column ";
420 msg += table->schema()->field(columnIdx)->name() + " has a different number of entries.";
421 throw std::runtime_error(msg);
422 }
423 };
424
425 /// For the moment we support only a few native types.
426 auto verifyColumnType = [&table](ColumnType column, int columnIdx) {
427 auto verifyType = std::make_unique<VerifyValidColumnType>();
428 auto result = column->type()->Accept(verifyType.get());
429 if (result.ok() == false) {
430 std::string msg = "Column ";
431 msg += table->schema()->field(columnIdx)->name() + " contains an unsupported type.";
432 throw std::runtime_error(msg);
433 }
434 };
435
436 /// This is used to create an index between the columnId
437 /// and the associated getter.
438 auto addColumnToGetterIndex = [&index](int columnId) { index.push_back(std::make_pair(columnId, index.size())); };
439
440 /// Assuming we can get called more than once, we need to
441 /// reset the getter index each time.
442 auto resetGetterIndex = [&index]() { index.clear(); };
443
444 /// This is what initialization actually does
445 filterWantedColumns();
446 resetGetterIndex();
447 auto nRecords = getRecordsFirstColumn();
448 for (auto &columnName : fColumnNames) {
449 auto columnIdx = fTable->schema()->GetFieldIndex(columnName);
450 addColumnToGetterIndex(columnIdx);
451
452 auto column = fTable->column(columnIdx);
453 verifyColumnSize(column, columnIdx, nRecords);
454 verifyColumnType(column, columnIdx);
455 }
456}
457
458////////////////////////////////////////////////////////////////////////
459/// Destructor.
461{
462}
463
464const std::vector<std::string> &RArrowDS::GetColumnNames() const
465{
466 return fColumnNames;
467}
468
469std::vector<std::pair<ULong64_t, ULong64_t>> RArrowDS::GetEntryRanges()
470{
471 auto entryRanges(std::move(fEntryRanges)); // empty fEntryRanges
472 return entryRanges;
473}
474
475std::string RArrowDS::GetTypeName(std::string_view colName) const
476{
477 auto field = fTable->schema()->GetFieldByName(std::string(colName));
478 if (!field) {
479 std::string msg = "The dataset does not have column ";
480 msg += colName;
481 throw std::runtime_error(msg);
482 }
483 RDFTypeNameGetter typeGetter;
484 auto status = field->type()->Accept(&typeGetter);
485 if (status.ok() == false) {
486 std::string msg = "RArrowDS does not support a column of type ";
487 msg += field->type()->name();
488 throw std::runtime_error(msg);
489 }
490 return typeGetter.result();
491}
492
493bool RArrowDS::HasColumn(std::string_view colName) const
494{
495 auto field = fTable->schema()->GetFieldByName(std::string(colName));
496 if (!field) {
497 return false;
498 }
499 return true;
500}
501
502bool RArrowDS::SetEntry(unsigned int slot, ULong64_t entry)
503{
504 for (auto link : fGetterIndex) {
505 auto &getter = fValueGetters[link.second];
506 getter->SetEntry(slot, entry);
507 }
508 return true;
509}
510
511void RArrowDS::InitSlot(unsigned int slot, ULong64_t entry)
512{
513 for (auto link : fGetterIndex) {
514 auto &getter = fValueGetters[link.second];
515 getter->UncachedSlotLookup(slot, entry);
516 }
517}
518
519void splitInEqualRanges(std::vector<std::pair<ULong64_t, ULong64_t>> &ranges, int nRecords, unsigned int nSlots)
520{
521 ranges.clear();
522 const auto chunkSize = nRecords / nSlots;
523 const auto remainder = 1U == nSlots ? 0 : nRecords % nSlots;
524 auto start = 0UL;
525 auto end = 0UL;
526 for (auto i : ROOT::TSeqU(nSlots)) {
527 start = end;
528 end += chunkSize;
529 ranges.emplace_back(start, end);
530 (void)i;
531 }
532 ranges.back().second += remainder;
533}
534
535int getNRecords(std::shared_ptr<arrow::Table> &table, std::vector<std::string> &columnNames)
536{
537 auto index = table->schema()->GetFieldIndex(columnNames.front());
538 return table->column(index)->length();
539};
540
541template <typename T>
542std::shared_ptr<arrow::ChunkedArray> getData(T p)
543{
544 return p->data();
545}
546
547template <>
548std::shared_ptr<arrow::ChunkedArray>
549getData<std::shared_ptr<arrow::ChunkedArray>>(std::shared_ptr<arrow::ChunkedArray> p)
550{
551 return p;
552}
553
554void RArrowDS::SetNSlots(unsigned int nSlots)
555{
556 assert(0U == fNSlots && "Setting the number of slots even if the number of slots is different from zero.");
557 fNSlots = nSlots;
558 // We dump all the previous getters structures and we rebuild it.
559 auto nColumns = fGetterIndex.size();
560
561 fValueGetters.clear();
562 for (size_t ci = 0; ci != nColumns; ++ci) {
563 auto chunkedArray = getData(fTable->column(fGetterIndex[ci].first));
564 fValueGetters.emplace_back(std::make_unique<ROOT::Internal::RDF::TValueGetter>(nSlots, chunkedArray->chunks()));
565 }
566}
567
568/// This needs to return a pointer to the pointer each value getter
569/// will point to.
570std::vector<void *> RArrowDS::GetColumnReadersImpl(std::string_view colName, const std::type_info &)
571{
572 auto &index = fGetterIndex;
573 auto findGetterIndex = [&index](unsigned int column) {
574 for (auto &entry : index) {
575 if (entry.first == column) {
576 return entry.second;
577 }
578 }
579 throw std::runtime_error("No column found at index " + std::to_string(column));
580 };
581
582 const int columnIdx = fTable->schema()->GetFieldIndex(std::string(colName));
583 const int getterIdx = findGetterIndex(columnIdx);
584 assert(getterIdx != -1);
585 assert((unsigned int)getterIdx < fValueGetters.size());
586 return fValueGetters[getterIdx]->SlotPtrs();
587}
588
590{
591 auto nRecords = getNRecords(fTable, fColumnNames);
593}
594
596{
597 return "ArrowDS";
598}
599
600/// \brief Factory method to create a Apache Arrow RDataFrame.
601///
602/// Creates a RDataFrame using an arrow::Table as input.
603/// \param[in] table an apache::arrow table to use as a source / to observe.
604/// \param[in] columnNames the name of the columns to use
605/// In case columnNames is empty, we use all the columns found in the table
606RDataFrame FromArrow(std::shared_ptr<arrow::Table> table, std::vector<std::string> const &columnNames)
607{
608 ROOT::RDataFrame tdf(std::make_unique<RArrowDS>(table, columnNames));
609 return tdf;
610}
611
612/// \brief Factory method to create a Apache Arrow RDataFrame.
613///
614/// Deprecated in favor of FromArrow().
615RDataFrame MakeArrowDataFrame(std::shared_ptr<arrow::Table> table, std::vector<std::string> const &columnNames)
616{
617 return FromArrow(table, columnNames);
618}
619
620} // namespace RDF
621
622} // namespace ROOT
#define ROOT_ARROW_STL_CONVERSION(c_type, ArrowType_)
Definition RArrowDS.cxx:62
long long Long64_t
Definition RtypesCore.h:80
unsigned long long ULong64_t
Definition RtypesCore.h:81
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h offset
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
char name[80]
Definition TGX11.cxx:110
#define snprintf
Definition civetweb.c:1540
ULong64_t fCurrentEntry
The entry in the array which should be looked up.
Definition RArrowDS.cxx:96
void ** fResult
The pointer to update.
Definition RArrowDS.cxx:85
virtual arrow::Status Visit(arrow::StringArray const &array) final
Definition RArrowDS.cxx:162
virtual arrow::Status Visit(arrow::ListArray const &array) final
Definition RArrowDS.cxx:169
virtual arrow::Status Visit(arrow::Int64Array const &array) final
Definition RArrowDS.cxx:124
virtual arrow::Status Visit(arrow::DoubleArray const &array) final
Definition RArrowDS.cxx:149
virtual arrow::Status Visit(arrow::Int32Array const &array) final
Check if we are asking the same entry as before.
Definition RArrowDS.cxx:118
virtual arrow::Status Visit(arrow::BooleanArray const &array) final
Definition RArrowDS.cxx:155
virtual arrow::Status Visit(arrow::UInt32Array const &array) final
Check if we are asking the same entry as before.
Definition RArrowDS.cxx:131
virtual arrow::Status Visit(arrow::FloatArray const &array) final
Definition RArrowDS.cxx:143
void * getTypeErasedPtrFrom(arrow::ListArray const &array, int32_t entry, RVec< T > &cache)
Definition RArrowDS.cxx:99
virtual arrow::Status Visit(arrow::UInt64Array const &array) final
Definition RArrowDS.cxx:137
Helper class which keeps track for each slot where to get the entry.
Definition RArrowDS.cxx:204
std::vector< ULong64_t > fLastChunkPerSlot
Definition RArrowDS.cxx:208
std::vector< ArrayPtrVisitor > fArrayVisitorPerSlot
Definition RArrowDS.cxx:210
std::vector< ULong64_t > fFirstEntryPerChunk
Definition RArrowDS.cxx:209
std::vector< ULong64_t > fLastEntryPerSlot
Definition RArrowDS.cxx:207
std::vector< void * > SlotPtrs()
This returns the ptr to the ptr to actual data.
Definition RArrowDS.cxx:234
TValueGetter(size_t slots, arrow::ArrayVector chunks)
Definition RArrowDS.cxx:218
void SetEntry(unsigned int slot, ULong64_t entry)
Set the current entry to be retrieved.
Definition RArrowDS.cxx:279
std::vector< ULong64_t > fChunkIndex
Since data can be chunked in different arrays we need to construct an index which contains the first ...
Definition RArrowDS.cxx:214
void UncachedSlotLookup(unsigned int slot, ULong64_t entry)
Definition RArrowDS.cxx:245
std::vector< void * > fValuesPtrPerSlot
Definition RArrowDS.cxx:206
std::vector< std::pair< ULong64_t, ULong64_t > > GetEntryRanges() final
Return ranges of entries to distribute to tasks.
Definition RArrowDS.cxx:469
RArrowDS(std::shared_ptr< arrow::Table > table, std::vector< std::string > const &columns)
Constructor to create an Arrow RDataSource for RDataFrame.
Definition RArrowDS.cxx:388
void SetNSlots(unsigned int nSlots) final
Inform RDataSource of the number of processing slots (i.e.
Definition RArrowDS.cxx:554
~RArrowDS()
Destructor.
Definition RArrowDS.cxx:460
bool HasColumn(std::string_view colName) const final
Checks if the dataset has a certain column.
Definition RArrowDS.cxx:493
void InitSlot(unsigned int slot, ULong64_t firstEntry) final
Convenience method called at the start of the data processing associated to a slot.
Definition RArrowDS.cxx:511
std::string GetLabel() final
Return a string representation of the datasource type.
Definition RArrowDS.cxx:595
void Initialize() final
Convenience method called before starting an event-loop.
Definition RArrowDS.cxx:589
std::shared_ptr< arrow::Table > fTable
Definition RArrowDS.hxx:32
std::vector< std::pair< size_t, size_t > > fGetterIndex
Definition RArrowDS.hxx:37
std::string GetTypeName(std::string_view colName) const final
Type of a column as a string, e.g.
Definition RArrowDS.cxx:475
std::vector< void * > GetColumnReadersImpl(std::string_view name, const std::type_info &type) final
This needs to return a pointer to the pointer each value getter will point to.
Definition RArrowDS.cxx:570
std::vector< std::unique_ptr< ROOT::Internal::RDF::TValueGetter > > fValueGetters
Definition RArrowDS.hxx:38
bool SetEntry(unsigned int slot, ULong64_t entry) final
Advance the "cursors" returned by GetColumnReaders to the selected entry for a particular slot.
Definition RArrowDS.cxx:502
std::vector< std::string > fColumnNames
Definition RArrowDS.hxx:34
std::vector< std::pair< ULong64_t, ULong64_t > > fEntryRanges
Definition RArrowDS.hxx:33
const std::vector< std::string > & GetColumnNames() const final
Returns a reference to the collection of the dataset's column names.
Definition RArrowDS.cxx:464
Helper to get the contents of a given column.
Definition RArrowDS.cxx:297
arrow::Status Visit(const arrow::DoubleType &) override
Definition RArrowDS.cxx:327
arrow::Status Visit(const arrow::BooleanType &) override
Definition RArrowDS.cxx:337
arrow::Status Visit(const arrow::UInt32Type &) override
Definition RArrowDS.cxx:317
arrow::Status Visit(const arrow::ListType &l) override
Definition RArrowDS.cxx:342
arrow::Status Visit(const arrow::FloatType &) override
Definition RArrowDS.cxx:322
arrow::Status Visit(const arrow::UInt64Type &) override
Definition RArrowDS.cxx:312
arrow::Status Visit(const arrow::StringType &) override
Definition RArrowDS.cxx:332
arrow::Status Visit(const arrow::Int64Type &) override
Definition RArrowDS.cxx:302
std::vector< std::string > fTypeName
Definition RArrowDS.cxx:299
arrow::Status Visit(const arrow::Int32Type &) override
Definition RArrowDS.cxx:307
Helper to determine if a given Column is a supported type.
Definition RArrowDS.cxx:367
virtual arrow::Status Visit(const arrow::BooleanType &) override
Definition RArrowDS.cxx:377
virtual arrow::Status Visit(const arrow::StringType &) override
Definition RArrowDS.cxx:376
virtual arrow::Status Visit(const arrow::Int32Type &) override
Definition RArrowDS.cxx:372
virtual arrow::Status Visit(const arrow::DoubleType &) override
Definition RArrowDS.cxx:375
virtual arrow::Status Visit(const arrow::Int64Type &) override
Definition RArrowDS.cxx:370
virtual arrow::Status Visit(const arrow::UInt32Type &) override
Definition RArrowDS.cxx:373
virtual arrow::Status Visit(const arrow::ListType &) override
Definition RArrowDS.cxx:378
virtual arrow::Status Visit(const arrow::FloatType &) override
Definition RArrowDS.cxx:374
virtual arrow::Status Visit(const arrow::UInt64Type &) override
Definition RArrowDS.cxx:371
ROOT's RDataFrame offers a modern, high-level interface for analysis of data stored in TTree ,...
A "std::vector"-like collection of values implementing handy operation to analyse them.
Definition RVec.hxx:1480
struct void * fTypeName
Definition cppyy.h:9
void splitInEqualRanges(std::vector< std::pair< ULong64_t, ULong64_t > > &ranges, int nRecords, unsigned int nSlots)
Definition RArrowDS.cxx:519
int getNRecords(std::shared_ptr< arrow::Table > &table, std::vector< std::string > &columnNames)
Definition RArrowDS.cxx:535
RDataFrame MakeArrowDataFrame(std::shared_ptr< arrow::Table > table, std::vector< std::string > const &columnNames)
Factory method to create a Apache Arrow RDataFrame.
Definition RArrowDS.cxx:615
std::shared_ptr< arrow::ChunkedArray > getData(T p)
Definition RArrowDS.cxx:542
RDataFrame FromArrow(std::shared_ptr< arrow::Table > table, std::vector< std::string > const &columnNames)
Factory method to create a Apache Arrow RDataFrame.
Definition RArrowDS.cxx:606
This file contains a specialised ROOT message handler to test for diagnostic in unit tests.
TSeq< unsigned int > TSeqU
Definition TSeq.hxx:204
TLine l
Definition textangle.C:4