27#ifdef ROOBATCHCOMPUTE_USE_IMT
43#error "RF_ARCH should always be defined"
62 for (std::size_t i = 0; i < vars.size(); i++) {
63 arrays[i]._array = vars[i].data();
64 arrays[i]._isVector = vars[i].empty() || vars[i].size() >= nEvents;
70 for (std::size_t i = 0; i <
batches.nBatches; i++) {
96#error "It's unexpected that _QUOTEVAL_ is defined at this point!"
98#define _QUOTEVAL_(x) _QUOTE_(x)
101 std::transform(out.begin(), out.end(), out.begin(), [](
unsigned char c) { return std::tolower(c); });
118 throw std::bad_function_call();
122 throw std::bad_function_call();
127#ifdef ROOBATCHCOMPUTE_USE_IMT
134#ifdef ROOBATCHCOMPUTE_USE_IMT
137 std::size_t nEvents =
output.size();
149 auto task = [&](std::size_t idx) ->
int {
153 std::vector<Batch>
arrays(vars.size());
164 std::size_t events =
batches.nEvents;
176 std::vector<std::size_t> indices(
nThreads);
177 for (
unsigned int i = 1; i <
nThreads; i++) {
213#ifdef ROOBATCHCOMPUTE_USE_IMT
219 std::size_t nEvents =
output.size();
224 std::vector<Batch>
arrays(vars.size());
229 std::size_t events =
batches.nEvents;
245 out.nNonPositiveValues++;
249 if (std::isinf(
prob)) {
250 out.nInfiniteValues++;
253 if (std::isnan(
prob)) {
258 return {std::log(
prob), 0.0};
269 std::span<const double> weights, std::span<const double>
offsetProbas)
277 for (std::size_t i = 0; i < weights.size(); ++i) {
279 if (0. == weights[i])
282 std::pair<double, double>
logOut =
getLog(probas.size() == 1 ? probas[0] : probas[i], out);
295 out.nllSum = nllSum.
Sum();
296 out.nllSumCarry = nllSum.
Carry();
301 out.nllSumCarry = 0.0;
309class ScalarBufferContainer {
311 ScalarBufferContainer() {}
312 ScalarBufferContainer(std::size_t
size)
315 throw std::runtime_error(
"ScalarBufferContainer can only be of size 1");
318 double const *hostReadPtr()
const {
return &
_val; }
319 double const *deviceReadPtr()
const {
return &
_val; }
321 double *hostWritePtr() {
return &
_val; }
322 double *deviceWritePtr() {
return &
_val; }
324 void assignFromHost(std::span<const double>
input) {
_val =
input[0]; }
325 void assignFromDevice(std::span<const double>) {
throw std::bad_function_call(); }
331class CPUBufferContainer {
335 double const *hostReadPtr()
const {
return _vec.data(); }
336 double const *deviceReadPtr()
const
338 throw std::bad_function_call();
342 double *hostWritePtr() {
return _vec.data(); }
343 double *deviceWritePtr()
345 throw std::bad_function_call();
349 void assignFromHost(std::span<const double>
input) {
_vec.assign(
input.begin(),
input.end()); }
350 void assignFromDevice(std::span<const double>) {
throw std::bad_function_call(); }
356template <
class Container>
357class BufferImpl :
public AbsBuffer {
359 using Queue = std::queue<std::unique_ptr<Container>>;
361 BufferImpl(std::size_t
size, Queue &queue) :
_queue{queue}
364 _vec = std::make_unique<Container>(
size);
373 double const *hostReadPtr()
const override {
return _vec->hostReadPtr(); }
374 double const *deviceReadPtr()
const override {
return _vec->deviceReadPtr(); }
376 double *hostWritePtr()
override {
return _vec->hostWritePtr(); }
377 double *deviceWritePtr()
override {
return _vec->deviceWritePtr(); }
379 void assignFromHost(std::span<const double>
input)
override {
_vec->assignFromHost(
input); }
380 void assignFromDevice(std::span<const double>
input)
override {
_vec->assignFromDevice(
input); }
385 std::unique_ptr<Container>
_vec;
392struct BufferQueuesMaps {
397class BufferManager :
public AbsBufferManager {
400 BufferManager() :
_queuesMaps{std::make_unique<BufferQueuesMaps>()} {}
402 std::unique_ptr<AbsBuffer> makeScalarBuffer()
override
404 return std::make_unique<ScalarBuffer>(1,
_queuesMaps->scalarBufferQueuesMap[1]);
406 std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t
size)
override
410 std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t)
override {
throw std::bad_function_call(); }
411 std::unique_ptr<AbsBuffer> makePinnedBuffer(std::size_t, CudaInterface::CudaStream * =
nullptr)
override
413 throw std::bad_function_call();
424 return std::make_unique<BufferManager>();
std::vector< double > _vec
std::map< std::size_t, CPUBuffer::Queue > cpuBufferQueuesMap
std::map< std::size_t, ScalarBuffer::Queue > scalarBufferQueuesMap
std::unique_ptr< BufferQueuesMaps > _queuesMaps
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
This class implements the interface to execute the same task multiple times, sequentially or in paral...
The Kahan summation is a compensated summation algorithm, which significantly reduces numerical error...
static KahanSum< T, N > Accumulate(Iterator begin, Iterator end, T initialValue=T{})
Iterate over a range and return an instance of a KahanSum.
void Add(T x)
Single-element accumulation. Will not vectorise.
const double *__restrict _array
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a CPU spec...
std::string architectureName() const override
void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const override
void compute(Config const &, Computer computer, std::span< double > output, VarSpan vars, ArgSpan extraArgs) override
Compute multiple values using optimized functions.
const std::vector< void(*)(Batches &)> _computeFunctions
CudaInterface::CudaEvent * newCudaEvent(bool) const override
void deleteCudaEvent(CudaInterface::CudaEvent *) const override
double reduceSum(Config const &, InputArr input, size_t n) override
void deleteCudaStream(CudaInterface::CudaStream *) const override
std::unique_ptr< AbsBufferManager > createBufferManager() const override
CudaInterface::CudaStream * newCudaStream() const override
bool cudaStreamIsActive(CudaInterface::CudaStream *) const override
Architecture architecture() const override
ReduceNLLOutput reduceNLL(Config const &, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const override
The interface which should be implemented to provide optimised computation functions for implementati...
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
std::vector< void(*)(Batches &)> getFunctions()
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
Namespace for dispatching RooFit computations to various backends.
std::span< double > ArgSpan
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
constexpr std::size_t bufferSize
const double *__restrict InputArr
std::span< const std::span< const double > > VarSpan
static double packFloatIntoNaN(float payload)
Pack float into mantissa of a NaN.
static float unpackNaN(double val)
If val is NaN and a this NaN has been tagged as containing a payload, unpack the float from the manti...