27#ifdef ROOBATCHCOMPUTE_USE_IMT
43#error "RF_ARCH should always be defined"
62 for (std::size_t i = 0; i < vars.size(); i++) {
63 arrays[i]._array = vars[i].data();
64 arrays[i]._isVector = vars[i].empty() || vars[i].size() >= nEvents;
70 for (std::size_t i = 0; i <
batches.nBatches; i++) {
96 std::transform(out.begin(), out.end(), out.begin(), [](
unsigned char c) { return std::tolower(c); });
113 throw std::bad_function_call();
117 throw std::bad_function_call();
122#ifdef ROOBATCHCOMPUTE_USE_IMT
129#ifdef ROOBATCHCOMPUTE_USE_IMT
132 std::size_t nEvents =
output.size();
144 auto task = [&](std::size_t idx) ->
int {
148 std::vector<Batch>
arrays(vars.size());
159 std::size_t events =
batches.nEvents;
171 std::vector<std::size_t> indices(
nThreads);
172 for (
unsigned int i = 1; i <
nThreads; i++) {
208#ifdef ROOBATCHCOMPUTE_USE_IMT
214 std::size_t nEvents =
output.size();
219 std::vector<Batch>
arrays(vars.size());
224 std::size_t events =
batches.nEvents;
240 out.nNonPositiveValues++;
244 if (std::isinf(
prob)) {
245 out.nInfiniteValues++;
248 if (std::isnan(
prob)) {
253 return {std::log(
prob), 0.0};
264 std::span<const double> weights, std::span<const double>
offsetProbas)
272 for (std::size_t i = 0; i < weights.size(); ++i) {
274 if (0. == weights[i])
277 std::pair<double, double>
logOut =
getLog(probas.size() == 1 ? probas[0] : probas[i], out);
290 out.nllSum = nllSum.
Sum();
291 out.nllSumCarry = nllSum.
Carry();
296 out.nllSumCarry = 0.0;
304class ScalarBufferContainer {
306 ScalarBufferContainer() {}
307 ScalarBufferContainer(std::size_t
size)
310 throw std::runtime_error(
"ScalarBufferContainer can only be of size 1");
313 double const *hostReadPtr()
const {
return &
_val; }
314 double const *deviceReadPtr()
const {
return &
_val; }
316 double *hostWritePtr() {
return &
_val; }
317 double *deviceWritePtr() {
return &
_val; }
319 void assignFromHost(std::span<const double>
input) {
_val =
input[0]; }
320 void assignFromDevice(std::span<const double>) {
throw std::bad_function_call(); }
326class CPUBufferContainer {
330 double const *hostReadPtr()
const {
return _vec.data(); }
331 double const *deviceReadPtr()
const
333 throw std::bad_function_call();
337 double *hostWritePtr() {
return _vec.data(); }
338 double *deviceWritePtr()
340 throw std::bad_function_call();
344 void assignFromHost(std::span<const double>
input) {
_vec.assign(
input.begin(),
input.end()); }
345 void assignFromDevice(std::span<const double>) {
throw std::bad_function_call(); }
351template <
class Container>
352class BufferImpl :
public AbsBuffer {
354 using Queue = std::queue<std::unique_ptr<Container>>;
356 BufferImpl(std::size_t
size, Queue &queue) :
_queue{queue}
359 _vec = std::make_unique<Container>(
size);
368 double const *hostReadPtr()
const override {
return _vec->hostReadPtr(); }
369 double const *deviceReadPtr()
const override {
return _vec->deviceReadPtr(); }
371 double *hostWritePtr()
override {
return _vec->hostWritePtr(); }
372 double *deviceWritePtr()
override {
return _vec->deviceWritePtr(); }
374 void assignFromHost(std::span<const double>
input)
override {
_vec->assignFromHost(
input); }
375 void assignFromDevice(std::span<const double>
input)
override {
_vec->assignFromDevice(
input); }
380 std::unique_ptr<Container>
_vec;
387struct BufferQueuesMaps {
392class BufferManager :
public AbsBufferManager {
395 BufferManager() :
_queuesMaps{std::make_unique<BufferQueuesMaps>()} {}
397 std::unique_ptr<AbsBuffer> makeScalarBuffer()
override
399 return std::make_unique<ScalarBuffer>(1,
_queuesMaps->scalarBufferQueuesMap[1]);
401 std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t
size)
override
405 std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t)
override {
throw std::bad_function_call(); }
406 std::unique_ptr<AbsBuffer> makePinnedBuffer(std::size_t, CudaInterface::CudaStream * =
nullptr)
override
408 throw std::bad_function_call();
419 return std::make_unique<BufferManager>();
#define _R_QUOTEVAL_(string)
std::vector< double > _vec
std::map< std::size_t, CPUBuffer::Queue > cpuBufferQueuesMap
std::map< std::size_t, ScalarBuffer::Queue > scalarBufferQueuesMap
std::unique_ptr< BufferQueuesMaps > _queuesMaps
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
This class implements the interface to execute the same task multiple times, sequentially or in paral...
The Kahan summation is a compensated summation algorithm, which significantly reduces numerical error...
static KahanSum< T, N > Accumulate(Iterator begin, Iterator end, T initialValue=T{})
Iterate over a range and return an instance of a KahanSum.
void Add(T x)
Single-element accumulation. Will not vectorise.
const double *__restrict _array
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a CPU spec...
std::string architectureName() const override
void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const override
void compute(Config const &, Computer computer, std::span< double > output, VarSpan vars, ArgSpan extraArgs) override
Compute multiple values using optimized functions.
const std::vector< void(*)(Batches &)> _computeFunctions
CudaInterface::CudaEvent * newCudaEvent(bool) const override
void deleteCudaEvent(CudaInterface::CudaEvent *) const override
double reduceSum(Config const &, InputArr input, size_t n) override
void deleteCudaStream(CudaInterface::CudaStream *) const override
std::unique_ptr< AbsBufferManager > createBufferManager() const override
CudaInterface::CudaStream * newCudaStream() const override
bool cudaStreamIsActive(CudaInterface::CudaStream *) const override
Architecture architecture() const override
ReduceNLLOutput reduceNLL(Config const &, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const override
The interface which should be implemented to provide optimised computation functions for implementati...
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
std::vector< void(*)(Batches &)> getFunctions()
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
Namespace for dispatching RooFit computations to various backends.
std::span< double > ArgSpan
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
constexpr std::size_t bufferSize
const double *__restrict InputArr
std::span< const std::span< const double > > VarSpan
static double packFloatIntoNaN(float payload)
Pack float into mantissa of a NaN.
static float unpackNaN(double val)
If val is NaN and a this NaN has been tagged as containing a payload, unpack the float from the manti...