28#ifdef ROOBATCHCOMPUTE_USE_IMT
44#error "RF_ARCH should always be defined"
63 for (std::size_t i = 0; i < vars.size(); i++) {
64 arrays[i]._array = vars[i].data();
65 arrays[i]._isVector = vars[i].empty() || vars[i].size() >= nEvents;
71 for (std::size_t i = 0; i <
batches.nBatches; i++) {
73 arg._array += arg._isVector * nEvents;
84class RooBatchComputeClass :
public RooBatchComputeInterface {
97#error "It's unexpected that _QUOTEVAL_ is defined at this point!"
99#define _QUOTEVAL_(x) _QUOTE_(x)
102 std::transform(out.begin(), out.end(), out.begin(), [](
unsigned char c) { return std::tolower(c); });
119 throw std::bad_function_call();
123 throw std::bad_function_call();
128#ifdef ROOBATCHCOMPUTE_USE_IMT
132 const std::vector<void (*)(
Batches &)> _computeFunctions;
135#ifdef ROOBATCHCOMPUTE_USE_IMT
138 std::size_t nEvents =
output.size();
150 auto task = [&](std::size_t idx) ->
int {
154 std::vector<Batch>
arrays(vars.size());
165 std::size_t events =
batches.nEvents;
167 while (events > bufferSize) {
170 events -= bufferSize;
177 std::vector<std::size_t> indices(
nThreads);
178 for (
unsigned int i = 1; i <
nThreads; i++) {
193void RooBatchComputeClass::compute(Config
const &, Computer
computer, std::span<double>
output, VarSpan vars,
214#ifdef ROOBATCHCOMPUTE_USE_IMT
220 std::size_t nEvents =
output.size();
225 std::vector<Batch>
arrays(vars.size());
230 std::size_t events =
batches.nEvents;
232 while (events > bufferSize) {
243inline std::pair<double, double>
getLog(
double prob, ReduceNLLOutput &out)
246 out.nNonPositiveValues++;
250 if (std::isinf(
prob)) {
251 out.nInfiniteValues++;
254 if (std::isnan(
prob)) {
259 return {std::log(
prob), 0.0};
264double RooBatchComputeClass::reduceSum(Config
const &, InputArr
input,
size_t n)
269ReduceNLLOutput RooBatchComputeClass::reduceNLL(Config
const &, std::span<const double> probas,
270 std::span<const double> weights, std::span<const double>
offsetProbas)
278 for (std::size_t i = 0; i <
probas.size(); ++i) {
280 const double eventWeight = weights.size() > 1 ? weights[i] : weights[0];
282 if (0. == eventWeight)
285 std::pair<double, double>
logOut =
getLog(probas[i], out);
293 term *= -eventWeight;
304 out.nllSumCarry = 0.0;
312class ScalarBufferContainer {
314 ScalarBufferContainer() {}
315 ScalarBufferContainer(std::size_t
size)
318 throw std::runtime_error(
"ScalarBufferContainer can only be of size 1");
321 double const *hostReadPtr()
const {
return &
_val; }
322 double const *deviceReadPtr()
const {
return &
_val; }
324 double *hostWritePtr() {
return &
_val; }
325 double *deviceWritePtr() {
return &
_val; }
327 void assignFromHost(std::span<const double>
input) {
_val =
input[0]; }
328 void assignFromDevice(std::span<const double>) {
throw std::bad_function_call(); }
334class CPUBufferContainer {
338 double const *hostReadPtr()
const {
return _vec.data(); }
339 double const *deviceReadPtr()
const
341 throw std::bad_function_call();
345 double *hostWritePtr() {
return _vec.data(); }
346 double *deviceWritePtr()
348 throw std::bad_function_call();
352 void assignFromHost(std::span<const double>
input) {
_vec.assign(
input.begin(),
input.end()); }
353 void assignFromDevice(std::span<const double>) {
throw std::bad_function_call(); }
359template <
class Container>
360class BufferImpl :
public AbsBuffer {
362 using Queue = std::queue<std::unique_ptr<Container>>;
364 BufferImpl(std::size_t
size, Queue &queue) :
_queue{queue}
367 _vec = std::make_unique<Container>(
size);
376 double const *hostReadPtr()
const override {
return _vec->hostReadPtr(); }
377 double const *deviceReadPtr()
const override {
return _vec->deviceReadPtr(); }
379 double *hostWritePtr()
override {
return _vec->hostWritePtr(); }
380 double *deviceWritePtr()
override {
return _vec->deviceWritePtr(); }
382 void assignFromHost(std::span<const double>
input)
override {
_vec->assignFromHost(
input); }
383 void assignFromDevice(std::span<const double>
input)
override {
_vec->assignFromDevice(
input); }
388 std::unique_ptr<Container>
_vec;
395struct BufferQueuesMaps {
400class BufferManager :
public AbsBufferManager {
403 BufferManager() :
_queuesMaps{std::make_unique<BufferQueuesMaps>()} {}
405 std::unique_ptr<AbsBuffer> makeScalarBuffer()
override
407 return std::make_unique<ScalarBuffer>(1,
_queuesMaps->scalarBufferQueuesMap[1]);
409 std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t
size)
override
413 std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t)
override {
throw std::bad_function_call(); }
414 std::unique_ptr<AbsBuffer> makePinnedBuffer(std::size_t, CudaInterface::CudaStream * =
nullptr)
override
416 throw std::bad_function_call();
425std::unique_ptr<AbsBufferManager> RooBatchComputeClass::createBufferManager()
const
427 return std::make_unique<BufferManager>();
std::vector< double > _vec
std::map< std::size_t, CPUBuffer::Queue > cpuBufferQueuesMap
std::map< std::size_t, ScalarBuffer::Queue > scalarBufferQueuesMap
std::unique_ptr< BufferQueuesMaps > _queuesMaps
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
These classes encapsulate the necessary data for the computations.
This class implements the interface to execute the same task multiple times, sequentially or in paral...
The Kahan summation is a compensated summation algorithm, which significantly reduces numerical error...
static KahanSum< T, N > Accumulate(Iterator begin, Iterator end, T initialValue=T{})
Iterate over a range and return an instance of a KahanSum.
void Add(T x)
Single-element accumulation. Will not vectorise.
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a cuda spe...
double reduceSum(Config const &, InputArr input, size_t n) override
void deleteCudaStream(CudaInterface::CudaStream *) const override
CudaInterface::CudaStream * newCudaStream() const override
std::unique_ptr< AbsBufferManager > createBufferManager() const override
CudaInterface::CudaEvent * newCudaEvent(bool) const override
bool cudaStreamIsActive(CudaInterface::CudaStream *) const override
ReduceNLLOutput reduceNLL(Config const &, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const override
std::string architectureName() const override
void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const override
void compute(Config const &, Computer computer, std::span< double > output, VarSpan vars, ArgSpan extraArgs) override
void deleteCudaEvent(CudaInterface::CudaEvent *) const override
Architecture architecture() const override
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
std::vector< void(*)(Batches &)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
Namespace for dispatching RooFit computations to various backends.
std::span< double > ArgSpan
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
constexpr std::size_t bufferSize
const double *__restrict InputArr
std::span< const std::span< const double > > VarSpan
void probas(TString dataset, TString fin="TMVA.root", Bool_t useTMVAStyle=kTRUE)
static double packFloatIntoNaN(float payload)
Pack float into mantissa of a NaN.
static float unpackNaN(double val)
If val is NaN and a this NaN has been tagged as containing a payload, unpack the float from the manti...