49   for (
int i = 0; i < vars.size(); i++) {
 
   50      const std::span<const double> &span = vars[i];
 
   51      arrays[i]._isVector = span.empty() || span.size() >= nEvents;
 
   52      if (!
arrays[i]._isVector) {
 
   62         arrays[i]._array = span.data();
 
  111      using namespace CudaInterface;
 
  113      std::size_t nEvents = 
output.size();
 
  121      auto scalarBuffer = 
reinterpret_cast<double *
>(
arrays + vars.size());
 
  122      auto extraArgsHost = 
reinterpret_cast<double *
>(scalarBuffer + vars.size());
 
 
  154                             std::span<const double> weights, std::span<const double> 
offsetProbas) 
override;
 
 
  185   const double t = 
sum + 
y; 
 
  188   carry = (t - 
sum) - 
y;
 
 
  198   for (
int i = 
blockDim.x / 2; i > 0; i >>= 1) {
 
 
  229      double val = nll == 1 ? -std::log(
input[i]) : 
input[i];
 
 
  263      val = weights[i] * val;
 
 
  292                                                std::span<const double> weights, std::span<const double> 
offsetProbas)
 
  295   if (probas.empty()) {
 
  306      assert(span.size() == 0 || span.data() == 
nullptr ||
 
  313      probas.size() == 1 ? probas[0] : 0.0, weights.size(), 
devOut.data());
 
 
  329class ScalarBufferContainer {
 
  331   ScalarBufferContainer() {}
 
  332   ScalarBufferContainer(std::size_t 
size)
 
  335         throw std::runtime_error(
"ScalarBufferContainer can only be of size 1");
 
  338   double const *hostReadPtr()
 const { 
return &
_val; }
 
  339   double const *deviceReadPtr()
 const { 
return &
_val; }
 
  341   double *hostWritePtr() { 
return &
_val; }
 
  342   double *deviceWritePtr() { 
return &
_val; }
 
  344   void assignFromHost(std::span<const double> 
input) { 
_val = 
input[0]; }
 
  345   void assignFromDevice(std::span<const double> 
input)
 
  354class CPUBufferContainer {
 
  358   double const *hostReadPtr()
 const { 
return _vec.data(); }
 
  359   double const *deviceReadPtr()
 const 
  361      throw std::bad_function_call();
 
  365   double *hostWritePtr() { 
return _vec.data(); }
 
  366   double *deviceWritePtr()
 
  368      throw std::bad_function_call();
 
  372   void assignFromHost(std::span<const double> 
input) { 
_vec.assign(
input.begin(), 
input.end()); }
 
  373   void assignFromDevice(std::span<const double> 
input)
 
  382class GPUBufferContainer {
 
  386   double const *hostReadPtr()
 const 
  388      throw std::bad_function_call();
 
  391   double const *deviceReadPtr()
 const { 
return _arr.data(); }
 
  393   double *hostWritePtr()
 const 
  395      throw std::bad_function_call();
 
  398   double *deviceWritePtr()
 const { 
return const_cast<double *
>(
_arr.data()); }
 
  400   void assignFromHost(std::span<const double> 
input)
 
  404   void assignFromDevice(std::span<const double> 
input)
 
  410   CudaInterface::DeviceArray<double> 
_arr;
 
  413class PinnedBufferContainer {
 
  416   std::size_t 
size()
 const { 
return _arr.size(); }
 
  418   void setCudaStream(CudaInterface::CudaStream *stream) { 
_cudaStream = stream; }
 
  420   double const *hostReadPtr()
 const 
  423      if (_lastAccess == LastAccessType::GPU_WRITE) {
 
  429      return const_cast<double *
>(
_arr.data());
 
  431   double const *deviceReadPtr()
 const 
  434      if (_lastAccess == LastAccessType::CPU_WRITE) {
 
  442   double *hostWritePtr()
 
  447   double *deviceWritePtr()
 
  453   void assignFromHost(std::span<const double> 
input) { std::copy(
input.begin(), 
input.end(), hostWritePtr()); }
 
  454   void assignFromDevice(std::span<const double> 
input)
 
  460   enum class LastAccessType {
 
  467   CudaInterface::PinnedHostArray<double> 
_arr;
 
  473template <
class Container>
 
  474class BufferImpl : 
public AbsBuffer {
 
  476   using Queue = std::queue<std::unique_ptr<Container>>;
 
  478   BufferImpl(std::size_t 
size, Queue &queue) : 
_queue{queue}
 
  481         _vec = std::make_unique<Container>(
size);
 
  490   double const *hostReadPtr()
 const override { 
return _vec->hostReadPtr(); }
 
  491   double const *deviceReadPtr()
 const override { 
return _vec->deviceReadPtr(); }
 
  493   double *hostWritePtr()
 override { 
return _vec->hostWritePtr(); }
 
  494   double *deviceWritePtr()
 override { 
return _vec->deviceWritePtr(); }
 
  496   void assignFromHost(std::span<const double> 
input)
 override { 
_vec->assignFromHost(
input); }
 
  497   void assignFromDevice(std::span<const double> 
input)
 override { 
_vec->assignFromDevice(
input); }
 
  502   std::unique_ptr<Container> 
_vec;
 
  511struct BufferQueuesMaps {
 
  518class BufferManager : 
public AbsBufferManager {
 
  521   BufferManager() : 
_queuesMaps{std::make_unique<BufferQueuesMaps>()} {}
 
  523   std::unique_ptr<AbsBuffer> makeScalarBuffer()
 override 
  525      return std::make_unique<ScalarBuffer>(1, 
_queuesMaps->scalarBufferQueuesMap[1]);
 
  527   std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t 
size)
 override 
  531   std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t 
size)
 override 
  535   std::unique_ptr<AbsBuffer> makePinnedBuffer(std::size_t 
size, CudaInterface::CudaStream *stream = 
nullptr)
 override 
  538      out->vec().setCudaStream(stream);
 
  550   return std::make_unique<BufferManager>();
 
 
std::vector< double > _vec
CudaInterface::CudaStream * _cudaStream
std::map< std::size_t, CPUBuffer::Queue > cpuBufferQueuesMap
std::map< std::size_t, ScalarBuffer::Queue > scalarBufferQueuesMap
CudaInterface::DeviceArray< double > _arr
std::map< std::size_t, PinnedBuffer::Queue > pinnedBufferQueuesMap
LastAccessType _lastAccess
GPUBufferContainer _gpuBuffer
std::unique_ptr< BufferQueuesMaps > _queuesMaps
std::map< std::size_t, GPUBuffer::Queue > gpuBufferQueuesMap
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t attr
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a cuda spe...
ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
void deleteCudaEvent(CudaInterface::CudaEvent *event) const override
void cudaStreamWaitForEvent(CudaInterface::CudaStream *stream, CudaInterface::CudaEvent *event) const override
std::unique_ptr< AbsBufferManager > createBufferManager() const
void cudaEventRecord(CudaInterface::CudaEvent *event, CudaInterface::CudaStream *stream) const override
CudaInterface::CudaStream * newCudaStream() const override
bool cudaStreamIsActive(CudaInterface::CudaStream *stream) const override
void deleteCudaStream(CudaInterface::CudaStream *stream) const override
double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override
Return the sum of an input array.
std::string architectureName() const override
const std::vector< void(*)(Batches &)> _computeFunctions
Architecture architecture() const override
CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const override
void compute(RooBatchCompute::Config const &cfg, Computer computer, std::span< double > output, VarSpan vars, ArgSpan extraArgs) override
Compute multiple values using cuda kernels.
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
CudaInterface::CudaStream * cudaStream() const
bool isActive()
Checks if a CUDA stream is currently active.
void waitForEvent(CudaEvent &)
Makes a CUDA stream wait for a CUDA event.
The interface which should be implemented to provide optimised computation functions for implementati...
std::vector< void(*)(Batches &)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
__global__ void kahanSum(const double *__restrict__ input, const double *__restrict__ carries, size_t n, double *__restrict__ result, bool nll)
__global__ void nllSumKernel(const double *__restrict__ probas, const double *__restrict__ weights, const double *__restrict__ offsetProbas, size_t nProbas, double scalarProba, size_t nWeights, double *__restrict__ result)
__device__ void kahanSumReduction(double *shared, size_t n, double *__restrict__ result, int carry_index)
__device__ void kahanSumUpdate(double &sum, double &carry, double a, double otherCarry)
void copyDeviceToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the CUDA device.
void cudaEventRecord(CudaEvent &event, CudaStream &stream)
Records a CUDA event.
void copyHostToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the host to the CUDA device.
void copyDeviceToHost(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the host.
Namespace for dispatching RooFit computations to various backends.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::span< double > ArgSpan
const double *__restrict InputArr
std::span< const std::span< const double > > VarSpan
static uint64_t sum(uint64_t i)