49   for (
int i = 0; i < vars.size(); i++) {
 
   50      const std::span<const double> &span = vars[i];
 
   51      arrays[i]._isVector = span.empty() || span.size() >= nEvents;
 
   52      if (!
arrays[i]._isVector) {
 
   62         arrays[i]._array = span.data();
 
  111      using namespace CudaInterface;
 
  113      std::size_t nEvents = 
output.size();
 
  121      auto scalarBuffer = 
reinterpret_cast<double *
>(
arrays + vars.size());
 
  122      auto extraArgsHost = 
reinterpret_cast<double *
>(scalarBuffer + vars.size());
 
 
  154                             std::span<const double> weights, std::span<const double> 
offsetProbas) 
override;
 
 
  185   const double t = 
sum + 
y; 
 
  188   carry = (t - 
sum) - 
y;
 
 
  198   for (
int i = 
blockDim.x / 2; i > 0; i >>= 1) {
 
 
  229      double val = nll == 1 ? -std::log(
input[i]) : 
input[i];
 
 
  263      val = weights[i] * val;
 
 
  292                                                std::span<const double> weights, std::span<const double> 
offsetProbas)
 
  295   if (probas.empty()) {
 
  306      assert(span.size() == 0 || span.data() == 
nullptr ||
 
  313      probas.size() == 1 ? probas[0] : 0.0, weights.size(), 
devOut.data());
 
 
  329class ScalarBufferContainer {
 
  331   ScalarBufferContainer() {}
 
  332   ScalarBufferContainer(std::size_t 
size)
 
  335         throw std::runtime_error(
"ScalarBufferContainer can only be of size 1");
 
  338   double const *hostReadPtr()
 const { 
return &
_val; }
 
  339   double const *deviceReadPtr()
 const { 
return &
_val; }
 
  341   double *hostWritePtr() { 
return &
_val; }
 
  342   double *deviceWritePtr() { 
return &
_val; }
 
  344   void assignFromHost(std::span<const double> 
input) { 
_val = 
input[0]; }
 
  345   void assignFromDevice(std::span<const double> 
input)
 
  354class CPUBufferContainer {
 
  358   double const *hostReadPtr()
 const { 
return _vec.data(); }
 
  359   double const *deviceReadPtr()
 const 
  361      throw std::bad_function_call();
 
  365   double *hostWritePtr() { 
return _vec.data(); }
 
  366   double *deviceWritePtr()
 
  368      throw std::bad_function_call();
 
  372   void assignFromHost(std::span<const double> 
input) { 
_vec.assign(
input.begin(), 
input.end()); }
 
  373   void assignFromDevice(std::span<const double> 
input)
 
  382class GPUBufferContainer {
 
  386   double const *hostReadPtr()
 const 
  388      throw std::bad_function_call();
 
  391   double const *deviceReadPtr()
 const { 
return _arr.data(); }
 
  393   double *hostWritePtr()
 const 
  395      throw std::bad_function_call();
 
  398   double *deviceWritePtr()
 const { 
return const_cast<double *
>(
_arr.data()); }
 
  400   void assignFromHost(std::span<const double> 
input)
 
  404   void assignFromDevice(std::span<const double> 
input)
 
  410   CudaInterface::DeviceArray<double> 
_arr;
 
  413class PinnedBufferContainer {
 
  416   std::size_t 
size()
 const { 
return _arr.size(); }
 
  418   void setCudaStream(CudaInterface::CudaStream *stream) { 
_cudaStream = stream; }
 
  420   double const *hostReadPtr()
 const 
  423      if (_lastAccess == LastAccessType::GPU_WRITE) {
 
  429      return const_cast<double *
>(
_arr.data());
 
  431   double const *deviceReadPtr()
 const 
  434      if (_lastAccess == LastAccessType::CPU_WRITE) {
 
  442   double *hostWritePtr()
 
  447   double *deviceWritePtr()
 
  453   void assignFromHost(std::span<const double> 
input) { std::copy(
input.begin(), 
input.end(), hostWritePtr()); }
 
  454   void assignFromDevice(std::span<const double> 
input)
 
  460   enum class LastAccessType {
 
  467   CudaInterface::PinnedHostArray<double> 
_arr;
 
  473template <
class Container>
 
  474class BufferImpl : 
public AbsBuffer {
 
  476   using Queue = std::queue<std::unique_ptr<Container>>;
 
  478   BufferImpl(std::size_t 
size, Queue &queue) : 
_queue{queue}
 
  481         _vec = std::make_unique<Container>(
size);
 
  490   double const *hostReadPtr()
 const override { 
return _vec->hostReadPtr(); }
 
  491   double const *deviceReadPtr()
 const override { 
return _vec->deviceReadPtr(); }
 
  493   double *hostWritePtr()
 override { 
return _vec->hostWritePtr(); }
 
  494   double *deviceWritePtr()
 override { 
return _vec->deviceWritePtr(); }
 
  496   void assignFromHost(std::span<const double> 
input)
 override { 
_vec->assignFromHost(
input); }
 
  497   void assignFromDevice(std::span<const double> 
input)
 override { 
_vec->assignFromDevice(
input); }
 
  502   std::unique_ptr<Container> 
_vec;
 
  511struct BufferQueuesMaps {
 
  518class BufferManager : 
public AbsBufferManager {
 
  521   BufferManager() : 
_queuesMaps{std::make_unique<BufferQueuesMaps>()} {}
 
  523   std::unique_ptr<AbsBuffer> makeScalarBuffer()
 override 
  525      return std::make_unique<ScalarBuffer>(1, 
_queuesMaps->scalarBufferQueuesMap[1]);
 
  527   std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t 
size)
 override 
  531   std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t 
size)
 override 
  535   std::unique_ptr<AbsBuffer> makePinnedBuffer(std::size_t 
size, CudaInterface::CudaStream *stream = 
nullptr)
 override 
  538      out->vec().setCudaStream(stream);
 
  550   return std::make_unique<BufferManager>();
 
 
std::vector< double > _vec
 
CudaInterface::CudaStream * _cudaStream
 
std::map< std::size_t, CPUBuffer::Queue > cpuBufferQueuesMap
 
std::map< std::size_t, ScalarBuffer::Queue > scalarBufferQueuesMap
 
CudaInterface::DeviceArray< double > _arr
 
std::map< std::size_t, PinnedBuffer::Queue > pinnedBufferQueuesMap
 
LastAccessType _lastAccess
 
GPUBufferContainer _gpuBuffer
 
std::unique_ptr< BufferQueuesMaps > _queuesMaps
 
std::map< std::size_t, GPUBuffer::Queue > gpuBufferQueuesMap
 
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
 
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
 
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
 
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
 
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t attr
 
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a cuda spe...
 
ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
 
void deleteCudaEvent(CudaInterface::CudaEvent *event) const override
 
void cudaStreamWaitForEvent(CudaInterface::CudaStream *stream, CudaInterface::CudaEvent *event) const override
 
std::unique_ptr< AbsBufferManager > createBufferManager() const
 
void cudaEventRecord(CudaInterface::CudaEvent *event, CudaInterface::CudaStream *stream) const override
 
CudaInterface::CudaStream * newCudaStream() const override
 
bool cudaStreamIsActive(CudaInterface::CudaStream *stream) const override
 
void deleteCudaStream(CudaInterface::CudaStream *stream) const override
 
double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override
Return the sum of an input array.
 
std::string architectureName() const override
 
const std::vector< void(*)(Batches &)> _computeFunctions
 
Architecture architecture() const override
 
CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const override
 
void compute(RooBatchCompute::Config const &cfg, Computer computer, std::span< double > output, VarSpan vars, ArgSpan extraArgs) override
Compute multiple values using cuda kernels.
 
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
 
CudaInterface::CudaStream * cudaStream() const
 
bool isActive()
Checks if a CUDA stream is currently active.
 
void waitForEvent(CudaEvent &)
Makes a CUDA stream wait for a CUDA event.
 
The interface which should be implemented to provide optimised computation functions for implementati...
 
std::vector< void(*)(Batches &)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
 
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
 
__global__ void kahanSum(const double *__restrict__ input, const double *__restrict__ carries, size_t n, double *__restrict__ result, bool nll)
 
__global__ void nllSumKernel(const double *__restrict__ probas, const double *__restrict__ weights, const double *__restrict__ offsetProbas, size_t nProbas, double scalarProba, size_t nWeights, double *__restrict__ result)
 
__device__ void kahanSumReduction(double *shared, size_t n, double *__restrict__ result, int carry_index)
 
__device__ void kahanSumUpdate(double &sum, double &carry, double a, double otherCarry)
 
void copyDeviceToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the CUDA device.
 
void cudaEventRecord(CudaEvent &event, CudaStream &stream)
Records a CUDA event.
 
void copyHostToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the host to the CUDA device.
 
void copyDeviceToHost(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the host.
 
Namespace for dispatching RooFit computations to various backends.
 
R__EXTERN RooBatchComputeInterface * dispatchCUDA
 
std::span< double > ArgSpan
 
const double *__restrict InputArr
 
std::span< const std::span< const double > > VarSpan
 
static uint64_t sum(uint64_t i)