38void fillBatches(
Batches &batches,
double *
output,
size_t nEvents, std::size_t nBatches, std::size_t nExtraArgs)
42 batches.
nExtra = nExtraArgs;
46void fillArrays(
Batch *arrays,
VarSpan vars,
double *buffer,
double *bufferDevice, std::size_t nEvents)
48 for (
int i = 0; i < vars.size(); i++) {
49 const std::span<const double> &span = vars[i];
50 arrays[i].
_isVector = span.empty() || span.size() >= nEvents;
51 if (!arrays[i]._isVector) {
55 arrays[i].
_array = bufferDevice + i;
61 arrays[i].
_array = span.data();
66int getGridSize(std::size_t
n)
79 constexpr int maxGridSize = 84;
80 return std::min(
int(std::ceil(
double(
n) /
blockSize)), maxGridSize);
110 using namespace CudaInterface;
112 std::size_t nEvents =
output.size();
114 const std::size_t memSize =
sizeof(
Batches) + vars.size() *
sizeof(
Batch) + vars.size() *
sizeof(
double) +
115 extraArgs.size() *
sizeof(
double);
117 std::vector<char> hostMem(memSize);
118 auto batches =
reinterpret_cast<Batches *
>(hostMem.data());
119 auto arrays =
reinterpret_cast<Batch *
>(batches + 1);
120 auto scalarBuffer =
reinterpret_cast<double *
>(arrays + vars.size());
121 auto extraArgsHost =
reinterpret_cast<double *
>(scalarBuffer + vars.size());
123 DeviceArray<char> deviceMem(memSize);
124 auto batchesDevice =
reinterpret_cast<Batches *
>(deviceMem.data());
125 auto arraysDevice =
reinterpret_cast<Batch *
>(batchesDevice + 1);
126 auto scalarBufferDevice =
reinterpret_cast<double *
>(arraysDevice + vars.size());
127 auto extraArgsDevice =
reinterpret_cast<double *
>(scalarBufferDevice + vars.size());
129 fillBatches(*batches,
output.data(), nEvents, vars.size(), extraArgs.size());
130 fillArrays(arrays, vars, scalarBuffer, scalarBufferDevice, nEvents);
131 batches->args = arraysDevice;
133 if (!extraArgs.empty()) {
134 std::copy(std::cbegin(extraArgs), std::cend(extraArgs), extraArgsHost);
135 batches->extra = extraArgsDevice;
138 copyHostToDevice(hostMem.data(), deviceMem.data(), hostMem.size(), cfg.
cudaStream());
140 const int gridSize = getGridSize(nEvents);
146 if (!extraArgs.empty()) {
147 copyDeviceToHost(extraArgsDevice, extraArgs.data(), extraArgs.size(), cfg.
cudaStream());
153 std::span<const double> weights, std::span<const double> offsetProbas)
override;
183 const double y =
a - (carry + otherCarry);
184 const double t =
sum +
y;
187 carry = (t -
sum) -
y;
197 for (
int i = blockDim.x / 2; i > 0; i >>= 1) {
198 if (threadIdx.x < i && (threadIdx.x + i) <
n) {
199 kahanSumUpdate(shared[threadIdx.x], shared[carry_index], shared[threadIdx.x + i], shared[carry_index + i]);
205 if (threadIdx.x == 0) {
206 result[blockIdx.x] = shared[0];
207 result[blockIdx.x + gridDim.x] = shared[carry_index];
211__global__
void kahanSum(
const double *__restrict__
input,
const double *__restrict__ carries,
size_t n,
212 double *__restrict__
result,
bool nll)
214 int thIdx = threadIdx.x;
215 int gthIdx = thIdx + blockIdx.x *
blockSize;
216 int carry_index = threadIdx.x + blockDim.x;
217 const int nThreadsTotal =
blockSize * gridDim.x;
220 extern __shared__
double shared[];
225 for (
int i = gthIdx; i <
n; i += nThreadsTotal) {
228 double val = nll == 1 ? -std::log(
input[i]) :
input[i];
233 shared[carry_index] = carry;
241__global__
void nllSumKernel(
const double *__restrict__ probas,
const double *__restrict__ weights,
242 const double *__restrict__ offsetProbas,
size_t n,
double *__restrict__
result)
244 int thIdx = threadIdx.x;
245 int gthIdx = thIdx + blockIdx.x *
blockSize;
246 int carry_index = threadIdx.x + blockDim.x;
247 const int nThreadsTotal =
blockSize * gridDim.x;
250 extern __shared__
double shared[];
255 for (
int i = gthIdx; i <
n; i += nThreadsTotal) {
258 double val = -std::log(probas[i]);
260 val += std::log(offsetProbas[i]);
262 val = weights[i] * val;
267 shared[carry_index] = carry;
279 const int gridSize = getGridSize(
n);
283 kahanSum<<<gridSize, blockSize, shMemSize, stream>>>(
input,
nullptr,
n, devOut.
data(), 0);
284 kahanSum<<<1, blockSize, shMemSize, stream>>>(devOut.
data(), devOut.
data() + gridSize, gridSize, devOut.
data(), 0);
291 std::span<const double> weights, std::span<const double> offsetProbas)
294 if (probas.empty()) {
297 const int gridSize = getGridSize(probas.size());
302 nllSumKernel<<<gridSize, blockSize, shMemSize, stream>>>(
303 probas.data(), weights.size() == 1 ? nullptr : weights.data(),
304 offsetProbas.empty() ? nullptr : offsetProbas.data(), probas.size(), devOut.
data());
306 kahanSum<<<1, blockSize, shMemSize, stream>>>(devOut.
data(), devOut.
data() + gridSize, gridSize, devOut.
data(), 0);
309 double tmpCarry = 0.0;
313 if (weights.size() == 1) {
314 tmpSum *= weights[0];
315 tmpCarry *= weights[0];
319 out.nllSumCarry = tmpCarry;
325class ScalarBufferContainer {
327 ScalarBufferContainer() {}
328 ScalarBufferContainer(std::size_t
size)
331 throw std::runtime_error(
"ScalarBufferContainer can only be of size 1");
334 double const *hostReadPtr()
const {
return &
_val; }
335 double const *deviceReadPtr()
const {
return &
_val; }
337 double *hostWritePtr() {
return &
_val; }
338 double *deviceWritePtr() {
return &
_val; }
340 void assignFromHost(std::span<const double>
input) {
_val =
input[0]; }
341 void assignFromDevice(std::span<const double>
input)
350class CPUBufferContainer {
354 double const *hostReadPtr()
const {
return _vec.data(); }
355 double const *deviceReadPtr()
const
357 throw std::bad_function_call();
361 double *hostWritePtr() {
return _vec.data(); }
362 double *deviceWritePtr()
364 throw std::bad_function_call();
368 void assignFromHost(std::span<const double>
input) {
_vec.assign(
input.begin(),
input.end()); }
369 void assignFromDevice(std::span<const double>
input)
378class GPUBufferContainer {
382 double const *hostReadPtr()
const
384 throw std::bad_function_call();
387 double const *deviceReadPtr()
const {
return _arr.data(); }
389 double *hostWritePtr()
const
391 throw std::bad_function_call();
394 double *deviceWritePtr()
const {
return const_cast<double *
>(
_arr.data()); }
396 void assignFromHost(std::span<const double>
input)
400 void assignFromDevice(std::span<const double>
input)
406 CudaInterface::DeviceArray<double>
_arr;
409class PinnedBufferContainer {
412 std::size_t
size()
const {
return _arr.size(); }
414 void setCudaStream(CudaInterface::CudaStream *stream) {
_cudaStream = stream; }
416 double const *hostReadPtr()
const
419 if (_lastAccess == LastAccessType::GPU_WRITE) {
425 return const_cast<double *
>(
_arr.data());
427 double const *deviceReadPtr()
const
430 if (_lastAccess == LastAccessType::CPU_WRITE) {
438 double *hostWritePtr()
443 double *deviceWritePtr()
449 void assignFromHost(std::span<const double>
input) { std::copy(
input.begin(),
input.end(), hostWritePtr()); }
450 void assignFromDevice(std::span<const double>
input)
456 enum class LastAccessType { CPU_READ, GPU_READ, CPU_WRITE, GPU_WRITE };
458 CudaInterface::PinnedHostArray<double>
_arr;
464template <
class Container>
465class BufferImpl :
public AbsBuffer {
467 using Queue = std::queue<std::unique_ptr<Container>>;
469 BufferImpl(std::size_t
size, Queue &queue) :
_queue{queue}
472 _vec = std::make_unique<Container>(
size);
479 ~BufferImpl()
override {
_queue.emplace(std::move(_vec)); }
481 double const *hostReadPtr()
const override {
return _vec->hostReadPtr(); }
482 double const *deviceReadPtr()
const override {
return _vec->deviceReadPtr(); }
484 double *hostWritePtr()
override {
return _vec->hostWritePtr(); }
485 double *deviceWritePtr()
override {
return _vec->deviceWritePtr(); }
487 void assignFromHost(std::span<const double>
input)
override {
_vec->assignFromHost(
input); }
488 void assignFromDevice(std::span<const double>
input)
override {
_vec->assignFromDevice(
input); }
490 Container &
vec() {
return *
_vec; }
493 std::unique_ptr<Container>
_vec;
497using ScalarBuffer = BufferImpl<ScalarBufferContainer>;
498using CPUBuffer = BufferImpl<CPUBufferContainer>;
499using GPUBuffer = BufferImpl<GPUBufferContainer>;
500using PinnedBuffer = BufferImpl<PinnedBufferContainer>;
502struct BufferQueuesMaps {
509class BufferManager :
public AbsBufferManager {
512 BufferManager() :
_queuesMaps{std::make_unique<BufferQueuesMaps>()} {}
514 std::unique_ptr<AbsBuffer> makeScalarBuffer()
override
516 return std::make_unique<ScalarBuffer>(1,
_queuesMaps->scalarBufferQueuesMap[1]);
518 std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t
size)
override
522 std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t
size)
override
526 std::unique_ptr<AbsBuffer> makePinnedBuffer(std::size_t
size, CudaInterface::CudaStream *stream =
nullptr)
override
529 out->vec().setCudaStream(stream);
541 return std::make_unique<BufferManager>();
std::vector< double > _vec
CudaInterface::CudaStream * _cudaStream
std::map< std::size_t, CPUBuffer::Queue > cpuBufferQueuesMap
std::map< std::size_t, ScalarBuffer::Queue > scalarBufferQueuesMap
CudaInterface::DeviceArray< double > _arr
std::map< std::size_t, PinnedBuffer::Queue > pinnedBufferQueuesMap
LastAccessType _lastAccess
GPUBufferContainer _gpuBuffer
std::unique_ptr< BufferQueuesMaps > _queuesMaps
std::map< std::size_t, GPUBuffer::Queue > gpuBufferQueuesMap
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
const double *__restrict _array
double *__restrict output
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a cuda spe...
ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
void deleteCudaEvent(CudaInterface::CudaEvent *event) const override
void cudaStreamWaitForEvent(CudaInterface::CudaStream *stream, CudaInterface::CudaEvent *event) const override
std::unique_ptr< AbsBufferManager > createBufferManager() const
void cudaEventRecord(CudaInterface::CudaEvent *event, CudaInterface::CudaStream *stream) const override
CudaInterface::CudaStream * newCudaStream() const override
bool cudaStreamIsActive(CudaInterface::CudaStream *stream) const override
void deleteCudaStream(CudaInterface::CudaStream *stream) const override
double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override
Return the sum of an input array.
std::string architectureName() const override
const std::vector< void(*)(Batches &)> _computeFunctions
Architecture architecture() const override
CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const override
void compute(RooBatchCompute::Config const &cfg, Computer computer, std::span< double > output, VarSpan vars, ArgSpan extraArgs) override
Compute multiple values using cuda kernels.
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
CudaInterface::CudaStream * cudaStream() const
A templated class for managing an array of data using a specified memory type.
Data_t * data()
Get a pointer to the start of the array.
bool isActive()
Checks if a CUDA stream is currently active.
void waitForEvent(CudaEvent &)
Makes a CUDA stream wait for a CUDA event.
The interface which should be implemented to provide optimised computation functions for implementati...
std::vector< void(*)(Batches &)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
__global__ void kahanSum(const double *__restrict__ input, const double *__restrict__ carries, size_t n, double *__restrict__ result, bool nll)
__global__ void nllSumKernel(const double *__restrict__ probas, const double *__restrict__ weights, const double *__restrict__ offsetProbas, size_t n, double *__restrict__ result)
__device__ void kahanSumReduction(double *shared, size_t n, double *__restrict__ result, int carry_index)
__device__ void kahanSumUpdate(double &sum, double &carry, double a, double otherCarry)
void copyDeviceToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the CUDA device.
void cudaEventRecord(CudaEvent &event, CudaStream &stream)
Records a CUDA event.
void copyHostToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the host to the CUDA device.
void copyDeviceToHost(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the host.
Namespace for dispatching RooFit computations to various backends.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::span< double > ArgSpan
const double *__restrict InputArr
std::span< const std::span< const double > > VarSpan
static uint64_t sum(uint64_t i)