39void fillBatches(
Batches &batches,
double *
output,
size_t nEvents, std::size_t nBatches, std::size_t nExtraArgs)
43 batches.
nExtra = nExtraArgs;
47void fillArrays(
Batch *arrays,
VarSpan vars,
double *buffer,
double *bufferDevice, std::size_t nEvents)
49 for (
int i = 0; i < vars.size(); i++) {
50 const std::span<const double> &span = vars[i];
51 arrays[i].
_isVector = span.empty() || span.size() >= nEvents;
52 if (!arrays[i]._isVector) {
56 arrays[i].
_array = bufferDevice + i;
62 arrays[i].
_array = span.data();
67int getGridSize(std::size_t
n)
80 constexpr int maxGridSize = 84;
81 return std::min(
int(std::ceil(
double(
n) /
blockSize)), maxGridSize);
111 using namespace CudaInterface;
113 std::size_t nEvents =
output.size();
115 const std::size_t memSize =
sizeof(
Batches) + vars.size() *
sizeof(
Batch) + vars.size() *
sizeof(
double) +
116 extraArgs.size() *
sizeof(
double);
118 std::vector<char> hostMem(memSize);
119 auto batches =
reinterpret_cast<Batches *
>(hostMem.data());
120 auto arrays =
reinterpret_cast<Batch *
>(batches + 1);
121 auto scalarBuffer =
reinterpret_cast<double *
>(arrays + vars.size());
122 auto extraArgsHost =
reinterpret_cast<double *
>(scalarBuffer + vars.size());
124 DeviceArray<char> deviceMem(memSize);
125 auto batchesDevice =
reinterpret_cast<Batches *
>(deviceMem.data());
126 auto arraysDevice =
reinterpret_cast<Batch *
>(batchesDevice + 1);
127 auto scalarBufferDevice =
reinterpret_cast<double *
>(arraysDevice + vars.size());
128 auto extraArgsDevice =
reinterpret_cast<double *
>(scalarBufferDevice + vars.size());
130 fillBatches(*batches,
output.data(), nEvents, vars.size(), extraArgs.size());
131 fillArrays(arrays, vars, scalarBuffer, scalarBufferDevice, nEvents);
132 batches->args = arraysDevice;
134 if (!extraArgs.empty()) {
135 std::copy(std::cbegin(extraArgs), std::cend(extraArgs), extraArgsHost);
136 batches->extra = extraArgsDevice;
139 copyHostToDevice(hostMem.data(), deviceMem.data(), hostMem.size(), cfg.
cudaStream());
141 const int gridSize = getGridSize(nEvents);
147 if (!extraArgs.empty()) {
148 copyDeviceToHost(extraArgsDevice, extraArgs.data(), extraArgs.size(), cfg.
cudaStream());
154 std::span<const double> weights, std::span<const double> offsetProbas)
override;
184 const double y =
a - (carry + otherCarry);
185 const double t =
sum +
y;
188 carry = (t -
sum) -
y;
198 for (
int i = blockDim.x / 2; i > 0; i >>= 1) {
199 if (threadIdx.x < i && (threadIdx.x + i) <
n) {
200 kahanSumUpdate(shared[threadIdx.x], shared[carry_index], shared[threadIdx.x + i], shared[carry_index + i]);
206 if (threadIdx.x == 0) {
207 result[blockIdx.x] = shared[0];
208 result[blockIdx.x + gridDim.x] = shared[carry_index];
212__global__
void kahanSum(
const double *__restrict__
input,
const double *__restrict__ carries,
size_t n,
213 double *__restrict__
result,
bool nll)
215 int thIdx = threadIdx.x;
216 int gthIdx = thIdx + blockIdx.x *
blockSize;
217 int carry_index = threadIdx.x + blockDim.x;
218 const int nThreadsTotal =
blockSize * gridDim.x;
221 extern __shared__
double shared[];
226 for (
int i = gthIdx; i <
n; i += nThreadsTotal) {
229 double val = nll == 1 ? -std::log(
input[i]) :
input[i];
234 shared[carry_index] = carry;
242__global__
void nllSumKernel(
const double *__restrict__ probas,
const double *__restrict__ weights,
243 const double *__restrict__ offsetProbas,
size_t n,
double *__restrict__
result)
245 int thIdx = threadIdx.x;
246 int gthIdx = thIdx + blockIdx.x *
blockSize;
247 int carry_index = threadIdx.x + blockDim.x;
248 const int nThreadsTotal =
blockSize * gridDim.x;
251 extern __shared__
double shared[];
256 for (
int i = gthIdx; i <
n; i += nThreadsTotal) {
259 double val = -std::log(probas[i]);
261 val += std::log(offsetProbas[i]);
263 val = weights[i] * val;
268 shared[carry_index] = carry;
280 const int gridSize = getGridSize(
n);
284 kahanSum<<<gridSize, blockSize, shMemSize, stream>>>(
input,
nullptr,
n, devOut.
data(), 0);
285 kahanSum<<<1, blockSize, shMemSize, stream>>>(devOut.
data(), devOut.
data() + gridSize, gridSize, devOut.
data(), 0);
292 std::span<const double> weights, std::span<const double> offsetProbas)
295 if (probas.empty()) {
298 const int gridSize = getGridSize(probas.size());
304 for (
auto span : {probas, weights, offsetProbas}) {
305 cudaPointerAttributes
attr;
306 assert(span.size() == 0 || span.data() ==
nullptr ||
307 (cudaPointerGetAttributes(&
attr, span.data()) == cudaSuccess &&
attr.type == cudaMemoryTypeDevice));
311 nllSumKernel<<<gridSize, blockSize, shMemSize, stream>>>(
312 probas.data(), weights.size() == 1 ? nullptr : weights.data(),
313 offsetProbas.empty() ? nullptr : offsetProbas.data(), probas.size(), devOut.
data());
315 kahanSum<<<1, blockSize, shMemSize, stream>>>(devOut.
data(), devOut.
data() + gridSize, gridSize, devOut.
data(), 0);
318 double tmpCarry = 0.0;
322 if (weights.size() == 1) {
323 tmpSum *= weights[0];
324 tmpCarry *= weights[0];
328 out.nllSumCarry = tmpCarry;
334class ScalarBufferContainer {
336 ScalarBufferContainer() {}
337 ScalarBufferContainer(std::size_t
size)
340 throw std::runtime_error(
"ScalarBufferContainer can only be of size 1");
343 double const *hostReadPtr()
const {
return &
_val; }
344 double const *deviceReadPtr()
const {
return &
_val; }
346 double *hostWritePtr() {
return &
_val; }
347 double *deviceWritePtr() {
return &
_val; }
349 void assignFromHost(std::span<const double>
input) {
_val =
input[0]; }
350 void assignFromDevice(std::span<const double>
input)
359class CPUBufferContainer {
363 double const *hostReadPtr()
const {
return _vec.data(); }
364 double const *deviceReadPtr()
const
366 throw std::bad_function_call();
370 double *hostWritePtr() {
return _vec.data(); }
371 double *deviceWritePtr()
373 throw std::bad_function_call();
377 void assignFromHost(std::span<const double>
input) {
_vec.assign(
input.begin(),
input.end()); }
378 void assignFromDevice(std::span<const double>
input)
387class GPUBufferContainer {
391 double const *hostReadPtr()
const
393 throw std::bad_function_call();
396 double const *deviceReadPtr()
const {
return _arr.data(); }
398 double *hostWritePtr()
const
400 throw std::bad_function_call();
403 double *deviceWritePtr()
const {
return const_cast<double *
>(
_arr.data()); }
405 void assignFromHost(std::span<const double>
input)
409 void assignFromDevice(std::span<const double>
input)
415 CudaInterface::DeviceArray<double>
_arr;
418class PinnedBufferContainer {
421 std::size_t
size()
const {
return _arr.size(); }
423 void setCudaStream(CudaInterface::CudaStream *stream) {
_cudaStream = stream; }
425 double const *hostReadPtr()
const
428 if (_lastAccess == LastAccessType::GPU_WRITE) {
434 return const_cast<double *
>(
_arr.data());
436 double const *deviceReadPtr()
const
439 if (_lastAccess == LastAccessType::CPU_WRITE) {
447 double *hostWritePtr()
452 double *deviceWritePtr()
458 void assignFromHost(std::span<const double>
input) { std::copy(
input.begin(),
input.end(), hostWritePtr()); }
459 void assignFromDevice(std::span<const double>
input)
465 enum class LastAccessType { CPU_READ, GPU_READ, CPU_WRITE, GPU_WRITE };
467 CudaInterface::PinnedHostArray<double>
_arr;
473template <
class Container>
474class BufferImpl :
public AbsBuffer {
476 using Queue = std::queue<std::unique_ptr<Container>>;
478 BufferImpl(std::size_t
size, Queue &queue) :
_queue{queue}
481 _vec = std::make_unique<Container>(
size);
488 ~BufferImpl()
override {
_queue.emplace(std::move(_vec)); }
490 double const *hostReadPtr()
const override {
return _vec->hostReadPtr(); }
491 double const *deviceReadPtr()
const override {
return _vec->deviceReadPtr(); }
493 double *hostWritePtr()
override {
return _vec->hostWritePtr(); }
494 double *deviceWritePtr()
override {
return _vec->deviceWritePtr(); }
496 void assignFromHost(std::span<const double>
input)
override {
_vec->assignFromHost(
input); }
497 void assignFromDevice(std::span<const double>
input)
override {
_vec->assignFromDevice(
input); }
499 Container &
vec() {
return *
_vec; }
502 std::unique_ptr<Container>
_vec;
506using ScalarBuffer = BufferImpl<ScalarBufferContainer>;
507using CPUBuffer = BufferImpl<CPUBufferContainer>;
508using GPUBuffer = BufferImpl<GPUBufferContainer>;
509using PinnedBuffer = BufferImpl<PinnedBufferContainer>;
511struct BufferQueuesMaps {
518class BufferManager :
public AbsBufferManager {
521 BufferManager() :
_queuesMaps{std::make_unique<BufferQueuesMaps>()} {}
523 std::unique_ptr<AbsBuffer> makeScalarBuffer()
override
525 return std::make_unique<ScalarBuffer>(1,
_queuesMaps->scalarBufferQueuesMap[1]);
527 std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t
size)
override
531 std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t
size)
override
535 std::unique_ptr<AbsBuffer> makePinnedBuffer(std::size_t
size, CudaInterface::CudaStream *stream =
nullptr)
override
538 out->vec().setCudaStream(stream);
550 return std::make_unique<BufferManager>();
std::vector< double > _vec
CudaInterface::CudaStream * _cudaStream
std::map< std::size_t, CPUBuffer::Queue > cpuBufferQueuesMap
std::map< std::size_t, ScalarBuffer::Queue > scalarBufferQueuesMap
CudaInterface::DeviceArray< double > _arr
std::map< std::size_t, PinnedBuffer::Queue > pinnedBufferQueuesMap
LastAccessType _lastAccess
GPUBufferContainer _gpuBuffer
std::unique_ptr< BufferQueuesMaps > _queuesMaps
std::map< std::size_t, GPUBuffer::Queue > gpuBufferQueuesMap
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t attr
const double *__restrict _array
double *__restrict output
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a cuda spe...
ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
void deleteCudaEvent(CudaInterface::CudaEvent *event) const override
void cudaStreamWaitForEvent(CudaInterface::CudaStream *stream, CudaInterface::CudaEvent *event) const override
std::unique_ptr< AbsBufferManager > createBufferManager() const
void cudaEventRecord(CudaInterface::CudaEvent *event, CudaInterface::CudaStream *stream) const override
CudaInterface::CudaStream * newCudaStream() const override
bool cudaStreamIsActive(CudaInterface::CudaStream *stream) const override
void deleteCudaStream(CudaInterface::CudaStream *stream) const override
double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override
Return the sum of an input array.
std::string architectureName() const override
const std::vector< void(*)(Batches &)> _computeFunctions
Architecture architecture() const override
CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const override
void compute(RooBatchCompute::Config const &cfg, Computer computer, std::span< double > output, VarSpan vars, ArgSpan extraArgs) override
Compute multiple values using cuda kernels.
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
CudaInterface::CudaStream * cudaStream() const
A templated class for managing an array of data using a specified memory type.
Data_t * data()
Get a pointer to the start of the array.
bool isActive()
Checks if a CUDA stream is currently active.
void waitForEvent(CudaEvent &)
Makes a CUDA stream wait for a CUDA event.
The interface which should be implemented to provide optimised computation functions for implementati...
std::vector< void(*)(Batches &)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
__global__ void kahanSum(const double *__restrict__ input, const double *__restrict__ carries, size_t n, double *__restrict__ result, bool nll)
__global__ void nllSumKernel(const double *__restrict__ probas, const double *__restrict__ weights, const double *__restrict__ offsetProbas, size_t n, double *__restrict__ result)
__device__ void kahanSumReduction(double *shared, size_t n, double *__restrict__ result, int carry_index)
__device__ void kahanSumUpdate(double &sum, double &carry, double a, double otherCarry)
void copyDeviceToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the CUDA device.
void cudaEventRecord(CudaEvent &event, CudaStream &stream)
Records a CUDA event.
void copyHostToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the host to the CUDA device.
void copyDeviceToHost(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the host.
Namespace for dispatching RooFit computations to various backends.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::span< double > ArgSpan
const double *__restrict InputArr
std::span< const std::span< const double > > VarSpan
static uint64_t sum(uint64_t i)