30#error "RF_ARCH should always be defined"
50void fillArrays(
Batch *arrays,
const VarVector &vars,
double *buffer,
double *bufferDevice)
52 for (
int i = 0; i < vars.size(); i++) {
53 const std::span<const double> &span = vars[i];
54 if (span.size() == 1) {
56 arrays[i].
set(bufferDevice + i,
false);
59 arrays[i].
set(span.data(),
true);
64int getGridSize(std::size_t
n)
77 constexpr int maxGridSize = 84;
78 return std::min(
int(std::ceil(
double(
n) /
blockSize)), maxGridSize);
101 std::string out =
_QUOTE_(RF_ARCH);
102 std::transform(out.begin(), out.end(), out.begin(), [](
unsigned char c) { return std::tolower(c); });
119 const std::size_t memSize =
sizeof(
Batches) + vars.size() *
sizeof(
Batch) + vars.size() *
sizeof(
double) +
120 extraArgs.size() *
sizeof(
double);
122 std::vector<char> hostMem(memSize);
123 auto batches =
reinterpret_cast<Batches *
>(hostMem.data());
124 auto arrays =
reinterpret_cast<Batch *
>(batches + 1);
125 auto scalarBuffer =
reinterpret_cast<double *
>(arrays + vars.size());
126 auto extraArgsHost =
reinterpret_cast<double *
>(scalarBuffer + vars.size());
129 auto batchesDevice =
reinterpret_cast<Batches *
>(deviceMem.
data());
130 auto arraysDevice =
reinterpret_cast<Batch *
>(batchesDevice + 1);
131 auto scalarBufferDevice =
reinterpret_cast<double *
>(arraysDevice + vars.size());
132 auto extraArgsDevice =
reinterpret_cast<double *
>(scalarBufferDevice + vars.size());
134 fillBatches(*batches,
output, nEvents, vars.size(), extraArgs.size());
135 fillArrays(arrays, vars, scalarBuffer, scalarBufferDevice);
136 batches->_arrays = arraysDevice;
138 if (!extraArgs.empty()) {
139 std::copy(std::cbegin(extraArgs), std::cend(extraArgs), extraArgsHost);
140 batches->_extraArgs = extraArgsDevice;
143 copyHostToDevice(hostMem.data(), deviceMem.
data(), hostMem.size(), cfg.cudaStream());
145 const int gridSize = getGridSize(nEvents);
151 if (!extraArgs.empty()) {
152 copyDeviceToHost(extraArgsDevice, extraArgs.data(), extraArgs.size(), cfg.cudaStream());
158 std::span<const double> weights, std::span<const double> offsetProbas)
override;
164 const double y =
a - (carry + otherCarry);
165 const double t =
sum +
y;
168 carry = (t -
sum) -
y;
178 for (
int i = blockDim.x / 2; i > 0; i >>= 1) {
179 if (threadIdx.x < i && (threadIdx.x + i) <
n) {
180 kahanSumUpdate(shared[threadIdx.x], shared[carry_index], shared[threadIdx.x + i], shared[carry_index + i]);
186 if (threadIdx.x == 0) {
187 result[blockIdx.x] = shared[0];
188 result[blockIdx.x + gridDim.x] = shared[carry_index];
192__global__
void kahanSum(
const double *__restrict__
input,
const double *__restrict__ carries,
size_t n,
193 double *__restrict__
result,
bool nll)
195 int thIdx = threadIdx.x;
196 int gthIdx = thIdx + blockIdx.x *
blockSize;
197 int carry_index = threadIdx.x + blockDim.x;
198 const int nThreadsTotal =
blockSize * gridDim.x;
201 extern __shared__
double shared[];
206 for (
int i = gthIdx; i <
n; i += nThreadsTotal) {
209 double val = nll == 1 ? -std::log(
input[i]) :
input[i];
214 shared[carry_index] = carry;
222__global__
void nllSumKernel(
const double *__restrict__ probas,
const double *__restrict__ weights,
223 const double *__restrict__ offsetProbas,
size_t n,
double *__restrict__
result)
225 int thIdx = threadIdx.x;
226 int gthIdx = thIdx + blockIdx.x *
blockSize;
227 int carry_index = threadIdx.x + blockDim.x;
228 const int nThreadsTotal =
blockSize * gridDim.x;
231 extern __shared__
double shared[];
236 for (
int i = gthIdx; i <
n; i += nThreadsTotal) {
239 double val = -std::log(probas[i]);
241 val += std::log(offsetProbas[i]);
243 val = weights[i] * val;
248 shared[carry_index] = carry;
258 const int gridSize = getGridSize(
n);
259 cudaStream_t stream = *cfg.cudaStream();
262 kahanSum<<<gridSize, blockSize, shMemSize, stream>>>(
input,
nullptr,
n, devOut.
data(), 0);
263 kahanSum<<<1, blockSize, shMemSize, stream>>>(devOut.
data(), devOut.
data() + gridSize, gridSize, devOut.
data(), 0);
270 std::span<const double> weights, std::span<const double> offsetProbas)
273 const int gridSize = getGridSize(probas.size());
275 cudaStream_t stream = *cfg.cudaStream();
278 nllSumKernel<<<gridSize, blockSize, shMemSize, stream>>>(
279 probas.data(), weights.size() == 1 ? nullptr : weights.data(),
280 offsetProbas.empty() ? nullptr : offsetProbas.data(), probas.size(), devOut.
data());
282 kahanSum<<<1, blockSize, shMemSize, stream>>>(devOut.
data(), devOut.
data() + gridSize, gridSize, devOut.
data(), 0);
285 double tmpCarry = 0.0;
289 if (weights.size() == 1) {
290 tmpSum *= weights[0];
291 tmpCarry *= weights[0];
295 out.nllSumCarry = tmpCarry;
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void set(InputArr array, bool isVector)
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a cuda spe...
std::string architectureName() const override
const std::vector< void(*)(BatchesHandle)> _computeFunctions
ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
void compute(RooBatchCompute::Config const &cfg, Computer computer, RestrictArr output, size_t nEvents, const VarVector &vars, ArgVector &extraArgs) override
Compute multiple values using cuda kernels.
double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override
Return the sum of an input array.
Architecture architecture() const override
The interface which should be implemented to provide optimised computation functions for implementati...
A templated class for managing an array of data using a specified memory type.
Data_t * data()
Get a pointer to the start of the array.
__global__ void kahanSum(const double *__restrict__ input, const double *__restrict__ carries, size_t n, double *__restrict__ result, bool nll)
__device__ void kahanSumUpdate(double &sum, double &carry, double a, double otherCarry)
__device__ void kahanSumReduction(double *shared, size_t n, double *__restrict__ result, int carry_index)
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
std::vector< void(*)(BatchesHandle)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
__global__ void nllSumKernel(const double *__restrict__ probas, const double *__restrict__ weights, const double *__restrict__ offsetProbas, size_t n, double *__restrict__ result)
Namespace for dispatching RooFit computations to various backends.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::vector< std::span< const double > > VarVector
const double *__restrict InputArr
std::vector< double > ArgVector
double *__restrict RestrictArr
void copyDeviceToHost(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the host.
static uint64_t sum(uint64_t i)