Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RooBatchCompute.cu
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Emmanouil Michalainas, CERN, September 2020
5 *
6 * Copyright (c) 2021, CERN
7 *
8 * Redistribution and use in source and binary forms,
9 * with or without modification, are permitted according to the terms
10 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
11 */
12
13/**
14\file RooBatchCompute.cu
15\class RbcClass
16\ingroup roofit_dev_docs_batchcompute
17
18This file contains the code for cuda computations using the RooBatchCompute library.
19**/
20
21#include "RooBatchCompute.h"
22#include "Batches.h"
23#include "CudaInterface.h"
24
25#include <algorithm>
26#include <cassert>
27#include <functional>
28#include <map>
29#include <queue>
30#include <vector>
31
32namespace RooBatchCompute {
33namespace CUDA {
34
35constexpr int blockSize = 512;
36
37namespace {
38
39void fillBatches(Batches &batches, double *output, size_t nEvents, std::size_t nBatches, std::size_t nExtraArgs)
40{
41 batches.nEvents = nEvents;
42 batches.nBatches = nBatches;
43 batches.nExtra = nExtraArgs;
44 batches.output = output;
45}
46
47void fillArrays(Batch *arrays, VarSpan vars, double *buffer, double *bufferDevice, std::size_t nEvents)
48{
49 for (int i = 0; i < vars.size(); i++) {
50 const std::span<const double> &span = vars[i];
51 arrays[i]._isVector = span.empty() || span.size() >= nEvents;
52 if (!arrays[i]._isVector) {
53 // In the scalar case, the value is not on the GPU yet, so we have to
54 // copy the value to the GPU buffer.
55 buffer[i] = span[0];
56 arrays[i]._array = bufferDevice + i;
57 } else {
58 // In the vector input cases, they are already on the GPU, so we can
59 // fill be buffer with some dummy value and set the input span
60 // directly.
61 buffer[i] = 0.0;
62 arrays[i]._array = span.data();
63 }
64 }
65}
66
67int getGridSize(std::size_t n)
68{
69 // The grid size should be not larger than the order of number of streaming
70 // multiprocessors (SMs) in an Nvidia GPU. The number 84 was chosen because
71 // the developers were using an Nvidia RTX A4500, which has 46 SMs. This was
72 // multiplied by a factor of 1.5, as recommended by stackoverflow.
73 //
74 // But when there are not enough elements to load the GPU, the number should
75 // be lower: that's why there is the std::ceil().
76 //
77 // Note: for grid sizes larger than 512, the Kahan summation kernels give
78 // wrong results. This problem is not understood, but also not really worth
79 // investigating further, as that number is unreasonably large anyway.
80 constexpr int maxGridSize = 84;
81 return std::min(int(std::ceil(double(n) / blockSize)), maxGridSize);
82}
83
84} // namespace
85
86std::vector<void (*)(Batches &)> getFunctions();
87
88/// This class overrides some RooBatchComputeInterface functions, for the
89/// purpose of providing a cuda specific implementation of the library.
91
92public:
94 {
95 dispatchCUDA = this; // Set the dispatch pointer to this instance of the library upon loading
96 }
97
98 Architecture architecture() const override { return Architecture::CUDA; }
99 std::string architectureName() const override { return "cuda"; }
100
101 /** Compute multiple values using cuda kernels.
102 This method creates a Batches object and passes it to the correct compute function.
103 The compute function is launched as a cuda kernel.
104 \param computer An enum specifying the compute function to be used.
105 \param output The array where the computation results are stored.
106 \param vars A std::span containing pointers to the variables involved in the computation.
107 \param extraArgs An optional std::span containing extra double values that may participate in the computation. **/
108 void compute(RooBatchCompute::Config const &cfg, Computer computer, std::span<double> output, VarSpan vars,
109 ArgSpan extraArgs) override
110 {
111 using namespace CudaInterface;
112
113 std::size_t nEvents = output.size();
114
115 const std::size_t memSize = sizeof(Batches) + vars.size() * sizeof(Batch) + vars.size() * sizeof(double) +
116 extraArgs.size() * sizeof(double);
117
118 std::vector<char> hostMem(memSize);
119 auto batches = reinterpret_cast<Batches *>(hostMem.data());
120 auto arrays = reinterpret_cast<Batch *>(batches + 1);
121 auto scalarBuffer = reinterpret_cast<double *>(arrays + vars.size());
122 auto extraArgsHost = reinterpret_cast<double *>(scalarBuffer + vars.size());
123
124 DeviceArray<char> deviceMem(memSize);
125 auto batchesDevice = reinterpret_cast<Batches *>(deviceMem.data());
126 auto arraysDevice = reinterpret_cast<Batch *>(batchesDevice + 1);
127 auto scalarBufferDevice = reinterpret_cast<double *>(arraysDevice + vars.size());
128 auto extraArgsDevice = reinterpret_cast<double *>(scalarBufferDevice + vars.size());
129
130 fillBatches(*batches, output.data(), nEvents, vars.size(), extraArgs.size());
131 fillArrays(arrays, vars, scalarBuffer, scalarBufferDevice, nEvents);
132 batches->args = arraysDevice;
133
134 if (!extraArgs.empty()) {
135 std::copy(std::cbegin(extraArgs), std::cend(extraArgs), extraArgsHost);
136 batches->extra = extraArgsDevice;
137 }
138
139 copyHostToDevice(hostMem.data(), deviceMem.data(), hostMem.size(), cfg.cudaStream());
140
141 const int gridSize = getGridSize(nEvents);
142 _computeFunctions[computer]<<<gridSize, blockSize, 0, *cfg.cudaStream()>>>(*batchesDevice);
143
144 // The compute might have modified the mutable extra args, so we need to
145 // copy them back. This can be optimized if necessary in the future by
146 // flagging if the extra args were actually changed.
147 if (!extraArgs.empty()) {
148 copyDeviceToHost(extraArgsDevice, extraArgs.data(), extraArgs.size(), cfg.cudaStream());
149 }
150 }
151 /// Return the sum of an input array
152 double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override;
153 ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span<const double> probas,
154 std::span<const double> weights, std::span<const double> offsetProbas) override;
155
156 std::unique_ptr<AbsBufferManager> createBufferManager() const;
157
158 CudaInterface::CudaEvent *newCudaEvent(bool forTiming) const override
159 {
160 return new CudaInterface::CudaEvent{forTiming};
161 }
163 void deleteCudaEvent(CudaInterface::CudaEvent *event) const override { delete event; }
164 void deleteCudaStream(CudaInterface::CudaStream *stream) const override { delete stream; }
165
167 {
168 CudaInterface::cudaEventRecord(*event, *stream);
169 }
171 {
172 stream->waitForEvent(*event);
173 }
174 bool cudaStreamIsActive(CudaInterface::CudaStream *stream) const override { return stream->isActive(); }
175
176private:
177 const std::vector<void (*)(Batches &)> _computeFunctions;
178
179}; // End class RooBatchComputeClass
180
181inline __device__ void kahanSumUpdate(double &sum, double &carry, double a, double otherCarry)
182{
183 // c is zero the first time around. Then is done a summation as the c variable is NEGATIVE
184 const double y = a - (carry + otherCarry);
185 const double t = sum + y; // Alas, sum is big, y small, so low-order digits of y are lost.
186
187 // (t - sum) cancels the high-order part of y; subtracting y recovers NEGATIVE (low part of y)
188 carry = (t - sum) - y;
189
190 // Algebraically, c should always be zero. Beware overly-aggressive optimizing compilers!
191 sum = t;
192}
193
194// This is the same implementation of the ROOT::Math::KahanSum::operator+=(KahanSum) but in GPU
195inline __device__ void kahanSumReduction(double *shared, size_t n, double *__restrict__ result, int carry_index)
196{
197 // Stride in first iteration = half of the block dim. Then the half of the half...
198 for (int i = blockDim.x / 2; i > 0; i >>= 1) {
199 if (threadIdx.x < i && (threadIdx.x + i) < n) {
200 kahanSumUpdate(shared[threadIdx.x], shared[carry_index], shared[threadIdx.x + i], shared[carry_index + i]);
201 }
202 __syncthreads();
203 } // Next time around, the lost low part will be added to y in a fresh attempt.
204 // Wait until all threads of the block have finished its work
205
206 if (threadIdx.x == 0) {
207 result[blockIdx.x] = shared[0];
208 result[blockIdx.x + gridDim.x] = shared[carry_index];
209 }
210}
211
212__global__ void kahanSum(const double *__restrict__ input, const double *__restrict__ carries, size_t n,
213 double *__restrict__ result, bool nll)
214{
215 int thIdx = threadIdx.x;
216 int gthIdx = thIdx + blockIdx.x * blockSize;
217 int carry_index = threadIdx.x + blockDim.x;
218 const int nThreadsTotal = blockSize * gridDim.x;
219
220 // The first half of the shared memory is for storing the summation and the second half for the carry or compensation
221 extern __shared__ double shared[];
222
223 double sum = 0.0;
224 double carry = 0.0;
225
226 for (int i = gthIdx; i < n; i += nThreadsTotal) {
227 // Note: it does not make sense to use the nll option and provide at the
228 // same time external carries.
229 double val = nll == 1 ? -std::log(input[i]) : input[i];
230 kahanSumUpdate(sum, carry, val, carries ? carries[i] : 0.0);
231 }
232
233 shared[thIdx] = sum;
234 shared[carry_index] = carry;
235
236 // Wait until all threads in each block have loaded their elements
237 __syncthreads();
238
239 kahanSumReduction(shared, n, result, carry_index);
240}
241
242__global__ void nllSumKernel(const double *__restrict__ probas, const double *__restrict__ weights,
243 const double *__restrict__ offsetProbas, size_t n, double *__restrict__ result)
244{
245 int thIdx = threadIdx.x;
246 int gthIdx = thIdx + blockIdx.x * blockSize;
247 int carry_index = threadIdx.x + blockDim.x;
248 const int nThreadsTotal = blockSize * gridDim.x;
249
250 // The first half of the shared memory is for storing the summation and the second half for the carry or compensation
251 extern __shared__ double shared[];
252
253 double sum = 0.0;
254 double carry = 0.0;
255
256 for (int i = gthIdx; i < n; i += nThreadsTotal) {
257 // Note: it does not make sense to use the nll option and provide at the
258 // same time external carries.
259 double val = -std::log(probas[i]);
260 if (offsetProbas)
261 val += std::log(offsetProbas[i]);
262 if (weights)
263 val = weights[i] * val;
264 kahanSumUpdate(sum, carry, val, 0.0);
265 }
266
267 shared[thIdx] = sum;
268 shared[carry_index] = carry;
269
270 // Wait until all threads in each block have loaded their elements
271 __syncthreads();
272
273 kahanSumReduction(shared, n, result, carry_index);
274}
275
277{
278 if (n == 0)
279 return 0.0;
280 const int gridSize = getGridSize(n);
281 cudaStream_t stream = *cfg.cudaStream();
282 CudaInterface::DeviceArray<double> devOut(2 * gridSize);
283 constexpr int shMemSize = 2 * blockSize * sizeof(double);
284 kahanSum<<<gridSize, blockSize, shMemSize, stream>>>(input, nullptr, n, devOut.data(), 0);
285 kahanSum<<<1, blockSize, shMemSize, stream>>>(devOut.data(), devOut.data() + gridSize, gridSize, devOut.data(), 0);
286 double tmp = 0.0;
287 CudaInterface::copyDeviceToHost(devOut.data(), &tmp, 1, cfg.cudaStream());
288 return tmp;
289}
290
292 std::span<const double> weights, std::span<const double> offsetProbas)
293{
294 ReduceNLLOutput out;
295 if (probas.empty()) {
296 return out;
297 }
298 const int gridSize = getGridSize(probas.size());
299 CudaInterface::DeviceArray<double> devOut(2 * gridSize);
300 cudaStream_t stream = *cfg.cudaStream();
301 constexpr int shMemSize = 2 * blockSize * sizeof(double);
302
303#ifndef NDEBUG
304 for (auto span : {probas, weights, offsetProbas}) {
305 cudaPointerAttributes attr;
306 assert(span.size() == 0 || span.data() == nullptr ||
307 (cudaPointerGetAttributes(&attr, span.data()) == cudaSuccess && attr.type == cudaMemoryTypeDevice));
308 }
309#endif
310
311 nllSumKernel<<<gridSize, blockSize, shMemSize, stream>>>(
312 probas.data(), weights.size() == 1 ? nullptr : weights.data(),
313 offsetProbas.empty() ? nullptr : offsetProbas.data(), probas.size(), devOut.data());
314
315 kahanSum<<<1, blockSize, shMemSize, stream>>>(devOut.data(), devOut.data() + gridSize, gridSize, devOut.data(), 0);
316
317 double tmpSum = 0.0;
318 double tmpCarry = 0.0;
319 CudaInterface::copyDeviceToHost(devOut.data(), &tmpSum, 1, cfg.cudaStream());
320 CudaInterface::copyDeviceToHost(devOut.data() + 1, &tmpCarry, 1, cfg.cudaStream());
321
322 if (weights.size() == 1) {
323 tmpSum *= weights[0];
324 tmpCarry *= weights[0];
325 }
326
327 out.nllSum = tmpSum;
328 out.nllSumCarry = tmpCarry;
329 return out;
330}
331
332namespace {
333
334class ScalarBufferContainer {
335public:
336 ScalarBufferContainer() {}
337 ScalarBufferContainer(std::size_t size)
338 {
339 if (size != 1)
340 throw std::runtime_error("ScalarBufferContainer can only be of size 1");
341 }
342
343 double const *hostReadPtr() const { return &_val; }
344 double const *deviceReadPtr() const { return &_val; }
345
346 double *hostWritePtr() { return &_val; }
347 double *deviceWritePtr() { return &_val; }
348
349 void assignFromHost(std::span<const double> input) { _val = input[0]; }
350 void assignFromDevice(std::span<const double> input)
351 {
352 CudaInterface::copyDeviceToHost(input.data(), &_val, input.size(), nullptr);
353 }
354
355private:
356 double _val;
357};
358
359class CPUBufferContainer {
360public:
361 CPUBufferContainer(std::size_t size) : _vec(size) {}
362
363 double const *hostReadPtr() const { return _vec.data(); }
364 double const *deviceReadPtr() const
365 {
366 throw std::bad_function_call();
367 return nullptr;
368 }
369
370 double *hostWritePtr() { return _vec.data(); }
371 double *deviceWritePtr()
372 {
373 throw std::bad_function_call();
374 return nullptr;
375 }
376
377 void assignFromHost(std::span<const double> input) { _vec.assign(input.begin(), input.end()); }
378 void assignFromDevice(std::span<const double> input)
379 {
380 CudaInterface::copyDeviceToHost(input.data(), _vec.data(), input.size(), nullptr);
381 }
382
383private:
384 std::vector<double> _vec;
385};
386
387class GPUBufferContainer {
388public:
389 GPUBufferContainer(std::size_t size) : _arr(size) {}
390
391 double const *hostReadPtr() const
392 {
393 throw std::bad_function_call();
394 return nullptr;
395 }
396 double const *deviceReadPtr() const { return _arr.data(); }
397
398 double *hostWritePtr() const
399 {
400 throw std::bad_function_call();
401 return nullptr;
402 }
403 double *deviceWritePtr() const { return const_cast<double *>(_arr.data()); }
404
405 void assignFromHost(std::span<const double> input)
406 {
407 CudaInterface::copyHostToDevice(input.data(), deviceWritePtr(), input.size(), nullptr);
408 }
409 void assignFromDevice(std::span<const double> input)
410 {
411 CudaInterface::copyDeviceToDevice(input.data(), deviceWritePtr(), input.size(), nullptr);
412 }
413
414private:
415 CudaInterface::DeviceArray<double> _arr;
416};
417
418class PinnedBufferContainer {
419public:
420 PinnedBufferContainer(std::size_t size) : _arr{size}, _gpuBuffer{size} {}
421 std::size_t size() const { return _arr.size(); }
422
423 void setCudaStream(CudaInterface::CudaStream *stream) { _cudaStream = stream; }
424
425 double const *hostReadPtr() const
426 {
427
428 if (_lastAccess == LastAccessType::GPU_WRITE) {
429 CudaInterface::copyDeviceToHost(_gpuBuffer.deviceReadPtr(), const_cast<double *>(_arr.data()), size(),
430 _cudaStream);
431 }
432
433 _lastAccess = LastAccessType::CPU_READ;
434 return const_cast<double *>(_arr.data());
435 }
436 double const *deviceReadPtr() const
437 {
438
439 if (_lastAccess == LastAccessType::CPU_WRITE) {
440 CudaInterface::copyHostToDevice(_arr.data(), _gpuBuffer.deviceWritePtr(), size(), _cudaStream);
441 }
442
443 _lastAccess = LastAccessType::GPU_READ;
444 return _gpuBuffer.deviceReadPtr();
445 }
446
447 double *hostWritePtr()
448 {
449 _lastAccess = LastAccessType::CPU_WRITE;
450 return _arr.data();
451 }
452 double *deviceWritePtr()
453 {
454 _lastAccess = LastAccessType::GPU_WRITE;
455 return _gpuBuffer.deviceWritePtr();
456 }
457
458 void assignFromHost(std::span<const double> input) { std::copy(input.begin(), input.end(), hostWritePtr()); }
459 void assignFromDevice(std::span<const double> input)
460 {
461 CudaInterface::copyDeviceToDevice(input.data(), deviceWritePtr(), input.size(), _cudaStream);
462 }
463
464private:
465 enum class LastAccessType { CPU_READ, GPU_READ, CPU_WRITE, GPU_WRITE };
466
467 CudaInterface::PinnedHostArray<double> _arr;
468 GPUBufferContainer _gpuBuffer;
469 CudaInterface::CudaStream *_cudaStream = nullptr;
470 mutable LastAccessType _lastAccess = LastAccessType::CPU_READ;
471};
472
473template <class Container>
474class BufferImpl : public AbsBuffer {
475public:
476 using Queue = std::queue<std::unique_ptr<Container>>;
477
478 BufferImpl(std::size_t size, Queue &queue) : _queue{queue}
479 {
480 if (_queue.empty()) {
481 _vec = std::make_unique<Container>(size);
482 } else {
483 _vec = std::move(_queue.front());
484 _queue.pop();
485 }
486 }
487
488 ~BufferImpl() override { _queue.emplace(std::move(_vec)); }
489
490 double const *hostReadPtr() const override { return _vec->hostReadPtr(); }
491 double const *deviceReadPtr() const override { return _vec->deviceReadPtr(); }
492
493 double *hostWritePtr() override { return _vec->hostWritePtr(); }
494 double *deviceWritePtr() override { return _vec->deviceWritePtr(); }
495
496 void assignFromHost(std::span<const double> input) override { _vec->assignFromHost(input); }
497 void assignFromDevice(std::span<const double> input) override { _vec->assignFromDevice(input); }
498
499 Container &vec() { return *_vec; }
500
501private:
502 std::unique_ptr<Container> _vec;
503 Queue &_queue;
504};
505
506using ScalarBuffer = BufferImpl<ScalarBufferContainer>;
507using CPUBuffer = BufferImpl<CPUBufferContainer>;
508using GPUBuffer = BufferImpl<GPUBufferContainer>;
509using PinnedBuffer = BufferImpl<PinnedBufferContainer>;
510
511struct BufferQueuesMaps {
512 std::map<std::size_t, ScalarBuffer::Queue> scalarBufferQueuesMap;
513 std::map<std::size_t, CPUBuffer::Queue> cpuBufferQueuesMap;
514 std::map<std::size_t, GPUBuffer::Queue> gpuBufferQueuesMap;
515 std::map<std::size_t, PinnedBuffer::Queue> pinnedBufferQueuesMap;
516};
517
518class BufferManager : public AbsBufferManager {
519
520public:
521 BufferManager() : _queuesMaps{std::make_unique<BufferQueuesMaps>()} {}
522
523 std::unique_ptr<AbsBuffer> makeScalarBuffer() override
524 {
525 return std::make_unique<ScalarBuffer>(1, _queuesMaps->scalarBufferQueuesMap[1]);
526 }
527 std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t size) override
528 {
529 return std::make_unique<CPUBuffer>(size, _queuesMaps->cpuBufferQueuesMap[size]);
530 }
531 std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t size) override
532 {
533 return std::make_unique<GPUBuffer>(size, _queuesMaps->gpuBufferQueuesMap[size]);
534 }
535 std::unique_ptr<AbsBuffer> makePinnedBuffer(std::size_t size, CudaInterface::CudaStream *stream = nullptr) override
536 {
537 auto out = std::make_unique<PinnedBuffer>(size, _queuesMaps->pinnedBufferQueuesMap[size]);
538 out->vec().setCudaStream(stream);
539 return out;
540 }
541
542private:
543 std::unique_ptr<BufferQueuesMaps> _queuesMaps;
544};
545
546} // namespace
547
548std::unique_ptr<AbsBufferManager> RooBatchComputeClass::createBufferManager() const
549{
550 return std::make_unique<BufferManager>();
551}
552
553/// Static object to trigger the constructor which overwrites the dispatch pointer.
555
556} // End namespace CUDA
557} // End namespace RooBatchCompute
#define a(i)
Definition RSha256.hxx:99
std::vector< double > _vec
double _val
CudaInterface::CudaStream * _cudaStream
std::map< std::size_t, CPUBuffer::Queue > cpuBufferQueuesMap
std::map< std::size_t, ScalarBuffer::Queue > scalarBufferQueuesMap
Queue & _queue
CudaInterface::DeviceArray< double > _arr
std::map< std::size_t, PinnedBuffer::Queue > pinnedBufferQueuesMap
LastAccessType _lastAccess
GPUBufferContainer _gpuBuffer
std::unique_ptr< BufferQueuesMaps > _queuesMaps
std::map< std::size_t, GPUBuffer::Queue > gpuBufferQueuesMap
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t attr
const double *__restrict _array
Definition Batches.h:32
std::size_t nEvents
Definition Batches.h:46
double *__restrict output
Definition Batches.h:49
std::size_t nBatches
Definition Batches.h:47
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a cuda spe...
ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
void deleteCudaEvent(CudaInterface::CudaEvent *event) const override
void cudaStreamWaitForEvent(CudaInterface::CudaStream *stream, CudaInterface::CudaEvent *event) const override
std::unique_ptr< AbsBufferManager > createBufferManager() const
void cudaEventRecord(CudaInterface::CudaEvent *event, CudaInterface::CudaStream *stream) const override
CudaInterface::CudaStream * newCudaStream() const override
bool cudaStreamIsActive(CudaInterface::CudaStream *stream) const override
void deleteCudaStream(CudaInterface::CudaStream *stream) const override
double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override
Return the sum of an input array.
std::string architectureName() const override
const std::vector< void(*)(Batches &)> _computeFunctions
Architecture architecture() const override
CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const override
void compute(RooBatchCompute::Config const &cfg, Computer computer, std::span< double > output, VarSpan vars, ArgSpan extraArgs) override
Compute multiple values using cuda kernels.
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
CudaInterface::CudaStream * cudaStream() const
A templated class for managing an array of data using a specified memory type.
Data_t * data()
Get a pointer to the start of the array.
bool isActive()
Checks if a CUDA stream is currently active.
void waitForEvent(CudaEvent &)
Makes a CUDA stream wait for a CUDA event.
The interface which should be implemented to provide optimised computation functions for implementati...
Double_t y[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
std::vector< void(*)(Batches &)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
__global__ void kahanSum(const double *__restrict__ input, const double *__restrict__ carries, size_t n, double *__restrict__ result, bool nll)
__global__ void nllSumKernel(const double *__restrict__ probas, const double *__restrict__ weights, const double *__restrict__ offsetProbas, size_t n, double *__restrict__ result)
__device__ void kahanSumReduction(double *shared, size_t n, double *__restrict__ result, int carry_index)
__device__ void kahanSumUpdate(double &sum, double &carry, double a, double otherCarry)
void copyDeviceToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the CUDA device.
void cudaEventRecord(CudaEvent &event, CudaStream &stream)
Records a CUDA event.
void copyHostToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the host to the CUDA device.
void copyDeviceToHost(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the host.
Namespace for dispatching RooFit computations to various backends.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::span< double > ArgSpan
const double *__restrict InputArr
std::span< const std::span< const double > > VarSpan
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2345
static void output()