Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RooBatchCompute.cu
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Emmanouil Michalainas, CERN, September 2020
5 *
6 * Copyright (c) 2021, CERN
7 *
8 * Redistribution and use in source and binary forms,
9 * with or without modification, are permitted according to the terms
10 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
11 */
12
13/**
14\file RooBatchCompute.cu
15\class RbcClass
16\ingroup Roobatchcompute
17
18This file contains the code for cuda computations using the RooBatchCompute library.
19**/
20
21#include "RooBatchCompute.h"
22#include "Batches.h"
23
24#include <ROOT/RConfig.hxx>
25#include <TError.h>
26
27#include <algorithm>
28
29#ifndef RF_ARCH
30#error "RF_ARCH should always be defined"
31#endif
32
34
35namespace RooBatchCompute {
36namespace RF_ARCH {
37
38constexpr int blockSize = 512;
39
40namespace {
41
42void fillBatches(Batches &batches, RestrictArr output, size_t nEvents, std::size_t nBatches, std::size_t nExtraArgs)
43{
44 batches._nEvents = nEvents;
45 batches._nBatches = nBatches;
46 batches._nExtraArgs = nExtraArgs;
47 batches._output = output;
48}
49
50void fillArrays(Batch *arrays, const VarVector &vars, double *buffer, double *bufferDevice)
51{
52 for (int i = 0; i < vars.size(); i++) {
53 const std::span<const double> &span = vars[i];
54 if (span.size() == 1) {
55 buffer[i] = span[0];
56 arrays[i].set(bufferDevice + i, false);
57 } else {
58 buffer[i] = 0.0;
59 arrays[i].set(span.data(), true);
60 }
61 }
62}
63
64int getGridSize(std::size_t n)
65{
66 // The grid size should be not larger than the order of number of streaming
67 // multiprocessors (SMs) in an Nvidia GPU. The number 84 was chosen because
68 // the developers were using an Nvidia RTX A4500, which has 46 SMs. This was
69 // multiplied by a factor of 1.5, as recommended by stackoverflow.
70 //
71 // But when there are not enough elements to load the GPU, the number should
72 // be lower: that's why there is the std::ceil().
73 //
74 // Note: for grid sizes larger than 512, the Kahan summation kernels give
75 // wrong results. This problem is not understood, but also not really worth
76 // investigating further, as that number is unreasonably large anyway.
77 constexpr int maxGridSize = 84;
78 return std::min(int(std::ceil(double(n) / blockSize)), maxGridSize);
79}
80
81} // namespace
82
83std::vector<void (*)(BatchesHandle)> getFunctions();
84
85/// This class overrides some RooBatchComputeInterface functions, for the
86/// purpose of providing a cuda specific implementation of the library.
88private:
89 const std::vector<void (*)(BatchesHandle)> _computeFunctions;
90
91public:
93 {
94 dispatchCUDA = this; // Set the dispatch pointer to this instance of the library upon loading
95 }
96
97 Architecture architecture() const override { return Architecture::RF_ARCH; };
98 std::string architectureName() const override
99 {
100 // transform to lower case to match the original architecture name passed to the compiler
101 std::string out = _QUOTE_(RF_ARCH);
102 std::transform(out.begin(), out.end(), out.begin(), [](unsigned char c) { return std::tolower(c); });
103 return out;
104 };
105
106 /** Compute multiple values using cuda kernels.
107 This method creates a Batches object and passes it to the correct compute function.
108 The compute function is launched as a cuda kernel.
109 \param computer An enum specifying the compute function to be used.
110 \param output The array where the computation results are stored.
111 \param nEvents The number of events to be processed.
112 \param vars A std::vector containing pointers to the variables involved in the computation.
113 \param extraArgs An optional std::vector containing extra double values that may participate in the computation. **/
114 void compute(RooBatchCompute::Config const &cfg, Computer computer, RestrictArr output, size_t nEvents,
115 const VarVector &vars, ArgVector &extraArgs) override
116 {
117 using namespace RooFit::Detail::CudaInterface;
118
119 const std::size_t memSize = sizeof(Batches) + vars.size() * sizeof(Batch) + vars.size() * sizeof(double) +
120 extraArgs.size() * sizeof(double);
121
122 std::vector<char> hostMem(memSize);
123 auto batches = reinterpret_cast<Batches *>(hostMem.data());
124 auto arrays = reinterpret_cast<Batch *>(batches + 1);
125 auto scalarBuffer = reinterpret_cast<double *>(arrays + vars.size());
126 auto extraArgsHost = reinterpret_cast<double *>(scalarBuffer + vars.size());
127
128 DeviceArray<char> deviceMem(memSize);
129 auto batchesDevice = reinterpret_cast<Batches *>(deviceMem.data());
130 auto arraysDevice = reinterpret_cast<Batch *>(batchesDevice + 1);
131 auto scalarBufferDevice = reinterpret_cast<double *>(arraysDevice + vars.size());
132 auto extraArgsDevice = reinterpret_cast<double *>(scalarBufferDevice + vars.size());
133
134 fillBatches(*batches, output, nEvents, vars.size(), extraArgs.size());
135 fillArrays(arrays, vars, scalarBuffer, scalarBufferDevice);
136 batches->_arrays = arraysDevice;
137
138 if (!extraArgs.empty()) {
139 std::copy(std::cbegin(extraArgs), std::cend(extraArgs), extraArgsHost);
140 batches->_extraArgs = extraArgsDevice;
141 }
142
143 copyHostToDevice(hostMem.data(), deviceMem.data(), hostMem.size(), cfg.cudaStream());
144
145 const int gridSize = getGridSize(nEvents);
146 _computeFunctions[computer]<<<gridSize, blockSize, 0, *cfg.cudaStream()>>>(*batchesDevice);
147
148 // The compute might have modified the mutable extra args, so we need to
149 // copy them back. This can be optimized if necessary in the future by
150 // flagging if the extra args were actually changed.
151 if (!extraArgs.empty()) {
152 copyDeviceToHost(extraArgsDevice, extraArgs.data(), extraArgs.size(), cfg.cudaStream());
153 }
154 }
155 /// Return the sum of an input array
156 double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override;
157 ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span<const double> probas,
158 std::span<const double> weights, std::span<const double> offsetProbas) override;
159}; // End class RooBatchComputeClass
160
161inline __device__ void kahanSumUpdate(double &sum, double &carry, double a, double otherCarry)
162{
163 // c is zero the first time around. Then is done a summation as the c variable is NEGATIVE
164 const double y = a - (carry + otherCarry);
165 const double t = sum + y; // Alas, sum is big, y small, so low-order digits of y are lost.
166
167 // (t - sum) cancels the high-order part of y; subtracting y recovers NEGATIVE (low part of y)
168 carry = (t - sum) - y;
169
170 // Algebraically, c should always be zero. Beware overly-aggressive optimizing compilers!
171 sum = t;
172}
173
174// This is the same implementation of the ROOT::Math::KahanSum::operator+=(KahanSum) but in GPU
175inline __device__ void kahanSumReduction(double *shared, size_t n, double *__restrict__ result, int carry_index)
176{
177 // Stride in first iteration = half of the block dim. Then the half of the half...
178 for (int i = blockDim.x / 2; i > 0; i >>= 1) {
179 if (threadIdx.x < i && (threadIdx.x + i) < n) {
180 kahanSumUpdate(shared[threadIdx.x], shared[carry_index], shared[threadIdx.x + i], shared[carry_index + i]);
181 }
182 __syncthreads();
183 } // Next time around, the lost low part will be added to y in a fresh attempt.
184 // Wait until all threads of the block have finished its work
185
186 if (threadIdx.x == 0) {
187 result[blockIdx.x] = shared[0];
188 result[blockIdx.x + gridDim.x] = shared[carry_index];
189 }
190}
191
192__global__ void kahanSum(const double *__restrict__ input, const double *__restrict__ carries, size_t n,
193 double *__restrict__ result, bool nll)
194{
195 int thIdx = threadIdx.x;
196 int gthIdx = thIdx + blockIdx.x * blockSize;
197 int carry_index = threadIdx.x + blockDim.x;
198 const int nThreadsTotal = blockSize * gridDim.x;
199
200 // The first half of the shared memory is for storing the summation and the second half for the carry or compensation
201 extern __shared__ double shared[];
202
203 double sum = 0.0;
204 double carry = 0.0;
205
206 for (int i = gthIdx; i < n; i += nThreadsTotal) {
207 // Note: it does not make sense to use the nll option and provide at the
208 // same time external carries.
209 double val = nll == 1 ? -std::log(input[i]) : input[i];
210 kahanSumUpdate(sum, carry, val, carries ? carries[i] : 0.0);
211 }
212
213 shared[thIdx] = sum;
214 shared[carry_index] = carry;
215
216 // Wait until all threads in each block have loaded their elements
217 __syncthreads();
218
219 kahanSumReduction(shared, n, result, carry_index);
220}
221
222__global__ void nllSumKernel(const double *__restrict__ probas, const double *__restrict__ weights,
223 const double *__restrict__ offsetProbas, size_t n, double *__restrict__ result)
224{
225 int thIdx = threadIdx.x;
226 int gthIdx = thIdx + blockIdx.x * blockSize;
227 int carry_index = threadIdx.x + blockDim.x;
228 const int nThreadsTotal = blockSize * gridDim.x;
229
230 // The first half of the shared memory is for storing the summation and the second half for the carry or compensation
231 extern __shared__ double shared[];
232
233 double sum = 0.0;
234 double carry = 0.0;
235
236 for (int i = gthIdx; i < n; i += nThreadsTotal) {
237 // Note: it does not make sense to use the nll option and provide at the
238 // same time external carries.
239 double val = -std::log(probas[i]);
240 if (offsetProbas)
241 val += std::log(offsetProbas[i]);
242 if (weights)
243 val = weights[i] * val;
244 kahanSumUpdate(sum, carry, val, 0.0);
245 }
246
247 shared[thIdx] = sum;
248 shared[carry_index] = carry;
249
250 // Wait until all threads in each block have loaded their elements
251 __syncthreads();
252
253 kahanSumReduction(shared, n, result, carry_index);
254}
255
257{
258 const int gridSize = getGridSize(n);
259 cudaStream_t stream = *cfg.cudaStream();
260 CudaInterface::DeviceArray<double> devOut(2 * gridSize);
261 constexpr int shMemSize = 2 * blockSize * sizeof(double);
262 kahanSum<<<gridSize, blockSize, shMemSize, stream>>>(input, nullptr, n, devOut.data(), 0);
263 kahanSum<<<1, blockSize, shMemSize, stream>>>(devOut.data(), devOut.data() + gridSize, gridSize, devOut.data(), 0);
264 double tmp = 0.0;
265 CudaInterface::copyDeviceToHost(devOut.data(), &tmp, 1, cfg.cudaStream());
266 return tmp;
267}
268
270 std::span<const double> weights, std::span<const double> offsetProbas)
271{
272 ReduceNLLOutput out;
273 const int gridSize = getGridSize(probas.size());
274 CudaInterface::DeviceArray<double> devOut(2 * gridSize);
275 cudaStream_t stream = *cfg.cudaStream();
276 constexpr int shMemSize = 2 * blockSize * sizeof(double);
277
278 nllSumKernel<<<gridSize, blockSize, shMemSize, stream>>>(
279 probas.data(), weights.size() == 1 ? nullptr : weights.data(),
280 offsetProbas.empty() ? nullptr : offsetProbas.data(), probas.size(), devOut.data());
281
282 kahanSum<<<1, blockSize, shMemSize, stream>>>(devOut.data(), devOut.data() + gridSize, gridSize, devOut.data(), 0);
283
284 double tmpSum = 0.0;
285 double tmpCarry = 0.0;
286 CudaInterface::copyDeviceToHost(devOut.data(), &tmpSum, 1, cfg.cudaStream());
287 CudaInterface::copyDeviceToHost(devOut.data() + 1, &tmpCarry, 1, cfg.cudaStream());
288
289 if (weights.size() == 1) {
290 tmpSum *= weights[0];
291 tmpCarry *= weights[0];
292 }
293
294 out.nllSum = tmpSum;
295 out.nllSumCarry = tmpCarry;
296 return out;
297}
298
299/// Static object to trigger the constructor which overwrites the dispatch pointer.
301
302} // End namespace RF_ARCH
303} // End namespace RooBatchCompute
#define _QUOTE_(name)
Definition RConfig.hxx:448
#define c(i)
Definition RSha256.hxx:101
#define a(i)
Definition RSha256.hxx:99
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void set(InputArr array, bool isVector)
Definition Batches.h:45
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a cuda spe...
const std::vector< void(*)(BatchesHandle)> _computeFunctions
ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
void compute(RooBatchCompute::Config const &cfg, Computer computer, RestrictArr output, size_t nEvents, const VarVector &vars, ArgVector &extraArgs) override
Compute multiple values using cuda kernels.
double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override
Return the sum of an input array.
The interface which should be implemented to provide optimised computation functions for implementati...
A templated class for managing an array of data using a specified memory type.
Data_t * data()
Get a pointer to the start of the array.
Double_t y[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
__global__ void kahanSum(const double *__restrict__ input, const double *__restrict__ carries, size_t n, double *__restrict__ result, bool nll)
__device__ void kahanSumUpdate(double &sum, double &carry, double a, double otherCarry)
__device__ void kahanSumReduction(double *shared, size_t n, double *__restrict__ result, int carry_index)
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
std::vector< void(*)(BatchesHandle)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
Batches & BatchesHandle
Definition Batches.h:84
__global__ void nllSumKernel(const double *__restrict__ probas, const double *__restrict__ weights, const double *__restrict__ offsetProbas, size_t n, double *__restrict__ result)
Namespace for dispatching RooFit computations to various backends.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::vector< std::span< const double > > VarVector
const double *__restrict InputArr
std::vector< double > ArgVector
double *__restrict RestrictArr
void copyDeviceToHost(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the host.
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2345
static void output()