Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
CudaKernels.cuh
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Author:
4 * Jonas Rembser, CERN 2023
5 *
6 * Copyright (c) 2023, CERN
7 *
8 * Redistribution and use in source and binary forms,
9 * with or without modification, are permitted according to the terms
10 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
11 */
12
13#ifndef RooFit_Detail_CudaKernels_cuh
14#define RooFit_Detail_CudaKernels_cuh
15
16namespace RooFit {
17namespace Detail {
18
19/*
20 * Namespace for templated CUDA kernels.
21 * \ingroup RooFitCuda
22 */
23namespace CudaKernels {
24
25/// The type for array size parameters.
26using Size_t = int;
27
28/// Dedicated namespace for reduction kernels.
29namespace Reducers {
30
31/**
32 * Performs a multi-block sum reduction on the input array `arr`. The input
33 * array can either be scalar, or a flattened vector array with inner dimension
34 * `ElemDim_n`.
35 *
36 * The output array is of shape `[ gridDim.x, ElemDim_n ]` (row-major flattened, meaning
37 * when iterating over the bins, the elements are contiguous).
38 *
39 * @tparam ElemDim_n Inner dimension of the flattened input array. Set to
40 * `1` when summing scalar values.
41 * @tparam BlockSize_n Needs to be identical to the number of thread blocks.
42 * Has to be known statically for the size of the shared memory.
43 * @tparam Elem_t Data type of the input array elements.
44 * @tparam Sum_t Data type of the output and shared memory array elements.
45 * @param[in] nElems Number of elements in the input array.
46 * @param[in] arr Input array containing data to be summed.
47 * @param[out] output Output array containing partial sums for each block.
48 * (Each block's sum is stored in 'output[ElemDim_n * blockIdx.x]').
49 */
50template <int BlockSize_n, int ElemDim_n, class Elem_t, class Sum_t>
51__global__ void SumVectors(Size_t nElems, const Elem_t *__restrict__ arr, Sum_t *__restrict__ output)
52{
53 int thIdx = threadIdx.x;
54 int gthIdx = thIdx + blockIdx.x * BlockSize_n;
55 int arraySize = nElems * ElemDim_n;
56 const int nThreadsTotal = BlockSize_n * gridDim.x;
57 // Each thread calculates its partial sum and writes the result to memory
58 // that is shared by all threads in the block.
59 Sum_t sum[ElemDim_n];
60 // Don't forget the zero-initialization!
61 for (int i = 0; i < ElemDim_n; i += 1) {
62 sum[i] = 0;
63 }
64
65 for (int i = gthIdx; i < arraySize; i += nThreadsTotal) {
66 sum[i % ElemDim_n] += arr[i];
67 }
68 __shared__ Sum_t shArr[ElemDim_n * BlockSize_n];
69 for (int i = 0; i < ElemDim_n; i += 1) {
70 shArr[i * BlockSize_n + thIdx] = sum[i];
71 }
72 __syncthreads();
73 // Sum within the thread blocks
74 for (int size = BlockSize_n / 2; size > 0; size /= 2) { // uniform
75 if (thIdx < size) {
76 for (int i = 0; i < ElemDim_n; i += 1) {
77 shArr[i * BlockSize_n + thIdx] += shArr[i * BlockSize_n + thIdx + size];
78 }
79 }
80 __syncthreads();
81 }
82 if (thIdx == 0)
83 for (int i = 0; i < ElemDim_n; i += 1) {
84 output[ElemDim_n * blockIdx.x + i] = shArr[i * BlockSize_n];
85 }
86}
87
88/**
89 * Computes bin-wise sum and count of elements from the 'arr' array into separate output arrays
90 * based on indices provided in 'x1' and 'x2' arrays, using a 2D grid-stride loop approach.
91 *
92 * The output arrays are of shape `[ gridDim.x, Bins1_n, Bins2_n ]` (row-major
93 * flattened, meaning when iterating over the bins, the elements are
94 * contiguous).
95 *
96 * @tparam Bins1_n Number of bins in the first dimension.
97 * @tparam Bins2_n Number of bins in the second dimension.
98 * @tparam BlockSize_n Needs to be identical to the number of thread blocks.
99 * Has to be known statically for the size of the shared memory.
100 * @tparam Idx_t Data type of index arrays `x1` and `x2`.
101 * @tparam Elem_t Data type of the input array `arr` elements.
102 * @tparam Sum_t Data type for bin-wise sum output.
103 * @tparam Counts_t Data type for bin-wise count output.
104 * @param[in] arraySize Size of the input arrays `x1`, `x2`, and `arr`.
105 * @param[in] x1 Input array containing bin indices for the first dimension.
106 * @param[in] x2 Input array containing bin indices for the second dimension.
107 * @param[in] arr Input array containing elements to be summed.
108 * @param[out] outputSum Output array for storing bin-wise sum of elements.
109 * @param[out] outputCounts Output array for storing bin-wise count of elements.
110 */
111template <int BlockSize_n, int Bins1_n, int Bins2_n, class Idx_t, class Elem_t, class Sum_t, class Counts_t>
112__global__ void
113SumBinwise2D(int arraySize, Idx_t const *__restrict__ x1, Idx_t const *__restrict__ x2, const Elem_t *__restrict__ arr,
114 Sum_t *__restrict__ outputSum, Counts_t *__restrict__ outputCounts)
115{
116 int thIdx = threadIdx.x;
117 int gthIdx = thIdx + blockIdx.x * BlockSize_n;
118 const int nThreadsTotal = BlockSize_n * gridDim.x;
119 constexpr int nBins = Bins1_n * Bins2_n;
120 float sum[nBins];
121 int cnt[nBins];
122
123 // We need to do zero-initialization.
124 for (int i = 0; i < nBins; i += 1) {
125 sum[i] = 0;
126 cnt[i] = 0;
127 }
128
129 for (int i = gthIdx; i < arraySize; i += nThreadsTotal) {
130 int idx = Bins2_n * x1[i] + x2[i];
131 sum[idx] += arr[i];
132 cnt[idx] += 1;
133 }
134 __shared__ float shArrSum[nBins * BlockSize_n];
135 __shared__ int shArrCnt[nBins * BlockSize_n];
136
137 for (int i = 0; i < nBins; i += 1) {
138 shArrSum[i * BlockSize_n + thIdx] = sum[i];
139 shArrCnt[i * BlockSize_n + thIdx] = cnt[i];
140 }
141
142 __syncthreads();
143 for (int size = BlockSize_n / 2; size > 0; size /= 2) { // uniform
144 if (thIdx < size) {
145 for (int i = 0; i < nBins; i += 1) {
146 shArrSum[i * BlockSize_n + thIdx] += shArrSum[i * BlockSize_n + thIdx + size];
147 shArrCnt[i * BlockSize_n + thIdx] += shArrCnt[i * BlockSize_n + thIdx + size];
148 }
149 }
150 __syncthreads();
151 }
152 if (thIdx == 0)
153 for (int i = 0; i < nBins; i += 1) {
154 outputSum[nBins * blockIdx.x + i] = shArrSum[i * BlockSize_n];
155 outputCounts[nBins * blockIdx.x + i] = shArrCnt[i * BlockSize_n];
156 }
157}
158
159/**
160 * Computes the covariance, variance of 'x', and variance of 'y' for a 2D data set.
161
162 * The output array is of shape `[ gridDim.x, 3 ]` (row-major flattened,
163 * meaning when iterating over the elements of the covariance matrix, the
164 * elements are contiguous). Only three output elements are needed to store the
165 * symmetric 2-by-2 covariance matrix.
166 *
167 * @tparam BlockSize_n Needs to be identical to the number of thread blocks.
168 * Has to be known statically for the size of the shared memory.
169 * @param[in] arraySize Size of the input arrays 'x' and 'y'.
170 * @param[in] x Input array containing 'x' data.
171 * @param[in] y Input array containing 'y' data.
172 * @param[in] xMean Mean of the 'x' data.
173 * @param[in] yMean Mean of the 'y' data.
174 * @param[out] output Output array to store computed covariance and variances.
175 * (Three values are stored per block: variance of 'x', covariance, variance of 'y'.)
176 */
177template <int BlockSize_n>
178__global__ void Covariance2D(Size_t arraySize, float const *__restrict__ x, const float *__restrict__ y, double xMean,
179 double yMean, double *__restrict__ output)
180{
181 int thIdx = threadIdx.x;
182 int gthIdx = thIdx + blockIdx.x * BlockSize_n;
183 const int nThreadsTotal = BlockSize_n * gridDim.x;
184 double sumX2 = 0;
185 double sumCov = 0;
186 double sumY2 = 0;
187 for (int i = gthIdx; i < arraySize; i += nThreadsTotal) {
188 const double argx = double(x[i]) - xMean;
189 const double argy = double(y[i]) - yMean;
190 sumX2 += argx * argx;
191 sumCov += argx * argy;
192 sumY2 += argy * argy;
193 }
194 __shared__ double shArrCov[BlockSize_n];
195 __shared__ double shArrX2[BlockSize_n];
196 __shared__ double shArrY2[BlockSize_n];
197 shArrX2[thIdx] = sumX2;
198 shArrCov[thIdx] = sumCov;
199 shArrY2[thIdx] = sumY2;
200 __syncthreads();
201 for (int size = BlockSize_n / 2; size > 0; size /= 2) { // uniform
202 if (thIdx < size) {
203 shArrX2[thIdx] += shArrX2[thIdx + size];
204 shArrCov[thIdx] += shArrCov[thIdx + size];
205 shArrY2[thIdx] += shArrY2[thIdx + size];
206 }
207 __syncthreads();
208 }
209 if (thIdx == 0) {
210 output[3 * blockIdx.x + 0] = shArrX2[0] / arraySize;
211 output[3 * blockIdx.x + 1] = shArrCov[0] / arraySize;
212 output[3 * blockIdx.x + 2] = shArrY2[0] / arraySize;
213 }
214}
215
216} // namespace Reducers
217
218/**
219 * Divides elements of the `num` array in-place by corresponding elements of
220 * the `den` array.
221 *
222 * @tparam Num_t Data type of the 'num' array elements.
223 * @tparam Den_t Data type of the 'den' array elements.
224 * @param[in] arraySize Size of the input arrays.
225 * @param[in,out] num Numerator array containing values to be divided.
226 * @param[in] den Denominator array containing divisor values.
227 */
228template <class Num_t, class Den_t>
229__global__ void DivideBy(Size_t arraySize, Den_t *__restrict__ num, Num_t const *__restrict__ den)
230{
231 int thIdx = threadIdx.x;
232 int gthIdx = thIdx + blockIdx.x * blockDim.x;
233 const int nThreadsTotal = blockDim.x * gridDim.x;
234
235 for (int i = gthIdx; i < arraySize; i += nThreadsTotal) {
236 num[i] = num[i] / den[i];
237 }
238}
239
240/**
241 * Fills an output array 'output' with values from a row-wise flattened
242 * two-dimensional lookup table `lut` based on input arrays `x1` and `x2`:
243 *
244 * `output[ i ] = lut[ n2 * x1[i] + x2[i] ]`.
245 *
246 * Each thread processes a portion of the input arrays using grid-stride looping.
247 *
248 * @tparam Idx_t Data type of the indices.
249 * @tparam Elem_t Data type of the lookup table and output array elements.
250 * @param[in] arraySize Size of the input and output arrays.
251 * @param[in] n2 Size of the second dimension of the lookup table.
252 * @param[in] x1 Input array containing indices for the first dimension of the lookup table.
253 * @param[in] x2 Input array containing indices for the second dimension of the lookup table.
254 * @param[in] lut Lookup table containing values.
255 * @param[out] output Output array to be filled with values from the lookup table.
256 */
257template <class Idx_t, class Elem_t>
258__global__ void Lookup2D(Size_t arraySize, Idx_t n2, Idx_t const *__restrict__ x1, Idx_t const *__restrict__ x2,
259 Elem_t const *__restrict__ lut, Elem_t *__restrict__ output)
260{
261 int thIdx = threadIdx.x;
262 int gthIdx = thIdx + blockIdx.x * blockDim.x;
263 const int nThreadsTotal = blockDim.x * gridDim.x;
264
265 for (int i = gthIdx; i < arraySize; i += nThreadsTotal) {
266 output[i] = lut[n2 * x1[i] + x2[i]];
267 }
268}
269
270} // namespace CudaKernels
271
272} // namespace Detail
273} // namespace RooFit
274
275#endif
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
Option_t Option_t TPoint TPoint const char x2
Option_t Option_t TPoint TPoint const char x1
Double_t y[n]
Definition legend1.C:17
Double_t x[n]
Definition legend1.C:17
__global__ void Covariance2D(Size_t arraySize, float const *__restrict__ x, const float *__restrict__ y, double xMean, double yMean, double *__restrict__ output)
Computes the covariance, variance of 'x', and variance of 'y' for a 2D data set.
__global__ void SumVectors(Size_t nElems, const Elem_t *__restrict__ arr, Sum_t *__restrict__ output)
Performs a multi-block sum reduction on the input array arr.
__global__ void SumBinwise2D(int arraySize, Idx_t const *__restrict__ x1, Idx_t const *__restrict__ x2, const Elem_t *__restrict__ arr, Sum_t *__restrict__ outputSum, Counts_t *__restrict__ outputCounts)
Computes bin-wise sum and count of elements from the 'arr' array into separate output arrays based on...
__global__ void DivideBy(Size_t arraySize, Den_t *__restrict__ num, Num_t const *__restrict__ den)
Divides elements of the num array in-place by corresponding elements of the den array.
__global__ void Lookup2D(Size_t arraySize, Idx_t n2, Idx_t const *__restrict__ x1, Idx_t const *__restrict__ x2, Elem_t const *__restrict__ lut, Elem_t *__restrict__ output)
Fills an output array 'output' with values from a row-wise flattened two-dimensional lookup table lut...
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
Definition JSONIO.h:26
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2345
static void output()