13#ifndef RooFit_Detail_CudaKernels_cuh
14#define RooFit_Detail_CudaKernels_cuh
23namespace CudaKernels {
50template <
int BlockSize_n,
int ElemDim_n,
class Elem_t,
class Sum_t>
53 int thIdx = threadIdx.x;
54 int gthIdx = thIdx + blockIdx.x * BlockSize_n;
55 int arraySize = nElems * ElemDim_n;
56 const int nThreadsTotal = BlockSize_n * gridDim.x;
61 for (
int i = 0; i < ElemDim_n; i += 1) {
65 for (
int i = gthIdx; i < arraySize; i += nThreadsTotal) {
66 sum[i % ElemDim_n] += arr[i];
68 __shared__ Sum_t shArr[ElemDim_n * BlockSize_n];
69 for (
int i = 0; i < ElemDim_n; i += 1) {
70 shArr[i * BlockSize_n + thIdx] =
sum[i];
76 for (
int i = 0; i < ElemDim_n; i += 1) {
77 shArr[i * BlockSize_n + thIdx] += shArr[i * BlockSize_n + thIdx +
size];
83 for (
int i = 0; i < ElemDim_n; i += 1) {
84 output[ElemDim_n * blockIdx.x + i] = shArr[i * BlockSize_n];
111template <
int BlockSize_n,
int Bins1_n,
int Bins2_n,
class Idx_t,
class Elem_t,
class Sum_t,
class Counts_t>
113SumBinwise2D(
int arraySize, Idx_t
const *__restrict__
x1, Idx_t
const *__restrict__
x2,
const Elem_t *__restrict__ arr,
114 Sum_t *__restrict__ outputSum, Counts_t *__restrict__ outputCounts)
116 int thIdx = threadIdx.x;
117 int gthIdx = thIdx + blockIdx.x * BlockSize_n;
118 const int nThreadsTotal = BlockSize_n * gridDim.x;
119 constexpr int nBins = Bins1_n * Bins2_n;
124 for (
int i = 0; i < nBins; i += 1) {
129 for (
int i = gthIdx; i < arraySize; i += nThreadsTotal) {
130 int idx = Bins2_n *
x1[i] +
x2[i];
134 __shared__
float shArrSum[nBins * BlockSize_n];
135 __shared__
int shArrCnt[nBins * BlockSize_n];
137 for (
int i = 0; i < nBins; i += 1) {
138 shArrSum[i * BlockSize_n + thIdx] =
sum[i];
139 shArrCnt[i * BlockSize_n + thIdx] = cnt[i];
145 for (
int i = 0; i < nBins; i += 1) {
146 shArrSum[i * BlockSize_n + thIdx] += shArrSum[i * BlockSize_n + thIdx +
size];
147 shArrCnt[i * BlockSize_n + thIdx] += shArrCnt[i * BlockSize_n + thIdx +
size];
153 for (
int i = 0; i < nBins; i += 1) {
154 outputSum[nBins * blockIdx.x + i] = shArrSum[i * BlockSize_n];
155 outputCounts[nBins * blockIdx.x + i] = shArrCnt[i * BlockSize_n];
177template <
int BlockSize_n>
178__global__
void Covariance2D(
Size_t arraySize,
float const *__restrict__
x,
const float *__restrict__
y,
double xMean,
179 double yMean,
double *__restrict__
output)
181 int thIdx = threadIdx.x;
182 int gthIdx = thIdx + blockIdx.x * BlockSize_n;
183 const int nThreadsTotal = BlockSize_n * gridDim.x;
187 for (
int i = gthIdx; i < arraySize; i += nThreadsTotal) {
188 const double argx =
double(
x[i]) - xMean;
189 const double argy =
double(
y[i]) - yMean;
190 sumX2 += argx * argx;
191 sumCov += argx * argy;
192 sumY2 += argy * argy;
194 __shared__
double shArrCov[BlockSize_n];
195 __shared__
double shArrX2[BlockSize_n];
196 __shared__
double shArrY2[BlockSize_n];
197 shArrX2[thIdx] = sumX2;
198 shArrCov[thIdx] = sumCov;
199 shArrY2[thIdx] = sumY2;
203 shArrX2[thIdx] += shArrX2[thIdx +
size];
204 shArrCov[thIdx] += shArrCov[thIdx +
size];
205 shArrY2[thIdx] += shArrY2[thIdx +
size];
210 output[3 * blockIdx.x + 0] = shArrX2[0] / arraySize;
211 output[3 * blockIdx.x + 1] = shArrCov[0] / arraySize;
212 output[3 * blockIdx.x + 2] = shArrY2[0] / arraySize;
228template <
class Num_t,
class Den_t>
229__global__
void DivideBy(
Size_t arraySize, Den_t *__restrict__ num, Num_t
const *__restrict__ den)
231 int thIdx = threadIdx.x;
232 int gthIdx = thIdx + blockIdx.x * blockDim.x;
233 const int nThreadsTotal = blockDim.x * gridDim.x;
235 for (
int i = gthIdx; i < arraySize; i += nThreadsTotal) {
236 num[i] = num[i] / den[i];
257template <
class Idx_t,
class Elem_t>
258__global__
void Lookup2D(
Size_t arraySize, Idx_t n2, Idx_t
const *__restrict__
x1, Idx_t
const *__restrict__
x2,
259 Elem_t
const *__restrict__ lut, Elem_t *__restrict__
output)
261 int thIdx = threadIdx.x;
262 int gthIdx = thIdx + blockIdx.x * blockDim.x;
263 const int nThreadsTotal = blockDim.x * gridDim.x;
265 for (
int i = gthIdx; i < arraySize; i += nThreadsTotal) {
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
Option_t Option_t TPoint TPoint const char x2
Option_t Option_t TPoint TPoint const char x1
__global__ void Covariance2D(Size_t arraySize, float const *__restrict__ x, const float *__restrict__ y, double xMean, double yMean, double *__restrict__ output)
Computes the covariance, variance of 'x', and variance of 'y' for a 2D data set.
__global__ void SumVectors(Size_t nElems, const Elem_t *__restrict__ arr, Sum_t *__restrict__ output)
Performs a multi-block sum reduction on the input array arr.
__global__ void SumBinwise2D(int arraySize, Idx_t const *__restrict__ x1, Idx_t const *__restrict__ x2, const Elem_t *__restrict__ arr, Sum_t *__restrict__ outputSum, Counts_t *__restrict__ outputCounts)
Computes bin-wise sum and count of elements from the 'arr' array into separate output arrays based on...
__global__ void DivideBy(Size_t arraySize, Den_t *__restrict__ num, Num_t const *__restrict__ den)
Divides elements of the num array in-place by corresponding elements of the den array.
__global__ void Lookup2D(Size_t arraySize, Idx_t n2, Idx_t const *__restrict__ x1, Idx_t const *__restrict__ x2, Elem_t const *__restrict__ lut, Elem_t *__restrict__ output)
Fills an output array 'output' with values from a row-wise flattened two-dimensional lookup table lut...
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
static uint64_t sum(uint64_t i)