17 struct ArgusBGComputer {
18 template<
class Tm,
class Tm0,
class Tc,
class Tp>
19 void run(
size_t batchSize,
double * __restrict
output, Tm M, Tm0 M0, Tc C, Tp P )
const
21 for (
size_t i=0; i<batchSize; i++) {
22 const double t = M[i]/M0[i];
23 const double u = 1 - t*t;
26 for (
size_t i=0; i<batchSize; i++) {
27 if (M[i] >= M0[i])
output[i] = 0.0;
37 constexpr size_t block = 128;
38 const int nCoef = coef.size();
39 const int degree = nCoef-1;
40 double X[block], _1_X[block], powX[block], pow_1_X[block];
41 double *Binomial =
new double[nCoef+5];
45 for (
int i=1; i<=degree; i++) {
46 Binomial[i] = Binomial[i-1]*(degree-i+1)/i;
49 for (
size_t i=0; i<batchSize; i+=block) {
50 const size_t stop = (i+block > batchSize) ? batchSize-i : block;
53 for (
size_t j=0; j<stop; j++) {
54 powX[j] = pow_1_X[j] = 1.0;
61 for (
int k=2; k<=degree; k+=2)
62 for (
size_t j=0; j<stop; j++)
63 pow_1_X[j] *= _1_X[j]*_1_X[j];
66 for (
size_t j=0; j<stop; j++)
67 pow_1_X[j] *= _1_X[j];
70 for (
size_t j=0; j<stop; j++)
73 for (
int k=0; k<nCoef; k++) {
74 for (
size_t j=0; j<stop; j++) {
75 output[i+j] += coef[k]*Binomial[k]*powX[j]*pow_1_X[j];
79 pow_1_X[j] *= _1_X[j];
88 struct BifurGaussComputer {
89 template<
class Tx,
class Tm,
class Tsl,
class Tsr>
90 void run(
size_t batchSize,
double * __restrict
output, Tx X, Tm M, Tsl SL, Tsr SR)
const
92 for (
size_t i=0; i<batchSize; i++) {
93 const double arg = X[i]-M[i];
94 output[i] = arg / ((arg < 0.0)*SL[i] + (arg >= 0.0)*SR[i]);
97 for (
size_t i=0; i<batchSize; i++) {
98 if (X[i]-M[i]>1
e-30 || X[i]-M[i]<-1
e-30) {
110 struct BukinComputer {
111 template<
class Tx,
class TXp,
class TSigp,
class Txi,
class Trho1,
class Trho2>
112 void run(
size_t batchSize,
double * __restrict
output, Tx X, TXp XP, TSigp SP, Txi XI, Trho1
R1, Trho2
R2)
const
114 const double r3 =
log(2.0);
115 const double r6 =
exp(-6.0);
116 const double r7 = 2*
sqrt(2*
log(2.0));
118 for (
size_t i=0; i<batchSize; i++) {
119 const double r1 = XI[i]*
fast_isqrt(XI[i]*XI[i]+1);
120 const double r4 = 1/
fast_isqrt(XI[i]*XI[i]+1);
121 const double hp = 1 / (SP[i]*r7);
122 const double x1 = XP[i] + 0.5*SP[i]*r7*(r1-1);
123 const double x2 = XP[i] + 0.5*SP[i]*r7*(r1+1);
126 if (XI[i]>r6 || XI[i]<-r6) r5 = XI[i]/
fast_log(r4+XI[i]);
128 double factor=1,
y=X[i]-
x1, Yp=XP[i]-
x1, yi=r4-XI[i], rho=
R1[i];
137 output[i] = rho*
y*
y/Yp/Yp -r3 + factor*4*r3*
y*hp*r5*r4/yi/yi;
138 if (X[i]>=
x1 && X[i]<
x2) {
142 if (X[i]>=
x1 && X[i]<
x2 && XI[i]<r6 && XI[i]>-r6) {
143 output[i] = -4*r3*(X[i]-XP[i])*(X[i]-XP[i])*hp*hp;
146 for (
size_t i=0; i<batchSize; i++) {
154 struct BreitWignerComputer {
155 template<
class Tx,
class Tmean,
class Tw
idth>
156 void run(
size_t batchSize,
double * __restrict
output, Tx X, Tmean M, Twidth W)
const
158 for (
size_t i=0; i<batchSize; i++) {
159 const double arg = X[i]-M[i];
160 output[i] = 1 / (arg*arg + 0.25*W[i]*W[i]);
167 struct CBShapeComputer {
168 template<
class Tm,
class Tm0,
class Tsigma,
class Talpha,
class Tn>
169 void run(
size_t batchSize,
double * __restrict
output, Tm M, Tm0 M0, Tsigma S, Talpha A, Tn
N)
const
171 for (
size_t i=0; i<batchSize; i++) {
172 const double t = (M[i]-M0[i]) / S[i];
173 if ( (A[i]>0 && t>=-A[i]) || (A[i]<0 && -t>=A[i]) ) {
176 output[i] =
N[i] / (
N[i] -A[i]*A[i] -A[i]*t);
179 output[i] -= 0.5*A[i]*A[i];
183 for (
size_t i=0; i<batchSize; i++) {
193 constexpr size_t block = 128;
194 const size_t nCoef = coef.size();
195 double prev[block][2], X[block];
197 for (
size_t i=0; i<batchSize; i+=block) {
198 size_t stop = (i+block >= batchSize) ? batchSize-i : block;
202 for (
size_t j=0; j<stop; j++) {
203 prev[j][0] =
output[i+j] = 1.0;
207 for (
size_t k=0; k<nCoef; k++) {
208 for (
size_t j=0; j<stop; j++) {
209 output[i+j] += prev[j][1]*coef[k];
212 const double next = 2*X[j]*prev[j][1] -prev[j][0];
213 prev[j][0] = prev[j][1];
222 struct ChiSquareComputer {
223 template<
class T_x,
class T_ndof>
224 void run(
size_t batchSize,
double * __restrict
output, T_x X, T_ndof
N)
const
227 for (
size_t i=0; i<batchSize; i++) {
229 output[i] = 1/std::tgamma(
N[i]/2.0);
235 const double gamma = 1/std::tgamma(
N[2019]/2.0);
236 for (
size_t i=0; i<batchSize; i++) {
241 constexpr double ln2 = 0.693147180559945309417232121458;
242 const double lnx0 = std::log(X[0]);
243 for (
size_t i=0; i<batchSize; i++) {
245 if ( X.isBatch() ) lnx =
fast_log(X[i]);
248 double arg = (
N[i]-2)*lnx -X[i] -
N[i]*ln2;
256 struct DstD0BGComputer {
257 template<
class Tdm,
class Tdm0,
class TC,
class TA,
class TB>
258 void run(
size_t batchSize,
double * __restrict
output, Tdm DM, Tdm0 DM0, TC C, TA A, TB B)
const
260 for (
size_t i=0; i<batchSize; i++) {
261 const double ratio = DM[i] / DM0[i];
262 const double arg1 = (DM0[i]-DM[i]) / C[i];
263 const double arg2 = A[i]*
fast_log(ratio);
267 for (
size_t i=0; i<batchSize; i++) {
275 struct ExponentialComputer {
276 template<
class Tx,
class Tc>
277 void run(
size_t n,
double* __restrict
output, Tx
x, Tc
c)
const
279 for (
size_t i = 0; i <
n; ++i) {
287 struct GammaComputer {
288 template<
class Tx,
class Tgamma,
class Tbeta,
class Tmu>
289 void run (
size_t batchSize,
double * __restrict
output, Tx X, Tgamma
G, Tbeta B, Tmu M)
const
291 constexpr double NaN = std::numeric_limits<double>::quiet_NaN();
292 for (
size_t i=0; i<batchSize; i++) {
293 if (X[i]<M[i] ||
G[i] <= 0.0 || B[i] <= 0.0) {
297 output[i] = ((
G[i]==1.0) ? 1. : 0.)/B[i];
305 for (
size_t i=0; i<batchSize; i++) {
307 output[i] = -std::lgamma(
G[i]);
312 double gamma = -std::lgamma(
G[2019]);
313 for (
size_t i=0; i<batchSize; i++) {
320 for (
size_t i=0; i<batchSize; i++) {
322 const double invBeta = 1/B[i];
323 double arg = (X[i]-M[i])*invBeta;
339 struct GaussianComputer {
340 template<
class Tx,
class TMean,
class TSig>
341 void run(
size_t n,
double* __restrict
output, Tx
x, TMean mean, TSig
sigma)
const
343 for (std::size_t i=0; i<
n; ++i) {
344 const double arg =
x[i]-mean[i];
345 const double halfBySigmaSq = -0.5 / (
sigma[i]*
sigma[i]);
353 struct JohnsonComputer {
358 template<
class TMass,
class TMu,
class TLambda,
class TGamma,
class TDelta>
359 void run(
size_t n,
double* __restrict
output, TMass mass, TMu mu, TLambda lambda, TGamma gamma, TDelta delta)
const
363 for (
size_t i=0; i<
n; ++i) {
364 const double arg = (mass[i]-mu[i]) / lambda[i];
368 const double asinh_arg = asinh(arg);
370 const double expo =
gamma[i] + delta[i]*asinh_arg;
371 const double result = delta[i]*
fast_exp(-0.5*expo*expo)*
fast_isqrt(1. +arg*arg) / (sqrt_twoPi*lambda[i]);
373 const double passThrough = mass[i] >= massThreshold;
374 output[i] = result*passThrough;
377 const double massThreshold;
382 struct LandauComputer {
388 template<
class Tx,
class Tmean,
class Tsigma>
389 void run(
size_t batchSize,
double* __restrict
output, Tx X, Tmean M, Tsigma S)
const
391 const double p1[5] = {0.4259894875,-0.1249762550, 0.03984243700, -0.006298287635, 0.001511162253};
392 const double q1[5] = {1.0 ,-0.3388260629, 0.09594393323, -0.01608042283, 0.003778942063};
394 const double p2[5] = {0.1788541609, 0.1173957403, 0.01488850518, -0.001394989411, 0.0001283617211};
395 const double q2[5] = {1.0 , 0.7428795082, 0.3153932961, 0.06694219548, 0.008790609714};
397 const double p3[5] = {0.1788544503, 0.09359161662,0.006325387654, 0.00006611667319,-0.000002031049101};
398 const double q3[5] = {1.0 , 0.6097809921, 0.2560616665, 0.04746722384, 0.006957301675};
400 const double p4[5] = {0.9874054407, 118.6723273, 849.2794360, -743.7792444, 427.0262186};
401 const double q4[5] = {1.0 , 106.8615961, 337.6496214, 2016.712389, 1597.063511};
403 const double p5[5] = {1.003675074, 167.5702434, 4789.711289, 21217.86767, -22324.94910};
404 const double q5[5] = {1.0 , 156.9424537, 3745.310488, 9834.698876, 66924.28357};
406 const double p6[5] = {1.000827619, 664.9143136, 62972.92665, 475554.6998, -5743609.109};
407 const double q6[5] = {1.0 , 651.4101098, 56974.73333, 165917.4725, -2815759.939};
409 const double a1[3] = {0.04166666667,-0.01996527778, 0.02709538966};
410 const double a2[2] = {-1.845568670,-4.284640743};
412 const double NaN = std::nan(
"");
413 const size_t block=256;
416 for (
size_t i=0; i<batchSize; i+=block) {
417 const size_t stop = (i+block < batchSize) ? block : batchSize-i ;
419 for (
size_t j=0; j<stop; j++) {
420 v[j] = (X[i+j]-M[i+j]) / S[i+j];
421 output[i+j] = (p2[0]+(p2[1]+(p2[2]+(p2[3]+p2[4]*
v[j])*
v[j])*
v[j])*
v[j]) /
422 (q2[0]+(q2[1]+(q2[2]+(q2[3]+q2[4]*
v[j])*
v[j])*
v[j])*
v[j]);
425 for (
size_t j=0; j<stop; j++) {
426 const bool mask =
S[i+j] > 0;
430 if (!mask)
v[j] =
NaN;
435 for (
size_t j=0; j<stop; j++) {
439 output[i+j] = (p3[0]+(p3[1]+(p3[2]+(p3[3]+p3[4]*
v[j])*
v[j])*
v[j])*
v[j]) /
440 (q3[0]+(q3[1]+(q3[2]+(q3[3]+q3[4]*
v[j])*
v[j])*
v[j])*
v[j]);
441 }
else if (
v[j] < 12) {
443 output[i+j] = u*u*(p4[0]+(p4[1]+(p4[2]+(p4[3]+p4[4]*u)*u)*u)*u) /
444 (q4[0]+(q4[1]+(q4[2]+(q4[3]+q4[4]*u)*u)*u)*u);
445 }
else if (
v[j] < 50) {
447 output[i+j] = u*u*(p5[0]+(p5[1]+(p5[2]+(p5[3]+p5[4]*u)*u)*u)*u) /
448 (q5[0]+(q5[1]+(q5[2]+(q5[3]+q5[4]*u)*u)*u)*u);
449 }
else if (
v[j] < 300) {
451 output[i+j] = u*u*(p6[0]+(p6[1]+(p6[2]+(p6[3]+p6[4]*u)*u)*u)*u) /
452 (q6[0]+(q6[1]+(q6[2]+(q6[3]+q6[4]*u)*u)*u)*u);
454 u = 1 / (
v[j] -
v[j]*std::log(
v[j])/(
v[j]+1) );
455 output[i+j] = u*u*(1 +(a2[0] +a2[1]*u)*u );
457 }
else if (
v[j] < -1) {
459 u = std::exp(-
v[j]-1);
460 output[i+j] = std::exp(-u)*std::sqrt(u)*
461 (p1[0]+(p1[1]+(p1[2]+(p1[3]+p1[4]*
v[j])*
v[j])*
v[j])*
v[j])/
462 (q1[0]+(q1[1]+(q1[2]+(q1[3]+q1[4]*
v[j])*
v[j])*
v[j])*
v[j]);
464 u = std::exp(
v[j]+1.0);
465 if (u < 1
e-10)
output[i+j] = 0.0;
469 output[i+j] = 0.3989422803*(ue/us)*(1+(a1[0]+(a1[1]+a1[2]*u)*u)*u);
480 struct LognormalComputer {
481 template<
class Tx,
class Tm0,
class Tk>
482 void run(
size_t batchSize,
double* __restrict
output, Tx X, Tm0 M0, Tk K)
const
484 const double rootOf2pi = 2.506628274631000502415765284811;
485 for (
size_t i=0; i<batchSize; i++) {
486 double lnxOverM0 =
fast_log(X[i]/M0[i]);
488 if (lnk<0) lnk = -lnk;
489 double arg = lnxOverM0/lnk;
498 struct NovosibirskComputer {
507 template<
class Tx,
class Tw
idth,
class Tpeak,
class Ttail>
508 void run(
size_t batchSize,
double * __restrict
output, Tx X, Tpeak P, Twidth W, Ttail T)
const
510 constexpr double xi = 2.3548200450309494;
511 for (
size_t i=0; i<batchSize; i++) {
512 double argasinh = 0.5*xi*
T[i];
513 double argln = argasinh + 1/
fast_isqrt(argasinh*argasinh +1);
516 double argln2 = 1 -(X[i]-P[i])*T[i]/W[i];
520 output[i] -= 2.0/xi/xi*asinh*asinh;
524 for (
size_t i=0; i<batchSize; i++) {
532 struct PoissonComputer {
533 template<
class Tx,
class TMean>
534 void run(
const size_t n,
double* __restrict
output, Tx
x, TMean mean)
const
536 for (
size_t i = 0; i <
n; ++i) {
537 const double x_i = noRounding ?
x[i] :
floor(
x[i]);
544 for (
size_t i = 0; i <
n; ++i) {
545 const double x_i = noRounding ?
x[i] :
floor(
x[i]);
546 const double logMean =
fast_log(mean[i]);
547 const double logPoisson = x_i * logMean - mean[i] -
output[i];
553 else if (x_i == 0.) {
556 if (protectNegative && mean[i] < 0.)
560 const bool protectNegative;
561 const bool noRounding;
566 void startComputationPolynomial(
size_t batchSize,
double* __restrict
output,
const double* __restrict
const X,
int lowestOrder, std::vector<BracketAdapterWithMask>& coefList )
568 const int nCoef = coefList.size();
569 if (nCoef==0 && lowestOrder==0) {
570 for (
size_t i=0; i<batchSize; i++) {
574 else if (nCoef==0 && lowestOrder>0) {
575 for (
size_t i=0; i<batchSize; i++) {
579 for (
size_t i=0; i<batchSize; i++) {
580 output[i] = coefList[nCoef-1][i];
583 if (nCoef == 0)
return;
590 for (
int k=nCoef-3; k>=0; k-=2) {
591 for (
size_t i=0; i<batchSize; i++) {
592 double coef1 = coefList[k+1][i];
593 double coef2 = coefList[ k ][i];
599 for (
size_t i=0; i<batchSize; i++) {
604 if (lowestOrder == 0)
return;
605 for (
int k=2; k<=lowestOrder; k+=2) {
606 for (
size_t i=0; i<batchSize; i++) {
610 const bool isOdd = lowestOrder%2;
611 for (
size_t i=0; i<batchSize; i++) {
612 if (isOdd)
output[i] *= X[i];
619 struct VoigtianComputer {
620 template<
class Tx,
class Tmean,
class Tw
idth,
class Tsigma>
621 void run(
size_t batchSize,
double * __restrict
output, Tx X, Tmean M, Twidth W, Tsigma S)
const
623 const double invSqrt2 = 0.707106781186547524400844362105;
624 for (
size_t i=0; i<batchSize; i++) {
625 const double arg = (X[i]-M[i])*(X[i]-M[i]);
626 if (S[i]==0.0 && W[i]==0.0) {
628 }
else if (S[i]==0.0) {
629 output[i] = 1/(arg+0.25*W[i]*W[i]);
630 }
else if (W[i]==0.0) {
637 for (
size_t i=0; i<batchSize; i++) {
638 if (S[i]!=0.0 && W[i]!=0.0) {
640 const double factor = W[i]>0.0 ? 0.5 : -0.5;
641 std::complex<Double_t> z(
output[i]*(X[i]-M[i]) , factor*
output[i]*W[i] );
660 class RooBatchComputeClass :
public RooBatchComputeInterface {
663 struct AnalysisInfo {
664 size_t batchSize=SIZE_MAX;
665 bool canDoHighPerf=
true;
673 if (parameters[0].size()<=1) ret.canDoHighPerf=
false;
674 else ret.batchSize = std::min(ret.batchSize, parameters[0].size());
675 for (
size_t i=1; i<parameters.size(); i++)
676 if (parameters[i].size()>1)
678 ret.canDoHighPerf=
false;
679 ret.batchSize = std::min(ret.batchSize, parameters[i].size());
686 template <
class Computer_t,
typename Arg_t,
typename... Args_t>
689 AnalysisInfo info = analyseInputSpans({
first, rest...});
692 if (info.canDoHighPerf) computer.run(info.batchSize,
output.data(),
first, BracketAdapter<double>(rest[0])...);
693 else computer.run(info.batchSize,
output.data(), BracketAdapterWithMask(
first), BracketAdapterWithMask(rest)...);
699 RooBatchComputeClass() {
704 return startComputation(caller, evalData, ArgusBGComputer{},
m, m0,
c, p);
706 void computeBernstein(
size_t batchSize,
double * __restrict
output,
const double * __restrict
const xData,
double xmin,
double xmax, std::vector<double> coef)
override {
710 return startComputation(caller, evalData, BifurGaussComputer{},
x, mean, sigmaL, sigmaR);
713 return startComputation(caller, evalData, BukinComputer{},
x, Xp, sigp, xi, rho1, rho2);
716 return startComputation(caller, evalData, BreitWignerComputer{},
x, mean,
width);
719 return startComputation(caller, evalData, CBShapeComputer{},
m, m0,
sigma, alpha,
n);
721 void computeChebychev(
size_t batchSize,
double * __restrict
output,
const double * __restrict
const xData,
double xmin,
double xmax, std::vector<double> coef)
override {
725 return startComputation(caller, evalData, ChiSquareComputer{},
x, ndof);
728 return startComputation(caller, evalData, DstD0BGComputer{}, dm, dm0,
C, A, B);
731 return startComputation(caller, evalData, ExponentialComputer{},
x,
c);
734 return startComputation(caller, evalData, GammaComputer{},
x,
gamma,
beta, mu);
737 return startComputation(caller, evalData, GaussianComputer{},
x, mean,
sigma);
740 return startComputation(caller, evalData, JohnsonComputer{massThreshold}, mass, mu, lambda,
gamma, delta);
743 return startComputation(caller, evalData, LandauComputer{},
x, mean,
sigma);
746 return startComputation(caller, evalData, LognormalComputer{},
x, m0, k);
749 return startComputation(caller, evalData, NovosibirskComputer{},
x, peak,
width, tail);
752 return startComputation(caller, evalData, PoissonComputer{protectNegative, noRounding},
x, mean);
754 void computePolynomial(
size_t batchSize,
double* __restrict
output,
const double* __restrict
const X,
int lowestOrder, std::vector<BracketAdapterWithMask>& coefList)
override {
758 return startComputation(caller, evalData, VoigtianComputer{},
x, mean,
width,
sigma);
static const double x2[5]
static const double x1[5]
include TDocParser_001 C image html pict1_TDocParser_001 png width
RooAbsReal is the common abstract base class for objects that represent a real value and implements f...
static std::complex< double > faddeeva(std::complex< double > z)
evaluate Faddeeva function for complex argument
A simple container to hold a batch of data values.
double beta(double x, double y)
Calculates the beta function.
void startComputationPolynomial(size_t batchSize, double *__restrict output, const double *__restrict const X, int lowestOrder, std::vector< BracketAdapterWithMask > &coefList)
void startComputationChebychev(size_t batchSize, double *__restrict output, const double *__restrict const xData, double xmin, double xmax, std::vector< double > coef)
void startComputationBernstein(size_t batchSize, double *__restrict output, const double *__restrict const xData, double xmin, double xmax, std::vector< double > coef)
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
Namespace for dispatching RooFit computations to various backends.
double fast_log(double x)
double fast_exp(double x)
double fast_isqrt(double x)
R__EXTERN RooBatchComputeInterface * dispatch
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
RooArgSet S(const RooAbsArg &v1)
constexpr Double_t C()
Velocity of light in .
Double_t LnGamma(Double_t z)
Computation of ln[gamma(z)] for all z.
constexpr Double_t TwoPi()
#define R1(v, w, x, y, z, i)
#define R2(v, w, x, y, z, i)
static void output(int code)