Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RooBatchCompute.cxx
Go to the documentation of this file.
1// RooBatchCompute library created September 2020 by Emmanouil Michalainas
2#include "RooBatchCompute.h"
3#include "RooMath.h"
4
5#include <complex>
6
7namespace RooBatchCompute {
8 /**
9 * \brief Contains the part of the code of the RooBatchCompute Library that needs to be compiled for every different cpu architecture.
10 *
11 * RF_ARCH is a macro that is defined by cmake. The macro gets a different name for each copy of the library, namely
12 * GENERIC, SSE4, AVX, AVX2, AVX512, CUDA. This ensures that name clashes are avoided.
13 * \see RooBatchComputeInterface, RooBatchComputeClass, RooBatchCompute::dispatch
14 */
15 namespace RF_ARCH {
16
17 struct ArgusBGComputer {
18 template<class Tm, class Tm0, class Tc, class Tp>
19 void run(size_t batchSize, double * __restrict output, Tm M, Tm0 M0, Tc C, Tp P ) const
20 {
21 for (size_t i=0; i<batchSize; i++) {
22 const double t = M[i]/M0[i];
23 const double u = 1 - t*t;
24 output[i] = C[i]*u + P[i]*fast_log(u);
25 }
26 for (size_t i=0; i<batchSize; i++) {
27 if (M[i] >= M0[i]) output[i] = 0.0;
28 else output[i] = M[i]*fast_exp(output[i]);
29 }
30 }
31 };
32
33//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
34
35 void startComputationBernstein(size_t batchSize, double * __restrict output, const double * __restrict const xData, double xmin, double xmax, std::vector<double> coef)
36 {
37 constexpr size_t block = 128;
38 const int nCoef = coef.size();
39 const int degree = nCoef-1;
40 double X[block], _1_X[block], powX[block], pow_1_X[block];
41 double *Binomial = new double[nCoef+5];
42 //Binomial stores values c(degree,i) for i in [0..degree]
43
44 Binomial[0] = 1.0;
45 for (int i=1; i<=degree; i++) {
46 Binomial[i] = Binomial[i-1]*(degree-i+1)/i;
47 }
48
49 for (size_t i=0; i<batchSize; i+=block) {
50 const size_t stop = (i+block > batchSize) ? batchSize-i : block;
51
52 //initialization
53 for (size_t j=0; j<stop; j++) {
54 powX[j] = pow_1_X[j] = 1.0;
55 X[j] = (xData[i+j]-xmin) / (xmax-xmin);
56 _1_X[j] = 1-X[j];
57 output[i+j] = 0.0;
58 }
59
60 //raising 1-x to the power of degree
61 for (int k=2; k<=degree; k+=2)
62 for (size_t j=0; j<stop; j++)
63 pow_1_X[j] *= _1_X[j]*_1_X[j];
64
65 if (degree%2 == 1)
66 for (size_t j=0; j<stop; j++)
67 pow_1_X[j] *= _1_X[j];
68
69 //inverting 1-x ---> 1/(1-x)
70 for (size_t j=0; j<stop; j++)
71 _1_X[j] = 1/_1_X[j];
72
73 for (int k=0; k<nCoef; k++) {
74 for (size_t j=0; j<stop; j++) {
75 output[i+j] += coef[k]*Binomial[k]*powX[j]*pow_1_X[j];
76
77 //calculating next power for x and 1-x
78 powX[j] *= X[j];
79 pow_1_X[j] *= _1_X[j];
80 }
81 }
82 }
83 delete[] Binomial;
84 }
85
86//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
87
88 struct BifurGaussComputer {
89 template<class Tx, class Tm, class Tsl, class Tsr>
90 void run(size_t batchSize, double * __restrict output, Tx X, Tm M, Tsl SL, Tsr SR) const
91 {
92 for (size_t i=0; i<batchSize; i++) {
93 const double arg = X[i]-M[i];
94 output[i] = arg / ((arg < 0.0)*SL[i] + (arg >= 0.0)*SR[i]);
95 }
96
97 for (size_t i=0; i<batchSize; i++) {
98 if (X[i]-M[i]>1e-30 || X[i]-M[i]<-1e-30) {
99 output[i] = fast_exp(-0.5*output[i]*output[i]);
100 }
101 else {
102 output[i] = 1.0;
103 }
104 }
105 }
106 };
107
108//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
109
110 struct BukinComputer {
111 template<class Tx, class TXp, class TSigp, class Txi, class Trho1, class Trho2>
112 void run(size_t batchSize, double * __restrict output, Tx X, TXp XP, TSigp SP, Txi XI, Trho1 R1, Trho2 R2) const
113 {
114 const double r3 = log(2.0);
115 const double r6 = exp(-6.0);
116 const double r7 = 2*sqrt(2*log(2.0));
117
118 for (size_t i=0; i<batchSize; i++) {
119 const double r1 = XI[i]*fast_isqrt(XI[i]*XI[i]+1);
120 const double r4 = 1/fast_isqrt(XI[i]*XI[i]+1);
121 const double hp = 1 / (SP[i]*r7);
122 const double x1 = XP[i] + 0.5*SP[i]*r7*(r1-1);
123 const double x2 = XP[i] + 0.5*SP[i]*r7*(r1+1);
124
125 double r5 = 1.0;
126 if (XI[i]>r6 || XI[i]<-r6) r5 = XI[i]/fast_log(r4+XI[i]);
127
128 double factor=1, y=X[i]-x1, Yp=XP[i]-x1, yi=r4-XI[i], rho=R1[i];
129 if (X[i]>=x2) {
130 factor = -1;
131 y = X[i]-x2;
132 Yp = XP[i]-x2;
133 yi = r4+XI[i];
134 rho = R2[i];
135 }
136
137 output[i] = rho*y*y/Yp/Yp -r3 + factor*4*r3*y*hp*r5*r4/yi/yi;
138 if (X[i]>=x1 && X[i]<x2) {
139 output[i] = fast_log(1 + 4*XI[i]*r4*(X[i]-XP[i])*hp) / fast_log(1 +2*XI[i]*( XI[i]-r4 ));
140 output[i] *= -output[i]*r3;
141 }
142 if (X[i]>=x1 && X[i]<x2 && XI[i]<r6 && XI[i]>-r6) {
143 output[i] = -4*r3*(X[i]-XP[i])*(X[i]-XP[i])*hp*hp;
144 }
145 }
146 for (size_t i=0; i<batchSize; i++) {
147 output[i] = fast_exp(output[i]);
148 }
149 }
150 };
151
152/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
153
154 struct BreitWignerComputer {
155 template<class Tx, class Tmean, class Twidth>
156 void run(size_t batchSize, double * __restrict output, Tx X, Tmean M, Twidth W) const
157 {
158 for (size_t i=0; i<batchSize; i++) {
159 const double arg = X[i]-M[i];
160 output[i] = 1 / (arg*arg + 0.25*W[i]*W[i]);
161 }
162 }
163 };
164
165//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
166
167 struct CBShapeComputer {
168 template<class Tm, class Tm0, class Tsigma, class Talpha, class Tn>
169 void run( size_t batchSize, double * __restrict output, Tm M, Tm0 M0, Tsigma S, Talpha A, Tn N) const
170 {
171 for (size_t i=0; i<batchSize; i++) {
172 const double t = (M[i]-M0[i]) / S[i];
173 if ( (A[i]>0 && t>=-A[i]) || (A[i]<0 && -t>=A[i]) ) {
174 output[i] = -0.5*t*t;
175 } else {
176 output[i] = N[i] / (N[i] -A[i]*A[i] -A[i]*t);
177 output[i] = fast_log(output[i]);
178 output[i] *= N[i];
179 output[i] -= 0.5*A[i]*A[i];
180 }
181 }
182
183 for (size_t i=0; i<batchSize; i++) {
184 output[i] = fast_exp(output[i]);
185 }
186 }
187 };
188
189//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
190
191 void startComputationChebychev(size_t batchSize, double * __restrict output, const double * __restrict const xData, double xmin, double xmax, std::vector<double> coef)
192 {
193 constexpr size_t block = 128;
194 const size_t nCoef = coef.size();
195 double prev[block][2], X[block];
196
197 for (size_t i=0; i<batchSize; i+=block) {
198 size_t stop = (i+block >= batchSize) ? batchSize-i : block;
199
200 // set a0-->prev[j][0] and a1-->prev[j][1]
201 // and x tranfsformed to range[-1..1]-->X[j]
202 for (size_t j=0; j<stop; j++) {
203 prev[j][0] = output[i+j] = 1.0;
204 prev[j][1] = X[j] = (xData[i+j] -0.5*(xmax + xmin)) / (0.5*(xmax - xmin));
205 }
206
207 for (size_t k=0; k<nCoef; k++) {
208 for (size_t j=0; j<stop; j++) {
209 output[i+j] += prev[j][1]*coef[k];
210
211 //compute next order
212 const double next = 2*X[j]*prev[j][1] -prev[j][0];
213 prev[j][0] = prev[j][1];
214 prev[j][1] = next;
215 }
216 }
217 }
218 }
219
220//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
221
222 struct ChiSquareComputer {
223 template<class T_x, class T_ndof>
224 void run(size_t batchSize, double * __restrict output, T_x X, T_ndof N) const
225 {
226 if ( N.isBatch() ) {
227 for (size_t i=0; i<batchSize; i++) {
228 if (X[i] > 0) {
229 output[i] = 1/std::tgamma(N[i]/2.0);
230 }
231 }
232 }
233 else {
234 // N is just a scalar so bracket adapter ignores index.
235 const double gamma = 1/std::tgamma(N[2019]/2.0);
236 for (size_t i=0; i<batchSize; i++) {
237 output[i] = gamma;
238 }
239 }
240
241 constexpr double ln2 = 0.693147180559945309417232121458;
242 const double lnx0 = std::log(X[0]);
243 for (size_t i=0; i<batchSize; i++) {
244 double lnx;
245 if ( X.isBatch() ) lnx = fast_log(X[i]);
246 else lnx = lnx0;
247
248 double arg = (N[i]-2)*lnx -X[i] -N[i]*ln2;
249 output[i] *= fast_exp(0.5*arg);
250 }
251 }
252 };
253
254//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
255
256 struct DstD0BGComputer {
257 template<class Tdm, class Tdm0, class TC, class TA, class TB>
258 void run(size_t batchSize, double * __restrict output, Tdm DM, Tdm0 DM0, TC C, TA A, TB B) const
259 {
260 for (size_t i=0; i<batchSize; i++) {
261 const double ratio = DM[i] / DM0[i];
262 const double arg1 = (DM0[i]-DM[i]) / C[i];
263 const double arg2 = A[i]*fast_log(ratio);
264 output[i] = (1 -fast_exp(arg1)) * fast_exp(arg2) +B[i]*(ratio-1);
265 }
266
267 for (size_t i=0; i<batchSize; i++) {
268 if (output[i]<0) output[i] = 0;
269 }
270 }
271 };
272
273//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
274
275 struct ExponentialComputer {
276 template<class Tx, class Tc>
277 void run(size_t n, double* __restrict output, Tx x, Tc c) const
278 {
279 for (size_t i = 0; i < n; ++i) {
280 output[i] = fast_exp(x[i]*c[i]);
281 }
282 }
283 };
284
285//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
286
287 struct GammaComputer {
288 template<class Tx, class Tgamma, class Tbeta, class Tmu>
289 void run (size_t batchSize, double * __restrict output, Tx X, Tgamma G, Tbeta B, Tmu M) const
290 {
291 constexpr double NaN = std::numeric_limits<double>::quiet_NaN();
292 for (size_t i=0; i<batchSize; i++) {
293 if (X[i]<M[i] || G[i] <= 0.0 || B[i] <= 0.0) {
294 output[i] = NaN;
295 }
296 if (X[i] == M[i]) {
297 output[i] = ((G[i]==1.0) ? 1. : 0.)/B[i];
298 }
299 else {
300 output[i] = 0.0;
301 }
302 }
303
304 if (G.isBatch()) {
305 for (size_t i=0; i<batchSize; i++) {
306 if (output[i] == 0.0) {
307 output[i] = -std::lgamma(G[i]);
308 }
309 }
310 }
311 else {
312 double gamma = -std::lgamma(G[2019]);
313 for (size_t i=0; i<batchSize; i++) {
314 if (output[i] == 0.0) {
315 output[i] = gamma;
316 }
317 }
318 }
319
320 for (size_t i=0; i<batchSize; i++) {
321 if (X[i] != M[i]) {
322 const double invBeta = 1/B[i];
323 double arg = (X[i]-M[i])*invBeta;
324 output[i] -= arg;
325 arg = fast_log(arg);
326 output[i] += arg*(G[i]-1);
327 output[i] = fast_exp(output[i]);
328 output[i] *= invBeta;
329 }
330 }
331 }
332 };
333
334//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
335 ///Actual computations for the batch evaluation of the Gaussian.
336 ///May vectorise over x, mean, sigma, depending on the types of the inputs.
337 ///\note The output and input spans are assumed to be non-overlapping. If they
338 ///overlap, results will likely be garbage.
339 struct GaussianComputer {
340 template<class Tx, class TMean, class TSig>
341 void run(size_t n, double* __restrict output, Tx x, TMean mean, TSig sigma) const
342 {
343 for (std::size_t i=0; i<n; ++i) {
344 const double arg = x[i]-mean[i];
345 const double halfBySigmaSq = -0.5 / (sigma[i]*sigma[i]);
346 output[i] = fast_exp(arg*arg*halfBySigmaSq);
347 }
348 }
349 };
350
351//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
352
353 struct JohnsonComputer {
354 ///Actual computations for the batch evaluation of the Johnson.
355 ///May vectorise over observables depending on types of inputs.
356 ///\note The output and input spans are assumed to be non-overlapping. If they
357 ///overlap, results will likely be garbage.
358 template<class TMass, class TMu, class TLambda, class TGamma, class TDelta>
359 void run(size_t n, double* __restrict output, TMass mass, TMu mu, TLambda lambda, TGamma gamma, TDelta delta) const
360 {
361 const double sqrt_twoPi = sqrt(TMath::TwoPi());
362
363 for (size_t i=0; i<n; ++i) {
364 const double arg = (mass[i]-mu[i]) / lambda[i];
365 #ifdef R__HAS_VDT
366 const double asinh_arg = fast_log(arg + 1/fast_isqrt(arg*arg+1));
367 #else
368 const double asinh_arg = asinh(arg);
369 #endif
370 const double expo = gamma[i] + delta[i]*asinh_arg;
371 const double result = delta[i]*fast_exp(-0.5*expo*expo)*fast_isqrt(1. +arg*arg) / (sqrt_twoPi*lambda[i]);
372
373 const double passThrough = mass[i] >= massThreshold;
374 output[i] = result*passThrough;
375 }
376 }
377 const double massThreshold;
378 };
379
380//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
381
382 struct LandauComputer {
383 /* Actual computation of Landau(x,mean,sigma) in a vectorization-friendly way
384 * Code copied from function landau_pdf (math/mathcore/src/PdfFuncMathCore.cxx)
385 * and rewritten to take advantage for the most popular case
386 * which is -1 < (x-mean)/sigma < 1. The rest cases are handled in scalar way
387 */
388 template<class Tx, class Tmean, class Tsigma>
389 void run(size_t batchSize, double* __restrict output, Tx X, Tmean M, Tsigma S) const
390 {
391 const double p1[5] = {0.4259894875,-0.1249762550, 0.03984243700, -0.006298287635, 0.001511162253};
392 const double q1[5] = {1.0 ,-0.3388260629, 0.09594393323, -0.01608042283, 0.003778942063};
393
394 const double p2[5] = {0.1788541609, 0.1173957403, 0.01488850518, -0.001394989411, 0.0001283617211};
395 const double q2[5] = {1.0 , 0.7428795082, 0.3153932961, 0.06694219548, 0.008790609714};
396
397 const double p3[5] = {0.1788544503, 0.09359161662,0.006325387654, 0.00006611667319,-0.000002031049101};
398 const double q3[5] = {1.0 , 0.6097809921, 0.2560616665, 0.04746722384, 0.006957301675};
399
400 const double p4[5] = {0.9874054407, 118.6723273, 849.2794360, -743.7792444, 427.0262186};
401 const double q4[5] = {1.0 , 106.8615961, 337.6496214, 2016.712389, 1597.063511};
402
403 const double p5[5] = {1.003675074, 167.5702434, 4789.711289, 21217.86767, -22324.94910};
404 const double q5[5] = {1.0 , 156.9424537, 3745.310488, 9834.698876, 66924.28357};
405
406 const double p6[5] = {1.000827619, 664.9143136, 62972.92665, 475554.6998, -5743609.109};
407 const double q6[5] = {1.0 , 651.4101098, 56974.73333, 165917.4725, -2815759.939};
408
409 const double a1[3] = {0.04166666667,-0.01996527778, 0.02709538966};
410 const double a2[2] = {-1.845568670,-4.284640743};
411
412 const double NaN = std::nan("");
413 const size_t block=256;
414 double v[block];
415
416 for (size_t i=0; i<batchSize; i+=block) { //CHECK_VECTORISE
417 const size_t stop = (i+block < batchSize) ? block : batchSize-i ;
418
419 for (size_t j=0; j<stop; j++) { //CHECK_VECTORISE
420 v[j] = (X[i+j]-M[i+j]) / S[i+j];
421 output[i+j] = (p2[0]+(p2[1]+(p2[2]+(p2[3]+p2[4]*v[j])*v[j])*v[j])*v[j]) /
422 (q2[0]+(q2[1]+(q2[2]+(q2[3]+q2[4]*v[j])*v[j])*v[j])*v[j]);
423 }
424
425 for (size_t j=0; j<stop; j++) { //CHECK_VECTORISE
426 const bool mask = S[i+j] > 0;
427 /* comparison with NaN will give result false, so the next
428 * loop won't affect output, for cases where sigma <=0
429 */
430 if (!mask) v[j] = NaN;
431 output[i+j] *= mask;
432 }
433
434 double u, ue, us;
435 for (size_t j=0; j<stop; j++) { //CHECK_VECTORISE
436 // if branch written in way to quickly process the most popular case -1 <= v[j] < 1
437 if (v[j] >= 1) {
438 if (v[j] < 5) {
439 output[i+j] = (p3[0]+(p3[1]+(p3[2]+(p3[3]+p3[4]*v[j])*v[j])*v[j])*v[j]) /
440 (q3[0]+(q3[1]+(q3[2]+(q3[3]+q3[4]*v[j])*v[j])*v[j])*v[j]);
441 } else if (v[j] < 12) {
442 u = 1/v[j];
443 output[i+j] = u*u*(p4[0]+(p4[1]+(p4[2]+(p4[3]+p4[4]*u)*u)*u)*u) /
444 (q4[0]+(q4[1]+(q4[2]+(q4[3]+q4[4]*u)*u)*u)*u);
445 } else if (v[j] < 50) {
446 u = 1/v[j];
447 output[i+j] = u*u*(p5[0]+(p5[1]+(p5[2]+(p5[3]+p5[4]*u)*u)*u)*u) /
448 (q5[0]+(q5[1]+(q5[2]+(q5[3]+q5[4]*u)*u)*u)*u);
449 } else if (v[j] < 300) {
450 u = 1/v[j];
451 output[i+j] = u*u*(p6[0]+(p6[1]+(p6[2]+(p6[3]+p6[4]*u)*u)*u)*u) /
452 (q6[0]+(q6[1]+(q6[2]+(q6[3]+q6[4]*u)*u)*u)*u);
453 } else {
454 u = 1 / (v[j] -v[j]*std::log(v[j])/(v[j]+1) );
455 output[i+j] = u*u*(1 +(a2[0] +a2[1]*u)*u );
456 }
457 } else if (v[j] < -1) {
458 if (v[j] >= -5.5) {
459 u = std::exp(-v[j]-1);
460 output[i+j] = std::exp(-u)*std::sqrt(u)*
461 (p1[0]+(p1[1]+(p1[2]+(p1[3]+p1[4]*v[j])*v[j])*v[j])*v[j])/
462 (q1[0]+(q1[1]+(q1[2]+(q1[3]+q1[4]*v[j])*v[j])*v[j])*v[j]);
463 } else {
464 u = std::exp(v[j]+1.0);
465 if (u < 1e-10) output[i+j] = 0.0;
466 else {
467 ue = std::exp(-1/u);
468 us = std::sqrt(u);
469 output[i+j] = 0.3989422803*(ue/us)*(1+(a1[0]+(a1[1]+a1[2]*u)*u)*u);
470 }
471 }
472 }
473 }
474 }
475 }
476 };
477
478//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
479
480 struct LognormalComputer {
481 template<class Tx, class Tm0, class Tk>
482 void run(size_t batchSize, double* __restrict output, Tx X, Tm0 M0, Tk K) const
483 {
484 const double rootOf2pi = 2.506628274631000502415765284811;
485 for (size_t i=0; i<batchSize; i++) {
486 double lnxOverM0 = fast_log(X[i]/M0[i]);
487 double lnk = fast_log(K[i]);
488 if (lnk<0) lnk = -lnk;
489 double arg = lnxOverM0/lnk;
490 arg *= -0.5*arg;
491 output[i] = fast_exp(arg) / (X[i]*lnk*rootOf2pi);
492 }
493 }
494 };
495
496//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
497
498 struct NovosibirskComputer {
499 /* TMath::ASinH(x) needs to be replaced with ln( x + sqrt(x^2+1))
500 * argasinh -> the argument of TMath::ASinH()
501 * argln -> the argument of the logarithm that replaces AsinH
502 * asinh -> the value that the function evaluates to
503 *
504 * ln is the logarithm that was solely present in the initial
505 * formula, that is before the asinh replacement
506 */
507 template<class Tx, class Twidth, class Tpeak, class Ttail>
508 void run(size_t batchSize, double * __restrict output, Tx X, Tpeak P, Twidth W, Ttail T) const
509 {
510 constexpr double xi = 2.3548200450309494; // 2 Sqrt( Ln(4) )
511 for (size_t i=0; i<batchSize; i++) {
512 double argasinh = 0.5*xi*T[i];
513 double argln = argasinh + 1/fast_isqrt(argasinh*argasinh +1);
514 double asinh = fast_log(argln);
515
516 double argln2 = 1 -(X[i]-P[i])*T[i]/W[i];
517 double ln = fast_log(argln2);
518 output[i] = ln/asinh;
519 output[i] *= -0.125*xi*xi*output[i];
520 output[i] -= 2.0/xi/xi*asinh*asinh;
521 }
522
523 //faster if you exponentiate in a seperate loop (dark magic!)
524 for (size_t i=0; i<batchSize; i++) {
525 output[i] = fast_exp(output[i]);
526 }
527 }
528 };
529
530//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
531
532 struct PoissonComputer {
533 template<class Tx, class TMean>
534 void run(const size_t n, double* __restrict output, Tx x, TMean mean) const
535 {
536 for (size_t i = 0; i < n; ++i) { //CHECK_VECTORISE
537 const double x_i = noRounding ? x[i] : floor(x[i]);
538 // The std::lgamma yields different values than in the scalar implementation.
539 // Need to check which one is more accurate.
540 // output[i] = std::lgamma(x_i + 1.);
541 output[i] = TMath::LnGamma(x_i + 1.);
542 }
543
544 for (size_t i = 0; i < n; ++i) {
545 const double x_i = noRounding ? x[i] : floor(x[i]);
546 const double logMean = fast_log(mean[i]);
547 const double logPoisson = x_i * logMean - mean[i] - output[i];
548 output[i] = fast_exp(logPoisson);
549
550 // Cosmetics
551 if (x_i < 0.)
552 output[i] = 0.;
553 else if (x_i == 0.) {
554 output[i] = 1./fast_exp(mean[i]);
555 }
556 if (protectNegative && mean[i] < 0.)
557 output[i] = 1.E-3;
558 }
559 }
560 const bool protectNegative;
561 const bool noRounding;
562 };
563
564//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
565
566 void startComputationPolynomial(size_t batchSize, double* __restrict output, const double* __restrict const X, int lowestOrder, std::vector<BracketAdapterWithMask>& coefList )
567 {
568 const int nCoef = coefList.size();
569 if (nCoef==0 && lowestOrder==0) {
570 for (size_t i=0; i<batchSize; i++) {
571 output[i] = 0.0;
572 }
573 }
574 else if (nCoef==0 && lowestOrder>0) {
575 for (size_t i=0; i<batchSize; i++) {
576 output[i] = 1.0;
577 }
578 } else {
579 for (size_t i=0; i<batchSize; i++) {
580 output[i] = coefList[nCoef-1][i];
581 }
582 }
583 if (nCoef == 0) return;
584
585 /* Indexes are in range 0..nCoef-1 but coefList[nCoef-1]
586 * has already been processed. In order to traverse the list,
587 * with step of 2 we have to start at index nCoef-3 and use
588 * coefList[k+1] and coefList[k]
589 */
590 for (int k=nCoef-3; k>=0; k-=2) {
591 for (size_t i=0; i<batchSize; i++) {
592 double coef1 = coefList[k+1][i];
593 double coef2 = coefList[ k ][i];
594 output[i] = X[i]*(output[i]*X[i] + coef1) + coef2;
595 }
596 }
597 // If nCoef is odd, then the coefList[0] didn't get processed
598 if (nCoef%2 == 0) {
599 for (size_t i=0; i<batchSize; i++) {
600 output[i] = output[i]*X[i] + coefList[0][i];
601 }
602 }
603 //Increase the order of the polynomial, first by myltiplying with X[i]^2
604 if (lowestOrder == 0) return;
605 for (int k=2; k<=lowestOrder; k+=2) {
606 for (size_t i=0; i<batchSize; i++) {
607 output[i] *= X[i]*X[i];
608 }
609 }
610 const bool isOdd = lowestOrder%2;
611 for (size_t i=0; i<batchSize; i++) {
612 if (isOdd) output[i] *= X[i];
613 output[i] += 1.0;
614 }
615 }
616
617//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
618
619 struct VoigtianComputer {
620 template<class Tx, class Tmean, class Twidth, class Tsigma>
621 void run(size_t batchSize, double * __restrict output, Tx X, Tmean M, Twidth W, Tsigma S) const
622 {
623 const double invSqrt2 = 0.707106781186547524400844362105;
624 for (size_t i=0; i<batchSize; i++) {
625 const double arg = (X[i]-M[i])*(X[i]-M[i]);
626 if (S[i]==0.0 && W[i]==0.0) {
627 output[i] = 1.0;
628 } else if (S[i]==0.0) {
629 output[i] = 1/(arg+0.25*W[i]*W[i]);
630 } else if (W[i]==0.0) {
631 output[i] = fast_exp(-0.5*arg/(S[i]*S[i]));
632 } else {
633 output[i] = invSqrt2/S[i];
634 }
635 }
636
637 for (size_t i=0; i<batchSize; i++) {
638 if (S[i]!=0.0 && W[i]!=0.0) {
639 if (output[i] < 0) output[i] = -output[i];
640 const double factor = W[i]>0.0 ? 0.5 : -0.5;
641 std::complex<Double_t> z( output[i]*(X[i]-M[i]) , factor*output[i]*W[i] );
642 output[i] *= RooMath::faddeeva(z).real();
643 }
644 }
645 }
646 };
647
648//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
649
650 /**
651 * \brief Implementation of the RooBatchComputeInterface.
652 *
653 * This class dispatches computation requests to an actual computation backend, such as SSE, AVX, AVX2, etc.
654 * Several implementations of this class may be provided, each targeted at different architectures.
655 *
656 * Note that when this class is instantiated, it registers itself in RooBatchCompute::dispatch. This means
657 * that all subsequent computation requests that are issued via RooBatchCompute::dispatch are handled by the
658 * last instance that was created.
659 */
660 class RooBatchComputeClass : public RooBatchComputeInterface {
661 private:
662
663 struct AnalysisInfo {
664 size_t batchSize=SIZE_MAX;
665 bool canDoHighPerf=true;
666 };
667 /// Small helping function that determines the sizes of the batches and executes either
668 /// * A high-performance and optimized compute function instance, if the observable is a span and all parameters are const scalars
669 /// * A less optimized one-fits-all compute function instance that covers every other (rare) scenario
670 AnalysisInfo analyseInputSpans(std::vector<RooSpan<const double>> parameters)
671 {
672 AnalysisInfo ret;
673 if (parameters[0].size()<=1) ret.canDoHighPerf=false;
674 else ret.batchSize = std::min(ret.batchSize, parameters[0].size());
675 for (size_t i=1; i<parameters.size(); i++)
676 if (parameters[i].size()>1)
677 {
678 ret.canDoHighPerf=false;
679 ret.batchSize = std::min(ret.batchSize, parameters[i].size());
680 }
681 return ret;
682 }
683
684 /// Templated function that works for every PDF: does the necessary preprocessing and launches
685 /// the correct overload of the actual computing function.
686 template <class Computer_t, typename Arg_t, typename... Args_t>
687 RooSpan<double> startComputation(const RooAbsReal* caller, RunContext& evalData, Computer_t computer, Arg_t first, Args_t... rest)
688 {
689 AnalysisInfo info = analyseInputSpans({first, rest...});
690 RooSpan<double> output = evalData.makeBatch(caller, info.batchSize);
691
692 if (info.canDoHighPerf) computer.run(info.batchSize, output.data(), first, BracketAdapter<double>(rest[0])...);
693 else computer.run(info.batchSize, output.data(), BracketAdapterWithMask(first), BracketAdapterWithMask(rest)...);
694
695 return output;
696 }
697
698 public:
699 RooBatchComputeClass() {
700 // Set the dispatch pointer to this instance of the library upon loading
702 }
703 RooSpan<double> computeArgusBG(const RooAbsReal* caller, RunContext& evalData, RooSpan<const double> m, RooSpan<const double> m0, RooSpan<const double> c, RooSpan<const double> p) override {
704 return startComputation(caller, evalData, ArgusBGComputer{}, m, m0, c, p);
705 }
706 void computeBernstein(size_t batchSize, double * __restrict output, const double * __restrict const xData, double xmin, double xmax, std::vector<double> coef) override {
707 startComputationBernstein(batchSize, output, xData, xmin, xmax, coef);
708 }
709 RooSpan<double> computeBifurGauss(const RooAbsReal* caller, RunContext& evalData, RooSpan<const double> x, RooSpan<const double> mean, RooSpan<const double> sigmaL, RooSpan<const double> sigmaR) override {
710 return startComputation(caller, evalData, BifurGaussComputer{}, x, mean, sigmaL, sigmaR);
711 }
713 return startComputation(caller, evalData, BukinComputer{}, x, Xp, sigp, xi, rho1, rho2);
714 }
715 RooSpan<double> computeBreitWigner(const RooAbsReal* caller, RunContext& evalData, RooSpan<const double> x, RooSpan<const double> mean, RooSpan<const double> width) override {
716 return startComputation(caller, evalData, BreitWignerComputer{}, x, mean, width);
717 }
719 return startComputation(caller, evalData, CBShapeComputer{}, m, m0, sigma, alpha, n);
720 }
721 void computeChebychev(size_t batchSize, double * __restrict output, const double * __restrict const xData, double xmin, double xmax, std::vector<double> coef) override {
722 startComputationChebychev(batchSize, output, xData, xmin, xmax, coef);
723 }
724 RooSpan<double> computeChiSquare(const RooAbsReal* caller, RunContext& evalData, RooSpan<const double> x, RooSpan<const double> ndof) override {
725 return startComputation(caller, evalData, ChiSquareComputer{}, x, ndof);
726 }
727 RooSpan<double> computeDstD0BG(const RooAbsReal* caller, RunContext& evalData, RooSpan<const double> dm, RooSpan<const double> dm0, RooSpan<const double> C, RooSpan<const double> A, RooSpan<const double> B) override {
728 return startComputation(caller, evalData, DstD0BGComputer{}, dm, dm0, C, A, B);
729 }
730 RooSpan<double> computeExponential(const RooAbsReal* caller, RunContext& evalData, RooSpan<const double> x, RooSpan<const double> c) override {
731 return startComputation(caller, evalData, ExponentialComputer{}, x, c);
732 }
733 RooSpan<double> computeGamma(const RooAbsReal* caller, RunContext& evalData, RooSpan<const double> x, RooSpan<const double> gamma, RooSpan<const double> beta, RooSpan<const double> mu) override {
734 return startComputation(caller, evalData, GammaComputer{}, x, gamma, beta, mu);
735 }
736 RooSpan<double> computeGaussian(const RooAbsReal* caller, RunContext& evalData, RooSpan<const double> x, RooSpan<const double> mean, RooSpan<const double> sigma) override {
737 return startComputation(caller, evalData, GaussianComputer{}, x, mean, sigma);
738 }
739 RooSpan<double> computeJohnson(const RooAbsReal* caller, RunContext& evalData, RooSpan<const double> mass, RooSpan<const double> mu, RooSpan<const double> lambda, RooSpan<const double> gamma, RooSpan<const double> delta, double massThreshold) override {
740 return startComputation(caller, evalData, JohnsonComputer{massThreshold}, mass, mu, lambda, gamma, delta);
741 }
742 RooSpan<double> computeLandau(const RooAbsReal* caller, RunContext& evalData, RooSpan<const double> x, RooSpan<const double> mean, RooSpan<const double> sigma) override {
743 return startComputation(caller, evalData, LandauComputer{}, x, mean, sigma);
744 }
745 RooSpan<double> computeLognormal(const RooAbsReal* caller, RunContext& evalData, RooSpan<const double> x, RooSpan<const double> m0, RooSpan<const double> k) override {
746 return startComputation(caller, evalData, LognormalComputer{}, x, m0, k);
747 }
748 RooSpan<double> computeNovosibirsk(const RooAbsReal* caller, RunContext& evalData, RooSpan<const double> x, RooSpan<const double> peak, RooSpan<const double> width, RooSpan<const double> tail) override {
749 return startComputation(caller, evalData, NovosibirskComputer{}, x, peak, width, tail);
750 }
751 RooSpan<double> computePoisson(const RooAbsReal* caller, RunContext& evalData, RooSpan<const double> x, RooSpan<const double> mean, bool protectNegative, bool noRounding) override {
752 return startComputation(caller, evalData, PoissonComputer{protectNegative, noRounding}, x, mean);
753 }
754 void computePolynomial(size_t batchSize, double* __restrict output, const double* __restrict const X, int lowestOrder, std::vector<BracketAdapterWithMask>& coefList) override {
755 startComputationPolynomial(batchSize, output, X, lowestOrder, coefList);
756 }
757 RooSpan<double> computeVoigtian(const RooAbsReal* caller, RunContext& evalData, RooSpan<const double> x, RooSpan<const double> mean, RooSpan<const double> width, RooSpan<const double> sigma) override {
758 return startComputation(caller, evalData, VoigtianComputer{}, x, mean, width, sigma);
759 }
760 }; // End class RooBatchComputeClass
761
762 /// Static object to trigger the constructor which overwrites the dispatch pointer.
763 static RooBatchComputeClass computeObj;
764
765 } //End namespace RF_ARCH
766} //End namespace RooBatchCompute
#define NaN
#define c(i)
Definition RSha256.hxx:101
#define e(i)
Definition RSha256.hxx:103
static const double x2[5]
static const double x1[5]
include TDocParser_001 C image html pict1_TDocParser_001 png width
#define N
float xmin
float xmax
double floor(double)
double sqrt(double)
double exp(double)
double log(double)
RooAbsReal is the common abstract base class for objects that represent a real value and implements f...
Definition RooAbsReal.h:61
static std::complex< double > faddeeva(std::complex< double > z)
evaluate Faddeeva function for complex argument
Definition RooMath.cxx:542
A simple container to hold a batch of data values.
Definition RooSpan.h:34
double beta(double x, double y)
Calculates the beta function.
const Double_t sigma
Double_t y[n]
Definition legend1.C:17
Double_t x[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
#define G(x, y, z)
double gamma(double x)
double T(double x)
void startComputationPolynomial(size_t batchSize, double *__restrict output, const double *__restrict const X, int lowestOrder, std::vector< BracketAdapterWithMask > &coefList)
void startComputationChebychev(size_t batchSize, double *__restrict output, const double *__restrict const xData, double xmin, double xmax, std::vector< double > coef)
void startComputationBernstein(size_t batchSize, double *__restrict output, const double *__restrict const xData, double xmin, double xmax, std::vector< double > coef)
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
Namespace for dispatching RooFit computations to various backends.
double fast_log(double x)
double fast_exp(double x)
double fast_isqrt(double x)
R__EXTERN RooBatchComputeInterface * dispatch
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
RooArgSet S(const RooAbsArg &v1)
constexpr Double_t C()
Velocity of light in .
Definition TMath.h:117
Double_t LnGamma(Double_t z)
Computation of ln[gamma(z)] for all z.
Definition TMath.cxx:486
constexpr Double_t TwoPi()
Definition TMath.h:44
Definition first.py:1
#define R1(v, w, x, y, z, i)
Definition sha1.inl:134
#define R2(v, w, x, y, z, i)
Definition sha1.inl:137
auto * m
Definition textangle.C:8
static void output(int code)
Definition gifencode.c:226