doc/v626/ComputeFunctions_8cxx_source.html

/*

 * Project: RooFit

 * Authors:

 *   Emmanouil Michalainas, CERN, Summer 2019

 *

 * Copyright (c) 2021, CERN

 *

 * Redistribution and use in source and binary forms,

 * with or without modification, are permitted according to the terms

 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)

 */


/**

\file ComputeFunctions.cxx

\ingroup Roobatchcompute


This file contains vectorizable computation functions for PDFs and other Roofit objects.

The same source file can also be compiled with nvcc. All functions have a single `Batches`

object as an argument passed by value, which contains all the information necessary for the

computation. In case of cuda computations, the loops have a step (stride) the size of the grid

which allows for reusing the same code as the cpu implementations, easier debugging and in terms

of performance, maximum memory coalescing. For more details, see

https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/

**/


#include "RooBatchCompute.h"

#include "RooVDTHeaders.h"

#include "Batches.h"


#include <TMath.h>


#include <complex>


#include "faddeeva_impl.h"


#ifdef __CUDACC__

#define BEGIN blockDim.x *blockIdx.x + threadIdx.x

#define STEP blockDim.x *gridDim.x

#else

#define BEGIN 0

#define STEP 1

#endif // #ifdef __CUDACC__


namespace RooBatchCompute {

namespace RF_ARCH {


__rooglobal__ void computeAddPdf(BatchesHandle batches)

{

   const int nPdfs = batches.getNExtraArgs();

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = batches.extraArg(0) * batches[0][i];

   for (int pdf = 1; pdf < nPdfs; pdf++)

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

         batches._output[i] += batches.extraArg(pdf) * batches[pdf][i];

}


__rooglobal__ void computeArgusBG(BatchesHandle batches)

{

   Batch m = batches[0], m0 = batches[1], c = batches[2], p = batches[3];

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double t = m[i] / m0[i];

      const double u = 1 - t * t;

      batches._output[i] = c[i] * u + p[i] * fast_log(u);

   }

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      if (m[i] >= m0[i])

         batches._output[i] = 0.0;

      else

         batches._output[i] = m[i] * fast_exp(batches._output[i]);

   }

}


__rooglobal__ void computeBernstein(BatchesHandle batches)

{

   const int nCoef = batches.getNExtraArgs() - 2;

   const int degree = nCoef - 1;

   const double xmin = batches.extraArg(nCoef);

   const double xmax = batches.extraArg(nCoef + 1);

   Batch xData = batches[0];


   // apply binomial coefficient in-place so we don't have to allocate new memory

   double binomial = 1.0;

   for (int k = 0; k < nCoef; k++) {

      batches.setExtraArg(k, batches.extraArg(k) * binomial);

      binomial = (binomial * (degree - k)) / (k + 1);

   }


   if (STEP == 1) {

      double X[bufferSize], _1_X[bufferSize], powX[bufferSize], pow_1_X[bufferSize];

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

         powX[i] = pow_1_X[i] = 1.0;

         X[i] = (xData[i] - xmin) / (xmax - xmin);

         _1_X[i] = 1 - X[i];

         batches._output[i] = 0.0;

      }


      // raising 1-x to the power of degree

      for (int k = 2; k <= degree; k += 2)

         for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

            pow_1_X[i] *= _1_X[i] * _1_X[i];


      if (degree % 2 == 1)

         for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

            pow_1_X[i] *= _1_X[i];


      // inverting 1-x ---> 1/(1-x)

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

         _1_X[i] = 1 / _1_X[i];


      for (int k = 0; k < nCoef; k++)

         for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

            batches._output[i] += batches.extraArg(k) * powX[i] * pow_1_X[i];


            // calculating next power for x and 1-x

            powX[i] *= X[i];

            pow_1_X[i] *= _1_X[i];

         }

   } else

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

         batches._output[i] = 0.0;

         const double X = (xData[i] - xmin) / (xmax - xmin);

         double powX = 1.0, pow_1_X = 1.0;

         for (int k = 1; k <= degree; k++)

            pow_1_X *= 1 - X;

         const double _1_X = 1 / (1 - X);

         for (int k = 0; k < nCoef; k++) {

            batches._output[i] += batches.extraArg(k) * powX * pow_1_X;

            powX *= X;

            pow_1_X *= _1_X;

         }

      }


   // reset extraArgs values so we don't mutate the Batches object

   binomial = 1.0;

   for (int k = 0; k < nCoef; k++) {

      batches.setExtraArg(k, batches.extraArg(k) / binomial);

      binomial = (binomial * (degree - k)) / (k + 1);

   }

}


__rooglobal__ void computeBifurGauss(BatchesHandle batches)

{

   Batch X = batches[0], M = batches[1], SL = batches[2], SR = batches[3];

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double arg = X[i] - M[i];

      if (arg < 0)

         arg /= SL[i];

      else

         arg /= SR[i];

      batches._output[i] = fast_exp(-0.5 * arg * arg);

   }

}


__rooglobal__ void computeBreitWigner(BatchesHandle batches)

{

   Batch X = batches[0], M = batches[1], W = batches[2];

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double arg = X[i] - M[i];

      batches._output[i] = 1 / (arg * arg + 0.25 * W[i] * W[i]);

   }

}


__rooglobal__ void computeBukin(BatchesHandle batches)

{

   Batch X = batches[0], XP = batches[1], SP = batches[2], XI = batches[3], R1 = batches[4], R2 = batches[5];

   const double r3 = log(2.0);

   const double r6 = exp(-6.0);

   const double r7 = 2 * sqrt(2 * log(2.0));


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double r1 = XI[i] * fast_isqrt(XI[i] * XI[i] + 1);

      const double r4 = 1 / fast_isqrt(XI[i] * XI[i] + 1);

      const double hp = 1 / (SP[i] * r7);

      const double x1 = XP[i] + 0.5 * SP[i] * r7 * (r1 - 1);

      const double x2 = XP[i] + 0.5 * SP[i] * r7 * (r1 + 1);


      double r5 = 1.0;

      if (XI[i] > r6 || XI[i] < -r6)

         r5 = XI[i] / fast_log(r4 + XI[i]);


      double factor = 1, y = X[i] - x1, Yp = XP[i] - x1, yi = r4 - XI[i], rho = R1[i];

      if (X[i] >= x2) {

         factor = -1;

         y = X[i] - x2;

         Yp = XP[i] - x2;

         yi = r4 + XI[i];

         rho = R2[i];

      }


      batches._output[i] = rho * y * y / Yp / Yp - r3 + factor * 4 * r3 * y * hp * r5 * r4 / yi / yi;

      if (X[i] >= x1 && X[i] < x2) {

         batches._output[i] =

            fast_log(1 + 4 * XI[i] * r4 * (X[i] - XP[i]) * hp) / fast_log(1 + 2 * XI[i] * (XI[i] - r4));

         batches._output[i] *= -batches._output[i] * r3;

      }

      if (X[i] >= x1 && X[i] < x2 && XI[i] < r6 && XI[i] > -r6)

         batches._output[i] = -4 * r3 * (X[i] - XP[i]) * (X[i] - XP[i]) * hp * hp;

   }

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = fast_exp(batches._output[i]);

}


__rooglobal__ void computeCBShape(BatchesHandle batches)

{

   Batch M = batches[0], M0 = batches[1], S = batches[2], A = batches[3], N = batches[4];

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double t = (M[i] - M0[i]) / S[i];

      if ((A[i] > 0 && t >= -A[i]) || (A[i] < 0 && -t >= A[i]))

         batches._output[i] = -0.5 * t * t;

      else {

         batches._output[i] = N[i] / (N[i] - A[i] * A[i] - A[i] * t);

         batches._output[i] = fast_log(batches._output[i]);

         batches._output[i] *= N[i];

         batches._output[i] -= 0.5 * A[i] * A[i];

      }

   }

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = fast_exp(batches._output[i]);

}


__rooglobal__ void computeChebychev(BatchesHandle batches)

{

   Batch xData = batches[0];

   const int nCoef = batches.getNExtraArgs() - 2;

   const double xmin = batches.extraArg(nCoef);

   const double xmax = batches.extraArg(nCoef + 1);


   if (STEP == 1) {

      double prev[bufferSize][2], X[bufferSize];


      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

         // set a0-->prev[i][0] and a1-->prev[i][1]

         // and x tranfsformed to range[-1..1]-->X[i]

         prev[i][0] = batches._output[i] = 1.0;

         prev[i][1] = X[i] = 2 * (xData[i] - 0.5 * (xmax + xmin)) / (xmax - xmin);

      }

      for (int k = 0; k < nCoef; k++)

         for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

            batches._output[i] += prev[i][1] * batches.extraArg(k);


            // compute next order

            const double next = 2 * X[i] * prev[i][1] - prev[i][0];

            prev[i][0] = prev[i][1];

            prev[i][1] = next;

         }

   } else

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

         double prev0 = 1.0, prev1 = 2 * (xData[i] - 0.5 * (xmax + xmin)) / (xmax - xmin), X = prev1;

         batches._output[i] = 1.0;

         for (int k = 0; k < nCoef; k++) {

            batches._output[i] += prev1 * batches.extraArg(k);


            // compute next order

            const double next = 2 * X * prev1 - prev0;

            prev0 = prev1;

            prev1 = next;

         }

      }

}


__rooglobal__ void computeChiSquare(BatchesHandle batches)

{

   Batch X = batches[0];

   const double ndof = batches.extraArg(0);

   const double gamma = 1 / std::tgamma(ndof / 2.0);

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = gamma;


   constexpr double ln2 = 0.693147180559945309417232121458;

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double arg = (ndof - 2) * fast_log(X[i]) - X[i] - ndof * ln2;

      batches._output[i] *= fast_exp(0.5 * arg);

   }

}


__rooglobal__ void computeDstD0BG(BatchesHandle batches)

{

   Batch DM = batches[0], DM0 = batches[1], C = batches[2], A = batches[3], B = batches[4];

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double ratio = DM[i] / DM0[i];

      const double arg1 = (DM0[i] - DM[i]) / C[i];

      const double arg2 = A[i] * fast_log(ratio);

      batches._output[i] = (1 - fast_exp(arg1)) * fast_exp(arg2) + B[i] * (ratio - 1);

   }


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      if (batches._output[i] < 0)

         batches._output[i] = 0;

}


__rooglobal__ void computeExponential(BatchesHandle batches)

{

   Batch x = batches[0], c = batches[1];

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = fast_exp(x[i] * c[i]);

}


__rooglobal__ void computeGamma(BatchesHandle batches)

{

   Batch X = batches[0], G = batches[1], B = batches[2], M = batches[3];

   double gamma = -std::lgamma(G[0]);

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      if (X[i] == M[i])

         batches._output[i] = (G[i] == 1.0) / B[i];

      else if (G.isItVector())

         batches._output[i] = -std::lgamma(G[i]);

      else

         batches._output[i] = gamma;


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      if (X[i] != M[i]) {

         const double invBeta = 1 / B[i];

         double arg = (X[i] - M[i]) * invBeta;

         batches._output[i] -= arg;

         arg = fast_log(arg);

         batches._output[i] += arg * (G[i] - 1);

         batches._output[i] = fast_exp(batches._output[i]);

         batches._output[i] *= invBeta;

      }

}


__rooglobal__ void computeGaussian(BatchesHandle batches)

{

   auto x = batches[0], mean = batches[1], sigma = batches[2];

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double arg = x[i] - mean[i];

      const double halfBySigmaSq = -0.5 / (sigma[i] * sigma[i]);

      batches._output[i] = fast_exp(arg * arg * halfBySigmaSq);

   }

}


__rooglobal__ void computeNegativeLogarithms(BatchesHandle batches)

{

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = -fast_log(batches[0][i]);

   // Multiply by weights if they exist

   if (batches.extraArg(0))

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

         batches._output[i] *= batches[1][i];

}


__rooglobal__ void computeJohnson(BatchesHandle batches)

{

   Batch mass = batches[0], mu = batches[1], lambda = batches[2], gamma = batches[3], delta = batches[4];

   const double sqrtTwoPi = std::sqrt(TMath::TwoPi());

   const double massThreshold = batches.extraArg(0);


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double arg = (mass[i] - mu[i]) / lambda[i];

#ifdef R__HAS_VDT

      const double asinh_arg = fast_log(arg + 1 / fast_isqrt(arg * arg + 1));

#else

      const double asinh_arg = asinh(arg);

#endif

      const double expo = gamma[i] + delta[i] * asinh_arg;

      const double result =

         delta[i] * fast_exp(-0.5 * expo * expo) * fast_isqrt(1. + arg * arg) / (sqrtTwoPi * lambda[i]);


      const double passThrough = mass[i] >= massThreshold;

      batches._output[i] = result * passThrough;

   }

}


/* Actual computation of Landau(x,mean,sigma) in a vectorization-friendly way

 * Code copied from function landau_pdf (math/mathcore/src/PdfFuncMathCore.cxx)

 * and rewritten to enable vectorization.

 */

__rooglobal__ void computeLandau(BatchesHandle batches)

{

   auto case0 = [](double x) {

      const double a1[3] = {0.04166666667, -0.01996527778, 0.02709538966};

      const double u = fast_exp(x + 1.0);

      return 0.3989422803 * fast_exp(-1 / u - 0.5 * (x + 1)) * (1 + (a1[0] + (a1[1] + a1[2] * u) * u) * u);

   };

   auto case1 = [](double x) {

      constexpr double p1[5] = {0.4259894875, -0.1249762550, 0.03984243700, -0.006298287635, 0.001511162253};

      constexpr double q1[5] = {1.0, -0.3388260629, 0.09594393323, -0.01608042283, 0.003778942063};

      const double u = fast_exp(-x - 1);

      return fast_exp(-u - 0.5 * (x + 1)) * (p1[0] + (p1[1] + (p1[2] + (p1[3] + p1[4] * x) * x) * x) * x) /

             (q1[0] + (q1[1] + (q1[2] + (q1[3] + q1[4] * x) * x) * x) * x);

   };

   auto case2 = [](double x) {

      constexpr double p2[5] = {0.1788541609, 0.1173957403, 0.01488850518, -0.001394989411, 0.0001283617211};

      constexpr double q2[5] = {1.0, 0.7428795082, 0.3153932961, 0.06694219548, 0.008790609714};

      return (p2[0] + (p2[1] + (p2[2] + (p2[3] + p2[4] * x) * x) * x) * x) /

             (q2[0] + (q2[1] + (q2[2] + (q2[3] + q2[4] * x) * x) * x) * x);

   };

   auto case3 = [](double x) {

      constexpr double p3[5] = {0.1788544503, 0.09359161662, 0.006325387654, 0.00006611667319, -0.000002031049101};

      constexpr double q3[5] = {1.0, 0.6097809921, 0.2560616665, 0.04746722384, 0.006957301675};

      return (p3[0] + (p3[1] + (p3[2] + (p3[3] + p3[4] * x) * x) * x) * x) /

             (q3[0] + (q3[1] + (q3[2] + (q3[3] + q3[4] * x) * x) * x) * x);

   };

   auto case4 = [](double x) {

      constexpr double p4[5] = {0.9874054407, 118.6723273, 849.2794360, -743.7792444, 427.0262186};

      constexpr double q4[5] = {1.0, 106.8615961, 337.6496214, 2016.712389, 1597.063511};

      const double u = 1 / x;

      return u * u * (p4[0] + (p4[1] + (p4[2] + (p4[3] + p4[4] * u) * u) * u) * u) /

             (q4[0] + (q4[1] + (q4[2] + (q4[3] + q4[4] * u) * u) * u) * u);

   };

   auto case5 = [](double x) {

      constexpr double p5[5] = {1.003675074, 167.5702434, 4789.711289, 21217.86767, -22324.94910};

      constexpr double q5[5] = {1.0, 156.9424537, 3745.310488, 9834.698876, 66924.28357};

      const double u = 1 / x;

      return u * u * (p5[0] + (p5[1] + (p5[2] + (p5[3] + p5[4] * u) * u) * u) * u) /

             (q5[0] + (q5[1] + (q5[2] + (q5[3] + q5[4] * u) * u) * u) * u);

   };

   auto case6 = [](double x) {

      constexpr double p6[5] = {1.000827619, 664.9143136, 62972.92665, 475554.6998, -5743609.109};

      constexpr double q6[5] = {1.0, 651.4101098, 56974.73333, 165917.4725, -2815759.939};

      const double u = 1 / x;

      return u * u * (p6[0] + (p6[1] + (p6[2] + (p6[3] + p6[4] * u) * u) * u) * u) /

             (q6[0] + (q6[1] + (q6[2] + (q6[3] + q6[4] * u) * u) * u) * u);

   };

   auto case7 = [](double x) {

      const double a2[2] = {-1.845568670, -4.284640743};

      const double u = 1 / (x - x * fast_log(x) / (x + 1));

      return u * u * (1 + (a2[0] + a2[1] * u) * u);

   };


   Batch X = batches[0], M = batches[1], S = batches[2];


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = (X[i] - M[i]) / S[i];


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      if (S[i] <= 0.0)

         batches._output[i] = 0;

      else if (batches._output[i] < -5.5)

         batches._output[i] = case0(batches._output[i]);

      else if (batches._output[i] < -1.0)

         batches._output[i] = case1(batches._output[i]);

      else if (batches._output[i] < 1.0)

         batches._output[i] = case2(batches._output[i]);

      else if (batches._output[i] < 5.0)

         batches._output[i] = case3(batches._output[i]);

      else if (batches._output[i] < 12.0)

         batches._output[i] = case4(batches._output[i]);

      else if (batches._output[i] < 50.0)

         batches._output[i] = case5(batches._output[i]);

      else if (batches._output[i] < 300.)

         batches._output[i] = case6(batches._output[i]);

      else

         batches._output[i] = case7(batches._output[i]);

}


__rooglobal__ void computeLognormal(BatchesHandle batches)

{

   Batch X = batches[0], M0 = batches[1], K = batches[2];

   const double rootOf2pi = 2.506628274631000502415765284811;

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double lnxOverM0 = fast_log(X[i] / M0[i]);

      double lnk = fast_log(K[i]);

      if (lnk < 0)

         lnk = -lnk;

      double arg = lnxOverM0 / lnk;

      arg *= -0.5 * arg;

      batches._output[i] = fast_exp(arg) / (X[i] * lnk * rootOf2pi);

   }

}


/* TMath::ASinH(x) needs to be replaced with ln( x + sqrt(x^2+1))

 * argasinh -> the argument of TMath::ASinH()

 * argln -> the argument of the logarithm that replaces AsinH

 * asinh -> the value that the function evaluates to

 *

 * ln is the logarithm that was solely present in the initial

 * formula, that is before the asinh replacement

 */

__rooglobal__ void computeNovosibirsk(BatchesHandle batches)

{

   Batch X = batches[0], P = batches[1], W = batches[2], T = batches[3];

   constexpr double xi = 2.3548200450309494; // 2 Sqrt( Ln(4) )

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double argasinh = 0.5 * xi * T[i];

      double argln = argasinh + 1 / fast_isqrt(argasinh * argasinh + 1);

      double asinh = fast_log(argln);


      double argln2 = 1 - (X[i] - P[i]) * T[i] / W[i];

      double ln = fast_log(argln2);

      batches._output[i] = ln / asinh;

      batches._output[i] *= -0.125 * xi * xi * batches._output[i];

      batches._output[i] -= 2.0 / xi / xi * asinh * asinh;

   }


   // faster if you exponentiate in a seperate loop (dark magic!)

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = fast_exp(batches._output[i]);

}


__rooglobal__ void computePoisson(BatchesHandle batches)

{

   Batch x = batches[0], mean = batches[1];

   bool protectNegative = batches.extraArg(0);

   bool noRounding = batches.extraArg(1);

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double x_i = noRounding ? x[i] : floor(x[i]);

      batches._output[i] = std::lgamma(x_i + 1.);

   }


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double x_i = noRounding ? x[i] : floor(x[i]);

      const double logMean = fast_log(mean[i]);

      const double logPoisson = x_i * logMean - mean[i] - batches._output[i];

      batches._output[i] = fast_exp(logPoisson);


      // Cosmetics

      if (x_i < 0)

         batches._output[i] = 0;

      else if (x_i == 0)

         batches._output[i] = 1 / fast_exp(mean[i]);


      if (protectNegative && mean[i] < 0)

         batches._output[i] = 1.E-3;

   }

}


__rooglobal__ void computePolynomial(BatchesHandle batches)

{

   Batch X = batches[0];

   const int nCoef = batches.getNExtraArgs() - 1;

   const int lowestOrder = batches.extraArg(nCoef);

   if (nCoef == 0) {

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

         batches._output[i] = (lowestOrder > 0.0);

      return;

   } else

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

         batches._output[i] = batches.extraArg(nCoef - 1);


   /* Indexes are in range 0..nCoef-1 but coefList[nCoef-1]

    * has already been processed. In order to traverse the list,

    * with step of 2 we have to start at index nCoef-3 and use

    * coefList[k+1] and coefList[k]

    */

   for (int k = nCoef - 3; k >= 0; k -= 2)

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

         batches._output[i] = X[i] * (batches._output[i] * X[i] + batches.extraArg(k + 1)) + batches.extraArg(k);


   // If nCoef is even, then the coefList[0] didn't get processed

   if (nCoef % 2 == 0)

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

         batches._output[i] = batches._output[i] * X[i] + batches.extraArg(0);


   // Increase the order of the polynomial, first by myltiplying with X[i]^2

   if (lowestOrder != 0) {

      for (int k = 2; k <= lowestOrder; k += 2)

         for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

            batches._output[i] *= X[i] * X[i];


      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

         if (lowestOrder % 2 == 1)

            batches._output[i] *= X[i];

         batches._output[i] += 1.0;

      }

   }

}


__rooglobal__ void computeProdPdf(BatchesHandle batches)

{

   const int nPdfs = batches.extraArg(0);

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = 1.;

   for (int pdf = 0; pdf < nPdfs; pdf++)

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

         batches._output[i] *= batches[pdf][i];

}


__rooglobal__ void computeRatio(BatchesHandle batches)

{

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = batches[0][i] / batches[1][i];

}


__rooglobal__ void computeVoigtian(BatchesHandle batches)

{

   Batch X = batches[0], M = batches[1], W = batches[2], S = batches[3];

   const double invSqrt2 = 0.707106781186547524400844362105;

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double arg = (X[i] - M[i]) * (X[i] - M[i]);

      if (S[i] == 0.0 && W[i] == 0.0)

         batches._output[i] = 1.0;

      else if (S[i] == 0.0)

         batches._output[i] = 1 / (arg + 0.25 * W[i] * W[i]);

      else if (W[i] == 0.0)

         batches._output[i] = fast_exp(-0.5 * arg / (S[i] * S[i]));

      else

         batches._output[i] = invSqrt2 / S[i];

   }


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      if (S[i] != 0.0 && W[i] != 0.0) {

         if (batches._output[i] < 0)

            batches._output[i] = -batches._output[i];

         const double factor = W[i] > 0.0 ? 0.5 : -0.5;

         std::complex<double> z(batches._output[i] * (X[i] - M[i]), factor * batches._output[i] * W[i]);

         batches._output[i] *= faddeeva_impl::faddeeva(z).real();

      }

}


/// Returns a std::vector of pointers to the compute functions in this file.

std::vector<void (*)(BatchesHandle)> getFunctions()

{

   return {computeAddPdf,

           computeArgusBG,

           computeBernstein,

           computeBifurGauss,

           computeBreitWigner,

           computeBukin,

           computeCBShape,

           computeChebychev,

           computeChiSquare,

           computeDstD0BG,

           computeExponential,

           computeGamma,

           computeGaussian,

           computeJohnson,

           computeLandau,

           computeLognormal,

           computeNegativeLogarithms,

           computeNovosibirsk,

           computePoisson,

           computePolynomial,

           computeProdPdf,

           computeRatio,

           computeVoigtian};

}

} // End namespace RF_ARCH

} // End namespace RooBatchCompute

Batches.h

STEP
#define STEP
Definition ComputeFunctions.cxx:41

BEGIN
#define BEGIN
Definition ComputeFunctions.cxx:40

void
typedef void(GLAPIENTRYP _GLUfuncptr)(void)

c
#define c(i)
Definition RSha256.hxx:101

__rooglobal__
#define __rooglobal__
Definition RooBatchComputeTypes.h:27

RooBatchCompute.h

x2
static const double x2[5]
Definition RooGaussKronrodIntegrator1D.cxx:364

x1
static const double x1[5]
Definition RooGaussKronrodIntegrator1D.cxx:346

RooVDTHeaders.h

N
#define N

xmin
float xmin
Definition THbookFile.cxx:95

xmax
float xmax
Definition THbookFile.cxx:95

TMath.h

RooBatchCompute::RF_ARCH::Batch
Definition Batches.h:42

RooBatchCompute::RF_ARCH::Batches
Definition Batches.h:69

RooBatchCompute::RF_ARCH::Batches::setExtraArg
__roodevice__ void setExtraArg(uint8_t i, double val)
Definition Batches.h:102

RooBatchCompute::RF_ARCH::Batches::getNEvents
__roodevice__ size_t getNEvents() const
Definition Batches.h:99

RooBatchCompute::RF_ARCH::Batches::getNExtraArgs
__roodevice__ uint8_t getNExtraArgs() const
Definition Batches.h:100

RooBatchCompute::RF_ARCH::Batches::_output
RestrictArr _output
Definition Batches.h:84

RooBatchCompute::RF_ARCH::Batches::extraArg
__roodevice__ double extraArg(uint8_t i) const
Definition Batches.h:101

faddeeva_impl.h

sigma
const Double_t sigma
Definition h1analysisProxy.h:11

y
Double_t y[n]
Definition legend1.C:17

x
Double_t x[n]
Definition legend1.C:17

G
#define G(x, y, z)

RooBatchCompute::RF_ARCH::computeExponential
__rooglobal__ void computeExponential(BatchesHandle batches)
Definition ComputeFunctions.cxx:291

RooBatchCompute::RF_ARCH::computeDstD0BG
__rooglobal__ void computeDstD0BG(BatchesHandle batches)
Definition ComputeFunctions.cxx:276

RooBatchCompute::RF_ARCH::computeLandau
__rooglobal__ void computeLandau(BatchesHandle batches)
Definition ComputeFunctions.cxx:368

RooBatchCompute::RF_ARCH::computeArgusBG
__rooglobal__ void computeArgusBG(BatchesHandle batches)
Definition ComputeFunctions.cxx:57

RooBatchCompute::RF_ARCH::computeBukin
__rooglobal__ void computeBukin(BatchesHandle batches)
Definition ComputeFunctions.cxx:163

RooBatchCompute::RF_ARCH::computeNovosibirsk
__rooglobal__ void computeNovosibirsk(BatchesHandle batches)
Definition ComputeFunctions.cxx:470

RooBatchCompute::RF_ARCH::computePolynomial
__rooglobal__ void computePolynomial(BatchesHandle batches)
Definition ComputeFunctions.cxx:518

RooBatchCompute::RF_ARCH::computePoisson
__rooglobal__ void computePoisson(BatchesHandle batches)
Definition ComputeFunctions.cxx:491

RooBatchCompute::RF_ARCH::computeBifurGauss
__rooglobal__ void computeBifurGauss(BatchesHandle batches)
Definition ComputeFunctions.cxx:141

RooBatchCompute::RF_ARCH::computeRatio
__rooglobal__ void computeRatio(BatchesHandle batches)
Definition ComputeFunctions.cxx:569

RooBatchCompute::RF_ARCH::computeAddPdf
__rooglobal__ void computeAddPdf(BatchesHandle batches)
Definition ComputeFunctions.cxx:47

RooBatchCompute::RF_ARCH::computeChiSquare
__rooglobal__ void computeChiSquare(BatchesHandle batches)
Definition ComputeFunctions.cxx:261

RooBatchCompute::RF_ARCH::computeChebychev
__rooglobal__ void computeChebychev(BatchesHandle batches)
Definition ComputeFunctions.cxx:221

RooBatchCompute::RF_ARCH::computeLognormal
__rooglobal__ void computeLognormal(BatchesHandle batches)
Definition ComputeFunctions.cxx:447

RooBatchCompute::RF_ARCH::computeVoigtian
__rooglobal__ void computeVoigtian(BatchesHandle batches)
Definition ComputeFunctions.cxx:575

RooBatchCompute::RF_ARCH::computeGaussian
__rooglobal__ void computeGaussian(BatchesHandle batches)
Definition ComputeFunctions.cxx:322

RooBatchCompute::RF_ARCH::computeBernstein
__rooglobal__ void computeBernstein(BatchesHandle batches)
Definition ComputeFunctions.cxx:73

RooBatchCompute::RF_ARCH::computeNegativeLogarithms
__rooglobal__ void computeNegativeLogarithms(BatchesHandle batches)
Definition ComputeFunctions.cxx:332

RooBatchCompute::RF_ARCH::computeGamma
__rooglobal__ void computeGamma(BatchesHandle batches)
Definition ComputeFunctions.cxx:298

RooBatchCompute::RF_ARCH::getFunctions
std::vector< void(*)(BatchesHandle)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
Definition ComputeFunctions.cxx:602

RooBatchCompute::RF_ARCH::BatchesHandle
Batches & BatchesHandle
Definition Batches.h:117

RooBatchCompute::RF_ARCH::computeJohnson
__rooglobal__ void computeJohnson(BatchesHandle batches)
Definition ComputeFunctions.cxx:342

RooBatchCompute::RF_ARCH::computeBreitWigner
__rooglobal__ void computeBreitWigner(BatchesHandle batches)
Definition ComputeFunctions.cxx:154

RooBatchCompute::RF_ARCH::computeProdPdf
__rooglobal__ void computeProdPdf(BatchesHandle batches)
Definition ComputeFunctions.cxx:559

RooBatchCompute::RF_ARCH::computeCBShape
__rooglobal__ void computeCBShape(BatchesHandle batches)
Definition ComputeFunctions.cxx:203

RooBatchCompute
Namespace for dispatching RooFit computations to various backends.
Definition BracketAdapters.h:24

RooBatchCompute::fast_exp
__roodevice__ double fast_exp(double x)
Definition RooVDTHeaders.h:53

RooBatchCompute::fast_log
__roodevice__ double fast_log(double x)
Definition RooVDTHeaders.h:57

RooBatchCompute::bufferSize
constexpr uint16_t bufferSize
Definition Batches.h:38

RooBatchCompute::fast_isqrt
__roodevice__ double fast_isqrt(double x)
Definition RooVDTHeaders.h:61

TMath::TwoPi
constexpr Double_t TwoPi()
Definition TMath.h:44

faddeeva_impl::faddeeva
__roodevice__ __roohost__ std::complex< double > faddeeva(std::complex< double > z)
Definition faddeeva_impl.h:528

R1
#define R1(v, w, x, y, z, i)
Definition sha1.inl:134

R2
#define R2(v, w, x, y, z, i)
Definition sha1.inl:137

m
auto * m
Definition textangle.C:8