doc/v628/ComputeFunctions_8cxx_source.html

/*

 * Project: RooFit

 * Authors:

 *   Emmanouil Michalainas, CERN, Summer 2019

 *

 * Copyright (c) 2021, CERN

 *

 * Redistribution and use in source and binary forms,

 * with or without modification, are permitted according to the terms

 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)

 */


/**

\file ComputeFunctions.cxx

\ingroup Roobatchcompute


This file contains vectorizable computation functions for PDFs and other Roofit objects.

The same source file can also be compiled with nvcc. All functions have a single `Batches`

object as an argument passed by value, which contains all the information necessary for the

computation. In case of cuda computations, the loops have a step (stride) the size of the grid

which allows for reusing the same code as the cpu implementations, easier debugging and in terms

of performance, maximum memory coalescing. For more details, see

https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/

**/


#include "RooBatchCompute.h"

#include "RooNaNPacker.h"

#include "RooVDTHeaders.h"

#include "Batches.h"


#include <TMath.h>


#include <RooHeterogeneousMath.h>


#ifdef __CUDACC__

#define BEGIN blockDim.x *blockIdx.x + threadIdx.x

#define STEP blockDim.x *gridDim.x

#else

#define BEGIN 0

#define STEP 1

#endif // #ifdef __CUDACC__


namespace RooBatchCompute {

namespace RF_ARCH {


__rooglobal__ void computeAddPdf(BatchesHandle batches)

{

   const int nPdfs = batches.getNExtraArgs();

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = batches.extraArg(0) * batches[0][i];

   for (int pdf = 1; pdf < nPdfs; pdf++)

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

         batches._output[i] += batches.extraArg(pdf) * batches[pdf][i];

}


__rooglobal__ void computeArgusBG(BatchesHandle batches)

{

   Batch m = batches[0], m0 = batches[1], c = batches[2], p = batches[3];

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double t = m[i] / m0[i];

      const double u = 1 - t * t;

      batches._output[i] = c[i] * u + p[i] * fast_log(u);

   }

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      if (m[i] >= m0[i])

         batches._output[i] = 0.0;

      else

         batches._output[i] = m[i] * fast_exp(batches._output[i]);

   }

}


__rooglobal__ void computeBMixDecay(BatchesHandle batches)

{

   Batch coef0 = batches[0];

   Batch coef1 = batches[1];

   Batch tagFlav = batches[2];

   Batch delMistag = batches[3];

   Batch mixState = batches[4];

   Batch mistag = batches[5];


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      batches._output[i] =

         coef0[i] * (1.0 - tagFlav[i] * delMistag[0]) + coef1[i] * (mixState[i] * (1.0 - 2.0 * mistag[0]));

   }

}


__rooglobal__ void computeBernstein(BatchesHandle batches)

{

   const int nCoef = batches.getNExtraArgs() - 2;

   const int degree = nCoef - 1;

   const double xmin = batches.extraArg(nCoef);

   const double xmax = batches.extraArg(nCoef + 1);

   Batch xData = batches[0];


   // apply binomial coefficient in-place so we don't have to allocate new memory

   double binomial = 1.0;

   for (int k = 0; k < nCoef; k++) {

      batches.setExtraArg(k, batches.extraArg(k) * binomial);

      binomial = (binomial * (degree - k)) / (k + 1);

   }


   if (STEP == 1) {

      double X[bufferSize], _1_X[bufferSize], powX[bufferSize], pow_1_X[bufferSize];

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

         powX[i] = pow_1_X[i] = 1.0;

         X[i] = (xData[i] - xmin) / (xmax - xmin);

         _1_X[i] = 1 - X[i];

         batches._output[i] = 0.0;

      }


      // raising 1-x to the power of degree

      for (int k = 2; k <= degree; k += 2)

         for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

            pow_1_X[i] *= _1_X[i] * _1_X[i];


      if (degree % 2 == 1)

         for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

            pow_1_X[i] *= _1_X[i];


      // inverting 1-x ---> 1/(1-x)

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

         _1_X[i] = 1 / _1_X[i];


      for (int k = 0; k < nCoef; k++)

         for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

            batches._output[i] += batches.extraArg(k) * powX[i] * pow_1_X[i];


            // calculating next power for x and 1-x

            powX[i] *= X[i];

            pow_1_X[i] *= _1_X[i];

         }

   } else

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

         batches._output[i] = 0.0;

         const double X = (xData[i] - xmin) / (xmax - xmin);

         double powX = 1.0, pow_1_X = 1.0;

         for (int k = 1; k <= degree; k++)

            pow_1_X *= 1 - X;

         const double _1_X = 1 / (1 - X);

         for (int k = 0; k < nCoef; k++) {

            batches._output[i] += batches.extraArg(k) * powX * pow_1_X;

            powX *= X;

            pow_1_X *= _1_X;

         }

      }


   // reset extraArgs values so we don't mutate the Batches object

   binomial = 1.0;

   for (int k = 0; k < nCoef; k++) {

      batches.setExtraArg(k, batches.extraArg(k) / binomial);

      binomial = (binomial * (degree - k)) / (k + 1);

   }

}


__rooglobal__ void computeBifurGauss(BatchesHandle batches)

{

   Batch X = batches[0], M = batches[1], SL = batches[2], SR = batches[3];

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double arg = X[i] - M[i];

      if (arg < 0)

         arg /= SL[i];

      else

         arg /= SR[i];

      batches._output[i] = fast_exp(-0.5 * arg * arg);

   }

}


__rooglobal__ void computeBreitWigner(BatchesHandle batches)

{

   Batch X = batches[0], M = batches[1], W = batches[2];

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double arg = X[i] - M[i];

      batches._output[i] = 1 / (arg * arg + 0.25 * W[i] * W[i]);

   }

}


__rooglobal__ void computeBukin(BatchesHandle batches)

{

   Batch X = batches[0], XP = batches[1], SP = batches[2], XI = batches[3], R1 = batches[4], R2 = batches[5];

   const double r3 = log(2.0);

   const double r6 = exp(-6.0);

   const double r7 = 2 * sqrt(2 * log(2.0));


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double r1 = XI[i] * fast_isqrt(XI[i] * XI[i] + 1);

      const double r4 = 1 / fast_isqrt(XI[i] * XI[i] + 1);

      const double hp = 1 / (SP[i] * r7);

      const double x1 = XP[i] + 0.5 * SP[i] * r7 * (r1 - 1);

      const double x2 = XP[i] + 0.5 * SP[i] * r7 * (r1 + 1);


      double r5 = 1.0;

      if (XI[i] > r6 || XI[i] < -r6)

         r5 = XI[i] / fast_log(r4 + XI[i]);


      double factor = 1, y = X[i] - x1, Yp = XP[i] - x1, yi = r4 - XI[i], rho = R1[i];

      if (X[i] >= x2) {

         factor = -1;

         y = X[i] - x2;

         Yp = XP[i] - x2;

         yi = r4 + XI[i];

         rho = R2[i];

      }


      batches._output[i] = rho * y * y / Yp / Yp - r3 + factor * 4 * r3 * y * hp * r5 * r4 / yi / yi;

      if (X[i] >= x1 && X[i] < x2) {

         batches._output[i] =

            fast_log(1 + 4 * XI[i] * r4 * (X[i] - XP[i]) * hp) / fast_log(1 + 2 * XI[i] * (XI[i] - r4));

         batches._output[i] *= -batches._output[i] * r3;

      }

      if (X[i] >= x1 && X[i] < x2 && XI[i] < r6 && XI[i] > -r6)

         batches._output[i] = -4 * r3 * (X[i] - XP[i]) * (X[i] - XP[i]) * hp * hp;

   }

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = fast_exp(batches._output[i]);

}


__rooglobal__ void computeCBShape(BatchesHandle batches)

{

   Batch M = batches[0], M0 = batches[1], S = batches[2], A = batches[3], N = batches[4];

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double t = (M[i] - M0[i]) / S[i];

      if ((A[i] > 0 && t >= -A[i]) || (A[i] < 0 && -t >= A[i]))

         batches._output[i] = -0.5 * t * t;

      else {

         batches._output[i] = N[i] / (N[i] - A[i] * A[i] - A[i] * t);

         batches._output[i] = fast_log(batches._output[i]);

         batches._output[i] *= N[i];

         batches._output[i] -= 0.5 * A[i] * A[i];

      }

   }

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = fast_exp(batches._output[i]);

}


__rooglobal__ void computeChebychev(BatchesHandle batches)

{

   Batch xData = batches[0];

   const int nCoef = batches.getNExtraArgs() - 2;

   const double xmin = batches.extraArg(nCoef);

   const double xmax = batches.extraArg(nCoef + 1);


   if (STEP == 1) {

      double prev[bufferSize][2], X[bufferSize];


      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

         // set a0-->prev[i][0] and a1-->prev[i][1]

         // and x tranfsformed to range[-1..1]-->X[i]

         prev[i][0] = batches._output[i] = 1.0;

         prev[i][1] = X[i] = 2 * (xData[i] - 0.5 * (xmax + xmin)) / (xmax - xmin);

      }

      for (int k = 0; k < nCoef; k++)

         for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

            batches._output[i] += prev[i][1] * batches.extraArg(k);


            // compute next order

            const double next = 2 * X[i] * prev[i][1] - prev[i][0];

            prev[i][0] = prev[i][1];

            prev[i][1] = next;

         }

   } else

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

         double prev0 = 1.0, prev1 = 2 * (xData[i] - 0.5 * (xmax + xmin)) / (xmax - xmin), X = prev1;

         batches._output[i] = 1.0;

         for (int k = 0; k < nCoef; k++) {

            batches._output[i] += prev1 * batches.extraArg(k);


            // compute next order

            const double next = 2 * X * prev1 - prev0;

            prev0 = prev1;

            prev1 = next;

         }

      }

}


__rooglobal__ void computeChiSquare(BatchesHandle batches)

{

   Batch X = batches[0];

   const double ndof = batches.extraArg(0);

   const double gamma = 1 / std::tgamma(ndof / 2.0);

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = gamma;


   constexpr double ln2 = 0.693147180559945309417232121458;

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double arg = (ndof - 2) * fast_log(X[i]) - X[i] - ndof * ln2;

      batches._output[i] *= fast_exp(0.5 * arg);

   }

}


__rooglobal__ void computeDeltaFunction(BatchesHandle batches)

{

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      batches._output[i] = 0.0 + (batches[0][i] == 1.0);

   }

}


__rooglobal__ void computeDstD0BG(BatchesHandle batches)

{

   Batch DM = batches[0], DM0 = batches[1], C = batches[2], A = batches[3], B = batches[4];

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double ratio = DM[i] / DM0[i];

      const double arg1 = (DM0[i] - DM[i]) / C[i];

      const double arg2 = A[i] * fast_log(ratio);

      batches._output[i] = (1 - fast_exp(arg1)) * fast_exp(arg2) + B[i] * (ratio - 1);

   }


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      if (batches._output[i] < 0)

         batches._output[i] = 0;

}


__rooglobal__ void computeExponential(BatchesHandle batches)

{

   Batch x = batches[0], c = batches[1];

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = fast_exp(x[i] * c[i]);

}


__rooglobal__ void computeGamma(BatchesHandle batches)

{

   Batch X = batches[0], G = batches[1], B = batches[2], M = batches[3];

   double gamma = -std::lgamma(G[0]);

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      if (X[i] == M[i])

         batches._output[i] = (G[i] == 1.0) / B[i];

      else if (G.isItVector())

         batches._output[i] = -std::lgamma(G[i]);

      else

         batches._output[i] = gamma;


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      if (X[i] != M[i]) {

         const double invBeta = 1 / B[i];

         double arg = (X[i] - M[i]) * invBeta;

         batches._output[i] -= arg;

         arg = fast_log(arg);

         batches._output[i] += arg * (G[i] - 1);

         batches._output[i] = fast_exp(batches._output[i]);

         batches._output[i] *= invBeta;

      }

}


__rooglobal__ void computeGaussModelExpBasis(BatchesHandle batches)

{

   const double root2 = std::sqrt(2.);

   const double root2pi = std::sqrt(2. * std::atan2(0., -1.));


   const bool isMinus = batches.extraArg(0) < 0.0;

   const bool isPlus = batches.extraArg(0) > 0.0;


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {


      const double x = batches[0][i];

      const double mean = batches[1][i] * batches[2][i];

      const double sigma = batches[3][i] * batches[4][i];

      const double tau = batches[5][i];


      if (tau == 0.0) {

         // Straight Gaussian, used for unconvoluted PDF or expBasis with 0 lifetime

         double xprime = (x - mean) / sigma;

         double result = std::exp(-0.5 * xprime * xprime) / (sigma * root2pi);

         if (!isMinus && !isPlus)

            result *= 2;

         batches._output[i] = result;

      } else {

         // Convolution with exp(-t/tau)

         const double xprime = (x - mean) / tau;

         const double c = sigma / (root2 * tau);

         const double u = xprime / (2 * c);


         double result = 0.0;

         if (!isMinus)

            result += RooHeterogeneousMath::evalCerf(0, -u, c).real();

         if (!isPlus)

            result += RooHeterogeneousMath::evalCerf(0, u, c).real();

         batches._output[i] = result;

      }

   }

}


__rooglobal__ void computeGaussian(BatchesHandle batches)

{

   auto x = batches[0];

   auto mean = batches[1];

   auto sigma = batches[2];

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double arg = x[i] - mean[i];

      const double halfBySigmaSq = -0.5 / (sigma[i] * sigma[i]);

      batches._output[i] = fast_exp(arg * arg * halfBySigmaSq);

   }

}


__rooglobal__ void computeIdentity(BatchesHandle batches)

{

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      batches._output[i] = batches[0][i];

   }

}


__rooglobal__ void computeNegativeLogarithms(BatchesHandle batches)

{

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = -fast_log(batches[0][i]);

   // Multiply by weights if they exist

   if (batches.extraArg(0))

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

         batches._output[i] *= batches[1][i];

}


__rooglobal__ void computeJohnson(BatchesHandle batches)

{

   Batch mass = batches[0], mu = batches[1], lambda = batches[2], gamma = batches[3], delta = batches[4];

   const double sqrtTwoPi = std::sqrt(TMath::TwoPi());

   const double massThreshold = batches.extraArg(0);


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double arg = (mass[i] - mu[i]) / lambda[i];

#ifdef R__HAS_VDT

      const double asinh_arg = fast_log(arg + 1 / fast_isqrt(arg * arg + 1));

#else

      const double asinh_arg = asinh(arg);

#endif

      const double expo = gamma[i] + delta[i] * asinh_arg;

      const double result =

         delta[i] * fast_exp(-0.5 * expo * expo) * fast_isqrt(1. + arg * arg) / (sqrtTwoPi * lambda[i]);


      const double passThrough = mass[i] >= massThreshold;

      batches._output[i] = result * passThrough;

   }

}


/* Actual computation of Landau(x,mean,sigma) in a vectorization-friendly way

 * Code copied from function landau_pdf (math/mathcore/src/PdfFuncMathCore.cxx)

 * and rewritten to enable vectorization.

 */

__rooglobal__ void computeLandau(BatchesHandle batches)

{

   auto case0 = [](double x) {

      const double a1[3] = {0.04166666667, -0.01996527778, 0.02709538966};

      const double u = fast_exp(x + 1.0);

      return 0.3989422803 * fast_exp(-1 / u - 0.5 * (x + 1)) * (1 + (a1[0] + (a1[1] + a1[2] * u) * u) * u);

   };

   auto case1 = [](double x) {

      constexpr double p1[5] = {0.4259894875, -0.1249762550, 0.03984243700, -0.006298287635, 0.001511162253};

      constexpr double q1[5] = {1.0, -0.3388260629, 0.09594393323, -0.01608042283, 0.003778942063};

      const double u = fast_exp(-x - 1);

      return fast_exp(-u - 0.5 * (x + 1)) * (p1[0] + (p1[1] + (p1[2] + (p1[3] + p1[4] * x) * x) * x) * x) /

             (q1[0] + (q1[1] + (q1[2] + (q1[3] + q1[4] * x) * x) * x) * x);

   };

   auto case2 = [](double x) {

      constexpr double p2[5] = {0.1788541609, 0.1173957403, 0.01488850518, -0.001394989411, 0.0001283617211};

      constexpr double q2[5] = {1.0, 0.7428795082, 0.3153932961, 0.06694219548, 0.008790609714};

      return (p2[0] + (p2[1] + (p2[2] + (p2[3] + p2[4] * x) * x) * x) * x) /

             (q2[0] + (q2[1] + (q2[2] + (q2[3] + q2[4] * x) * x) * x) * x);

   };

   auto case3 = [](double x) {

      constexpr double p3[5] = {0.1788544503, 0.09359161662, 0.006325387654, 0.00006611667319, -0.000002031049101};

      constexpr double q3[5] = {1.0, 0.6097809921, 0.2560616665, 0.04746722384, 0.006957301675};

      return (p3[0] + (p3[1] + (p3[2] + (p3[3] + p3[4] * x) * x) * x) * x) /

             (q3[0] + (q3[1] + (q3[2] + (q3[3] + q3[4] * x) * x) * x) * x);

   };

   auto case4 = [](double x) {

      constexpr double p4[5] = {0.9874054407, 118.6723273, 849.2794360, -743.7792444, 427.0262186};

      constexpr double q4[5] = {1.0, 106.8615961, 337.6496214, 2016.712389, 1597.063511};

      const double u = 1 / x;

      return u * u * (p4[0] + (p4[1] + (p4[2] + (p4[3] + p4[4] * u) * u) * u) * u) /

             (q4[0] + (q4[1] + (q4[2] + (q4[3] + q4[4] * u) * u) * u) * u);

   };

   auto case5 = [](double x) {

      constexpr double p5[5] = {1.003675074, 167.5702434, 4789.711289, 21217.86767, -22324.94910};

      constexpr double q5[5] = {1.0, 156.9424537, 3745.310488, 9834.698876, 66924.28357};

      const double u = 1 / x;

      return u * u * (p5[0] + (p5[1] + (p5[2] + (p5[3] + p5[4] * u) * u) * u) * u) /

             (q5[0] + (q5[1] + (q5[2] + (q5[3] + q5[4] * u) * u) * u) * u);

   };

   auto case6 = [](double x) {

      constexpr double p6[5] = {1.000827619, 664.9143136, 62972.92665, 475554.6998, -5743609.109};

      constexpr double q6[5] = {1.0, 651.4101098, 56974.73333, 165917.4725, -2815759.939};

      const double u = 1 / x;

      return u * u * (p6[0] + (p6[1] + (p6[2] + (p6[3] + p6[4] * u) * u) * u) * u) /

             (q6[0] + (q6[1] + (q6[2] + (q6[3] + q6[4] * u) * u) * u) * u);

   };

   auto case7 = [](double x) {

      const double a2[2] = {-1.845568670, -4.284640743};

      const double u = 1 / (x - x * fast_log(x) / (x + 1));

      return u * u * (1 + (a2[0] + a2[1] * u) * u);

   };


   Batch X = batches[0], M = batches[1], S = batches[2];


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = (X[i] - M[i]) / S[i];


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      if (S[i] <= 0.0)

         batches._output[i] = 0;

      else if (batches._output[i] < -5.5)

         batches._output[i] = case0(batches._output[i]);

      else if (batches._output[i] < -1.0)

         batches._output[i] = case1(batches._output[i]);

      else if (batches._output[i] < 1.0)

         batches._output[i] = case2(batches._output[i]);

      else if (batches._output[i] < 5.0)

         batches._output[i] = case3(batches._output[i]);

      else if (batches._output[i] < 12.0)

         batches._output[i] = case4(batches._output[i]);

      else if (batches._output[i] < 50.0)

         batches._output[i] = case5(batches._output[i]);

      else if (batches._output[i] < 300.)

         batches._output[i] = case6(batches._output[i]);

      else

         batches._output[i] = case7(batches._output[i]);

}


__rooglobal__ void computeLognormal(BatchesHandle batches)

{

   Batch X = batches[0], M0 = batches[1], K = batches[2];

   const double rootOf2pi = 2.506628274631000502415765284811;

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double lnxOverM0 = fast_log(X[i] / M0[i]);

      double lnk = fast_log(K[i]);

      if (lnk < 0)

         lnk = -lnk;

      double arg = lnxOverM0 / lnk;

      arg *= -0.5 * arg;

      batches._output[i] = fast_exp(arg) / (X[i] * lnk * rootOf2pi);

   }

}


__rooglobal__ void computeNormalizedPdf(BatchesHandle batches)

{

   auto rawVal = batches[0];

   auto normVal = batches[1];


   int nEvalErrorsType0 = 0;

   int nEvalErrorsType1 = 0;

   int nEvalErrorsType2 = 0;


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double out = 0.0;

      // batches._output[i] = rawVal[i] / normVar[i];

      if (normVal[i] < 0. || (normVal[i] == 0. && rawVal[i] != 0)) {

         // Unreasonable normalisations. A zero integral can be tolerated if the function vanishes, though.

         out = RooNaNPacker::packFloatIntoNaN(-normVal[i] + (rawVal[i] < 0. ? -rawVal[i] : 0.));

         nEvalErrorsType0++;

      } else if (rawVal[i] < 0.) {

         // The pdf value is less than zero.

         out = RooNaNPacker::packFloatIntoNaN(-rawVal[i]);

         nEvalErrorsType1++;

      } else if (std::isnan(rawVal[i])) {

         // The pdf value is Not-a-Number.

         out = rawVal[i];

         nEvalErrorsType2++;

      } else {

         out = (rawVal[i] == 0. && normVal[i] == 0.) ? 0. : rawVal[i] / normVal[i];

      }

      batches._output[i] = out;

   }


   if (nEvalErrorsType0 > 0)

      batches.setExtraArg(0, batches.extraArg(0) + nEvalErrorsType0);

   if (nEvalErrorsType1 > 1)

      batches.setExtraArg(1, batches.extraArg(1) + nEvalErrorsType1);

   if (nEvalErrorsType2 > 2)

      batches.setExtraArg(2, batches.extraArg(2) + nEvalErrorsType2);

}


/* TMath::ASinH(x) needs to be replaced with ln( x + sqrt(x^2+1))

 * argasinh -> the argument of TMath::ASinH()

 * argln -> the argument of the logarithm that replaces AsinH

 * asinh -> the value that the function evaluates to

 *

 * ln is the logarithm that was solely present in the initial

 * formula, that is before the asinh replacement

 */

__rooglobal__ void computeNovosibirsk(BatchesHandle batches)

{

   Batch X = batches[0], P = batches[1], W = batches[2], T = batches[3];

   constexpr double xi = 2.3548200450309494; // 2 Sqrt( Ln(4) )

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double argasinh = 0.5 * xi * T[i];

      double argln = argasinh + 1 / fast_isqrt(argasinh * argasinh + 1);

      double asinh = fast_log(argln);


      double argln2 = 1 - (X[i] - P[i]) * T[i] / W[i];

      double ln = fast_log(argln2);

      batches._output[i] = ln / asinh;

      batches._output[i] *= -0.125 * xi * xi * batches._output[i];

      batches._output[i] -= 2.0 / xi / xi * asinh * asinh;

   }


   // faster if you exponentiate in a seperate loop (dark magic!)

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)

      batches._output[i] = fast_exp(batches._output[i]);

}


__rooglobal__ void computePoisson(BatchesHandle batches)

{

   Batch x = batches[0], mean = batches[1];

   bool protectNegative = batches.extraArg(0);

   bool noRounding = batches.extraArg(1);

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double x_i = noRounding ? x[i] : floor(x[i]);

      batches._output[i] = std::lgamma(x_i + 1.);

   }


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double x_i = noRounding ? x[i] : floor(x[i]);

      const double logMean = fast_log(mean[i]);

      const double logPoisson = x_i * logMean - mean[i] - batches._output[i];

      batches._output[i] = fast_exp(logPoisson);


      // Cosmetics

      if (x_i < 0)

         batches._output[i] = 0;

      else if (x_i == 0)

         batches._output[i] = 1 / fast_exp(mean[i]);


      if (protectNegative && mean[i] < 0)

         batches._output[i] = 1.E-3;

   }

}


__rooglobal__ void computePolynomial(BatchesHandle batches)

{

   const int nCoef = batches.extraArg(0);

   const std::size_t nEvents = batches.getNEvents();

   Batch x = batches[nCoef];


   for (size_t i = BEGIN; i < nEvents; i += STEP) {

      batches._output[i] = batches[nCoef - 1][i];

   }


   // Indexes are in range 0..nCoef-1 but coefList[nCoef-1] has already been

   // processed.

   for (int k = nCoef - 2; k >= 0; k--) {

      for (size_t i = BEGIN; i < nEvents; i += STEP) {

         batches._output[i] = batches[k][i] + x[i] * batches._output[i];

      }

   }

}


__rooglobal__ void computeProdPdf(BatchesHandle batches)

{

   const int nPdfs = batches.extraArg(0);

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      batches._output[i] = 1.;

   }

   for (int pdf = 0; pdf < nPdfs; pdf++) {

      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

         batches._output[i] *= batches[pdf][i];

      }

   }

}


__rooglobal__ void computeRatio(BatchesHandle batches)

{

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      batches._output[i] = batches[0][i] / batches[1][i];

   }

}


__rooglobal__ void computeTruthModelExpBasis(BatchesHandle batches)

{


   const bool isMinus = batches.extraArg(0) < 0.0;

   const bool isPlus = batches.extraArg(0) > 0.0;

   for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double x = batches[0][i];

      // Enforce sign compatibility

      const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0);

      batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]);

   }

}


__rooglobal__ void computeTruthModelSinBasis(BatchesHandle batches)

{

   const bool isMinus = batches.extraArg(0) < 0.0;

   const bool isPlus = batches.extraArg(0) > 0.0;

   for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double x = batches[0][i];

      // Enforce sign compatibility

      const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0);

      batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * fast_sin(x * batches[2][i]);

   }

}


__rooglobal__ void computeTruthModelCosBasis(BatchesHandle batches)

{

   const bool isMinus = batches.extraArg(0) < 0.0;

   const bool isPlus = batches.extraArg(0) > 0.0;

   for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double x = batches[0][i];

      // Enforce sign compatibility

      const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0);

      batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * fast_cos(x * batches[2][i]);

   }

}


__rooglobal__ void computeTruthModelLinBasis(BatchesHandle batches)

{

   const bool isMinus = batches.extraArg(0) < 0.0;

   const bool isPlus = batches.extraArg(0) > 0.0;

   for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double x = batches[0][i];

      // Enforce sign compatibility

      const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0);

      if (isOutOfSign) {

         batches._output[i] = 0.0;

      } else {

         const double tscaled = std::abs(x) / batches[1][i];

         batches._output[i] = fast_exp(-tscaled) * tscaled;

      }

   }

}


__rooglobal__ void computeTruthModelQuadBasis(BatchesHandle batches)

{

   const bool isMinus = batches.extraArg(0) < 0.0;

   const bool isPlus = batches.extraArg(0) > 0.0;

   for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double x = batches[0][i];

      // Enforce sign compatibility

      const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0);

      if (isOutOfSign) {

         batches._output[i] = 0.0;

      } else {

         const double tscaled = std::abs(x) / batches[1][i];

         batches._output[i] = fast_exp(-tscaled) * tscaled * tscaled;

      }

   }

}


__rooglobal__ void computeTruthModelSinhBasis(BatchesHandle batches)

{

   const bool isMinus = batches.extraArg(0) < 0.0;

   const bool isPlus = batches.extraArg(0) > 0.0;

   for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double x = batches[0][i];

      // Enforce sign compatibility

      const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0);

      batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * sinh(x * batches[2][i] * 0.5);

   }

}


__rooglobal__ void computeTruthModelCoshBasis(BatchesHandle batches)

{

   const bool isMinus = batches.extraArg(0) < 0.0;

   const bool isPlus = batches.extraArg(0) > 0.0;

   for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      double x = batches[0][i];

      // Enforce sign compatibility

      const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0);

      batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * cosh(x * batches[2][i] * .5);

   }

}


__rooglobal__ void computeVoigtian(BatchesHandle batches)

{

   Batch X = batches[0], M = batches[1], W = batches[2], S = batches[3];

   const double invSqrt2 = 0.707106781186547524400844362105;

   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      const double arg = (X[i] - M[i]) * (X[i] - M[i]);

      if (S[i] == 0.0 && W[i] == 0.0)

         batches._output[i] = 1.0;

      else if (S[i] == 0.0)

         batches._output[i] = 1 / (arg + 0.25 * W[i] * W[i]);

      else if (W[i] == 0.0)

         batches._output[i] = fast_exp(-0.5 * arg / (S[i] * S[i]));

      else

         batches._output[i] = invSqrt2 / S[i];

   }


   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {

      if (S[i] != 0.0 && W[i] != 0.0) {

         if (batches._output[i] < 0)

            batches._output[i] = -batches._output[i];

         const double factor = W[i] > 0.0 ? 0.5 : -0.5;

         RooHeterogeneousMath::STD::complex<double> z(batches._output[i] * (X[i] - M[i]),

                                                      factor * batches._output[i] * W[i]);

         batches._output[i] *= RooHeterogeneousMath::faddeeva(z).real();

      }

   }

}


/// Returns a std::vector of pointers to the compute functions in this file.

std::vector<void (*)(BatchesHandle)> getFunctions()

{

   return {computeAddPdf,

           computeArgusBG,

           computeBMixDecay,

           computeBernstein,

           computeBifurGauss,

           computeBreitWigner,

           computeBukin,

           computeCBShape,

           computeChebychev,

           computeChiSquare,

           computeDeltaFunction,

           computeDstD0BG,

           computeExponential,

           computeGamma,

           computeGaussModelExpBasis,

           computeGaussian,

           computeIdentity,

           computeJohnson,

           computeLandau,

           computeLognormal,

           computeNegativeLogarithms,

           computeNormalizedPdf,

           computeNovosibirsk,

           computePoisson,

           computePolynomial,

           computeProdPdf,

           computeRatio,

           computeTruthModelExpBasis,

           computeTruthModelSinBasis,

           computeTruthModelCosBasis,

           computeTruthModelLinBasis,

           computeTruthModelQuadBasis,

           computeTruthModelSinhBasis,

           computeTruthModelCoshBasis,

           computeVoigtian};

}

} // End namespace RF_ARCH

} // End namespace RooBatchCompute

Batches.h

STEP
#define STEP
Definition ComputeFunctions.cxx:40

BEGIN
#define BEGIN
Definition ComputeFunctions.cxx:39

c
#define c(i)
Definition RSha256.hxx:101

__rooglobal__
#define __rooglobal__
Definition RooBatchComputeTypes.h:27

RooBatchCompute.h

RooHeterogeneousMath.h

RooNaNPacker.h

RooVDTHeaders.h

N
#define N

p
winID h TVirtualViewer3D TVirtualGLPainter p
Definition TGWin32VirtualGLProxy.cxx:51

result
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Definition TGWin32VirtualXProxy.cxx:174

x2
Option_t Option_t TPoint TPoint const char x2
Definition TGWin32VirtualXProxy.cxx:70

x1
Option_t Option_t TPoint TPoint const char x1
Definition TGWin32VirtualXProxy.cxx:70

xmin
float xmin
Definition THbookFile.cxx:95

xmax
float xmax
Definition THbookFile.cxx:95

TMath.h

RooBatchCompute::RF_ARCH::Batch
Definition Batches.h:42

RooBatchCompute::RF_ARCH::Batches
Definition Batches.h:69

RooBatchCompute::RF_ARCH::Batches::getNEvents
__roodevice__ std::size_t getNEvents() const
Definition Batches.h:99

RooBatchCompute::RF_ARCH::Batches::getNExtraArgs
__roodevice__ std::size_t getNExtraArgs() const
Definition Batches.h:100

RooBatchCompute::RF_ARCH::Batches::setExtraArg
__roodevice__ void setExtraArg(std::size_t i, double val)
Definition Batches.h:102

RooBatchCompute::RF_ARCH::Batches::extraArg
__roodevice__ double extraArg(std::size_t i) const
Definition Batches.h:101

RooBatchCompute::RF_ARCH::Batches::_output
RestrictArr _output
Definition Batches.h:84

sigma
const Double_t sigma
Definition h1analysisProxy.h:11

y
Double_t y[n]
Definition legend1.C:17

x
Double_t x[n]
Definition legend1.C:17

G
#define G(x, y, z)

RooBatchCompute::RF_ARCH::computeTruthModelCosBasis
__rooglobal__ void computeTruthModelCosBasis(BatchesHandle batches)
Definition ComputeFunctions.cxx:688

RooBatchCompute::RF_ARCH::computeExponential
__rooglobal__ void computeExponential(BatchesHandle batches)
Definition ComputeFunctions.cxx:312

RooBatchCompute::RF_ARCH::computeDstD0BG
__rooglobal__ void computeDstD0BG(BatchesHandle batches)
Definition ComputeFunctions.cxx:297

RooBatchCompute::RF_ARCH::computeLandau
__rooglobal__ void computeLandau(BatchesHandle batches)
Definition ComputeFunctions.cxx:436

RooBatchCompute::RF_ARCH::computeArgusBG
__rooglobal__ void computeArgusBG(BatchesHandle batches)
Definition ComputeFunctions.cxx:56

RooBatchCompute::RF_ARCH::computeBukin
__rooglobal__ void computeBukin(BatchesHandle batches)
Definition ComputeFunctions.cxx:177

RooBatchCompute::RF_ARCH::computeNovosibirsk
__rooglobal__ void computeNovosibirsk(BatchesHandle batches)
Definition ComputeFunctions.cxx:576

RooBatchCompute::RF_ARCH::computeNormalizedPdf
__rooglobal__ void computeNormalizedPdf(BatchesHandle batches)
Definition ComputeFunctions.cxx:530

RooBatchCompute::RF_ARCH::computePolynomial
__rooglobal__ void computePolynomial(BatchesHandle batches)
Definition ComputeFunctions.cxx:624

RooBatchCompute::RF_ARCH::computeTruthModelSinhBasis
__rooglobal__ void computeTruthModelSinhBasis(BatchesHandle batches)
Definition ComputeFunctions.cxx:734

RooBatchCompute::RF_ARCH::computePoisson
__rooglobal__ void computePoisson(BatchesHandle batches)
Definition ComputeFunctions.cxx:597

RooBatchCompute::RF_ARCH::computeTruthModelCoshBasis
__rooglobal__ void computeTruthModelCoshBasis(BatchesHandle batches)
Definition ComputeFunctions.cxx:746

RooBatchCompute::RF_ARCH::computeBifurGauss
__rooglobal__ void computeBifurGauss(BatchesHandle batches)
Definition ComputeFunctions.cxx:155

RooBatchCompute::RF_ARCH::computeTruthModelSinBasis
__rooglobal__ void computeTruthModelSinBasis(BatchesHandle batches)
Definition ComputeFunctions.cxx:676

RooBatchCompute::RF_ARCH::computeRatio
__rooglobal__ void computeRatio(BatchesHandle batches)
Definition ComputeFunctions.cxx:656

RooBatchCompute::RF_ARCH::computeAddPdf
__rooglobal__ void computeAddPdf(BatchesHandle batches)
Definition ComputeFunctions.cxx:46

RooBatchCompute::RF_ARCH::computeTruthModelLinBasis
__rooglobal__ void computeTruthModelLinBasis(BatchesHandle batches)
Definition ComputeFunctions.cxx:700

RooBatchCompute::RF_ARCH::computeChiSquare
__rooglobal__ void computeChiSquare(BatchesHandle batches)
Definition ComputeFunctions.cxx:275

RooBatchCompute::RF_ARCH::computeTruthModelQuadBasis
__rooglobal__ void computeTruthModelQuadBasis(BatchesHandle batches)
Definition ComputeFunctions.cxx:717

RooBatchCompute::RF_ARCH::computeDeltaFunction
__rooglobal__ void computeDeltaFunction(BatchesHandle batches)
Definition ComputeFunctions.cxx:290

RooBatchCompute::RF_ARCH::computeChebychev
__rooglobal__ void computeChebychev(BatchesHandle batches)
Definition ComputeFunctions.cxx:235

RooBatchCompute::RF_ARCH::computeIdentity
__rooglobal__ void computeIdentity(BatchesHandle batches)
Definition ComputeFunctions.cxx:393

RooBatchCompute::RF_ARCH::computeLognormal
__rooglobal__ void computeLognormal(BatchesHandle batches)
Definition ComputeFunctions.cxx:515

RooBatchCompute::RF_ARCH::computeGaussModelExpBasis
__rooglobal__ void computeGaussModelExpBasis(BatchesHandle batches)
Definition ComputeFunctions.cxx:343

RooBatchCompute::RF_ARCH::computeVoigtian
__rooglobal__ void computeVoigtian(BatchesHandle batches)
Definition ComputeFunctions.cxx:758

RooBatchCompute::RF_ARCH::computeTruthModelExpBasis
__rooglobal__ void computeTruthModelExpBasis(BatchesHandle batches)
Definition ComputeFunctions.cxx:663

RooBatchCompute::RF_ARCH::computeGaussian
__rooglobal__ void computeGaussian(BatchesHandle batches)
Definition ComputeFunctions.cxx:381

RooBatchCompute::RF_ARCH::computeBernstein
__rooglobal__ void computeBernstein(BatchesHandle batches)
Definition ComputeFunctions.cxx:87

RooBatchCompute::RF_ARCH::computeNegativeLogarithms
__rooglobal__ void computeNegativeLogarithms(BatchesHandle batches)
Definition ComputeFunctions.cxx:400

RooBatchCompute::RF_ARCH::computeGamma
__rooglobal__ void computeGamma(BatchesHandle batches)
Definition ComputeFunctions.cxx:319

RooBatchCompute::RF_ARCH::getFunctions
std::vector< void(*)(BatchesHandle)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
Definition ComputeFunctions.cxx:787

RooBatchCompute::RF_ARCH::BatchesHandle
Batches & BatchesHandle
Definition Batches.h:117

RooBatchCompute::RF_ARCH::computeJohnson
__rooglobal__ void computeJohnson(BatchesHandle batches)
Definition ComputeFunctions.cxx:410

RooBatchCompute::RF_ARCH::computeBreitWigner
__rooglobal__ void computeBreitWigner(BatchesHandle batches)
Definition ComputeFunctions.cxx:168

RooBatchCompute::RF_ARCH::computeProdPdf
__rooglobal__ void computeProdPdf(BatchesHandle batches)
Definition ComputeFunctions.cxx:643

RooBatchCompute::RF_ARCH::computeCBShape
__rooglobal__ void computeCBShape(BatchesHandle batches)
Definition ComputeFunctions.cxx:217

RooBatchCompute::RF_ARCH::computeBMixDecay
__rooglobal__ void computeBMixDecay(BatchesHandle batches)
Definition ComputeFunctions.cxx:72

RooBatchCompute
Namespace for dispatching RooFit computations to various backends.
Definition BracketAdapters.h:24

RooBatchCompute::fast_exp
__roodevice__ double fast_exp(double x)
Definition RooVDTHeaders.h:68

RooBatchCompute::fast_sin
__roodevice__ double fast_sin(double x)
Definition RooVDTHeaders.h:73

RooBatchCompute::bufferSize
constexpr std::size_t bufferSize
Definition Batches.h:38

RooBatchCompute::fast_log
__roodevice__ double fast_log(double x)
Definition RooVDTHeaders.h:83

RooBatchCompute::fast_cos
__roodevice__ double fast_cos(double x)
Definition RooVDTHeaders.h:78

RooBatchCompute::fast_isqrt
__roodevice__ double fast_isqrt(double x)
Definition RooVDTHeaders.h:88

RooHeterogeneousMath::faddeeva
__roodevice__ __roohost__ STD::complex< double > faddeeva(STD::complex< double > z)
Definition RooHeterogeneousMath.h:540

RooHeterogeneousMath::evalCerf
__roohost__ __roodevice__ STD::complex< double > evalCerf(double swt, double u, double c)
Definition RooHeterogeneousMath.h:573

TMath::TwoPi
constexpr Double_t TwoPi()
Definition TMath.h:44

R1
#define R1(v, w, x, y, z, i)
Definition sha1.inl:134

R2
#define R2(v, w, x, y, z, i)
Definition sha1.inl:137

RooNaNPacker::packFloatIntoNaN
__roodevice__ static __roohost__ double packFloatIntoNaN(float payload)
Pack float into mantissa of a NaN.
Definition RooNaNPacker.h:109

m
TMarker m
Definition textangle.C:8