doc/v630/RooBatchCompute_8h_source.html

/*

 * Project: RooFit

 * Authors:

 *   Emmanouil Michalainas, CERN 6 January 2021

 *

 * Copyright (c) 2021, CERN

 *

 * Redistribution and use in source and binary forms,

 * with or without modification, are permitted according to the terms

 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)

 */


#ifndef ROOFIT_BATCHCOMPUTE_ROOBATCHCOMPUTE_H

#define ROOFIT_BATCHCOMPUTE_ROOBATCHCOMPUTE_H


#include <ROOT/RSpan.hxx>


#include <RConfig.h>


#ifdef ROOFIT_CUDA

#include <RooFit/Detail/CudaInterface.h>

#endif


#include <DllImport.h> //for R__EXTERN, needed for windows


#include <cassert>

#include <functional>

#include <string>

#include <vector>


/**

 * Namespace for dispatching RooFit computations to various backends.

 *

 * This namespace contains an interface for providing high-performance computation functions for use in

 * RooAbsReal::computeBatch(), see RooBatchComputeInterface.

 *

 * Furthermore, several implementations of this interface can be created, which reside in RooBatchCompute::RF_ARCH,

 * where RF_ARCH may be replaced by the architecture that this implementation targets, e.g. SSE, AVX, etc.

 *

 * Using the pointer RooBatchCompute::dispatch, a computation request can be dispatched to the fastest backend that is

 * available on a specific platform.

 */

namespace RooBatchCompute {


typedef std::vector<std::span<const double>> VarVector;

typedef std::vector<double> ArgVector;

typedef double *__restrict RestrictArr;

typedef const double *__restrict InputArr;


void init();


/// Minimal configuration struct to steer the evaluation of a single node with

/// the RooBatchCompute library.

class Config {

public:

#ifdef ROOFIT_CUDA

   bool useCuda() const { return _cudaStream != nullptr; }

   void setCudaStream(RooFit::Detail::CudaInterface::CudaStream *cudaStream) { _cudaStream = cudaStream; }

   RooFit::Detail::CudaInterface::CudaStream *cudaStream() const { return _cudaStream; }


private:

   RooFit::Detail::CudaInterface::CudaStream *_cudaStream = nullptr;

#else

   bool useCuda() const { return false; }

#endif

};


enum class Architecture { AVX512, AVX2, AVX, SSE4, GENERIC, CUDA };


enum Computer {

   AddPdf,

   ArgusBG,

   BMixDecay,

   Bernstein,

   BifurGauss,

   BreitWigner,

   Bukin,

   CBShape,

   Chebychev,

   ChiSquare,

   DeltaFunction,

   DstD0BG,

   ExpPoly,

   Exponential,

   ExponentialNeg,

   Gamma,

   GaussModelExpBasis,

   Gaussian,

   Identity,

   Johnson,

   Landau,

   Lognormal,

   LognormalStandard,

   NegativeLogarithms,

   NormalizedPdf,

   Novosibirsk,

   Poisson,

   Polynomial,

   Power,

   ProdPdf,

   Ratio,

   TruthModelExpBasis,

   TruthModelSinBasis,

   TruthModelCosBasis,

   TruthModelLinBasis,

   TruthModelQuadBasis,

   TruthModelSinhBasis,

   TruthModelCoshBasis,

   Voigtian

};


struct ReduceNLLOutput {

   double nllSum = 0.0;

   double nllSumCarry = 0.0;

   std::size_t nLargeValues = 0;

   std::size_t nNonPositiveValues = 0;

   std::size_t nNaNValues = 0;

};


/**

 * \class RooBatchComputeInterface

 * \ingroup Roobatchcompute

 * \brief The interface which should be implemented to provide optimised computation functions for implementations of

 * RooAbsReal::computeBatch().

 *

 * The class RooBatchComputeInterface provides the mechanism for external modules (like RooFit) to call

 * functions from the library. The power lies in the virtual functions that can resolve to different

 * implementations for the functionality; for example, calling a function through dispatchCuda

 * will resolve to efficient CUDA implementations.

 *

 * This interface contains the signatures of the compute functions of every PDF that has an optimised implementation

 * available. These are the functions that perform the actual computations in batches.

 *

 * Several implementations of this interface may be provided, e.g. SSE, AVX, AVX2 etc. At run time, the fastest

 * implementation of this interface is selected, and using a virtual call, the computation is dispatched to the best

 * backend.

 *

 * \see RooBatchCompute::dispatch, RooBatchComputeClass, RF_ARCH

 */

class RooBatchComputeInterface {

public:

   virtual ~RooBatchComputeInterface() = default;

   virtual void compute(Config const &cfg, Computer, RestrictArr, size_t, const VarVector &, ArgVector &) = 0;

   inline void compute(Config const &cfg, Computer comp, RestrictArr output, size_t size, const VarVector &vars)

   {

      ArgVector extraArgs{};

      compute(cfg, comp, output, size, vars, extraArgs);

   }


   virtual double reduceSum(Config const &cfg, InputArr input, size_t n) = 0;

   virtual ReduceNLLOutput reduceNLL(Config const &cfg, std::span<const double> probas, std::span<const double> weights,

                                     std::span<const double> offsetProbas) = 0;


   virtual Architecture architecture() const = 0;

   virtual std::string architectureName() const = 0;

};


/**

 * This dispatch pointer points to an implementation of the compute library, provided one has been loaded.

 * Using a virtual call, computation requests are dispatched to backends with architecture-specific functions

 * such as SSE, AVX, AVX2, etc.

 *

 * \see RooBatchComputeInterface, RooBatchComputeClass, RF_ARCH

 */

R__EXTERN RooBatchComputeInterface *dispatchCPU, *dispatchCUDA;


inline Architecture cpuArchitecture()

{

   init();

   return dispatchCPU->architecture();

}


inline std::string cpuArchitectureName()

{

   init();

   return dispatchCPU->architectureName();

}


inline bool hasCuda()

{

   init();

   return dispatchCUDA;

}


inline void

compute(Config cfg, Computer comp, RestrictArr output, size_t size, const VarVector &vars, ArgVector &extraArgs)

{

   init();

   auto dispatch = cfg.useCuda() ? dispatchCUDA : dispatchCPU;

   dispatch->compute(cfg, comp, output, size, vars, extraArgs);

}


inline void compute(Config cfg, Computer comp, RestrictArr output, size_t size, const VarVector &vars)

{

   ArgVector extraArgs{};

   compute(cfg, comp, output, size, vars, extraArgs);

}


inline double reduceSum(Config cfg, InputArr input, size_t n)

{

   init();

   auto dispatch = cfg.useCuda() ? dispatchCUDA : dispatchCPU;

   return dispatch->reduceSum(cfg, input, n);

}


inline ReduceNLLOutput reduceNLL(Config cfg, std::span<const double> probas, std::span<const double> weights,

                                 std::span<const double> offsetProbas)

{

   init();

   auto dispatch = cfg.useCuda() ? dispatchCUDA : dispatchCPU;

   return dispatch->reduceNLL(cfg, probas, weights, offsetProbas);

}


} // End namespace RooBatchCompute


#endif

CudaInterface.h

DllImport.h

R__EXTERN
#define R__EXTERN
Definition DllImport.h:27

RSpan.hxx

size
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix

input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Definition TGWin32VirtualXProxy.cxx:142

RooBatchCompute::Config
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
Definition RooBatchCompute.h:54

RooBatchCompute::Config::useCuda
bool useCuda() const
Definition RooBatchCompute.h:64

RooBatchCompute::RooBatchComputeInterface
The interface which should be implemented to provide optimised computation functions for implementati...
Definition RooBatchCompute.h:140

RooBatchCompute::RooBatchComputeInterface::reduceSum
virtual double reduceSum(Config const &cfg, InputArr input, size_t n)=0

RooBatchCompute::RooBatchComputeInterface::compute
void compute(Config const &cfg, Computer comp, RestrictArr output, size_t size, const VarVector &vars)
Definition RooBatchCompute.h:144

RooBatchCompute::RooBatchComputeInterface::architectureName
virtual std::string architectureName() const =0

RooBatchCompute::RooBatchComputeInterface::compute
virtual void compute(Config const &cfg, Computer, RestrictArr, size_t, const VarVector &, ArgVector &)=0

RooBatchCompute::RooBatchComputeInterface::architecture
virtual Architecture architecture() const =0

RooBatchCompute::RooBatchComputeInterface::reduceNLL
virtual ReduceNLLOutput reduceNLL(Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas)=0

RooBatchCompute::RooBatchComputeInterface::~RooBatchComputeInterface
virtual ~RooBatchComputeInterface()=default

RooFit::Detail::CudaInterface::CudaStream
Definition CudaInterface.h:58

n
const Int_t n
Definition legend1.C:16

RooBatchCompute
Namespace for dispatching RooFit computations to various backends.
Definition RooBatchCompute.h:43

RooBatchCompute::dispatchCUDA
R__EXTERN RooBatchComputeInterface * dispatchCUDA
Definition RooBatchCompute.h:165

RooBatchCompute::Architecture
Architecture
Definition RooBatchCompute.h:68

RooBatchCompute::Architecture::SSE4
@ SSE4

RooBatchCompute::Architecture::AVX
@ AVX

RooBatchCompute::Architecture::AVX512
@ AVX512

RooBatchCompute::Architecture::CUDA
@ CUDA

RooBatchCompute::Architecture::GENERIC
@ GENERIC

RooBatchCompute::Architecture::AVX2
@ AVX2

RooBatchCompute::cpuArchitectureName
std::string cpuArchitectureName()
Definition RooBatchCompute.h:173

RooBatchCompute::hasCuda
bool hasCuda()
Definition RooBatchCompute.h:179

RooBatchCompute::dispatchCPU
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
Definition RooBatchCompute.h:165

RooBatchCompute::VarVector
std::vector< std::span< const double > > VarVector
Definition RooBatchCompute.h:45

RooBatchCompute::reduceSum
double reduceSum(Config cfg, InputArr input, size_t n)
Definition RooBatchCompute.h:199

RooBatchCompute::reduceNLL
ReduceNLLOutput reduceNLL(Config cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas)
Definition RooBatchCompute.h:206

RooBatchCompute::cpuArchitecture
Architecture cpuArchitecture()
Definition RooBatchCompute.h:167

RooBatchCompute::InputArr
const double *__restrict InputArr
Definition RooBatchCompute.h:48

RooBatchCompute::init
void init()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
Definition Initialisation.cxx:44

RooBatchCompute::ArgVector
std::vector< double > ArgVector
Definition RooBatchCompute.h:46

RooBatchCompute::compute
void compute(Config cfg, Computer comp, RestrictArr output, size_t size, const VarVector &vars, ArgVector &extraArgs)
Definition RooBatchCompute.h:186

RooBatchCompute::RestrictArr
double *__restrict RestrictArr
Definition RooBatchCompute.h:47

RooBatchCompute::Computer
Computer
Definition RooBatchCompute.h:70

RooBatchCompute::Landau
@ Landau
Definition RooBatchCompute.h:91

RooBatchCompute::Bernstein
@ Bernstein
Definition RooBatchCompute.h:74

RooBatchCompute::ChiSquare
@ ChiSquare
Definition RooBatchCompute.h:80

RooBatchCompute::ExponentialNeg
@ ExponentialNeg
Definition RooBatchCompute.h:85

RooBatchCompute::TruthModelSinhBasis
@ TruthModelSinhBasis
Definition RooBatchCompute.h:107

RooBatchCompute::DeltaFunction
@ DeltaFunction
Definition RooBatchCompute.h:81

RooBatchCompute::TruthModelCosBasis
@ TruthModelCosBasis
Definition RooBatchCompute.h:104

RooBatchCompute::TruthModelQuadBasis
@ TruthModelQuadBasis
Definition RooBatchCompute.h:106

RooBatchCompute::NegativeLogarithms
@ NegativeLogarithms
Definition RooBatchCompute.h:94

RooBatchCompute::Exponential
@ Exponential
Definition RooBatchCompute.h:84

RooBatchCompute::Voigtian
@ Voigtian
Definition RooBatchCompute.h:109

RooBatchCompute::Power
@ Power
Definition RooBatchCompute.h:99

RooBatchCompute::AddPdf
@ AddPdf
Definition RooBatchCompute.h:71

RooBatchCompute::Gamma
@ Gamma
Definition RooBatchCompute.h:86

RooBatchCompute::LognormalStandard
@ LognormalStandard
Definition RooBatchCompute.h:93

RooBatchCompute::BMixDecay
@ BMixDecay
Definition RooBatchCompute.h:73

RooBatchCompute::Polynomial
@ Polynomial
Definition RooBatchCompute.h:98

RooBatchCompute::Lognormal
@ Lognormal
Definition RooBatchCompute.h:92

RooBatchCompute::TruthModelLinBasis
@ TruthModelLinBasis
Definition RooBatchCompute.h:105

RooBatchCompute::Identity
@ Identity
Definition RooBatchCompute.h:89

RooBatchCompute::TruthModelExpBasis
@ TruthModelExpBasis
Definition RooBatchCompute.h:102

RooBatchCompute::Bukin
@ Bukin
Definition RooBatchCompute.h:77

RooBatchCompute::Gaussian
@ Gaussian
Definition RooBatchCompute.h:88

RooBatchCompute::GaussModelExpBasis
@ GaussModelExpBasis
Definition RooBatchCompute.h:87

RooBatchCompute::DstD0BG
@ DstD0BG
Definition RooBatchCompute.h:82

RooBatchCompute::Chebychev
@ Chebychev
Definition RooBatchCompute.h:79

RooBatchCompute::Johnson
@ Johnson
Definition RooBatchCompute.h:90

RooBatchCompute::ProdPdf
@ ProdPdf
Definition RooBatchCompute.h:100

RooBatchCompute::TruthModelCoshBasis
@ TruthModelCoshBasis
Definition RooBatchCompute.h:108

RooBatchCompute::Poisson
@ Poisson
Definition RooBatchCompute.h:97

RooBatchCompute::BreitWigner
@ BreitWigner
Definition RooBatchCompute.h:76

RooBatchCompute::NormalizedPdf
@ NormalizedPdf
Definition RooBatchCompute.h:95

RooBatchCompute::ArgusBG
@ ArgusBG
Definition RooBatchCompute.h:72

RooBatchCompute::Novosibirsk
@ Novosibirsk
Definition RooBatchCompute.h:96

RooBatchCompute::Ratio
@ Ratio
Definition RooBatchCompute.h:101

RooBatchCompute::ExpPoly
@ ExpPoly
Definition RooBatchCompute.h:83

RooBatchCompute::TruthModelSinBasis
@ TruthModelSinBasis
Definition RooBatchCompute.h:103

RooBatchCompute::BifurGauss
@ BifurGauss
Definition RooBatchCompute.h:75

RooBatchCompute::CBShape
@ CBShape
Definition RooBatchCompute.h:78

RooBatchCompute::ReduceNLLOutput
Definition RooBatchCompute.h:112

RooBatchCompute::ReduceNLLOutput::nNaNValues
std::size_t nNaNValues
Definition RooBatchCompute.h:117

RooBatchCompute::ReduceNLLOutput::nLargeValues
std::size_t nLargeValues
Definition RooBatchCompute.h:115

RooBatchCompute::ReduceNLLOutput::nllSumCarry
double nllSumCarry
Definition RooBatchCompute.h:114

RooBatchCompute::ReduceNLLOutput::nNonPositiveValues
std::size_t nNonPositiveValues
Definition RooBatchCompute.h:116

RooBatchCompute::ReduceNLLOutput::nllSum
double nllSum
Definition RooBatchCompute.h:113

output
static void output()