doc/v632/RooBatchCompute_8h_source.html

/*

 * Project: RooFit

 * Authors:

 *   Emmanouil Michalainas, CERN 6 January 2021

 *

 * Copyright (c) 2021, CERN

 *

 * Redistribution and use in source and binary forms,

 * with or without modification, are permitted according to the terms

 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)

 */


#ifndef ROOFIT_BATCHCOMPUTE_ROOBATCHCOMPUTE_H

#define ROOFIT_BATCHCOMPUTE_ROOBATCHCOMPUTE_H


#include <ROOT/RSpan.hxx>


#include <DllImport.h> //for R__EXTERN, needed for windows


#include <cstddef>

#include <initializer_list>

#include <memory>

#include <string>


/**

 * Namespace for dispatching RooFit computations to various backends.

 *

 * This namespace contains an interface for providing high-performance computation functions for use in

 * RooAbsReal::doEval(), see RooBatchComputeInterface.

 *

 * Furthermore, several implementations of this interface can be created, which reside in RooBatchCompute::RF_ARCH,

 * where RF_ARCH may be replaced by the architecture that this implementation targets, e.g. SSE, AVX, etc.

 *

 * Using the pointer RooBatchCompute::dispatch, a computation request can be dispatched to the fastest backend that is

 * available on a specific platform.

 */

namespace RooBatchCompute {


namespace CudaInterface {

class CudaEvent;

class CudaStream;

} // namespace CudaInterface


typedef std::span<const std::span<const double>> VarSpan;

typedef std::span<double> ArgSpan;

typedef const double *__restrict InputArr;


constexpr std::size_t bufferSize = 64;


int initCPU();

int initCUDA();


/// Minimal configuration struct to steer the evaluation of a single node with

/// the RooBatchCompute library.

class Config {

public:

   bool useCuda() const { return _cudaStream != nullptr; }

   void setCudaStream(CudaInterface::CudaStream *cudaStream) { _cudaStream = cudaStream; }

   CudaInterface::CudaStream *cudaStream() const { return _cudaStream; }


private:

   CudaInterface::CudaStream *_cudaStream = nullptr;

};


enum class Architecture { AVX512, AVX2, AVX, SSE4, GENERIC, CUDA };


enum Computer {

   AddPdf,

   ArgusBG,

   BMixDecay,

   Bernstein,

   BifurGauss,

   BreitWigner,

   Bukin,

   CBShape,

   Chebychev,

   ChiSquare,

   DeltaFunction,

   DstD0BG,

   ExpPoly,

   Exponential,

   ExponentialNeg,

   Gamma,

   GaussModelExpBasis,

   Gaussian,

   Identity,

   Johnson,

   Landau,

   Lognormal,

   LognormalStandard,

   NegativeLogarithms,

   NormalizedPdf,

   Novosibirsk,

   Poisson,

   Polynomial,

   Power,

   ProdPdf,

   Ratio,

   TruthModelExpBasis,

   TruthModelSinBasis,

   TruthModelCosBasis,

   TruthModelLinBasis,

   TruthModelQuadBasis,

   TruthModelSinhBasis,

   TruthModelCoshBasis,

   Voigtian

};


struct ReduceNLLOutput {

   double nllSum = 0.0;

   double nllSumCarry = 0.0;

   std::size_t nLargeValues = 0;

   std::size_t nNonPositiveValues = 0;

   std::size_t nNaNValues = 0;

};


class AbsBuffer {

public:

   virtual ~AbsBuffer() = default;


   virtual double const *hostReadPtr() const = 0;

   virtual double const *deviceReadPtr() const = 0;


   virtual double *hostWritePtr() = 0;

   virtual double *deviceWritePtr() = 0;


   virtual void assignFromHost(std::span<const double> input) = 0;

   virtual void assignFromDevice(std::span<const double> input) = 0;

};


class AbsBufferManager {

public:

   virtual ~AbsBufferManager() = default;


   virtual std::unique_ptr<AbsBuffer> makeScalarBuffer() = 0;

   virtual std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t size) = 0;

   virtual std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t size) = 0;

   virtual std::unique_ptr<AbsBuffer>

   makePinnedBuffer(std::size_t size, CudaInterface::CudaStream *stream = nullptr) = 0;

};


/**

 * \class RooBatchComputeInterface

 * \ingroup roofit_dev_docs_batchcompute

 * \brief The interface which should be implemented to provide optimised computation functions for implementations of

 * RooAbsReal::doEval().

 *

 * The class RooBatchComputeInterface provides the mechanism for external modules (like RooFit) to call

 * functions from the library. The power lies in the virtual functions that can resolve to different

 * implementations for the functionality; for example, calling a function through dispatchCuda

 * will resolve to efficient CUDA implementations.

 *

 * This interface contains the signatures of the compute functions of every PDF that has an optimised implementation

 * available. These are the functions that perform the actual computations in batches.

 *

 * Several implementations of this interface may be provided, e.g. SSE, AVX, AVX2 etc. At run time, the fastest

 * implementation of this interface is selected, and using a virtual call, the computation is dispatched to the best

 * backend.

 *

 * \see RooBatchCompute::dispatch, RooBatchComputeClass, RF_ARCH

 */

class RooBatchComputeInterface {

public:

   virtual ~RooBatchComputeInterface() = default;

   virtual void compute(Config const &cfg, Computer, std::span<double> output, VarSpan, ArgSpan) = 0;


   virtual double reduceSum(Config const &cfg, InputArr input, size_t n) = 0;

   virtual ReduceNLLOutput reduceNLL(Config const &cfg, std::span<const double> probas, std::span<const double> weights,

                                     std::span<const double> offsetProbas) = 0;


   virtual Architecture architecture() const = 0;

   virtual std::string architectureName() const = 0;


   virtual std::unique_ptr<AbsBufferManager> createBufferManager() const = 0;


   virtual CudaInterface::CudaEvent *newCudaEvent(bool forTiming) const = 0;

   virtual CudaInterface::CudaStream *newCudaStream() const = 0;

   virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const = 0;

   virtual void deleteCudaStream(CudaInterface::CudaStream *) const = 0;

   virtual void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const = 0;

   virtual void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const = 0;

   virtual bool cudaStreamIsActive(CudaInterface::CudaStream *) const = 0;

};


/**

 * This dispatch pointer points to an implementation of the compute library, provided one has been loaded.

 * Using a virtual call, computation requests are dispatched to backends with architecture-specific functions

 * such as SSE, AVX, AVX2, etc.

 *

 * \see RooBatchComputeInterface, RooBatchComputeClass, RF_ARCH

 */

R__EXTERN RooBatchComputeInterface *dispatchCPU;

R__EXTERN RooBatchComputeInterface *dispatchCUDA;


inline Architecture cpuArchitecture()

{

   return dispatchCPU->architecture();

}


inline std::string cpuArchitectureName()

{

   return dispatchCPU->architectureName();

}


inline void compute(Config cfg, Computer comp, std::span<double> output, VarSpan vars, ArgSpan extraArgs = {})

{

   auto dispatch = cfg.useCuda() ? dispatchCUDA : dispatchCPU;

   dispatch->compute(cfg, comp, output, vars, extraArgs);

}


/// It is not possible to construct a std::span directly from an initializer

/// list (probably it will be with C++26). That's why we need an explicit

/// overload for this.

inline void compute(Config cfg, Computer comp, std::span<double> output,

                    std::initializer_list<std::span<const double>> vars, ArgSpan extraArgs = {})

{

   compute(cfg, comp, output, VarSpan{vars.begin(), vars.end()}, extraArgs);

}


inline double reduceSum(Config cfg, InputArr input, size_t n)

{

   auto dispatch = cfg.useCuda() ? dispatchCUDA : dispatchCPU;

   return dispatch->reduceSum(cfg, input, n);

}


inline ReduceNLLOutput reduceNLL(Config cfg, std::span<const double> probas, std::span<const double> weights,

                                 std::span<const double> offsetProbas)

{

   auto dispatch = cfg.useCuda() ? dispatchCUDA : dispatchCPU;

   return dispatch->reduceNLL(cfg, probas, weights, offsetProbas);

}


} // End namespace RooBatchCompute


#endif

DllImport.h

R__EXTERN
#define R__EXTERN
Definition DllImport.h:26

RSpan.hxx

size
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix

input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Definition TGWin32VirtualXProxy.cxx:142

RooBatchCompute::AbsBufferManager
Definition RooBatchCompute.h:131

RooBatchCompute::AbsBufferManager::makeScalarBuffer
virtual std::unique_ptr< AbsBuffer > makeScalarBuffer()=0

RooBatchCompute::AbsBufferManager::~AbsBufferManager
virtual ~AbsBufferManager()=default

RooBatchCompute::AbsBufferManager::makeCpuBuffer
virtual std::unique_ptr< AbsBuffer > makeCpuBuffer(std::size_t size)=0

RooBatchCompute::AbsBufferManager::makeGpuBuffer
virtual std::unique_ptr< AbsBuffer > makeGpuBuffer(std::size_t size)=0

RooBatchCompute::AbsBufferManager::makePinnedBuffer
virtual std::unique_ptr< AbsBuffer > makePinnedBuffer(std::size_t size, CudaInterface::CudaStream *stream=nullptr)=0

RooBatchCompute::AbsBuffer
Definition RooBatchCompute.h:117

RooBatchCompute::AbsBuffer::deviceReadPtr
virtual double const * deviceReadPtr() const =0

RooBatchCompute::AbsBuffer::~AbsBuffer
virtual ~AbsBuffer()=default

RooBatchCompute::AbsBuffer::assignFromHost
virtual void assignFromHost(std::span< const double > input)=0

RooBatchCompute::AbsBuffer::hostReadPtr
virtual double const * hostReadPtr() const =0

RooBatchCompute::AbsBuffer::deviceWritePtr
virtual double * deviceWritePtr()=0

RooBatchCompute::AbsBuffer::assignFromDevice
virtual void assignFromDevice(std::span< const double > input)=0

RooBatchCompute::AbsBuffer::hostWritePtr
virtual double * hostWritePtr()=0

RooBatchCompute::Config
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
Definition RooBatchCompute.h:55

RooBatchCompute::Config::useCuda
bool useCuda() const
Definition RooBatchCompute.h:57

RooBatchCompute::Config::setCudaStream
void setCudaStream(CudaInterface::CudaStream *cudaStream)
Definition RooBatchCompute.h:58

RooBatchCompute::Config::_cudaStream
CudaInterface::CudaStream * _cudaStream
Definition RooBatchCompute.h:62

RooBatchCompute::Config::cudaStream
CudaInterface::CudaStream * cudaStream() const
Definition RooBatchCompute.h:59

RooBatchCompute::CudaInterface::CudaEvent
Definition CudaInterface.h:43

RooBatchCompute::CudaInterface::CudaStream
Definition CudaInterface.h:58

RooBatchCompute::RooBatchComputeInterface
The interface which should be implemented to provide optimised computation functions for implementati...
Definition RooBatchCompute.h:162

RooBatchCompute::RooBatchComputeInterface::reduceSum
virtual double reduceSum(Config const &cfg, InputArr input, size_t n)=0

RooBatchCompute::RooBatchComputeInterface::architectureName
virtual std::string architectureName() const =0

RooBatchCompute::RooBatchComputeInterface::deleteCudaEvent
virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const =0

RooBatchCompute::RooBatchComputeInterface::newCudaEvent
virtual CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const =0

RooBatchCompute::RooBatchComputeInterface::cudaEventRecord
virtual void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const =0

RooBatchCompute::RooBatchComputeInterface::createBufferManager
virtual std::unique_ptr< AbsBufferManager > createBufferManager() const =0

RooBatchCompute::RooBatchComputeInterface::cudaStreamWaitForEvent
virtual void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const =0

RooBatchCompute::RooBatchComputeInterface::newCudaStream
virtual CudaInterface::CudaStream * newCudaStream() const =0

RooBatchCompute::RooBatchComputeInterface::deleteCudaStream
virtual void deleteCudaStream(CudaInterface::CudaStream *) const =0

RooBatchCompute::RooBatchComputeInterface::cudaStreamIsActive
virtual bool cudaStreamIsActive(CudaInterface::CudaStream *) const =0

RooBatchCompute::RooBatchComputeInterface::architecture
virtual Architecture architecture() const =0

RooBatchCompute::RooBatchComputeInterface::reduceNLL
virtual ReduceNLLOutput reduceNLL(Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas)=0

RooBatchCompute::RooBatchComputeInterface::compute
virtual void compute(Config const &cfg, Computer, std::span< double > output, VarSpan, ArgSpan)=0

RooBatchCompute::RooBatchComputeInterface::~RooBatchComputeInterface
virtual ~RooBatchComputeInterface()=default

n
const Int_t n
Definition legend1.C:16

RooBatchCompute
Namespace for dispatching RooFit computations to various backends.
Definition RooBatchCompute.h:37

RooBatchCompute::dispatchCUDA
R__EXTERN RooBatchComputeInterface * dispatchCUDA
Definition RooBatchCompute.h:193

RooBatchCompute::ArgSpan
std::span< double > ArgSpan
Definition RooBatchCompute.h:45

RooBatchCompute::Architecture
Architecture
Definition RooBatchCompute.h:65

RooBatchCompute::Architecture::SSE4
@ SSE4

RooBatchCompute::Architecture::AVX
@ AVX

RooBatchCompute::Architecture::AVX512
@ AVX512

RooBatchCompute::Architecture::CUDA
@ CUDA

RooBatchCompute::Architecture::GENERIC
@ GENERIC

RooBatchCompute::Architecture::AVX2
@ AVX2

RooBatchCompute::cpuArchitectureName
std::string cpuArchitectureName()
Definition RooBatchCompute.h:200

RooBatchCompute::compute
void compute(Config cfg, Computer comp, std::span< double > output, VarSpan vars, ArgSpan extraArgs={})
Definition RooBatchCompute.h:205

RooBatchCompute::dispatchCPU
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
Definition RooBatchCompute.h:192

RooBatchCompute::bufferSize
constexpr std::size_t bufferSize
Definition RooBatchCompute.h:48

RooBatchCompute::reduceSum
double reduceSum(Config cfg, InputArr input, size_t n)
Definition RooBatchCompute.h:220

RooBatchCompute::reduceNLL
ReduceNLLOutput reduceNLL(Config cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas)
Definition RooBatchCompute.h:226

RooBatchCompute::cpuArchitecture
Architecture cpuArchitecture()
Definition RooBatchCompute.h:195

RooBatchCompute::InputArr
const double *__restrict InputArr
Definition RooBatchCompute.h:46

RooBatchCompute::initCUDA
int initCUDA()
Definition Initialisation.cxx:89

RooBatchCompute::VarSpan
std::span< const std::span< const double > > VarSpan
Definition RooBatchCompute.h:44

RooBatchCompute::Computer
Computer
Definition RooBatchCompute.h:67

RooBatchCompute::Landau
@ Landau
Definition RooBatchCompute.h:88

RooBatchCompute::Bernstein
@ Bernstein
Definition RooBatchCompute.h:71

RooBatchCompute::ChiSquare
@ ChiSquare
Definition RooBatchCompute.h:77

RooBatchCompute::ExponentialNeg
@ ExponentialNeg
Definition RooBatchCompute.h:82

RooBatchCompute::TruthModelSinhBasis
@ TruthModelSinhBasis
Definition RooBatchCompute.h:104

RooBatchCompute::DeltaFunction
@ DeltaFunction
Definition RooBatchCompute.h:78

RooBatchCompute::TruthModelCosBasis
@ TruthModelCosBasis
Definition RooBatchCompute.h:101

RooBatchCompute::TruthModelQuadBasis
@ TruthModelQuadBasis
Definition RooBatchCompute.h:103

RooBatchCompute::NegativeLogarithms
@ NegativeLogarithms
Definition RooBatchCompute.h:91

RooBatchCompute::Exponential
@ Exponential
Definition RooBatchCompute.h:81

RooBatchCompute::Voigtian
@ Voigtian
Definition RooBatchCompute.h:106

RooBatchCompute::Power
@ Power
Definition RooBatchCompute.h:96

RooBatchCompute::AddPdf
@ AddPdf
Definition RooBatchCompute.h:68

RooBatchCompute::Gamma
@ Gamma
Definition RooBatchCompute.h:83

RooBatchCompute::LognormalStandard
@ LognormalStandard
Definition RooBatchCompute.h:90

RooBatchCompute::BMixDecay
@ BMixDecay
Definition RooBatchCompute.h:70

RooBatchCompute::Polynomial
@ Polynomial
Definition RooBatchCompute.h:95

RooBatchCompute::Lognormal
@ Lognormal
Definition RooBatchCompute.h:89

RooBatchCompute::TruthModelLinBasis
@ TruthModelLinBasis
Definition RooBatchCompute.h:102

RooBatchCompute::Identity
@ Identity
Definition RooBatchCompute.h:86

RooBatchCompute::TruthModelExpBasis
@ TruthModelExpBasis
Definition RooBatchCompute.h:99

RooBatchCompute::Bukin
@ Bukin
Definition RooBatchCompute.h:74

RooBatchCompute::Gaussian
@ Gaussian
Definition RooBatchCompute.h:85

RooBatchCompute::GaussModelExpBasis
@ GaussModelExpBasis
Definition RooBatchCompute.h:84

RooBatchCompute::DstD0BG
@ DstD0BG
Definition RooBatchCompute.h:79

RooBatchCompute::Chebychev
@ Chebychev
Definition RooBatchCompute.h:76

RooBatchCompute::Johnson
@ Johnson
Definition RooBatchCompute.h:87

RooBatchCompute::ProdPdf
@ ProdPdf
Definition RooBatchCompute.h:97

RooBatchCompute::TruthModelCoshBasis
@ TruthModelCoshBasis
Definition RooBatchCompute.h:105

RooBatchCompute::Poisson
@ Poisson
Definition RooBatchCompute.h:94

RooBatchCompute::BreitWigner
@ BreitWigner
Definition RooBatchCompute.h:73

RooBatchCompute::NormalizedPdf
@ NormalizedPdf
Definition RooBatchCompute.h:92

RooBatchCompute::ArgusBG
@ ArgusBG
Definition RooBatchCompute.h:69

RooBatchCompute::Novosibirsk
@ Novosibirsk
Definition RooBatchCompute.h:93

RooBatchCompute::Ratio
@ Ratio
Definition RooBatchCompute.h:98

RooBatchCompute::ExpPoly
@ ExpPoly
Definition RooBatchCompute.h:80

RooBatchCompute::TruthModelSinBasis
@ TruthModelSinBasis
Definition RooBatchCompute.h:100

RooBatchCompute::BifurGauss
@ BifurGauss
Definition RooBatchCompute.h:72

RooBatchCompute::CBShape
@ CBShape
Definition RooBatchCompute.h:75

RooBatchCompute::initCPU
int initCPU()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
Definition Initialisation.cxx:44

RooBatchCompute::ReduceNLLOutput
Definition RooBatchCompute.h:109

RooBatchCompute::ReduceNLLOutput::nNaNValues
std::size_t nNaNValues
Definition RooBatchCompute.h:114

RooBatchCompute::ReduceNLLOutput::nLargeValues
std::size_t nLargeValues
Definition RooBatchCompute.h:112

RooBatchCompute::ReduceNLLOutput::nllSumCarry
double nllSumCarry
Definition RooBatchCompute.h:111

RooBatchCompute::ReduceNLLOutput::nNonPositiveValues
std::size_t nNonPositiveValues
Definition RooBatchCompute.h:113

RooBatchCompute::ReduceNLLOutput::nllSum
double nllSum
Definition RooBatchCompute.h:110

output
static void output()