Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RooBatchCompute.h
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Emmanouil Michalainas, CERN 6 January 2021
5 *
6 * Copyright (c) 2021, CERN
7 *
8 * Redistribution and use in source and binary forms,
9 * with or without modification, are permitted according to the terms
10 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
11 */
12
13#ifndef ROOFIT_BATCHCOMPUTE_ROOBATCHCOMPUTE_H
14#define ROOFIT_BATCHCOMPUTE_ROOBATCHCOMPUTE_H
15
16#include <ROOT/RSpan.hxx>
17
18#include <DllImport.h> //for R__EXTERN, needed for windows
19
20#include <cstddef>
21#include <initializer_list>
22#include <memory>
23#include <string>
24
25/**
26 * Namespace for dispatching RooFit computations to various backends.
27 *
28 * This namespace contains an interface for providing high-performance computation functions for use in
29 * RooAbsReal::doEval(), see RooBatchComputeInterface.
30 *
31 * Furthermore, several implementations of this interface can be created, which reside in RooBatchCompute::RF_ARCH,
32 * where RF_ARCH may be replaced by the architecture that this implementation targets, e.g. SSE, AVX, etc.
33 *
34 * Using the pointer RooBatchCompute::dispatch, a computation request can be dispatched to the fastest backend that is
35 * available on a specific platform.
36 */
37namespace RooBatchCompute {
38
39namespace CudaInterface {
40class CudaEvent;
41class CudaStream;
42} // namespace CudaInterface
43
44typedef std::span<const std::span<const double>> VarSpan;
45typedef std::span<double> ArgSpan;
46typedef const double *__restrict InputArr;
47
48constexpr std::size_t bufferSize = 64;
49
50int initCPU();
51int initCUDA();
52
53/// Minimal configuration struct to steer the evaluation of a single node with
54/// the RooBatchCompute library.
55class Config {
56public:
57 bool useCuda() const { return _cudaStream != nullptr; }
60
61private:
63};
64
66
108
110 double nllSum = 0.0;
111 double nllSumCarry = 0.0;
112 std::size_t nInfiniteValues = 0;
113 std::size_t nNonPositiveValues = 0;
114 std::size_t nNaNValues = 0;
115};
116
118public:
119 virtual ~AbsBuffer() = default;
120
121 virtual double const *hostReadPtr() const = 0;
122 virtual double const *deviceReadPtr() const = 0;
123
124 virtual double *hostWritePtr() = 0;
125 virtual double *deviceWritePtr() = 0;
126
127 virtual void assignFromHost(std::span<const double> input) = 0;
128 virtual void assignFromDevice(std::span<const double> input) = 0;
129};
130
132public:
133 virtual ~AbsBufferManager() = default;
134
135 virtual std::unique_ptr<AbsBuffer> makeScalarBuffer() = 0;
136 virtual std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t size) = 0;
137 virtual std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t size) = 0;
138 virtual std::unique_ptr<AbsBuffer>
139 makePinnedBuffer(std::size_t size, CudaInterface::CudaStream *stream = nullptr) = 0;
140};
141
142/**
143 * \class RooBatchComputeInterface
144 * \ingroup roofit_dev_docs_batchcompute
145 * \brief The interface which should be implemented to provide optimised computation functions for implementations of
146 * RooAbsReal::doEval().
147 *
148 * The class RooBatchComputeInterface provides the mechanism for external modules (like RooFit) to call
149 * functions from the library. The power lies in the virtual functions that can resolve to different
150 * implementations for the functionality; for example, calling a function through dispatchCuda
151 * will resolve to efficient CUDA implementations.
152 *
153 * This interface contains the signatures of the compute functions of every PDF that has an optimised implementation
154 * available. These are the functions that perform the actual computations in batches.
155 *
156 * Several implementations of this interface may be provided, e.g. SSE, AVX, AVX2 etc. At run time, the fastest
157 * implementation of this interface is selected, and using a virtual call, the computation is dispatched to the best
158 * backend.
159 *
160 * \see RooBatchCompute::dispatch, RooBatchComputeClass, RF_ARCH
161 */
163public:
164 virtual ~RooBatchComputeInterface() = default;
165 virtual void compute(Config const &cfg, Computer, std::span<double> output, VarSpan, ArgSpan) = 0;
166
167 virtual double reduceSum(Config const &cfg, InputArr input, size_t n) = 0;
168 virtual ReduceNLLOutput reduceNLL(Config const &cfg, std::span<const double> probas, std::span<const double> weights,
169 std::span<const double> offsetProbas) = 0;
170
171 virtual Architecture architecture() const = 0;
172 virtual std::string architectureName() const = 0;
173
174 virtual std::unique_ptr<AbsBufferManager> createBufferManager() const = 0;
175
178 virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const = 0;
183};
184
185/**
186 * This dispatch pointer points to an implementation of the compute library, provided one has been loaded.
187 * Using a virtual call, computation requests are dispatched to backends with architecture-specific functions
188 * such as SSE, AVX, AVX2, etc.
189 *
190 * \see RooBatchComputeInterface, RooBatchComputeClass, RF_ARCH
191 */
194
196{
197 return dispatchCPU->architecture();
198}
199
200inline std::string cpuArchitectureName()
201{
203}
204
205inline void compute(Config cfg, Computer comp, std::span<double> output, VarSpan vars, ArgSpan extraArgs = {})
206{
207 auto dispatch = cfg.useCuda() ? dispatchCUDA : dispatchCPU;
208 dispatch->compute(cfg, comp, output, vars, extraArgs);
209}
210
211/// It is not possible to construct a std::span directly from an initializer
212/// list (probably it will be with C++26). That's why we need an explicit
213/// overload for this.
214inline void compute(Config cfg, Computer comp, std::span<double> output,
215 std::initializer_list<std::span<const double>> vars, ArgSpan extraArgs = {})
216{
217 compute(cfg, comp, output, VarSpan{vars.begin(), vars.end()}, extraArgs);
218}
219
220inline double reduceSum(Config cfg, InputArr input, size_t n)
221{
222 auto dispatch = cfg.useCuda() ? dispatchCUDA : dispatchCPU;
223 return dispatch->reduceSum(cfg, input, n);
224}
225
226inline ReduceNLLOutput reduceNLL(Config cfg, std::span<const double> probas, std::span<const double> weights,
227 std::span<const double> offsetProbas)
228{
229 auto dispatch = cfg.useCuda() ? dispatchCUDA : dispatchCPU;
230 return dispatch->reduceNLL(cfg, probas, weights, offsetProbas);
231}
232
233} // End namespace RooBatchCompute
234
235#endif
#define R__EXTERN
Definition DllImport.h:26
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
virtual std::unique_ptr< AbsBuffer > makeScalarBuffer()=0
virtual ~AbsBufferManager()=default
virtual std::unique_ptr< AbsBuffer > makeCpuBuffer(std::size_t size)=0
virtual std::unique_ptr< AbsBuffer > makeGpuBuffer(std::size_t size)=0
virtual std::unique_ptr< AbsBuffer > makePinnedBuffer(std::size_t size, CudaInterface::CudaStream *stream=nullptr)=0
virtual double const * deviceReadPtr() const =0
virtual ~AbsBuffer()=default
virtual void assignFromHost(std::span< const double > input)=0
virtual double const * hostReadPtr() const =0
virtual double * deviceWritePtr()=0
virtual void assignFromDevice(std::span< const double > input)=0
virtual double * hostWritePtr()=0
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void setCudaStream(CudaInterface::CudaStream *cudaStream)
CudaInterface::CudaStream * _cudaStream
CudaInterface::CudaStream * cudaStream() const
The interface which should be implemented to provide optimised computation functions for implementati...
virtual double reduceSum(Config const &cfg, InputArr input, size_t n)=0
virtual std::string architectureName() const =0
virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const =0
virtual void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const =0
virtual std::unique_ptr< AbsBufferManager > createBufferManager() const =0
virtual void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaStream * newCudaStream() const =0
virtual void deleteCudaStream(CudaInterface::CudaStream *) const =0
virtual bool cudaStreamIsActive(CudaInterface::CudaStream *) const =0
virtual Architecture architecture() const =0
virtual ReduceNLLOutput reduceNLL(Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas)=0
virtual void compute(Config const &cfg, Computer, std::span< double > output, VarSpan, ArgSpan)=0
const Int_t n
Definition legend1.C:16
Namespace for dispatching RooFit computations to various backends.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::span< double > ArgSpan
std::string cpuArchitectureName()
void compute(Config cfg, Computer comp, std::span< double > output, VarSpan vars, ArgSpan extraArgs={})
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
constexpr std::size_t bufferSize
double reduceSum(Config cfg, InputArr input, size_t n)
ReduceNLLOutput reduceNLL(Config cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas)
Architecture cpuArchitecture()
const double *__restrict InputArr
std::span< const std::span< const double > > VarSpan
int initCPU()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
static void output()