Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RooBatchCompute.h
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Emmanouil Michalainas, CERN 6 January 2021
5 *
6 * Copyright (c) 2021, CERN
7 *
8 * Redistribution and use in source and binary forms,
9 * with or without modification, are permitted according to the terms
10 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
11 */
12
13#ifndef ROOFIT_BATCHCOMPUTE_ROOBATCHCOMPUTE_H
14#define ROOFIT_BATCHCOMPUTE_ROOBATCHCOMPUTE_H
15
17
18#include <DllImport.h> //for R__EXTERN, needed for windows
19#include <TError.h>
20
21#include <Math/Util.h>
22
23#include <functional>
24#include <string>
25
26/**
27 * Namespace for dispatching RooFit computations to various backends.
28 *
29 * This namespace contains an interface for providing high-performance computation functions for use in
30 * RooAbsReal::evaluateSpan(), see RooBatchComputeInterface.
31 *
32 * Furthermore, several implementations of this interface can be created, which reside in RooBatchCompute::RF_ARCH,
33 * where RF_ARCH may be replaced by the architecture that this implementation targets, e.g. SSE, AVX, etc.
34 *
35 * Using the pointer RooBatchCompute::dispatch, a computation request can be dispatched to the fastest backend that is
36 * available on a specific platform.
37 */
38namespace RooBatchCompute {
39
41
78};
79
82 std::size_t nLargeValues = 0;
83 std::size_t nNonPositiveValues = 0;
84 std::size_t nNaNValues = 0;
85};
86
87/**
88 * \class RooBatchComputeInterface
89 * \ingroup Roobatchcompute
90 * \brief The interface which should be implemented to provide optimised computation functions for implementations of
91 * RooAbsReal::evaluateSpan().
92 *
93 * The class RooBatchComputeInterface provides the mechanism for external modules (like RooFit) to call
94 * functions from the library. The power lies in the virtual functions that can resolve to different
95 * implementations for the functionality; for example, calling a function through dispatchCuda
96 * will resolve to efficient cuda implementations.
97 *
98 * This interface contains the signatures of the compute functions of every PDF that has an optimised implementation
99 * available. These are the functions that perform the actual computations in batches.
100 *
101 * Several implementations of this interface may be provided, e.g. SSE, AVX, AVX2 etc. At run time, the fastest
102 * implementation of this interface is selected, and using a virtual call, the computation is dispatched to the best
103 * backend.
104 *
105 * \see RooBatchCompute::dispatch, RooBatchComputeClass, RF_ARCH
106 */
108public:
109 virtual ~RooBatchComputeInterface() = default;
110 virtual void compute(cudaStream_t *, Computer, RestrictArr, size_t, const VarVector &, ArgVector &) = 0;
111 inline void compute(cudaStream_t *stream, Computer comp, RestrictArr output, size_t size, const VarVector &vars)
112 {
113 ArgVector extraArgs{};
114 compute(stream, comp, output, size, vars, extraArgs);
115 }
116
117 virtual double reduceSum(cudaStream_t *, InputArr input, size_t n) = 0;
118 virtual ReduceNLLOutput reduceNLL(cudaStream_t *, RooSpan<const double> probas, RooSpan<const double> weightSpan,
119 RooSpan<const double> weights, double weightSum,
120 RooSpan<const double> binVolumes) = 0;
121
122 virtual Architecture architecture() const = 0;
123 virtual std::string architectureName() const = 0;
124
125 // cuda functions that need to be interfaced
126 virtual void *cudaMalloc(size_t) { throw std::bad_function_call(); }
127 virtual void cudaFree(void *) { throw std::bad_function_call(); }
128 virtual void *cudaMallocHost(size_t) { throw std::bad_function_call(); }
129 virtual void cudaFreeHost(void *) { throw std::bad_function_call(); }
130 virtual cudaEvent_t *newCudaEvent(bool /*forTiming*/) { throw std::bad_function_call(); }
131 virtual void deleteCudaEvent(cudaEvent_t *) { throw std::bad_function_call(); }
132 virtual cudaStream_t *newCudaStream() { throw std::bad_function_call(); }
133 virtual void deleteCudaStream(cudaStream_t *) { throw std::bad_function_call(); }
134 virtual bool streamIsActive(cudaStream_t *) { throw std::bad_function_call(); }
135 virtual void cudaEventRecord(cudaEvent_t *, cudaStream_t *) { throw std::bad_function_call(); }
136 virtual void cudaStreamWaitEvent(cudaStream_t *, cudaEvent_t *) { throw std::bad_function_call(); }
137 virtual float cudaEventElapsedTime(cudaEvent_t *, cudaEvent_t *) { throw std::bad_function_call(); }
138 virtual void memcpyToCUDA(void *, const void *, size_t, cudaStream_t * = nullptr) { throw std::bad_function_call(); }
139 virtual void memcpyToCPU(void *, const void *, size_t, cudaStream_t * = nullptr) { throw std::bad_function_call(); }
140};
141
142/**
143 * This dispatch pointer points to an implementation of the compute library, provided one has been loaded.
144 * Using a virtual call, computation requests are dispatched to backends with architecture-specific functions
145 * such as SSE, AVX, AVX2, etc.
146 *
147 * \see RooBatchComputeInterface, RooBatchComputeClass, RF_ARCH
148 */
150} // End namespace RooBatchCompute
151
152#endif
#define R__EXTERN
Definition DllImport.h:27
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
The Kahan summation is a compensated summation algorithm, which significantly reduces numerical error...
Definition Util.h:122
The interface which should be implemented to provide optimised computation functions for implementati...
virtual void cudaEventRecord(cudaEvent_t *, cudaStream_t *)
virtual cudaEvent_t * newCudaEvent(bool)
virtual void compute(cudaStream_t *, Computer, RestrictArr, size_t, const VarVector &, ArgVector &)=0
virtual std::string architectureName() const =0
virtual float cudaEventElapsedTime(cudaEvent_t *, cudaEvent_t *)
virtual ReduceNLLOutput reduceNLL(cudaStream_t *, RooSpan< const double > probas, RooSpan< const double > weightSpan, RooSpan< const double > weights, double weightSum, RooSpan< const double > binVolumes)=0
virtual void memcpyToCPU(void *, const void *, size_t, cudaStream_t *=nullptr)
virtual double reduceSum(cudaStream_t *, InputArr input, size_t n)=0
void compute(cudaStream_t *stream, Computer comp, RestrictArr output, size_t size, const VarVector &vars)
virtual void cudaStreamWaitEvent(cudaStream_t *, cudaEvent_t *)
virtual bool streamIsActive(cudaStream_t *)
virtual void deleteCudaStream(cudaStream_t *)
virtual void deleteCudaEvent(cudaEvent_t *)
virtual void memcpyToCUDA(void *, const void *, size_t, cudaStream_t *=nullptr)
virtual Architecture architecture() const =0
A simple container to hold a batch of data values.
Definition RooSpan.h:34
const Int_t n
Definition legend1.C:16
Namespace for dispatching RooFit computations to various backends.
std::vector< RooSpan< const double > > VarVector
R__EXTERN RooBatchComputeInterface * dispatchCUDA
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
const double *__restrict InputArr
std::vector< double > ArgVector
double *__restrict RestrictArr
ROOT::Math::KahanSum< double > nllSum
static void output()