56void assignSpan(std::span<T> &to, std::span<T> 
const &from)
 
   79   auto log = [](std::string_view message) {
 
   84      log(
"using generic CPU library compiled with no vectorizations");
 
   89      log(
"using CUDA computation library");
 
  104   std::shared_ptr<RooBatchCompute::AbsBuffer> 
buffer;
 
 
  154      throw std::runtime_error(
"Can't create Evaluator in CUDA mode because RooBatchCompute CUDA could not be loaded!");
 
  170   std::map<RooFit::Detail::DataKey, NodeInfo *> 
nodeInfos;
 
  174   std::size_t iNode = 0;
 
  180      nodeInfo.originalOperMode = arg->operMode();
 
  187         arg->setDataToken(iNode);
 
  197      info.serverInfos.reserve(
info.absArg->servers().size());
 
 
  243      throw std::runtime_error(
"Evaluator can only take device array as input in CUDA mode!");
 
  251   std::size_t iNode = 0;
 
  253      const bool fromArrayInput = 
info.absArg->namePtr() == namePtr;
 
  254      if (fromArrayInput) {
 
  255         info.fromArrayInput = 
true;
 
  256         info.absArg->setDataToken(iNode);
 
  284      info.isDirty = !
info.fromArrayInput;
 
  288   _needToUpdateOutputSizes = 
true;
 
 
  291void Evaluator::updateOutputSizes()
 
  293   std::map<RooFit::Detail::DataKey, std::size_t> 
sizeMap;
 
  294   for (
auto &
info : _nodes) {
 
  295      if (
info.fromArrayInput) {
 
  305         auto found = 
sizeMap.find(key);
 
  306         return found != 
sizeMap.
end() ? found->second : -1;
 
  309   for (
auto &
info : _nodes) {
 
  317      if (!
info.isScalar()) {
 
  320         setOperMode(
info.absArg, 
info.originalOperMode);
 
  328   _needToUpdateOutputSizes = 
false;
 
 
  331Evaluator::~Evaluator()
 
  333   for (
auto &
info : _nodes) {
 
  334      if (!
info.isVariable) {
 
  335         info.absArg->resetDataToken();
 
 
  342   using namespace Detail;
 
  344   const std::size_t 
nOut = 
info.outputSize;
 
  346   double *buffer = 
nullptr;
 
  348      buffer = &
info.scalarBuffer;
 
  350         _evalContextCUDA.set(node, {buffer, 
nOut});
 
  353      if (!
info.hasLogged && _useGPU) {
 
  356                                        << 
" could not be evaluated on the GPU because the class doesn't support it. " 
  357                                           "Consider requesting or implementing it to benefit from a speed up." 
  359         info.hasLogged = 
true;
 
  362         info.buffer = 
info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(
nOut, 
info.stream)
 
  363                                                : _bufferManager->makeCpuBuffer(
nOut);
 
  365      buffer = 
info.buffer->hostWritePtr();
 
  367   assignSpan(_evalContextCPU._currentOutput, {buffer, nOut});
 
  368   _evalContextCPU.set(node, {buffer, 
nOut});
 
  370      _evalContextCPU.enableVectorBuffers(
true);
 
  372   if (
info.isCategory) {
 
  377         throw std::runtime_error(
"RooFit::Evaluator - non-scalar category values are not supported!");
 
  383   _evalContextCPU.resetVectorBuffers();
 
  384   _evalContextCPU.enableVectorBuffers(
false);
 
  385   if (
info.copyAfterEvaluation) {
 
  386      _evalContextCUDA.set(node, {
info.buffer->deviceReadPtr(), 
nOut});
 
 
  398   auto *var = 
static_cast<RooRealVar const *
>(node);
 
  399   if (
nodeInfo.lastSetValCount != var->valueResetCounter()) {
 
  400      nodeInfo.lastSetValCount = var->valueResetCounter();
 
 
  419std::span<const double> Evaluator::run()
 
  421   if (_needToUpdateOutputSizes)
 
  427      return getValHeterogeneous();
 
  445   return _evalContextCPU.at(&_topNode);
 
 
  449std::span<const double> Evaluator::getValHeterogeneous()
 
  451   for (
auto &
info : _nodes) {
 
  452      info.remClients = 
info.clientInfos.size();
 
  453      info.remServers = 
info.serverInfos.size();
 
  454      if (
info.buffer && !
info.fromArrayInput) {
 
  460   for (
auto &
info : _nodes) {
 
  461      if (
info.remServers == 0 && 
info.computeInGPU) {
 
  469      for (
auto &
info : _nodes) {
 
  471            info.remServers = -2;
 
  486      auto it = _nodes.
begin();
 
  487      for (; it != _nodes.end(); it++) {
 
  488         if (it->remServers == 0 && !it->computeInGPU)
 
  493      if (it == _nodes.end()) {
 
  494         std::this_thread::sleep_for(std::chrono::milliseconds(1));
 
  501      info.remServers = -2; 
 
  503      if (!
info.fromArrayInput) {
 
  504         computeCPUNode(node, 
info);
 
  519   return _evalContextCUDA.at(&_topNode);
 
 
  526   using namespace Detail;
 
  528   info.remServers = -1;
 
  538   const std::size_t 
nOut = 
info.outputSize;
 
  540   double *buffer = 
nullptr;
 
  542      buffer = &
info.scalarBuffer;
 
  543      _evalContextCPU.set(node, {buffer, 
nOut});
 
  545      info.buffer = 
info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(
nOut, 
info.stream)
 
  546                                             : _bufferManager->makeGpuBuffer(
nOut);
 
  547      buffer = 
info.buffer->deviceWritePtr();
 
  549   assignSpan(_evalContextCUDA._currentOutput, {buffer, nOut});
 
  550   _evalContextCUDA.set(node, {buffer, 
nOut});
 
  551   node->doEval(_evalContextCUDA);
 
  553   if (
info.copyAfterEvaluation) {
 
  554      _evalContextCPU.set(node, {
info.buffer->hostReadPtr(), 
nOut});
 
 
  559void Evaluator::markGPUNodes()
 
  563   for (
auto &
info : _nodes) {
 
  564      info.computeInGPU = 
false;
 
  565      if (!
info.absArg->canComputeBatchWithCuda()) {
 
  570            info.computeInGPU = 
true;
 
  577   for (
auto &
info : _nodes) {
 
  578      info.copyAfterEvaluation = 
false;
 
  580      if (!
info.isScalar()) {
 
  583               info.copyAfterEvaluation = 
true;
 
 
  596      _changeOperModeRAIIs.emplace(std::make_unique<ChangeOperModeRAII>(arg, 
opMode));
 
 
  600void Evaluator::print(std::ostream &os)
 
  602   std::cout << 
"--- RooFit BatchMode evaluation ---\n";
 
  604   std::vector<int> 
widths{9, 37, 20, 9, 10, 20};
 
  607      const char separator = 
' ';
 
  608      os << separator << std::left << std::setw(
widths[
iCol]) << std::setfill(separator) << t;
 
  617      for (
int i = 0; i < 
n; i++) {
 
  636   for (std::size_t iNode = 0; iNode < _nodes.size(); ++iNode) {
 
  640      auto span = _evalContextCPU.at(node);
 
 
  666      if (
nodeInfo.absArg->isFundamental()) {
 
 
  686   if (
mode == _evalContextCPU._offsetMode)
 
  689   _evalContextCPU._offsetMode = 
mode;
 
  690   _evalContextCUDA._offsetMode = 
mode;
 
  693      if (
nodeInfo.absArg->isReducerNode()) {
 
 
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
 
Option_t Option_t TPoint TPoint const char mode
 
const_iterator begin() const
 
const_iterator end() const
 
Common abstract base class for objects that represent a value and a "shape" in RooFit.
 
OperMode operMode() const
Query the operation mode of this node.
 
A space to attach TBranches.
 
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
 
void sort(bool reverse=false)
Sort collection using std::sort and name comparison.
 
Abstract base class for objects that represent a real value and implements functionality common to al...
 
RooArgSet is a container object that can hold multiple RooAbsArg objects.
 
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
 
void setCudaStream(CudaInterface::CudaStream *cudaStream)
 
virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const =0
 
virtual CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const =0
 
virtual void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const =0
 
virtual std::unique_ptr< AbsBufferManager > createBufferManager() const =0
 
virtual void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const =0
 
virtual CudaInterface::CudaStream * newCudaStream() const =0
 
virtual void deleteCudaStream(CudaInterface::CudaStream *) const =0
 
virtual bool cudaStreamIsActive(CudaInterface::CudaStream *) const =0
 
void set(RooAbsArg const *arg, std::span< const double > const &span)
 
std::span< const double > at(RooAbsArg const *arg, RooAbsArg const *caller=nullptr)
 
void setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config)
 
void resize(std::size_t n)
 
void syncDataTokens()
If there are servers with the same name that got de-duplicated in the _nodes list,...
 
std::vector< NodeInfo > _nodes
 
Evaluator(const RooAbsReal &absReal, bool useGPU=false)
Construct a new Evaluator.
 
std::unique_ptr< RooBatchCompute::AbsBufferManager > _bufferManager
 
void setInput(std::string const &name, std::span< const double > inputArray, bool isOnDevice)
 
RooFit::EvalContext _evalContextCUDA
 
RooFit::EvalContext _evalContextCPU
 
static RooMsgService & instance()
Return reference to singleton instance.
 
static const TNamed * ptr(const char *stringPtr)
Return a unique TNamed pointer for given C++ string.
 
Variable that can be changed from the outside.
 
const char * GetName() const override
Returns name of object.
 
virtual const char * ClassName() const
Returns name of class to which the object belongs.
 
R__EXTERN RooBatchComputeInterface * dispatchCUDA
 
std::string cpuArchitectureName()
 
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
 
Architecture cpuArchitecture()
 
int initCPU()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
 
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
 
void getSortedComputationGraph(RooAbsArg const &func, RooArgSet &out)
 
A struct used by the Evaluator to store information on the RooAbsArgs in the computation graph.
 
RooBatchCompute::CudaInterface::CudaStream * stream
 
std::size_t lastSetValCount
 
RooBatchCompute::CudaInterface::CudaEvent * event
 
std::vector< NodeInfo * > serverInfos
 
RooAbsArg::OperMode originalOperMode
 
std::vector< NodeInfo * > clientInfos
 
std::shared_ptr< RooBatchCompute::AbsBuffer > buffer
 
void decrementRemainingClients()
Check the servers of a node that has been computed and release its resources if they are no longer ne...