doc/v640/Evaluator_8cxx_source.html

/*

 * Project: RooFit

 * Authors:

 *   Jonas Rembser, CERN 2021

 *   Emmanouil Michalainas, CERN 2021

 *

 * Copyright (c) 2021, CERN

 *

 * Redistribution and use in source and binary forms,

 * with or without modification, are permitted according to the terms

 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)

 */


/**

\file Evaluator.cxx

\class RooFit::Evaluator

\ingroup Roofitcore


Evaluates a RooAbsReal object in other ways than recursive graph

traversal. Currently, it is being used for evaluating a RooAbsReal object and

supplying the value to the minimizer, during a fit. The class scans the

dependencies and schedules the computations in a secure and efficient way. The

computations take place in the RooBatchCompute library and can be carried off

by either the CPU or a CUDA-supporting GPU. The Evaluator class takes care

of data transfers. An instance of this class is created every time

RooAbsPdf::fitTo() is called and gets destroyed when the fitting ends.

**/


#include <RooFit/Evaluator.h>


#include <RooAbsCategory.h>

#include <RooAbsData.h>

#include <RooAbsReal.h>

#include <RooRealVar.h>

#include <RooBatchCompute.h>

#include <RooMsgService.h>

#include <RooNameReg.h>

#include <RooSimultaneous.h>


#include <RooBatchCompute.h>


#include "BatchModeDataHelpers.h"

#include "RooFitImplHelpers.h"


#include <chrono>

#include <iomanip>

#include <numeric>

#include <thread>

#include <unordered_set>


namespace RooFit {


namespace {


// To avoid deleted move assignment.

template <class T>

void assignSpan(std::span<T> &to, std::span<T> const &from)

{

   to = from;

}


void logArchitectureInfo(bool useGPU)

{

   // We have to exit early if the message stream is not active. Otherwise it's

   // possible that this function skips logging because it thinks it has

   // already logged, but actually it didn't.

   if (!RooMsgService::instance().isActive(nullptr, RooFit::Fitting, RooFit::INFO)) {

      return;

   }


   // Don't repeat logging architecture info if the useGPU option didn't change

   {

      // Second element of pair tracks whether this function has already been called

      static std::pair<bool, bool> lastUseGPU;

      if (lastUseGPU.second && lastUseGPU.first == useGPU)

         return;

      lastUseGPU = {useGPU, true};

   }


   auto log = [](std::string_view message) {

      oocxcoutI(static_cast<RooAbsArg *>(nullptr), Fitting) << message << std::endl;

   };


   if (RooBatchCompute::cpuArchitecture() == RooBatchCompute::Architecture::GENERIC) {

      log("using generic CPU library compiled with no vectorizations");

   } else {

      log(std::string("using CPU computation library compiled with -m") + RooBatchCompute::cpuArchitectureName());

   }

   if (useGPU) {

      log("using CUDA computation library");

   }

}


} // namespace


/// A struct used by the Evaluator to store information on the RooAbsArgs in

/// the computation graph.


struct NodeInfo {


   bool isScalar() const { return outputSize == 1; }


   RooAbsArg *absArg = nullptr;

   RooAbsArg::OperMode originalOperMode;


   std::shared_ptr<RooBatchCompute::AbsBuffer> buffer;

   std::size_t iNode = 0;

   int remClients = 0;

   int remServers = 0;

   bool copyAfterEvaluation = false;

   bool fromArrayInput = false;

   bool isVariable = false;

   bool isDirty = true;

   bool isCategory = false;

   bool hasLogged = false;

   bool computeInGPU = false;

   bool isValueServer = false; // if this node is a value server to the top node

   std::size_t outputSize = 1;

   std::size_t lastSetValCount = std::numeric_limits<std::size_t>::max();

   int lastCatVal = std::numeric_limits<int>::max();

   double scalarBuffer = 0.0;

   std::vector<NodeInfo *> serverInfos;

   std::vector<NodeInfo *> clientInfos;


   RooBatchCompute::CudaInterface::CudaEvent *event = nullptr;

   RooBatchCompute::CudaInterface::CudaStream *stream = nullptr;


   /// Check the servers of a node that has been computed and release its

   /// resources if they are no longer needed.


   void decrementRemainingClients()

   {

      if (--remClients == 0 && !fromArrayInput) {

         buffer.reset();

      }

   }


   ~NodeInfo()

   {

      if (event)

         RooBatchCompute::dispatchCUDA->deleteCudaEvent(event);

      if (stream)

         RooBatchCompute::dispatchCUDA->deleteCudaStream(stream);

   }


};


/// Construct a new Evaluator. The constructor analyzes and saves metadata about the graph,

/// useful for the evaluation of it that will be done later. In case the CUDA mode is selected,

/// there's also some CUDA-related initialization.

///

/// \param[in] absReal The RooAbsReal object that sits on top of the

///            computation graph that we want to evaluate.

/// \param[in] useGPU Whether the evaluation should be preferably done on the GPU.


Evaluator::Evaluator(const RooAbsReal &absReal, bool useGPU)

   : _topNode{const_cast<RooAbsReal &>(absReal)}, _useGPU{useGPU}

{

   RooBatchCompute::initCPU();

   if (useGPU && RooBatchCompute::initCUDA() != 0) {

      throw std::runtime_error("Can't create Evaluator in CUDA mode because RooBatchCompute CUDA could not be loaded!");

   }

   // Some checks and logging of used architectures

   logArchitectureInfo(_useGPU);


   _bufferManager = _useGPU ? RooBatchCompute::dispatchCUDA->createBufferManager()

                            : RooBatchCompute::dispatchCPU->createBufferManager();


   RooArgSet serverSet;

   ::RooHelpers::getSortedComputationGraph(_topNode, serverSet);


   _evalContextCPU.resize(serverSet.size());

   if (useGPU) {

      _evalContextCUDA.resize(serverSet.size());

   }


   std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;


   // Fill the ordered nodes list and initialize the node info structs.

   _nodes.reserve(serverSet.size());

   std::size_t iNode = 0;

   for (RooAbsArg *arg : serverSet) {


      _nodes.emplace_back();

      auto &nodeInfo = _nodes.back();

      _nodesMap[arg->namePtr()] = &nodeInfo;


      nodeInfo.absArg = arg;

      nodeInfo.originalOperMode = arg->operMode();

      nodeInfo.iNode = iNode;

      nodeInfos[arg] = &nodeInfo;


      if (dynamic_cast<RooRealVar const *>(arg)) {

         nodeInfo.isVariable = true;

      } else {

         arg->setDataToken(iNode);

      }

      if (dynamic_cast<RooAbsCategory const *>(arg)) {

         nodeInfo.isCategory = true;

      }


      ++iNode;

   }


   for (NodeInfo &info : _nodes) {

      info.serverInfos.reserve(info.absArg->servers().size());

      for (RooAbsArg *server : info.absArg->servers()) {

         if (server->isValueServer(*info.absArg)) {

            auto *serverInfo = nodeInfos.at(server);

            info.serverInfos.emplace_back(serverInfo);

            serverInfo->clientInfos.emplace_back(&info);

         }

      }

   }


   // Figure out which nodes are value servers to the top node

   _nodes.back().isValueServer = true; // the top node itself

   for (auto iter = _nodes.rbegin(); iter != _nodes.rend(); ++iter) {

      if (!iter->isValueServer)

         continue;

      for (auto &serverInfo : iter->serverInfos) {

         serverInfo->isValueServer = true;

      }

   }


   syncDataTokens();


   if (_useGPU) {

      // create events and streams for every node

      for (auto &info : _nodes) {

         info.event = RooBatchCompute::dispatchCUDA->newCudaEvent(false);

         info.stream = RooBatchCompute::dispatchCUDA->newCudaStream();

         RooBatchCompute::Config cfg;

         cfg.setCudaStream(info.stream);

         _evalContextCUDA.setConfig(info.absArg, cfg);

      }

   }

}


/// If there are servers with the same name that got de-duplicated in the

/// `_nodes` list, we need to set their data tokens too. We find such nodes by

/// visiting the servers of every known node.


void Evaluator::syncDataTokens()

{

   for (NodeInfo &info : _nodes) {

      std::size_t iValueServer = 0;

      for (RooAbsArg *server : info.absArg->servers()) {

         if (server->isValueServer(*info.absArg)) {

            auto *knownServer = info.serverInfos[iValueServer]->absArg;

            if (knownServer->hasDataToken()) {

               server->setDataToken(knownServer->dataToken());

            }

            ++iValueServer;

         }

      }

   }

}


void Evaluator::setInput(std::string const &name, std::span<const double> inputArray, bool isOnDevice)

{

   if (isOnDevice && !_useGPU) {

      throw std::runtime_error("Evaluator can only take device array as input in CUDA mode!");

   }


   // Check if "name" is used in the computation graph. If yes, add the span to

   // the data map and set the node info accordingly.


   auto found = _nodesMap.find(RooNameReg::ptr(name.c_str()));


   if (found == _nodesMap.end())

      return;


   _needToUpdateOutputSizes = true;


   NodeInfo &info = *found->second;


   info.fromArrayInput = true;

   info.absArg->setDataToken(info.iNode);

   info.outputSize = inputArray.size();


   if (!_useGPU) {

      _evalContextCPU.set(info.absArg, inputArray);

      return;

   }


   if (info.outputSize <= 1) {

      // Empty or scalar observables from the data don't need to be

      // copied to the GPU.

      _evalContextCPU.set(info.absArg, inputArray);

      _evalContextCUDA.set(info.absArg, inputArray);

      return;

   }


   // For simplicity, we put the data on both host and device for

   // now. This could be optimized by inspecting the clients of the

   // variable.

   if (isOnDevice) {

      _evalContextCUDA.set(info.absArg, inputArray);

      auto gpuSpan = _evalContextCUDA.at(info.absArg);

      info.buffer = _bufferManager->makeCpuBuffer(gpuSpan.size());

      info.buffer->assignFromDevice(gpuSpan);

      _evalContextCPU.set(info.absArg, {info.buffer->hostReadPtr(), gpuSpan.size()});

   } else {

      _evalContextCPU.set(info.absArg, inputArray);

      auto cpuSpan = _evalContextCPU.at(info.absArg);

      info.buffer = _bufferManager->makeGpuBuffer(cpuSpan.size());

      info.buffer->assignFromHost(cpuSpan);

      _evalContextCUDA.set(info.absArg, {info.buffer->deviceReadPtr(), cpuSpan.size()});

   }

}


void Evaluator::updateOutputSizes()

{

   std::map<RooFit::Detail::DataKey, std::size_t> sizeMap;

   for (auto &info : _nodes) {

      if (info.fromArrayInput) {

         sizeMap[info.absArg] = info.outputSize;

      } else {

         // any buffer for temporary results is invalidated by resetting the output sizes

         info.buffer.reset();

      }

   }


   auto outputSizeMap =

      RooFit::BatchModeDataHelpers::determineOutputSizes(_topNode, [&](RooFit::Detail::DataKey key) -> int {

         auto found = sizeMap.find(key);

         return found != sizeMap.end() ? found->second : -1;

      });


   for (auto &info : _nodes) {

      info.outputSize = outputSizeMap.at(info.absArg);

      info.isDirty = true;

   }


   if (_useGPU) {

      markGPUNodes();

   }


   _needToUpdateOutputSizes = false;

}


Evaluator::~Evaluator()

{

   for (auto &info : _nodes) {

      if (!info.isVariable) {

         info.absArg->resetDataToken();

      }

   }

}


void Evaluator::computeCPUNode(const RooAbsArg *node, NodeInfo &info)

{

   using namespace Detail;


   const std::size_t nOut = info.outputSize;


   double *buffer = nullptr;

   if (nOut == 1) {

      buffer = &info.scalarBuffer;

      if (_useGPU) {

         _evalContextCUDA.set(node, {buffer, nOut});

      }

   } else {

      if (!info.hasLogged && _useGPU) {

         RooAbsArg const &arg = *info.absArg;

         oocoutI(&arg, FastEvaluations) << "The argument " << arg.ClassName() << "::" << arg.GetName()

                                        << " could not be evaluated on the GPU because the class doesn't support it. "

                                           "Consider requesting or implementing it to benefit from a speed up."

                                        << std::endl;

         info.hasLogged = true;

      }

      if (!info.buffer) {

         info.buffer = info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(nOut, info.stream)

                                                : _bufferManager->makeCpuBuffer(nOut);

      }

      buffer = info.buffer->hostWritePtr();

   }

   assignSpan(_evalContextCPU._currentOutput, {buffer, nOut});

   _evalContextCPU.set(node, {buffer, nOut});

   if (nOut > 1) {

      _evalContextCPU.enableVectorBuffers(true);

   }

   if (info.isCategory) {

      auto nodeAbsCategory = static_cast<RooAbsCategory const *>(node);

      if (nOut == 1) {

         buffer[0] = nodeAbsCategory->getCurrentIndex();

      } else {

         throw std::runtime_error("RooFit::Evaluator - non-scalar category values are not supported!");

      }

   } else {

      auto nodeAbsReal = static_cast<RooAbsReal const *>(node);

      nodeAbsReal->doEval(_evalContextCPU);

   }

   _evalContextCPU.resetVectorBuffers();

   _evalContextCPU.enableVectorBuffers(false);

   if (info.copyAfterEvaluation) {

      _evalContextCUDA.set(node, {info.buffer->deviceReadPtr(), nOut});

      if (info.event) {

         RooBatchCompute::dispatchCUDA->cudaEventRecord(info.event, info.stream);

      }

   }

}


/// Process a variable in the computation graph. This is a separate non-inlined

/// function such that we can see in performance profiles how long this takes.


void Evaluator::processVariable(NodeInfo &nodeInfo)

{

   RooAbsArg *node = nodeInfo.absArg;

   auto *var = static_cast<RooRealVar const *>(node);

   if (nodeInfo.lastSetValCount != var->valueResetCounter()) {

      nodeInfo.lastSetValCount = var->valueResetCounter();

      for (NodeInfo *clientInfo : nodeInfo.clientInfos) {

         clientInfo->isDirty = true;

      }

      computeCPUNode(node, nodeInfo);

      nodeInfo.isDirty = false;

   }

}


/// Process a category in the computation graph. This is a separate non-inlined

/// function such that we can see in performance profiles how long this takes.


void Evaluator::processCategory(NodeInfo &nodeInfo)

{

   RooAbsArg *node = nodeInfo.absArg;

   auto *cat = static_cast<RooAbsCategory const *>(node);

   if (nodeInfo.lastCatVal != cat->getCurrentIndex()) {

      nodeInfo.lastCatVal = cat->getCurrentIndex();

      for (NodeInfo *clientInfo : nodeInfo.clientInfos) {

         clientInfo->isDirty = true;

      }

      computeCPUNode(node, nodeInfo);

      nodeInfo.isDirty = false;

   }

}


/// Flags all the clients of a given node dirty. This is a separate non-inlined

/// function such that we can see in performance profiles how long this takes.


void Evaluator::setClientsDirty(NodeInfo &nodeInfo)

{

   for (NodeInfo *clientInfo : nodeInfo.clientInfos) {

      clientInfo->isDirty = true;

   }

}


/// Returns the value of the top node in the computation graph


std::span<const double> Evaluator::run()

{

   if (_needToUpdateOutputSizes)

      updateOutputSizes();


   ++_nEvaluations;


   if (_useGPU) {

      return getValHeterogeneous();

   }


   for (auto &nodeInfo : _nodes) {

      if (!nodeInfo.fromArrayInput) {

         if (nodeInfo.isVariable) {

            processVariable(nodeInfo);

         } else if (nodeInfo.isCategory) {

            processCategory(nodeInfo);

         } else {

            if (nodeInfo.isDirty) {

               setClientsDirty(nodeInfo);

               computeCPUNode(nodeInfo.absArg, nodeInfo);

               nodeInfo.isDirty = false;

            }

         }

      }

   }


   // return the final output

   return _evalContextCPU.at(&_topNode);

}


/// Returns the value of the top node in the computation graph


std::span<const double> Evaluator::getValHeterogeneous()

{

   for (auto &info : _nodes) {

      info.remClients = info.clientInfos.size();

      info.remServers = info.serverInfos.size();

      if (info.buffer && !info.fromArrayInput) {

         info.buffer.reset();

      }

   }


   // find initial GPU nodes and assign them to GPU

   for (auto &info : _nodes) {

      if (info.remServers == 0 && info.computeInGPU) {

         assignToGPU(info);

      }

   }


   NodeInfo const &topNodeInfo = _nodes.back();

   while (topNodeInfo.remServers != -2) {

      // find finished GPU nodes

      for (auto &info : _nodes) {

         if (info.remServers == -1 && !RooBatchCompute::dispatchCUDA->cudaStreamIsActive(info.stream)) {

            info.remServers = -2;

            // Decrement number of remaining servers for clients and start GPU computations

            for (auto *infoClient : info.clientInfos) {

               --infoClient->remServers;

               if (infoClient->computeInGPU && infoClient->remServers == 0) {

                  assignToGPU(*infoClient);

               }

            }

            for (auto *serverInfo : info.serverInfos) {

               serverInfo->decrementRemainingClients();

            }

         }

      }


      // find next CPU node

      auto it = _nodes.begin();

      for (; it != _nodes.end(); it++) {

         if (it->remServers == 0 && !it->computeInGPU)

            break;

      }


      // if no CPU node available sleep for a while to save CPU usage

      if (it == _nodes.end()) {

         std::this_thread::sleep_for(std::chrono::milliseconds(1));

         continue;

      }


      // compute next CPU node

      NodeInfo &info = *it;

      RooAbsArg const *node = info.absArg;

      info.remServers = -2; // so that it doesn't get picked again


      if (!info.fromArrayInput) {

         computeCPUNode(node, info);

      }


      // Assign the clients that are computed on the GPU

      for (auto *infoClient : info.clientInfos) {

         if (--infoClient->remServers == 0 && infoClient->computeInGPU) {

            assignToGPU(*infoClient);

         }

      }

      for (auto *serverInfo : info.serverInfos) {

         serverInfo->decrementRemainingClients();

      }

   }


   // return the final value

   return _evalContextCUDA.at(&_topNode);

}


/// Assign a node to be computed in the GPU. Scan it's clients and also assign them

/// in case they only depend on GPU nodes.


void Evaluator::assignToGPU(NodeInfo &info)

{

   using namespace Detail;


   info.remServers = -1;


   auto node = static_cast<RooAbsReal const *>(info.absArg);


   // wait for every server to finish

   for (auto *infoServer : info.serverInfos) {

      if (infoServer->event)

         RooBatchCompute::dispatchCUDA->cudaStreamWaitForEvent(info.stream, infoServer->event);

   }


   const std::size_t nOut = info.outputSize;


   double *buffer = nullptr;

   if (nOut == 1) {

      buffer = &info.scalarBuffer;

      _evalContextCPU.set(node, {buffer, nOut});

   } else {

      info.buffer = info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(nOut, info.stream)

                                             : _bufferManager->makeGpuBuffer(nOut);

      buffer = info.buffer->deviceWritePtr();

   }

   assignSpan(_evalContextCUDA._currentOutput, {buffer, nOut});

   _evalContextCUDA.set(node, {buffer, nOut});

   node->doEval(_evalContextCUDA);

   RooBatchCompute::dispatchCUDA->cudaEventRecord(info.event, info.stream);

   if (info.copyAfterEvaluation) {

      _evalContextCPU.set(node, {info.buffer->hostReadPtr(), nOut});

   }

}


/// Decides which nodes are assigned to the GPU in a CUDA fit.


void Evaluator::markGPUNodes()

{

   // Decide which nodes get evaluated on the GPU: we select nodes that support

   // CUDA evaluation and have at least one input of size greater than one.

   for (auto &info : _nodes) {

      info.computeInGPU = false;

      if (!info.absArg->canComputeBatchWithCuda()) {

         continue;

      }

      for (NodeInfo const *serverInfo : info.serverInfos) {

         if (serverInfo->outputSize > 1) {

            info.computeInGPU = true;

            break;

         }

      }

   }


   // In a second pass, figure out which nodes need to copy over their results.

   for (auto &info : _nodes) {

      info.copyAfterEvaluation = false;

      // scalar nodes don't need copying

      if (!info.isScalar()) {

         for (auto *clientInfo : info.clientInfos) {

            if (info.computeInGPU != clientInfo->computeInGPU) {

               info.copyAfterEvaluation = true;

               break;

            }

         }

      }

   }

}


/// Temporarily change the operation mode of a RooAbsArg until the

/// Evaluator gets deleted.


void Evaluator::setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)

{

   if (!_operModeChanges)

      _operModeChanges = std::make_unique<ChangeOperModeRAII>();

   _operModeChanges->change(arg, opMode);

}


// Change the operation modes of all RooAbsArgs in the computation graph.

// The changes are reset when the returned RAII object goes out of scope.

//

// We also walk transitively through value clients of the nodes to cover any

// node that RooAbsReal::doEval (the fallback scalar implementation) might

// inadvertently propagate the ADirty mode to via its recursive restore: that

// helper sets servers temporarily to AClean and then calls

// setOperMode(oldOperMode) to restore, which recurses to value clients when

// oldOperMode is ADirty. If we did not protect those clients here, any node

// outside the computation graph that shares a fundamental (e.g. a parameter

// like a RooRealVar) would be left permanently in ADirty after the first

// minimization, dramatically slowing down later scalar evaluations (for

// example on pdfs held by the legacy test statistics' internal cache).


std::unique_ptr<ChangeOperModeRAII> Evaluator::setOperModes(RooAbsArg::OperMode opMode)

{

   auto out = std::make_unique<ChangeOperModeRAII>();

   std::unordered_set<RooAbsArg *> visited;


   std::vector<RooAbsArg *> queue;

   queue.reserve(_nodes.size());

   for (auto &info : _nodes) {

      queue.push_back(info.absArg);

   }


   while (!queue.empty()) {

      RooAbsArg *node = queue.back();

      queue.pop_back();

      if (!visited.insert(node).second)

         continue;


      out->change(node, opMode);


      // Only follow value-client links: that is exactly the propagation path

      // used by RooAbsArg::setOperMode with mode==ADirty.

      if (opMode == RooAbsArg::ADirty) {

         for (auto *client : node->valueClients()) {

            queue.push_back(client);

         }

      }

   }

   return out;

}


void Evaluator::print(std::ostream &os)

{

   std::cout << "--- RooFit BatchMode evaluation ---\n";


   std::vector<int> widths{9, 37, 20, 9, 10, 20};


   auto printElement = [&](int iCol, auto const &t) {

      const char separator = ' ';

      os << separator << std::left << std::setw(widths[iCol]) << std::setfill(separator) << t;

      os << "|";

   };


   auto printHorizontalRow = [&]() {

      int n = 0;

      for (int w : widths) {

         n += w + 2;

      }

      for (int i = 0; i < n; i++) {

         os << '-';

      }

      os << "|\n";

   };


   printHorizontalRow();


   os << "|";

   printElement(0, "Index");

   printElement(1, "Name");

   printElement(2, "Class");

   printElement(3, "Size");

   printElement(4, "From Data");

   printElement(5, "1st value");

   std::cout << "\n";


   printHorizontalRow();


   for (std::size_t iNode = 0; iNode < _nodes.size(); ++iNode) {

      auto &nodeInfo = _nodes[iNode];

      RooAbsArg *node = nodeInfo.absArg;


      auto span = _evalContextCPU.at(node);


      os << "|";

      printElement(0, iNode);

      printElement(1, node->GetName());

      printElement(2, node->ClassName());

      printElement(3, nodeInfo.outputSize);

      printElement(4, nodeInfo.fromArrayInput);

      printElement(5, span[0]);


      std::cout << "\n";

   }


   printHorizontalRow();

}


/// Gets all the parameters of the RooAbsReal. This is in principle not

/// necessary, because we can always ask the RooAbsReal itself, but the

/// Evaluator has the cached information to get the answer quicker.

/// Therefore, this is not meant to be used in general, just where it matters.

/// \warning If we find another solution to get the parameters efficiently,

/// this function might be removed without notice.


RooArgSet Evaluator::getParameters() const

{

   RooArgSet parameters;

   for (auto &nodeInfo : _nodes) {

      if (nodeInfo.isValueServer && nodeInfo.absArg->isFundamental()) {

         parameters.add(*nodeInfo.absArg);

      }

   }

   // Just like in RooAbsArg::getParameters(), we sort the parameters alphabetically.

   parameters.sort();

   return parameters;

}


/// \brief Sets the offset mode for evaluation.

///

/// This function sets the offset mode for evaluation to the specified mode.

/// It updates the offset mode for both CPU and CUDA evaluation contexts.

///

/// \param mode The offset mode to be set.

///

/// \note This function marks reducer nodes as dirty if the offset mode is

///       changed, because only reducer nodes can use offsetting.


void Evaluator::setOffsetMode(RooFit::EvalContext::OffsetMode mode)

{

   if (mode == _evalContextCPU._offsetMode)

      return;


   _evalContextCPU._offsetMode = mode;

   _evalContextCUDA._offsetMode = mode;


   for (auto &nodeInfo : _nodes) {

      if (nodeInfo.absArg->isReducerNode()) {

         nodeInfo.isDirty = true;

      }

   }

}


} // namespace RooFit

BatchModeDataHelpers.h

Evaluator.h

RooAbsCategory.h

RooAbsData.h

RooAbsReal.h

RooBatchCompute.h

RooFitImplHelpers.h

RooMsgService.h

oocoutI
#define oocoutI(o, a)
Definition RooMsgService.h:49

oocxcoutI
#define oocxcoutI(o, a)
Definition RooMsgService.h:91

RooNameReg.h

RooRealVar.h

RooSimultaneous.h

TRangeDynCast
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Definition TCollection.h:360

w
winID w
Definition TGWin32VirtualGLProxy.cxx:39

mode
Option_t Option_t TPoint TPoint const char mode
Definition TGWin32VirtualXProxy.cxx:68

name
char name[80]
Definition TGX11.cxx:145

ROOT::Detail::TRangeCast
Definition TCollection.h:313

ROOT::RRangeCast::begin
const_iterator begin() const
Definition RRangeCast.hxx:104

ROOT::RRangeCast::end
const_iterator end() const
Definition RRangeCast.hxx:105

RooAbsArg
Common abstract base class for objects that represent a value and a "shape" in RooFit.
Definition RooAbsArg.h:76

RooAbsArg::valueClients
const RefCountList_t & valueClients() const
List of all value clients of this object. Value clients receive value updates.
Definition RooAbsArg.h:139

RooAbsArg::OperMode
OperMode
Definition RooAbsArg.h:322

RooAbsArg::ADirty
@ ADirty
Definition RooAbsArg.h:325

RooAbsCategory
A space to attach TBranches.
Definition RooAbsCategory.h:34

RooAbsCollection::add
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
Definition RooAbsCollection.cxx:442

RooAbsCollection::sort
void sort(bool reverse=false)
Sort collection using std::sort and name comparison.
Definition RooAbsCollection.cxx:1539

RooAbsReal
Abstract base class for objects that represent a real value and implements functionality common to al...
Definition RooAbsReal.h:63

RooArgSet
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Definition RooArgSet.h:24

RooBatchCompute::Config
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
Definition RooBatchCompute.h:55

RooBatchCompute::Config::setCudaStream
void setCudaStream(CudaInterface::CudaStream *cudaStream)
Definition RooBatchCompute.h:58

RooBatchCompute::CudaInterface::CudaEvent
Definition CudaInterface.h:43

RooBatchCompute::CudaInterface::CudaStream
Definition CudaInterface.h:58

RooBatchCompute::RooBatchComputeInterface::deleteCudaEvent
virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const =0

RooBatchCompute::RooBatchComputeInterface::newCudaEvent
virtual CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const =0

RooBatchCompute::RooBatchComputeInterface::cudaEventRecord
virtual void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const =0

RooBatchCompute::RooBatchComputeInterface::createBufferManager
virtual std::unique_ptr< AbsBufferManager > createBufferManager() const =0

RooBatchCompute::RooBatchComputeInterface::cudaStreamWaitForEvent
virtual void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const =0

RooBatchCompute::RooBatchComputeInterface::newCudaStream
virtual CudaInterface::CudaStream * newCudaStream() const =0

RooBatchCompute::RooBatchComputeInterface::deleteCudaStream
virtual void deleteCudaStream(CudaInterface::CudaStream *) const =0

RooBatchCompute::RooBatchComputeInterface::cudaStreamIsActive
virtual bool cudaStreamIsActive(CudaInterface::CudaStream *) const =0

RooFit::Detail::DataKey
Definition EvalContext.h:48

RooFit::EvalContext::set
void set(RooAbsArg const *arg, std::span< const double > const &span)
Definition EvalContext.h:91

RooFit::EvalContext::at
std::span< const double > at(RooAbsArg const *arg, RooAbsArg const *caller=nullptr)
Definition EvalContext.cxx:35

RooFit::EvalContext::OffsetMode
OffsetMode
Definition EvalContext.h:86

RooFit::EvalContext::resetVectorBuffers
void resetVectorBuffers()
Definition EvalContext.h:111

RooFit::EvalContext::enableVectorBuffers
void enableVectorBuffers(bool enable)
Definition EvalContext.h:110

RooFit::EvalContext::_offsetMode
OffsetMode _offsetMode
Definition EvalContext.h:120

RooFit::EvalContext::setConfig
void setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config)
Definition EvalContext.cxx:65

RooFit::EvalContext::_currentOutput
std::span< double > _currentOutput
Definition EvalContext.h:121

RooFit::EvalContext::resize
void resize(std::size_t n)
Definition EvalContext.cxx:82

RooFit::Evaluator::print
void print(std::ostream &os)
Definition Evaluator.cxx:668

RooFit::Evaluator::updateOutputSizes
void updateOutputSizes()
Definition Evaluator.cxx:308

RooFit::Evaluator::setClientsDirty
void setClientsDirty(NodeInfo &nodeInfo)
Flags all the clients of a given node dirty.
Definition Evaluator.cxx:434

RooFit::Evaluator::setOperModes
std::unique_ptr< ChangeOperModeRAII > setOperModes(RooAbsArg::OperMode opMode)
Definition Evaluator.cxx:638

RooFit::Evaluator::getParameters
RooArgSet getParameters() const
Gets all the parameters of the RooAbsReal.
Definition Evaluator.cxx:730

RooFit::Evaluator::setOffsetMode
void setOffsetMode(RooFit::EvalContext::OffsetMode)
Sets the offset mode for evaluation.
Definition Evaluator.cxx:752

RooFit::Evaluator::syncDataTokens
void syncDataTokens()
If there are servers with the same name that got de-duplicated in the _nodes list,...
Definition Evaluator.cxx:239

RooFit::Evaluator::_useGPU
const bool _useGPU
Definition Evaluator.h:63

RooFit::Evaluator::_nodesMap
std::unordered_map< TNamed const *, NodeInfo * > _nodesMap
Definition Evaluator.h:69

RooFit::Evaluator::_operModeChanges
std::unique_ptr< ChangeOperModeRAII > _operModeChanges
Definition Evaluator.h:70

RooFit::Evaluator::_nodes
std::vector< NodeInfo > _nodes
Definition Evaluator.h:68

RooFit::Evaluator::_nEvaluations
int _nEvaluations
Definition Evaluator.h:64

RooFit::Evaluator::_needToUpdateOutputSizes
bool _needToUpdateOutputSizes
Definition Evaluator.h:65

RooFit::Evaluator::getValHeterogeneous
std::span< const double > getValHeterogeneous()
Returns the value of the top node in the computation graph.
Definition Evaluator.cxx:474

RooFit::Evaluator::run
std::span< const double > run()
Returns the value of the top node in the computation graph.
Definition Evaluator.cxx:442

RooFit::Evaluator::Evaluator
Evaluator(const RooAbsReal &absReal, bool useGPU=false)
Construct a new Evaluator.
Definition Evaluator.cxx:152

RooFit::Evaluator::processVariable
void processVariable(NodeInfo &nodeInfo)
Process a variable in the computation graph.
Definition Evaluator.cxx:402

RooFit::Evaluator::processCategory
void processCategory(NodeInfo &nodeInfo)
Process a category in the computation graph.
Definition Evaluator.cxx:418

RooFit::Evaluator::~Evaluator
~Evaluator()
Definition Evaluator.cxx:338

RooFit::Evaluator::_bufferManager
std::unique_ptr< RooBatchCompute::AbsBufferManager > _bufferManager
Definition Evaluator.h:61

RooFit::Evaluator::markGPUNodes
void markGPUNodes()
Decides which nodes are assigned to the GPU in a CUDA fit.
Definition Evaluator.cxx:584

RooFit::Evaluator::assignToGPU
void assignToGPU(NodeInfo &info)
Assign a node to be computed in the GPU.
Definition Evaluator.cxx:549

RooFit::Evaluator::setInput
void setInput(std::string const &name, std::span< const double > inputArray, bool isOnDevice)
Definition Evaluator.cxx:255

RooFit::Evaluator::_evalContextCUDA
RooFit::EvalContext _evalContextCUDA
Definition Evaluator.h:67

RooFit::Evaluator::_evalContextCPU
RooFit::EvalContext _evalContextCPU
Definition Evaluator.h:66

RooFit::Evaluator::computeCPUNode
void computeCPUNode(const RooAbsArg *node, NodeInfo &info)
Definition Evaluator.cxx:347

RooFit::Evaluator::setOperMode
void setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
Temporarily change the operation mode of a RooAbsArg until the Evaluator gets deleted.
Definition Evaluator.cxx:618

RooFit::Evaluator::_topNode
RooAbsReal & _topNode
Definition Evaluator.h:62

RooMsgService::instance
static RooMsgService & instance()
Return reference to singleton instance.
Definition RooMsgService.cxx:344

RooNameReg::ptr
static const TNamed * ptr(const char *stringPtr)
Return a unique TNamed pointer for given C++ string.
Definition RooNameReg.cxx:73

RooRealVar
Variable that can be changed from the outside.
Definition RooRealVar.h:37

TNamed::GetName
const char * GetName() const override
Returns name of object.
Definition TNamed.h:49

TObject::ClassName
virtual const char * ClassName() const
Returns name of class to which the object belongs.
Definition TObject.cxx:224

ROOT::VecOps::log
RVec< PromoteType< T > > log(const RVec< T > &v)
Definition RVec.hxx:1836

n
const Int_t n
Definition legend1.C:16

RooBatchCompute::dispatchCUDA
R__EXTERN RooBatchComputeInterface * dispatchCUDA
Definition RooBatchCompute.h:200

RooBatchCompute::Architecture::GENERIC
@ GENERIC

RooBatchCompute::cpuArchitectureName
std::string cpuArchitectureName()
Definition RooBatchCompute.h:207

RooBatchCompute::dispatchCPU
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
Definition RooBatchCompute.h:199

RooBatchCompute::cpuArchitecture
Architecture cpuArchitecture()
Definition RooBatchCompute.h:202

RooBatchCompute::initCUDA
int initCUDA()
Definition Initialisation.cxx:95

RooBatchCompute::initCPU
int initCPU()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
Definition Initialisation.cxx:50

RooFit
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
Definition CodegenImpl.h:72

RooFit::INFO
@ INFO
Definition RooGlobalFunc.h:62

RooFit::FastEvaluations
@ FastEvaluations
Definition RooGlobalFunc.h:66

RooFit::Fitting
@ Fitting
Definition RooGlobalFunc.h:64

RooHelpers::getSortedComputationGraph
void getSortedComputationGraph(RooAbsArg const &func, RooArgSet &out)

RooFit::NodeInfo
A struct used by the Evaluator to store information on the RooAbsArgs in the computation graph.
Definition Evaluator.cxx:98

RooFit::NodeInfo::stream
RooBatchCompute::CudaInterface::CudaStream * stream
Definition Evaluator.cxx:125

RooFit::NodeInfo::absArg
RooAbsArg * absArg
Definition Evaluator.cxx:102

RooFit::NodeInfo::isScalar
bool isScalar() const
Definition Evaluator.cxx:100

RooFit::NodeInfo::remServers
int remServers
Definition Evaluator.cxx:108

RooFit::NodeInfo::iNode
std::size_t iNode
Definition Evaluator.cxx:106

RooFit::NodeInfo::fromArrayInput
bool fromArrayInput
Definition Evaluator.cxx:110

RooFit::NodeInfo::isValueServer
bool isValueServer
Definition Evaluator.cxx:116

RooFit::NodeInfo::remClients
int remClients
Definition Evaluator.cxx:107

RooFit::NodeInfo::lastSetValCount
std::size_t lastSetValCount
Definition Evaluator.cxx:118

RooFit::NodeInfo::copyAfterEvaluation
bool copyAfterEvaluation
Definition Evaluator.cxx:109

RooFit::NodeInfo::event
RooBatchCompute::CudaInterface::CudaEvent * event
Definition Evaluator.cxx:124

RooFit::NodeInfo::serverInfos
std::vector< NodeInfo * > serverInfos
Definition Evaluator.cxx:121

RooFit::NodeInfo::isVariable
bool isVariable
Definition Evaluator.cxx:111

RooFit::NodeInfo::scalarBuffer
double scalarBuffer
Definition Evaluator.cxx:120

RooFit::NodeInfo::isCategory
bool isCategory
Definition Evaluator.cxx:113

RooFit::NodeInfo::originalOperMode
RooAbsArg::OperMode originalOperMode
Definition Evaluator.cxx:103

RooFit::NodeInfo::lastCatVal
int lastCatVal
Definition Evaluator.cxx:119

RooFit::NodeInfo::outputSize
std::size_t outputSize
Definition Evaluator.cxx:117

RooFit::NodeInfo::isDirty
bool isDirty
Definition Evaluator.cxx:112

RooFit::NodeInfo::~NodeInfo
~NodeInfo()
Definition Evaluator.cxx:136

RooFit::NodeInfo::computeInGPU
bool computeInGPU
Definition Evaluator.cxx:115

RooFit::NodeInfo::hasLogged
bool hasLogged
Definition Evaluator.cxx:114

RooFit::NodeInfo::clientInfos
std::vector< NodeInfo * > clientInfos
Definition Evaluator.cxx:122

RooFit::NodeInfo::buffer
std::shared_ptr< RooBatchCompute::AbsBuffer > buffer
Definition Evaluator.cxx:105

RooFit::NodeInfo::decrementRemainingClients
void decrementRemainingClients()
Check the servers of a node that has been computed and release its resources if they are no longer ne...
Definition Evaluator.cxx:129