doc/v626/RooFitDriver_8cxx_source.html

/*

 * Project: RooFit

 * Authors:

 *   Jonas Rembser, CERN 2021

 *   Emmanouil Michalainas, CERN 2021

 *

 * Copyright (c) 2021, CERN

 *

 * Redistribution and use in source and binary forms,

 * with or without modification, are permitted according to the terms

 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)

 */


/**

\file RooFitDriver.cxx

\class RooFitDriver

\ingroup Roofitcore


This class can evaluate a RooAbsReal object in other ways than recursive graph

traversal. Currently, it is being used for evaluating a RooAbsReal object and

supplying the value to the minimizer, during a fit. The class scans the

dependencies and schedules the computations in a secure and efficient way. The

computations take place in the RooBatchCompute library and can be carried off

by either the CPU or a CUDA-supporting GPU. The RooFitDriver class takes care

of data transfers. An instance of this class is created every time

RooAbsPdf::fitTo() is called and gets destroyed when the fitting ends.

**/


#include <RooFitDriver.h>


#include <RooAbsCategory.h>

#include <RooAbsData.h>

#include <RooAbsReal.h>

#include <RooRealVar.h>

#include <RooArgList.h>

#include <RooBatchCompute.h>

#include <RooMsgService.h>

#include <RooBatchCompute/Initialisation.h>

#include <RooFit/BatchModeDataHelpers.h>

#include <RooFit/BatchModeHelpers.h>

#include <RooFit/CUDAHelpers.h>


#include "NormalizationHelpers.h"


#include <iomanip>

#include <numeric>

#include <thread>


namespace ROOT {

namespace Experimental {


/// A struct used by the RooFitDriver to store information on the RooAbsArgs in

/// the computation graph.

struct NodeInfo {

   /// Check the servers of a node that has been computed and release it's resources

   /// if they are no longer needed.

   void decrementRemainingClients()

   {

      if (--remClients == 0) {

         delete buffer;

         buffer = nullptr;

      }

   }


   RooAbsArg *absArg = nullptr;


   Detail::AbsBuffer *buffer = nullptr;


   cudaEvent_t *event = nullptr;

   cudaEvent_t *eventStart = nullptr;

   cudaStream_t *stream = nullptr;

   std::chrono::microseconds cpuTime{0};

   std::chrono::microseconds cudaTime{std::chrono::microseconds::max()};

   std::chrono::microseconds timeLaunched{-1};

   int remClients = 0;

   int remServers = 0;

   bool isScalar = false;

   bool computeInGPU = false;

   bool copyAfterEvaluation = false;

   bool fromDataset = false;

   bool isVariable = false;

   bool isDirty = true;

   bool isCategory = false;

   std::size_t outputSize = 1;

   std::size_t lastSetValCount = std::numeric_limits<std::size_t>::max();

   std::size_t originalDataToken = 0;

   double scalarBuffer;

   std::vector<NodeInfo *> serverInfos;

   std::vector<NodeInfo *> clientInfos;


   ~NodeInfo()

   {

      if (event)

         RooBatchCompute::dispatchCUDA->deleteCudaEvent(event);

      if (eventStart)

         RooBatchCompute::dispatchCUDA->deleteCudaEvent(eventStart);

      if (stream)

         RooBatchCompute::dispatchCUDA->deleteCudaStream(stream);

   }

};


/// Construct a new RooFitDriver. The constructor analyzes and saves metadata about the graph,

/// useful for the evaluation of it that will be done later. In case the CUDA mode is selected,

/// there's also some CUDA-related initialization.

///

/// \param[in] absReal The RooAbsReal object that sits on top of the

///            computation graph that we want to evaluate.

/// \param[in] normSet Normalization set for the evaluation

/// \param[in] batchMode The computation mode, accepted values are

///            `RooBatchCompute::Cpu` and `RooBatchCompute::Cuda`.

RooFitDriver::RooFitDriver(const RooAbsReal &absReal, RooArgSet const &normSet, RooFit::BatchModeOption batchMode)

   : _batchMode{batchMode}

{

   _integralUnfolder = std::make_unique<RooFit::NormalizationIntegralUnfolder>(absReal, normSet);


   // Initialize RooBatchCompute

   RooBatchCompute::init();


   // Some checks and logging of used architectures

   RooFit::BatchModeHelpers::logArchitectureInfo(_batchMode);


   // Get the set of nodes in the computation graph. Do the detour via

   // RooArgList to avoid deduplication done after adding each element.

   RooArgList serverList;

   topNode().treeNodeServerList(&serverList, nullptr, true, true, false, true);

   // If we fill the servers in reverse order, they are approximately in

   // topological order so we save a bit of work in sortTopologically().

   RooArgSet serverSet;

   serverSet.add(serverList.rbegin(), serverList.rend(), /*silent=*/true);

   // Sort nodes topologically: the servers of any node will be before that

   // node in the collection.

   serverSet.sortTopologically();


   _dataMapCPU.resize(serverSet.size());

   _dataMapCUDA.resize(serverSet.size());


   std::unordered_map<TNamed const *, std::size_t> tokens;

   std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;


   // Fill the ordered nodes list and initialize the node info structs.

   _nodes.resize(serverSet.size());

   std::size_t iNode = 0;

   for (RooAbsArg *arg : serverSet) {


      tokens[arg->namePtr()] = iNode;


      auto &nodeInfo = _nodes[iNode];

      nodeInfo.absArg = arg;

      nodeInfos[arg] = &nodeInfo;


      nodeInfo.originalDataToken = arg->dataToken();

      arg->setDataToken(iNode);


      if (dynamic_cast<RooRealVar const *>(arg)) {

         nodeInfo.isVariable = true;

      }

      if (dynamic_cast<RooAbsCategory const *>(arg)) {

         nodeInfo.isCategory = true;

      }


      ++iNode;

   }


   for (NodeInfo &info : _nodes) {

      info.serverInfos.reserve(info.absArg->servers().size());

      for (RooAbsArg *server : info.absArg->servers()) {

         if (server->isValueServer(*info.absArg)) {

            auto *serverInfo = nodeInfos.at(server);

            info.serverInfos.emplace_back(serverInfo);

            serverInfo->clientInfos.emplace_back(&info);

         }

         server->setDataToken(tokens.at(server->namePtr()));

      }

   }


   if (_batchMode == RooFit::BatchModeOption::Cuda) {

      // create events and streams for every node

      for (auto &info : _nodes) {

         info.event = RooBatchCompute::dispatchCUDA->newCudaEvent(true);

         info.eventStart = RooBatchCompute::dispatchCUDA->newCudaEvent(true);

         info.stream = RooBatchCompute::dispatchCUDA->newCudaStream();

      }

   }

}


void RooFitDriver::setData(RooAbsData const &data, std::string_view rangeName,

                           RooAbsCategory const *indexCatForSplitting, bool skipZeroWeights,

                           bool takeGlobalObservablesFromData)

{


   std::stack<std::vector<double>>{}.swap(_vectorBuffers);

   DataSpansMap dataSpans = RooFit::BatchModeDataHelpers::getDataSpans(data, rangeName, indexCatForSplitting,

                                                                       _vectorBuffers, skipZeroWeights);

   if (takeGlobalObservablesFromData && data.getGlobalObservables()) {

      _vectorBuffers.emplace();

      auto &buffer = _vectorBuffers.top();

      buffer.reserve(data.getGlobalObservables()->size());

      for (auto *arg : static_range_cast<RooRealVar const *>(*data.getGlobalObservables())) {

         buffer.push_back(arg->getVal());

         dataSpans[arg] = RooSpan<const double>{&buffer.back(), 1};

      }

   }

   setData(dataSpans);

}


void RooFitDriver::setData(DataSpansMap const &dataSpans)

{

   // Iterate over the given data spans and add them to the data map. Check if

   // they are used in the computation graph. If yes, add the span to the data

   // map and set the node info accordingly.

   std::size_t totalSize = 0;

   for (auto &info : _nodes) {

      if (info.buffer) {

         delete info.buffer;

         info.buffer = nullptr;

      }

      auto found = dataSpans.find(info.absArg->namePtr());

      if (found != dataSpans.end()) {

         _dataMapCPU.at(info.absArg) = found->second;

         info.outputSize = found->second.size();

         info.fromDataset = true;

         info.isDirty = false;

         totalSize += info.outputSize;

      } else {

         info.outputSize = 1;

         info.fromDataset = false;

         info.isDirty = true;

      }

   }


   determineOutputSizes();


   for (auto &info : _nodes) {

      // If the node has an output of size 1

      info.isScalar = info.outputSize == 1;


      // In principle we don't need dirty flag propagation because the driver

      // takes care of deciding which node needs to be re-evaluated. However,

      // disabling it also for scalar mode results in very long fitting times

      // for specific models (test 14 in stressRooFit), which still needs to be

      // understood. TODO.

      if (!info.isScalar) {

         setOperMode(info.absArg, RooAbsArg::ADirty);

      }

   }


   // Extra steps for initializing in cuda mode

   if (_batchMode != RooFit::BatchModeOption::Cuda)

      return;


   // copy observable data to the GPU

   // TODO: use separate buffers here

   _cudaMemDataset = static_cast<double *>(RooBatchCompute::dispatchCUDA->cudaMalloc(totalSize * sizeof(double)));

   size_t idx = 0;

   for (auto &info : _nodes) {

      if (!info.fromDataset)

         continue;

      std::size_t size = info.outputSize;

      _dataMapCUDA.at(info.absArg) = RooSpan<double>(_cudaMemDataset + idx, size);

      RooBatchCompute::dispatchCUDA->memcpyToCUDA(_cudaMemDataset + idx, _dataMapCPU.at(info.absArg).data(),

                                                  size * sizeof(double));

      idx += size;

   }

}


RooFitDriver::~RooFitDriver()

{

   for (auto &info : _nodes) {

      info.absArg->setDataToken(info.originalDataToken);

   }


   if (_batchMode == RooFit::BatchModeOption::Cuda) {

      RooBatchCompute::dispatchCUDA->cudaFree(_cudaMemDataset);

   }

}


std::vector<double> RooFitDriver::getValues()

{

   getVal();

   NodeInfo const &nodeInfo = _nodes.back();

   if (nodeInfo.computeInGPU) {

      std::size_t nOut = nodeInfo.outputSize;

      std::vector<double> out(nOut);

      RooBatchCompute::dispatchCUDA->memcpyToCPU(out.data(), _dataMapCPU.at(&topNode()).data(), nOut * sizeof(double));

      _dataMapCPU.at(&topNode()) = RooSpan<const double>(out.data(), nOut);

      return out;

   }

   // We copy the data to the output vector

   auto dataSpan = _dataMapCPU.at(&topNode());

   std::vector<double> out;

   out.reserve(dataSpan.size());

   for (auto const &x : dataSpan) {

      out.push_back(x);

   }

   return out;

}


void RooFitDriver::computeCPUNode(const RooAbsArg *node, NodeInfo &info)

{

   using namespace Detail;


   auto nodeAbsReal = static_cast<RooAbsReal const *>(node);


   const std::size_t nOut = info.outputSize;


   if (nOut == 1) {

      _dataMapCPU.at(node) = RooSpan<const double>(&info.scalarBuffer, nOut);

      if (_batchMode == RooFit::BatchModeOption::Cuda) {

         _dataMapCUDA.at(node) = RooSpan<const double>(&info.scalarBuffer, nOut);

      }

      nodeAbsReal->computeBatch(nullptr, &info.scalarBuffer, nOut, _dataMapCPU);

   } else {

      if (!info.buffer) {

         info.buffer = info.copyAfterEvaluation ? _bufferManager.makePinnedBuffer(nOut, info.stream)

                                                : _bufferManager.makeCpuBuffer(nOut);

      }

      double *buffer = info.buffer->cpuWritePtr();

      _dataMapCPU.at(node) = RooSpan<const double>(buffer, nOut);

      // compute node and measure the time the first time

      if (_getValInvocations == 1) {

         using namespace std::chrono;

         auto start = steady_clock::now();

         nodeAbsReal->computeBatch(nullptr, buffer, nOut, _dataMapCPU);

         info.cpuTime = duration_cast<microseconds>(steady_clock::now() - start);

      } else {

         nodeAbsReal->computeBatch(nullptr, buffer, nOut, _dataMapCPU);

      }

      if (info.copyAfterEvaluation) {

         _dataMapCUDA.at(node) = RooSpan<const double>(info.buffer->gpuReadPtr(), nOut);

         if (info.event) {

            RooBatchCompute::dispatchCUDA->cudaEventRecord(info.event, info.stream);

         }

      }

   }

}


/// Returns the value of the top node in the computation graph

double RooFitDriver::getVal()

{

   ++_getValInvocations;


   if (_batchMode == RooFit::BatchModeOption::Cuda) {

      return getValHeterogeneous();

   }


   for (auto &nodeInfo : _nodes) {

      RooAbsArg *node = nodeInfo.absArg;

      if (!nodeInfo.fromDataset) {

         if (nodeInfo.isVariable) {

            auto *var = static_cast<RooRealVar const *>(node);

            if (nodeInfo.lastSetValCount != var->valueResetCounter()) {

               nodeInfo.lastSetValCount = var->valueResetCounter();

               for (NodeInfo *clientInfo : nodeInfo.clientInfos) {

                  clientInfo->isDirty = true;

               }

               computeCPUNode(node, nodeInfo);

               nodeInfo.isDirty = false;

            }

         } else {

            if (nodeInfo.isDirty) {

               for (NodeInfo *clientInfo : nodeInfo.clientInfos) {

                  clientInfo->isDirty = true;

               }

               computeCPUNode(node, nodeInfo);

               nodeInfo.isDirty = false;

            }

         }

      }

   }


   // return the final value

   return _dataMapCPU.at(&topNode())[0];

}


/// Returns the value of the top node in the computation graph

double RooFitDriver::getValHeterogeneous()

{

   for (auto &info : _nodes) {

      info.remClients = info.clientInfos.size();

      info.remServers = info.serverInfos.size();

   }


   // In a cuda fit, use first 3 fits to determine the execution times

   // and the hardware that computes each part of the graph

   if (_batchMode == RooFit::BatchModeOption::Cuda && _getValInvocations <= 3)

      markGPUNodes();


   // find initial gpu nodes and assign them to gpu

   for (auto &info : _nodes) {

      if (info.remServers == 0 && info.computeInGPU) {

         assignToGPU(info);

      }

   }


   NodeInfo const &topNodeInfo = _nodes.back();

   while (topNodeInfo.remServers != -2) {

      // find finished gpu nodes

      for (auto &info : _nodes) {

         if (info.remServers == -1 && !RooBatchCompute::dispatchCUDA->streamIsActive(info.stream)) {

            if (_getValInvocations == 2) {

               float ms = RooBatchCompute::dispatchCUDA->cudaEventElapsedTime(info.eventStart, info.event);

               info.cudaTime += std::chrono::microseconds{int(1000.0 * ms)};

            }

            info.remServers = -2;

            // Decrement number of remaining servers for clients and start GPU computations

            for (auto *infoClient : info.clientInfos) {

               --infoClient->remServers;

               if (infoClient->computeInGPU && infoClient->remServers == 0) {

                  assignToGPU(*infoClient);

               }

            }

            for (auto *serverInfo : info.serverInfos) {

               serverInfo->decrementRemainingClients();

            }

         }

      }


      // find next CPU node

      auto it = _nodes.begin();

      for (; it != _nodes.end(); it++) {

         if (it->remServers == 0 && !it->computeInGPU)

            break;

      }


      // if no CPU node available sleep for a while to save CPU usage

      if (it == _nodes.end()) {

         std::this_thread::sleep_for(std::chrono::milliseconds(1));

         continue;

      }


      // compute next CPU node

      NodeInfo &info = *it;

      RooAbsArg const *node = info.absArg;

      info.remServers = -2; // so that it doesn't get picked again


      if (!info.fromDataset) {

         computeCPUNode(node, info);

      }


      // Assign the clients that are computed on the GPU

      for (auto *infoClient : info.clientInfos) {

         if (--infoClient->remServers == 0 && infoClient->computeInGPU) {

            assignToGPU(*infoClient);

         }

      }

      for (auto *serverInfo : info.serverInfos) {

         serverInfo->decrementRemainingClients();

      }

   }


   // return the final value

   return _dataMapCPU.at(&topNode())[0];

}


/// Assign a node to be computed in the GPU. Scan it's clients and also assign them

/// in case they only depend on gpu nodes.

void RooFitDriver::assignToGPU(NodeInfo &info)

{

   using namespace Detail;


   auto node = static_cast<RooAbsReal const *>(info.absArg);


   const std::size_t nOut = info.outputSize;


   info.remServers = -1;

   // wait for every server to finish

   for (auto *infoServer : info.serverInfos) {

      if (infoServer->event)

         RooBatchCompute::dispatchCUDA->cudaStreamWaitEvent(info.stream, infoServer->event);

   }


   info.buffer = info.copyAfterEvaluation ? _bufferManager.makePinnedBuffer(nOut, info.stream)

                                          : _bufferManager.makeGpuBuffer(nOut);

   double *buffer = info.buffer->gpuWritePtr();

   _dataMapCUDA.at(node) = RooSpan<const double>(buffer, nOut);

   // measure launching overhead (add computation time later)

   if (_getValInvocations == 2) {

      using namespace std::chrono;

      RooBatchCompute::dispatchCUDA->cudaEventRecord(info.eventStart, info.stream);

      auto start = steady_clock::now();

      node->computeBatch(info.stream, buffer, nOut, _dataMapCUDA);

      info.cudaTime = duration_cast<microseconds>(steady_clock::now() - start);

   } else

      node->computeBatch(info.stream, buffer, nOut, _dataMapCUDA);

   RooBatchCompute::dispatchCUDA->cudaEventRecord(info.event, info.stream);

   if (info.copyAfterEvaluation) {

      _dataMapCPU.at(node) = RooSpan<const double>(info.buffer->cpuReadPtr(), nOut);

   }

}


/// This methods simulates the computation of the whole graph and the time it takes

/// and decides what to compute in gpu. The decision is made on the basis of avoiding

/// leaving either the gpu or the cpu idle at any time, if possible, and on assigning

/// to each piece of hardware a computation that is significantly slower on the other part.

/// The nodes may be assigned to the non-efficient side (cpu or gpu) to prevent idleness

/// only if the absolute difference cpuTime-cudaTime does not exceed the diffThreshold.

std::chrono::microseconds RooFitDriver::simulateFit(std::chrono::microseconds h2dTime,

                                                    std::chrono::microseconds d2hTime,

                                                    std::chrono::microseconds diffThreshold)

{

   using namespace std::chrono;


   std::size_t nNodes = _nodes.size();

   // launch scalar nodes (assume they are computed in 0 time)

   for (auto &info : _nodes) {

      if (info.isScalar) {

         nNodes--;

         info.timeLaunched = microseconds{0};

      } else

         info.timeLaunched = microseconds{-1};

   }


   NodeInfo *cpuNode = nullptr;

   NodeInfo *cudaNode = nullptr;

   microseconds simulatedTime{0};

   while (nNodes) {

      microseconds minDiff = microseconds::max(), maxDiff = -minDiff; // diff = cpuTime - cudaTime

      NodeInfo *cpuCandidate = nullptr;

      NodeInfo *cudaCandidate = nullptr;

      microseconds cpuDelay{};

      microseconds cudaDelay{};

      for (auto &info : _nodes) {

         RooAbsArg const *absArg = info.absArg;

         if (info.timeLaunched >= microseconds{0})

            continue; // already launched

         microseconds diff{info.cpuTime - info.cudaTime}, cpuWait{0}, cudaWait{0};


         bool goToNextCandidate = false;


         for (auto *serverInfo : info.serverInfos) {

            if (serverInfo->isScalar)

               continue;


            // dependencies not computed yet

            if (serverInfo->timeLaunched < microseconds{0}) {

               goToNextCandidate = true;

               break;

            }

            if (serverInfo->computeInGPU)

               cpuWait = std::max(cpuWait, serverInfo->timeLaunched + serverInfo->cudaTime + d2hTime - simulatedTime);

            else

               cudaWait = std::max(cudaWait, serverInfo->timeLaunched + serverInfo->cpuTime + h2dTime - simulatedTime);

         }


         if (goToNextCandidate) {

            continue;

         }


         diff += cpuWait - cudaWait;

         if (diff < minDiff) {

            minDiff = diff;

            cpuDelay = cpuWait;

            cpuCandidate = &info;

         }

         if (diff > maxDiff && absArg->canComputeBatchWithCuda()) {

            maxDiff = diff;

            cudaDelay = cudaWait;

            cudaCandidate = &info;

         }

      }


      auto calcDiff = [](const NodeInfo *nodeInfo) { return nodeInfo->cpuTime - nodeInfo->cudaTime; };

      if (cpuCandidate && calcDiff(cpuCandidate) > diffThreshold)

         cpuCandidate = nullptr;

      if (cudaCandidate && -calcDiff(cudaCandidate) > diffThreshold)

         cudaCandidate = nullptr;

      // don't compute same node twice

      if (cpuCandidate == cudaCandidate && !cpuNode && !cudaNode) {

         if (minDiff < microseconds{0})

            cudaCandidate = nullptr;

         else

            cpuCandidate = nullptr;

      }

      if (cpuCandidate && !cpuNode) {

         cpuNode = cpuCandidate;

         cpuNode->timeLaunched = simulatedTime + cpuDelay;

         // If the compute mode is changed, the current buffer might not be appropriate anymore

         if (cpuNode->computeInGPU) {

            delete cpuNode->buffer;

            cpuNode->buffer = nullptr;

         }

         cpuNode->computeInGPU = false;

         nNodes--;

      }

      if (cudaCandidate && !cudaNode) {

         cudaNode = cudaCandidate;

         cudaNode->timeLaunched = simulatedTime + cudaDelay;

         // If the compute mode is changed, the current buffer might not be appropriate anymore

         if (!cudaNode->computeInGPU) {

            delete cudaNode->buffer;

            cudaNode->buffer = nullptr;

         }

         cudaNode->computeInGPU = true;

         nNodes--;

      }


      microseconds etaCPU{microseconds::max()}, etaCUDA{microseconds::max()};

      if (cpuNode) {

         etaCPU = cpuNode->timeLaunched + cpuNode->cpuTime;

      }

      if (cudaNode) {

         etaCUDA = cudaNode->timeLaunched + cudaNode->cudaTime;

      }

      simulatedTime = std::min(etaCPU, etaCUDA);

      if (etaCPU < etaCUDA)

         cpuNode = nullptr;

      else

         cudaNode = nullptr;

   } // while(nNodes)

   return simulatedTime;

}


/// Decides which nodes are assigned to the gpu in a cuda fit. In the 1st iteration,

/// everything is computed in cpu for measuring the cpu time. In the 2nd iteration,

/// everything is computed in gpu (if possible) to measure the gpu time.

/// In the 3rd iteration, simulate the computation of the graph by calling simulateFit

/// with every distinct threshold found as timeDiff within the nodes of the graph and select

/// the best configuration. In the end, mark the nodes and handle the details accordingly.

void RooFitDriver::markGPUNodes()

{

   using namespace std::chrono;


   if (_getValInvocations == 1) {

      // leave everything to be computed (and timed) in cpu

      return;

   } else if (_getValInvocations == 2) {

      // compute (and time) as much as possible in gpu

      for (auto &info : _nodes) {

         info.computeInGPU = !info.isScalar && info.absArg->canComputeBatchWithCuda();

      }

   } else {

      // Assign nodes to gpu using a greedy algorithm: for the number of bytes

      // in this benchmark we take the maximum size of spans in the dataset.

      std::size_t nBytes = 1;

      for (auto const &item : _dataMapCUDA) {

         nBytes = std::max(nBytes, item.size() * sizeof(double));

      }

      auto transferTimes = RooFit::CUDAHelpers::memcpyBenchmark(nBytes);


      microseconds h2dTime = transferTimes.first;

      microseconds d2hTime = transferTimes.second;

      ooccoutD(static_cast<TObject*>(nullptr), FastEvaluations) << "------Copying times------\n";

      ooccoutD(static_cast<TObject*>(nullptr), FastEvaluations) << "h2dTime=" << h2dTime.count() << "us\td2hTime=" << d2hTime.count()

                                         << "us\n";


      std::vector<microseconds> diffTimes;

      for (auto &info : _nodes) {

         if (!info.isScalar)

            diffTimes.push_back(info.cpuTime - info.cudaTime);

      }

      microseconds bestTime = microseconds::max();

      microseconds bestThreshold{};

      microseconds ret;

      for (auto &threshold : diffTimes) {

         if ((ret = simulateFit(h2dTime, d2hTime, microseconds{std::abs(threshold.count())})) < bestTime) {

            bestTime = ret;

            bestThreshold = threshold;

         }

      }

      // finalize the marking of the best configuration

      simulateFit(h2dTime, d2hTime, microseconds{std::abs(bestThreshold.count())});

      ooccoutD(static_cast<TObject*>(nullptr), FastEvaluations) << "Best threshold=" << bestThreshold.count() << "us" << std::endl;


      // deletion of the timing events (to be replaced later by non-timing events)

      for (auto &info : _nodes) {

         // If the copy mode is changed, the current buffer might not be appropriate anymore

         if (info.copyAfterEvaluation) {

            delete info.buffer;

            info.buffer = nullptr;

         }

         info.copyAfterEvaluation = false;

         RooBatchCompute::dispatchCUDA->deleteCudaEvent(info.event);

         RooBatchCompute::dispatchCUDA->deleteCudaEvent(info.eventStart);

         info.event = info.eventStart = nullptr;

      }

   } // else (_getValInvocations > 2)


   for (auto &info : _nodes) {

      // scalar nodes don't need copying

      if (!info.isScalar) {

         for (auto *clientInfo : info.clientInfos) {

            if (info.computeInGPU != clientInfo->computeInGPU) {

               // If the copy mode is changed, the current buffer might not be appropriate anymore

               if (!info.copyAfterEvaluation) {

                  delete info.buffer;

                  info.buffer = nullptr;

               }

               info.copyAfterEvaluation = true;

               break;

            }

         }

      }

   }


   // restore a cudaEventDisableTiming event when necessary

   if (_getValInvocations == 3) {

      for (auto &info : _nodes) {

         if (info.computeInGPU || info.copyAfterEvaluation)

            info.event = RooBatchCompute::dispatchCUDA->newCudaEvent(false);

      }


      ooccoutD(static_cast<TObject*>(nullptr), FastEvaluations) << "------Nodes------\t\t\t\tCpu time: \t Cuda time\n";

      for (auto &info : _nodes) {

         ooccoutD(static_cast<TObject*>(nullptr), FastEvaluations)

                                            << std::setw(20) << info.absArg->GetName() << "\t" << info.absArg << "\t"

                                            << (info.computeInGPU ? "CUDA" : "CPU") << "\t" << info.cpuTime.count()

                                            << "us\t" << info.cudaTime.count() << "us\n";

      }

   }

}


void RooFitDriver::determineOutputSizes()

{

   for (auto &argInfo : _nodes) {

      for (auto *serverInfo : argInfo.serverInfos) {

         if (!argInfo.absArg->isReducerNode()) {

            argInfo.outputSize = std::max(serverInfo->outputSize, argInfo.outputSize);

         }

      }

   }

}


/// Temporarily change the operation mode of a RooAbsArg until the

/// RooFitDriver gets deleted.

void RooFitDriver::setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)

{

   if (opMode != arg->operMode()) {

      _changeOperModeRAIIs.emplace(arg, opMode);

   }

}


RooAbsReal &RooFitDriver::topNode() const

{

   return static_cast<RooAbsReal &>(_integralUnfolder->arg());

}


} // namespace Experimental

} // namespace ROOT

BatchModeDataHelpers.h

BatchModeHelpers.h

CUDAHelpers.h

Initialisation.h

NormalizationHelpers.h

RooAbsCategory.h

RooAbsData.h

RooAbsReal.h

RooArgList.h

RooBatchCompute.h

RooFitDriver.h

size
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix

RooMsgService.h

ooccoutD
#define ooccoutD(o, a)
Definition RooMsgService.h:52

RooRealVar.h

ROOT::Experimental::Detail::AbsBuffer
Definition Buffers.h:22

ROOT::Experimental::Detail::AbsBuffer::gpuWritePtr
virtual double * gpuWritePtr()=0

ROOT::Experimental::Detail::AbsBuffer::cpuWritePtr
virtual double * cpuWritePtr()=0

ROOT::Experimental::Detail::AbsBuffer::gpuReadPtr
virtual double const * gpuReadPtr() const =0

ROOT::Experimental::Detail::AbsBuffer::cpuReadPtr
virtual double const * cpuReadPtr() const =0

ROOT::Experimental::Detail::BufferManager::makeCpuBuffer
AbsBuffer * makeCpuBuffer(std::size_t size)
Definition Buffers.cxx:240

ROOT::Experimental::Detail::BufferManager::makePinnedBuffer
AbsBuffer * makePinnedBuffer(std::size_t size, cudaStream_t *stream=nullptr)
Definition Buffers.cxx:248

ROOT::Experimental::Detail::BufferManager::makeGpuBuffer
AbsBuffer * makeGpuBuffer(std::size_t size)
Definition Buffers.cxx:244

ROOT::Experimental::RooFitDriver::DataSpansMap
std::map< RooFit::Detail::DataKey, RooSpan< const double > > DataSpansMap
Definition RooFitDriver.h:44

ROOT::Experimental::RooFitDriver::~RooFitDriver
~RooFitDriver()
Definition RooFitDriver.cxx:266

ROOT::Experimental::RooFitDriver::_batchMode
const RooFit::BatchModeOption _batchMode
Definition RooFitDriver.h:80

ROOT::Experimental::RooFitDriver::setOperMode
void setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
Temporarily change the operation mode of a RooAbsArg until the RooFitDriver gets deleted.
Definition RooFitDriver.cxx:725

ROOT::Experimental::RooFitDriver::_getValInvocations
int _getValInvocations
Definition RooFitDriver.h:81

ROOT::Experimental::RooFitDriver::RooFitDriver
RooFitDriver(const RooAbsReal &absReal, RooArgSet const &normSet, RooFit::BatchModeOption batchMode=RooFit::BatchModeOption::Cpu)
Construct a new RooFitDriver.
Definition RooFitDriver.cxx:111

ROOT::Experimental::RooFitDriver::_dataMapCPU
RooFit::Detail::DataMap _dataMapCPU
Definition RooFitDriver.h:85

ROOT::Experimental::RooFitDriver::getValHeterogeneous
double getValHeterogeneous()
Returns the value of the top node in the computation graph.
Definition RooFitDriver.cxx:376

ROOT::Experimental::RooFitDriver::getVal
double getVal()
Returns the value of the top node in the computation graph.
Definition RooFitDriver.cxx:338

ROOT::Experimental::RooFitDriver::_dataMapCUDA
RooFit::Detail::DataMap _dataMapCUDA
Definition RooFitDriver.h:86

ROOT::Experimental::RooFitDriver::_integralUnfolder
std::unique_ptr< RooFit::NormalizationIntegralUnfolder > _integralUnfolder
Definition RooFitDriver.h:93

ROOT::Experimental::RooFitDriver::determineOutputSizes
void determineOutputSizes()
Definition RooFitDriver.cxx:712

ROOT::Experimental::RooFitDriver::_nodes
std::vector< NodeInfo > _nodes
Definition RooFitDriver.h:89

ROOT::Experimental::RooFitDriver::getValues
std::vector< double > getValues()
Definition RooFitDriver.cxx:277

ROOT::Experimental::RooFitDriver::assignToGPU
void assignToGPU(NodeInfo &info)
Assign a node to be computed in the GPU.
Definition RooFitDriver.cxx:457

ROOT::Experimental::RooFitDriver::_cudaMemDataset
double * _cudaMemDataset
Definition RooFitDriver.h:82

ROOT::Experimental::RooFitDriver::computeCPUNode
void computeCPUNode(const RooAbsArg *node, NodeInfo &info)
Definition RooFitDriver.cxx:298

ROOT::Experimental::RooFitDriver::_bufferManager
Detail::BufferManager _bufferManager
Definition RooFitDriver.h:78

ROOT::Experimental::RooFitDriver::_vectorBuffers
std::stack< std::vector< double > > _vectorBuffers
Definition RooFitDriver.h:92

ROOT::Experimental::RooFitDriver::markGPUNodes
void markGPUNodes()
Decides which nodes are assigned to the gpu in a cuda fit.
Definition RooFitDriver.cxx:619

ROOT::Experimental::RooFitDriver::setData
void setData(RooAbsData const &data, std::string_view rangeName="", RooAbsCategory const *indexCatForSplitting=nullptr, bool skipZeroWeights=false, bool takeGlobalObservablesFromData=true)
Definition RooFitDriver.cxx:186

ROOT::Experimental::RooFitDriver::topNode
RooAbsReal & topNode() const
Definition RooFitDriver.cxx:732

ROOT::Experimental::RooFitDriver::simulateFit
std::chrono::microseconds simulateFit(std::chrono::microseconds h2dTime, std::chrono::microseconds d2hTime, std::chrono::microseconds diffThreshold)
This methods simulates the computation of the whole graph and the time it takes and decides what to c...
Definition RooFitDriver.cxx:497

RooAbsArg
RooAbsArg is the common abstract base class for objects that represent a value and a "shape" in RooFi...
Definition RooAbsArg.h:69

RooAbsArg::OperMode
OperMode
Definition RooAbsArg.h:405

RooAbsArg::ADirty
@ ADirty
Definition RooAbsArg.h:405

RooAbsArg::treeNodeServerList
void treeNodeServerList(RooAbsCollection *list, const RooAbsArg *arg=0, Bool_t doBranch=kTRUE, Bool_t doLeaf=kTRUE, Bool_t valueOnly=kFALSE, Bool_t recurseNonDerived=kFALSE) const
Fill supplied list with nodes of the arg tree, following all server links, starting with ourself as t...
Definition RooAbsArg.cxx:525

RooAbsArg::operMode
OperMode operMode() const
Query the operation mode of this node.
Definition RooAbsArg.h:499

RooAbsCategory
RooAbsCategory is the base class for objects that represent a discrete value with a finite number of ...
Definition RooAbsCategory.h:37

RooAbsCollection::sortTopologically
void sortTopologically()
Sort collection topologically: the servers of any RooAbsArg will be before that RooAbsArg in the coll...
Definition RooAbsCollection.cxx:1622

RooAbsCollection::rend
Storage_t::const_reverse_iterator rend() const
Definition RooAbsCollection.h:245

RooAbsCollection::rbegin
Storage_t::const_reverse_iterator rbegin() const
Definition RooAbsCollection.h:241

RooAbsCollection::add
virtual Bool_t add(const RooAbsArg &var, Bool_t silent=kFALSE)
Add the specified argument to list.
Definition RooAbsCollection.cxx:521

RooAbsCollection::size
Storage_t::size_type size() const
Definition RooAbsCollection.h:249

RooAbsData
RooAbsData is the common abstract base class for binned and unbinned datasets.
Definition RooAbsData.h:82

RooAbsData::getGlobalObservables
RooArgSet const * getGlobalObservables() const
Returns snapshot of global observables stored in this data.
Definition RooAbsData.h:315

RooAbsReal
RooAbsReal is the common abstract base class for objects that represent a real value and implements f...
Definition RooAbsReal.h:64

RooArgList
RooArgList is a container object that can hold multiple RooAbsArg objects.
Definition RooArgList.h:22

RooArgSet
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Definition RooArgSet.h:35

RooBatchCompute::RooBatchComputeInterface::cudaEventRecord
virtual void cudaEventRecord(cudaEvent_t *, cudaStream_t *)
Definition RooBatchCompute.h:79

RooBatchCompute::RooBatchComputeInterface::newCudaEvent
virtual cudaEvent_t * newCudaEvent(bool)
Definition RooBatchCompute.h:74

RooBatchCompute::RooBatchComputeInterface::cudaEventElapsedTime
virtual float cudaEventElapsedTime(cudaEvent_t *, cudaEvent_t *)
Definition RooBatchCompute.h:81

RooBatchCompute::RooBatchComputeInterface::cudaMalloc
virtual void * cudaMalloc(size_t)
Definition RooBatchCompute.h:70

RooBatchCompute::RooBatchComputeInterface::memcpyToCPU
virtual void memcpyToCPU(void *, const void *, size_t, cudaStream_t *=nullptr)
Definition RooBatchCompute.h:83

RooBatchCompute::RooBatchComputeInterface::newCudaStream
virtual cudaStream_t * newCudaStream()
Definition RooBatchCompute.h:76

RooBatchCompute::RooBatchComputeInterface::cudaStreamWaitEvent
virtual void cudaStreamWaitEvent(cudaStream_t *, cudaEvent_t *)
Definition RooBatchCompute.h:80

RooBatchCompute::RooBatchComputeInterface::streamIsActive
virtual bool streamIsActive(cudaStream_t *)
Definition RooBatchCompute.h:78

RooBatchCompute::RooBatchComputeInterface::cudaFree
virtual void cudaFree(void *)
Definition RooBatchCompute.h:71

RooBatchCompute::RooBatchComputeInterface::deleteCudaStream
virtual void deleteCudaStream(cudaStream_t *)
Definition RooBatchCompute.h:77

RooBatchCompute::RooBatchComputeInterface::deleteCudaEvent
virtual void deleteCudaEvent(cudaEvent_t *)
Definition RooBatchCompute.h:75

RooBatchCompute::RooBatchComputeInterface::memcpyToCUDA
virtual void memcpyToCUDA(void *, const void *, size_t, cudaStream_t *=nullptr)
Definition RooBatchCompute.h:82

RooFit::Detail::DataMap::at
auto & at(RooAbsArg const *arg, RooAbsArg const *=nullptr)
Definition DataMap.h:88

RooFit::Detail::DataMap::resize
auto resize(std::size_t n)
Definition DataMap.h:86

RooRealVar
RooRealVar represents a variable that can be changed from the outside.
Definition RooRealVar.h:39

RooRealVar::valueResetCounter
std::size_t valueResetCounter() const
Returns how many times the value of this RooRealVar was reset.
Definition RooRealVar.h:59

RooSpan
A simple container to hold a batch of data values.
Definition RooSpan.h:34

TObject
Mother of all ROOT objects.
Definition TObject.h:41

int

x
Double_t x[n]
Definition legend1.C:17

ROOT
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Definition EExecutionPolicy.hxx:4

RooBatchCompute::dispatchCUDA
R__EXTERN RooBatchComputeInterface * dispatchCUDA
Definition RooBatchCompute.h:93

RooBatchCompute::init
void init()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
Definition Initialisation.cxx:43

RooFit::BatchModeDataHelpers::getDataSpans
std::map< RooFit::Detail::DataKey, RooSpan< const double > > getDataSpans(RooAbsData const &data, std::string_view rangeName, RooAbsCategory const *indexCat, std::stack< std::vector< double > > &buffers, bool skipZeroWeights)
Extract all content from a RooFit datasets as a map of spans.
Definition BatchModeDataHelpers.cxx:98

RooFit::BatchModeHelpers::logArchitectureInfo
void logArchitectureInfo(RooFit::BatchModeOption batchMode)
Definition BatchModeHelpers.cxx:230

RooFit::CUDAHelpers::memcpyBenchmark
std::pair< std::chrono::microseconds, std::chrono::microseconds > memcpyBenchmark(std::size_t nBytes)
Measure the time for transfering data from host to device and vice-versa.
Definition CUDAHelpers.cxx:18

RooFit::BatchModeOption
BatchModeOption
For setting the batch mode flag with the BatchMode() command argument to RooAbsPdf::fitTo();.
Definition RooGlobalFunc.h:76

RooFit::BatchModeOption::Cuda
@ Cuda

ROOT::Experimental::NodeInfo
A struct used by the RooFitDriver to store information on the RooAbsArgs in the computation graph.
Definition RooFitDriver.cxx:54

ROOT::Experimental::NodeInfo::absArg
RooAbsArg * absArg
Definition RooFitDriver.cxx:65

ROOT::Experimental::NodeInfo::serverInfos
std::vector< NodeInfo * > serverInfos
Definition RooFitDriver.cxx:88

ROOT::Experimental::NodeInfo::remClients
int remClients
Definition RooFitDriver.cxx:75

ROOT::Experimental::NodeInfo::timeLaunched
std::chrono::microseconds timeLaunched
Definition RooFitDriver.cxx:74

ROOT::Experimental::NodeInfo::remServers
int remServers
Definition RooFitDriver.cxx:76

ROOT::Experimental::NodeInfo::copyAfterEvaluation
bool copyAfterEvaluation
Definition RooFitDriver.cxx:79

ROOT::Experimental::NodeInfo::scalarBuffer
double scalarBuffer
Definition RooFitDriver.cxx:87

ROOT::Experimental::NodeInfo::lastSetValCount
std::size_t lastSetValCount
Definition RooFitDriver.cxx:85

ROOT::Experimental::NodeInfo::cudaTime
std::chrono::microseconds cudaTime
Definition RooFitDriver.cxx:73

ROOT::Experimental::NodeInfo::isDirty
bool isDirty
Definition RooFitDriver.cxx:82

ROOT::Experimental::NodeInfo::isCategory
bool isCategory
Definition RooFitDriver.cxx:83

ROOT::Experimental::NodeInfo::originalDataToken
std::size_t originalDataToken
Definition RooFitDriver.cxx:86

ROOT::Experimental::NodeInfo::outputSize
std::size_t outputSize
Definition RooFitDriver.cxx:84

ROOT::Experimental::NodeInfo::eventStart
cudaEvent_t * eventStart
Definition RooFitDriver.cxx:70

ROOT::Experimental::NodeInfo::isScalar
bool isScalar
Definition RooFitDriver.cxx:77

ROOT::Experimental::NodeInfo::event
cudaEvent_t * event
Definition RooFitDriver.cxx:69

ROOT::Experimental::NodeInfo::isVariable
bool isVariable
Definition RooFitDriver.cxx:81

ROOT::Experimental::NodeInfo::clientInfos
std::vector< NodeInfo * > clientInfos
Definition RooFitDriver.cxx:89

ROOT::Experimental::NodeInfo::cpuTime
std::chrono::microseconds cpuTime
Definition RooFitDriver.cxx:72

ROOT::Experimental::NodeInfo::decrementRemainingClients
void decrementRemainingClients()
Check the servers of a node that has been computed and release it's resources if they are no longer n...
Definition RooFitDriver.cxx:57

ROOT::Experimental::NodeInfo::~NodeInfo
~NodeInfo()
Definition RooFitDriver.cxx:91

ROOT::Experimental::NodeInfo::computeInGPU
bool computeInGPU
Definition RooFitDriver.cxx:78

ROOT::Experimental::NodeInfo::fromDataset
bool fromDataset
Definition RooFitDriver.cxx:80

ROOT::Experimental::NodeInfo::stream
cudaStream_t * stream
Definition RooFitDriver.cxx:71

ROOT::Experimental::NodeInfo::buffer
Detail::AbsBuffer * buffer
Definition RooFitDriver.cxx:67

event
Definition triangle.c:553