50namespace Experimental {
69 cudaEvent_t *
event =
nullptr;
73 std::chrono::microseconds
cudaTime{std::chrono::microseconds::max()};
112 : _batchMode{batchMode}
114 _integralUnfolder = std::make_unique<RooFit::NormalizationIntegralUnfolder>(absReal, normSet);
137 std::unordered_map<TNamed const *, std::size_t> tokens;
138 std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;
142 std::size_t iNode = 0;
145 tokens[arg->namePtr()] = iNode;
147 auto &nodeInfo =
_nodes[iNode];
148 nodeInfo.absArg = arg;
149 nodeInfos[arg] = &nodeInfo;
151 nodeInfo.originalDataToken = arg->dataToken();
152 arg->setDataToken(iNode);
155 nodeInfo.isVariable =
true;
158 nodeInfo.isCategory =
true;
165 info.serverInfos.reserve(info.absArg->servers().size());
166 for (
RooAbsArg *server : info.absArg->servers()) {
167 if (server->isValueServer(*info.absArg)) {
168 auto *serverInfo = nodeInfos.at(server);
169 info.serverInfos.emplace_back(serverInfo);
170 serverInfo->clientInfos.emplace_back(&info);
172 server->setDataToken(tokens.at(server->namePtr()));
178 for (
auto &info :
_nodes) {
188 bool takeGlobalObservablesFromData)
199 buffer.push_back(arg->getVal());
211 std::size_t totalSize = 0;
212 for (
auto &info :
_nodes) {
215 info.buffer =
nullptr;
217 auto found = dataSpans.find(info.absArg->namePtr());
218 if (found != dataSpans.end()) {
220 info.outputSize = found->second.size();
221 info.fromDataset =
true;
222 info.isDirty =
false;
223 totalSize += info.outputSize;
226 info.fromDataset =
false;
233 for (
auto &info :
_nodes) {
235 info.isScalar = info.outputSize == 1;
242 if (!info.isScalar) {
255 for (
auto &info :
_nodes) {
256 if (!info.fromDataset)
258 std::size_t
size = info.outputSize;
261 size *
sizeof(
double));
268 for (
auto &info :
_nodes) {
269 info.absArg->setDataToken(info.originalDataToken);
283 std::vector<double> out(nOut);
290 std::vector<double> out;
291 out.reserve(dataSpan.size());
292 for (
auto const &
x : dataSpan) {
300 using namespace Detail;
302 auto nodeAbsReal =
static_cast<RooAbsReal const *
>(node);
321 using namespace std::chrono;
322 auto start = steady_clock::now();
323 nodeAbsReal->computeBatch(
nullptr, buffer, nOut,
_dataMapCPU);
324 info.
cpuTime = duration_cast<microseconds>(steady_clock::now() - start);
326 nodeAbsReal->computeBatch(
nullptr, buffer, nOut,
_dataMapCPU);
346 for (
auto &nodeInfo :
_nodes) {
348 if (!nodeInfo.fromDataset) {
349 if (nodeInfo.isVariable) {
350 auto *var =
static_cast<RooRealVar const *
>(node);
351 if (nodeInfo.lastSetValCount != var->valueResetCounter()) {
353 for (
NodeInfo *clientInfo : nodeInfo.clientInfos) {
354 clientInfo->isDirty =
true;
357 nodeInfo.isDirty =
false;
360 if (nodeInfo.isDirty) {
361 for (
NodeInfo *clientInfo : nodeInfo.clientInfos) {
362 clientInfo->isDirty =
true;
365 nodeInfo.isDirty =
false;
378 for (
auto &info :
_nodes) {
379 info.remClients = info.clientInfos.size();
380 info.remServers = info.serverInfos.size();
389 for (
auto &info :
_nodes) {
390 if (info.remServers == 0 && info.computeInGPU) {
398 for (
auto &info :
_nodes) {
402 info.cudaTime += std::chrono::microseconds{
int(1000.0 * ms)};
404 info.remServers = -2;
406 for (
auto *infoClient : info.clientInfos) {
407 --infoClient->remServers;
408 if (infoClient->computeInGPU && infoClient->remServers == 0) {
412 for (
auto *serverInfo : info.serverInfos) {
413 serverInfo->decrementRemainingClients();
420 for (; it !=
_nodes.end(); it++) {
421 if (it->remServers == 0 && !it->computeInGPU)
427 std::this_thread::sleep_for(std::chrono::milliseconds(1));
442 if (--infoClient->remServers == 0 && infoClient->computeInGPU) {
447 serverInfo->decrementRemainingClients();
459 using namespace Detail;
468 if (infoServer->event)
478 using namespace std::chrono;
480 auto start = steady_clock::now();
482 info.
cudaTime = duration_cast<microseconds>(steady_clock::now() - start);
498 std::chrono::microseconds d2hTime,
499 std::chrono::microseconds diffThreshold)
501 using namespace std::chrono;
503 std::size_t nNodes =
_nodes.size();
505 for (
auto &info :
_nodes) {
508 info.timeLaunched = microseconds{0};
510 info.timeLaunched = microseconds{-1};
515 microseconds simulatedTime{0};
517 microseconds minDiff = microseconds::max(), maxDiff = -minDiff;
520 microseconds cpuDelay{};
521 microseconds cudaDelay{};
522 for (
auto &info :
_nodes) {
524 if (info.timeLaunched >= microseconds{0})
526 microseconds diff{info.cpuTime - info.cudaTime}, cpuWait{0}, cudaWait{0};
528 bool goToNextCandidate =
false;
530 for (
auto *serverInfo : info.serverInfos) {
531 if (serverInfo->isScalar)
535 if (serverInfo->timeLaunched < microseconds{0}) {
536 goToNextCandidate =
true;
539 if (serverInfo->computeInGPU)
540 cpuWait = std::max(cpuWait, serverInfo->timeLaunched + serverInfo->cudaTime + d2hTime - simulatedTime);
542 cudaWait = std::max(cudaWait, serverInfo->timeLaunched + serverInfo->cpuTime + h2dTime - simulatedTime);
545 if (goToNextCandidate) {
549 diff += cpuWait - cudaWait;
550 if (diff < minDiff) {
553 cpuCandidate = &info;
555 if (diff > maxDiff && absArg->canComputeBatchWithCuda()) {
557 cudaDelay = cudaWait;
558 cudaCandidate = &info;
562 auto calcDiff = [](
const NodeInfo *nodeInfo) {
return nodeInfo->cpuTime - nodeInfo->cudaTime; };
563 if (cpuCandidate && calcDiff(cpuCandidate) > diffThreshold)
564 cpuCandidate =
nullptr;
565 if (cudaCandidate && -calcDiff(cudaCandidate) > diffThreshold)
566 cudaCandidate =
nullptr;
568 if (cpuCandidate == cudaCandidate && !cpuNode && !cudaNode) {
569 if (minDiff < microseconds{0})
570 cudaCandidate =
nullptr;
572 cpuCandidate =
nullptr;
574 if (cpuCandidate && !cpuNode) {
575 cpuNode = cpuCandidate;
576 cpuNode->timeLaunched = simulatedTime + cpuDelay;
578 if (cpuNode->computeInGPU) {
579 delete cpuNode->buffer;
580 cpuNode->buffer =
nullptr;
582 cpuNode->computeInGPU =
false;
585 if (cudaCandidate && !cudaNode) {
586 cudaNode = cudaCandidate;
587 cudaNode->timeLaunched = simulatedTime + cudaDelay;
589 if (!cudaNode->computeInGPU) {
590 delete cudaNode->buffer;
591 cudaNode->buffer =
nullptr;
593 cudaNode->computeInGPU =
true;
597 microseconds etaCPU{microseconds::max()}, etaCUDA{microseconds::max()};
599 etaCPU = cpuNode->timeLaunched + cpuNode->cpuTime;
602 etaCUDA = cudaNode->timeLaunched + cudaNode->cudaTime;
604 simulatedTime = std::min(etaCPU, etaCUDA);
605 if (etaCPU < etaCUDA)
610 return simulatedTime;
619void RooFitDriver::markGPUNodes()
621 using namespace std::chrono;
623 if (_getValInvocations == 1) {
626 }
else if (_getValInvocations == 2) {
628 for (
auto &info : _nodes) {
629 info.computeInGPU = !info.isScalar && info.absArg->canComputeBatchWithCuda();
634 std::size_t nBytes = 1;
635 for (
auto const &item : _dataMapCUDA) {
636 nBytes = std::max(nBytes, item.size() *
sizeof(
double));
640 microseconds h2dTime = transferTimes.first;
641 microseconds d2hTime = transferTimes.second;
642 ooccoutD(
static_cast<TObject*
>(
nullptr), FastEvaluations) <<
"------Copying times------\n";
643 ooccoutD(
static_cast<TObject*
>(
nullptr), FastEvaluations) <<
"h2dTime=" << h2dTime.count() <<
"us\td2hTime=" << d2hTime.count()
646 std::vector<microseconds> diffTimes;
647 for (
auto &info : _nodes) {
649 diffTimes.push_back(info.cpuTime - info.cudaTime);
651 microseconds bestTime = microseconds::max();
652 microseconds bestThreshold{};
654 for (
auto &threshold : diffTimes) {
655 if ((ret = simulateFit(h2dTime, d2hTime, microseconds{std::abs(threshold.count())})) < bestTime) {
657 bestThreshold = threshold;
661 simulateFit(h2dTime, d2hTime, microseconds{std::abs(bestThreshold.count())});
662 ooccoutD(
static_cast<TObject*
>(
nullptr), FastEvaluations) <<
"Best threshold=" << bestThreshold.count() <<
"us" << std::endl;
665 for (
auto &info : _nodes) {
667 if (info.copyAfterEvaluation) {
669 info.buffer =
nullptr;
671 info.copyAfterEvaluation =
false;
674 info.event = info.eventStart =
nullptr;
678 for (
auto &info : _nodes) {
680 if (!info.isScalar) {
681 for (
auto *clientInfo : info.clientInfos) {
682 if (info.computeInGPU != clientInfo->computeInGPU) {
684 if (!info.copyAfterEvaluation) {
686 info.buffer =
nullptr;
688 info.copyAfterEvaluation =
true;
696 if (_getValInvocations == 3) {
697 for (
auto &info : _nodes) {
698 if (info.computeInGPU || info.copyAfterEvaluation)
702 ooccoutD(
static_cast<TObject*
>(
nullptr), FastEvaluations) <<
"------Nodes------\t\t\t\tCpu time: \t Cuda time\n";
703 for (
auto &info : _nodes) {
705 << std::setw(20) << info.absArg->GetName() <<
"\t" << info.absArg <<
"\t"
706 << (info.computeInGPU ?
"CUDA" :
"CPU") <<
"\t" << info.cpuTime.count()
707 <<
"us\t" << info.cudaTime.count() <<
"us\n";
712void RooFitDriver::determineOutputSizes()
714 for (
auto &argInfo : _nodes) {
715 for (
auto *serverInfo : argInfo.serverInfos) {
716 if (!argInfo.absArg->isReducerNode()) {
717 argInfo.outputSize = std::max(serverInfo->outputSize, argInfo.outputSize);
728 _changeOperModeRAIIs.emplace(arg, opMode);
734 return static_cast<RooAbsReal &
>(_integralUnfolder->arg());
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
virtual double * gpuWritePtr()=0
virtual double * cpuWritePtr()=0
virtual double const * gpuReadPtr() const =0
virtual double const * cpuReadPtr() const =0
AbsBuffer * makeCpuBuffer(std::size_t size)
AbsBuffer * makePinnedBuffer(std::size_t size, cudaStream_t *stream=nullptr)
AbsBuffer * makeGpuBuffer(std::size_t size)
std::map< RooFit::Detail::DataKey, RooSpan< const double > > DataSpansMap
const RooFit::BatchModeOption _batchMode
void setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
Temporarily change the operation mode of a RooAbsArg until the RooFitDriver gets deleted.
RooFitDriver(const RooAbsReal &absReal, RooArgSet const &normSet, RooFit::BatchModeOption batchMode=RooFit::BatchModeOption::Cpu)
Construct a new RooFitDriver.
RooFit::Detail::DataMap _dataMapCPU
double getValHeterogeneous()
Returns the value of the top node in the computation graph.
double getVal()
Returns the value of the top node in the computation graph.
RooFit::Detail::DataMap _dataMapCUDA
std::unique_ptr< RooFit::NormalizationIntegralUnfolder > _integralUnfolder
void determineOutputSizes()
std::vector< NodeInfo > _nodes
std::vector< double > getValues()
void assignToGPU(NodeInfo &info)
Assign a node to be computed in the GPU.
void computeCPUNode(const RooAbsArg *node, NodeInfo &info)
Detail::BufferManager _bufferManager
std::stack< std::vector< double > > _vectorBuffers
void markGPUNodes()
Decides which nodes are assigned to the gpu in a cuda fit.
void setData(RooAbsData const &data, std::string_view rangeName="", RooAbsCategory const *indexCatForSplitting=nullptr, bool skipZeroWeights=false, bool takeGlobalObservablesFromData=true)
RooAbsReal & topNode() const
std::chrono::microseconds simulateFit(std::chrono::microseconds h2dTime, std::chrono::microseconds d2hTime, std::chrono::microseconds diffThreshold)
This methods simulates the computation of the whole graph and the time it takes and decides what to c...
RooAbsArg is the common abstract base class for objects that represent a value and a "shape" in RooFi...
void treeNodeServerList(RooAbsCollection *list, const RooAbsArg *arg=0, Bool_t doBranch=kTRUE, Bool_t doLeaf=kTRUE, Bool_t valueOnly=kFALSE, Bool_t recurseNonDerived=kFALSE) const
Fill supplied list with nodes of the arg tree, following all server links, starting with ourself as t...
OperMode operMode() const
Query the operation mode of this node.
RooAbsCategory is the base class for objects that represent a discrete value with a finite number of ...
void sortTopologically()
Sort collection topologically: the servers of any RooAbsArg will be before that RooAbsArg in the coll...
Storage_t::const_reverse_iterator rend() const
Storage_t::const_reverse_iterator rbegin() const
virtual Bool_t add(const RooAbsArg &var, Bool_t silent=kFALSE)
Add the specified argument to list.
Storage_t::size_type size() const
RooAbsData is the common abstract base class for binned and unbinned datasets.
RooArgSet const * getGlobalObservables() const
Returns snapshot of global observables stored in this data.
RooAbsReal is the common abstract base class for objects that represent a real value and implements f...
RooArgList is a container object that can hold multiple RooAbsArg objects.
RooArgSet is a container object that can hold multiple RooAbsArg objects.
virtual void cudaEventRecord(cudaEvent_t *, cudaStream_t *)
virtual cudaEvent_t * newCudaEvent(bool)
virtual float cudaEventElapsedTime(cudaEvent_t *, cudaEvent_t *)
virtual void * cudaMalloc(size_t)
virtual void memcpyToCPU(void *, const void *, size_t, cudaStream_t *=nullptr)
virtual cudaStream_t * newCudaStream()
virtual void cudaStreamWaitEvent(cudaStream_t *, cudaEvent_t *)
virtual bool streamIsActive(cudaStream_t *)
virtual void cudaFree(void *)
virtual void deleteCudaStream(cudaStream_t *)
virtual void deleteCudaEvent(cudaEvent_t *)
virtual void memcpyToCUDA(void *, const void *, size_t, cudaStream_t *=nullptr)
auto & at(RooAbsArg const *arg, RooAbsArg const *=nullptr)
auto resize(std::size_t n)
RooRealVar represents a variable that can be changed from the outside.
std::size_t valueResetCounter() const
Returns how many times the value of this RooRealVar was reset.
A simple container to hold a batch of data values.
Mother of all ROOT objects.
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
R__EXTERN RooBatchComputeInterface * dispatchCUDA
void init()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
std::map< RooFit::Detail::DataKey, RooSpan< const double > > getDataSpans(RooAbsData const &data, std::string_view rangeName, RooAbsCategory const *indexCat, std::stack< std::vector< double > > &buffers, bool skipZeroWeights)
Extract all content from a RooFit datasets as a map of spans.
void logArchitectureInfo(RooFit::BatchModeOption batchMode)
std::pair< std::chrono::microseconds, std::chrono::microseconds > memcpyBenchmark(std::size_t nBytes)
Measure the time for transfering data from host to device and vice-versa.
BatchModeOption
For setting the batch mode flag with the BatchMode() command argument to RooAbsPdf::fitTo();.
A struct used by the RooFitDriver to store information on the RooAbsArgs in the computation graph.
std::vector< NodeInfo * > serverInfos
std::chrono::microseconds timeLaunched
std::size_t lastSetValCount
std::chrono::microseconds cudaTime
std::size_t originalDataToken
std::vector< NodeInfo * > clientInfos
std::chrono::microseconds cpuTime
void decrementRemainingClients()
Check the servers of a node that has been computed and release it's resources if they are no longer n...
Detail::AbsBuffer * buffer