Logo ROOT  
Reference Guide
RooFitDriver.cxx
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Jonas Rembser, CERN 2021
5 * Emmanouil Michalainas, CERN 2021
6 *
7 * Copyright (c) 2021, CERN
8 *
9 * Redistribution and use in source and binary forms,
10 * with or without modification, are permitted according to the terms
11 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
12 */
13
14/**
15\file RooFitDriver.cxx
16\class RooFitDriver
17\ingroup Roofitcore
18
19This class can evaluate a RooAbsReal object in other ways than recursive graph
20traversal. Currently, it is being used for evaluating a RooAbsReal object and
21supplying the value to the minimizer, during a fit. The class scans the
22dependencies and schedules the computations in a secure and efficient way. The
23computations take place in the RooBatchCompute library and can be carried off
24by either the CPU or a CUDA-supporting GPU. The RooFitDriver class takes care
25of data transfers. An instance of this class is created every time
26RooAbsPdf::fitTo() is called and gets destroyed when the fitting ends.
27**/
28
29#include <RooFitDriver.h>
30
31#include <RooAbsCategory.h>
32#include <RooAbsData.h>
33#include <RooAbsReal.h>
34#include <RooRealVar.h>
35#include <RooArgList.h>
36#include <RooBatchCompute.h>
37#include <RooMsgService.h>
41#include <RooFit/CUDAHelpers.h>
42
44
45#include <iomanip>
46#include <numeric>
47#include <thread>
48
49#define COUT_DEBUG ooccoutD(nullptr, FastEvaluations)
50
51namespace {
52
53enum HeterogeneosIterations { CPUOnly = 2, GPUOnly = 1, Both = 3 };
54
55}
56
57namespace ROOT {
58namespace Experimental {
59
60/// A struct used by the RooFitDriver to store information on the RooAbsArgs in
61/// the computation graph.
62struct NodeInfo {
63 /// Check the servers of a node that has been computed and release it's resources
64 /// if they are no longer needed.
66 {
67 if (--remClients == 0) {
68 delete buffer;
69 buffer = nullptr;
70 }
71 }
72
73 RooAbsArg *absArg = nullptr;
74
76 std::size_t iNode = 0;
77 cudaEvent_t *event = nullptr;
78 cudaEvent_t *eventStart = nullptr;
79 cudaStream_t *stream = nullptr;
80 std::chrono::microseconds cpuTime{0};
81 std::chrono::microseconds cudaTime{std::chrono::microseconds::max()};
82 int remClients = 0;
83 int remServers = 0;
84 bool isScalar = false;
85 bool computeInGPU = false;
86 bool copyAfterEvaluation = false;
87 bool fromDataset = false;
88 bool isVariable = false;
89 bool isDirty = true;
90 bool isCategory = false;
91 std::size_t outputSize = 1;
92 std::size_t lastSetValCount = std::numeric_limits<std::size_t>::max();
93 std::size_t originalDataToken = 0;
94 double scalarBuffer = 0.0;
95 std::vector<NodeInfo *> serverInfos;
96 std::vector<NodeInfo *> clientInfos;
97
99 {
100 if (event)
102 if (eventStart)
104 if (stream)
106 }
107};
108
109/// Construct a new RooFitDriver. The constructor analyzes and saves metadata about the graph,
110/// useful for the evaluation of it that will be done later. In case the CUDA mode is selected,
111/// there's also some CUDA-related initialization.
112///
113/// \param[in] absReal The RooAbsReal object that sits on top of the
114/// computation graph that we want to evaluate.
115/// \param[in] normSet Normalization set for the evaluation
116/// \param[in] batchMode The computation mode, accepted values are
117/// `RooBatchCompute::Cpu` and `RooBatchCompute::Cuda`.
119 : _batchMode{batchMode}
120{
121 _integralUnfolder = std::make_unique<RooFit::NormalizationIntegralUnfolder>(absReal, normSet);
122
123 // Initialize RooBatchCompute
125
126 // Some checks and logging of used architectures
128
129 // Get the set of nodes in the computation graph. Do the detour via
130 // RooArgList to avoid deduplication done after adding each element.
131 RooArgList serverList;
132 topNode().treeNodeServerList(&serverList, nullptr, true, true, false, true);
133 // If we fill the servers in reverse order, they are approximately in
134 // topological order so we save a bit of work in sortTopologically().
135 RooArgSet serverSet;
136 serverSet.add(serverList.rbegin(), serverList.rend(), /*silent=*/true);
137 // Sort nodes topologically: the servers of any node will be before that
138 // node in the collection.
139 serverSet.sortTopologically();
140
141 _dataMapCPU.resize(serverSet.size());
142 _dataMapCUDA.resize(serverSet.size());
143
144 std::unordered_map<TNamed const *, std::size_t> tokens;
145 std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;
146
147 // Fill the ordered nodes list and initialize the node info structs.
148 _nodes.resize(serverSet.size());
149 std::size_t iNode = 0;
150 for (RooAbsArg *arg : serverSet) {
151
152 tokens[arg->namePtr()] = iNode;
153
154 auto &nodeInfo = _nodes[iNode];
155 nodeInfo.absArg = arg;
156 nodeInfo.iNode = iNode;
157 nodeInfos[arg] = &nodeInfo;
158
159 nodeInfo.originalDataToken = arg->dataToken();
160 arg->setDataToken(iNode);
161
162 if (dynamic_cast<RooRealVar const *>(arg)) {
163 nodeInfo.isVariable = true;
164 }
165 if (dynamic_cast<RooAbsCategory const *>(arg)) {
166 nodeInfo.isCategory = true;
167 }
168
169 ++iNode;
170 }
171
172 for (NodeInfo &info : _nodes) {
173 info.serverInfos.reserve(info.absArg->servers().size());
174 for (RooAbsArg *server : info.absArg->servers()) {
175 if (server->isValueServer(*info.absArg)) {
176 auto *serverInfo = nodeInfos.at(server);
177 info.serverInfos.emplace_back(serverInfo);
178 serverInfo->clientInfos.emplace_back(&info);
179 }
180 server->setDataToken(tokens.at(server->namePtr()));
181 }
182 }
183
185 // create events and streams for every node
186 for (auto &info : _nodes) {
188 info.eventStart = RooBatchCompute::dispatchCUDA->newCudaEvent(true);
190 }
191 }
192}
193
195 RooAbsCategory const *indexCatForSplitting, bool skipZeroWeights,
196 bool takeGlobalObservablesFromData)
197{
198
199 std::stack<std::vector<double>>{}.swap(_vectorBuffers);
200 DataSpansMap dataSpans = RooFit::BatchModeDataHelpers::getDataSpans(data, rangeName, indexCatForSplitting,
201 _vectorBuffers, skipZeroWeights);
202 if (takeGlobalObservablesFromData && data.getGlobalObservables()) {
203 _vectorBuffers.emplace();
204 auto &buffer = _vectorBuffers.top();
205 buffer.reserve(data.getGlobalObservables()->size());
206 for (auto *arg : static_range_cast<RooRealVar const *>(*data.getGlobalObservables())) {
207 buffer.push_back(arg->getVal());
208 dataSpans[arg] = RooSpan<const double>{&buffer.back(), 1};
209 }
210 }
211 setData(dataSpans);
212}
213
215{
216 // Iterate over the given data spans and add them to the data map. Check if
217 // they are used in the computation graph. If yes, add the span to the data
218 // map and set the node info accordingly.
219 std::size_t totalSize = 0;
220 for (auto &info : _nodes) {
221 if (info.buffer) {
222 delete info.buffer;
223 info.buffer = nullptr;
224 }
225 auto found = dataSpans.find(info.absArg->namePtr());
226 if (found != dataSpans.end()) {
227 _dataMapCPU.at(info.absArg) = found->second;
228 info.outputSize = found->second.size();
229 info.fromDataset = true;
230 info.isDirty = false;
231 totalSize += info.outputSize;
232 } else {
233 info.outputSize = 1;
234 info.fromDataset = false;
235 info.isDirty = true;
236 }
237 }
238
240
241 for (auto &info : _nodes) {
242 // If the node has an output of size 1
243 info.isScalar = info.outputSize == 1;
244
245 // In principle we don't need dirty flag propagation because the driver
246 // takes care of deciding which node needs to be re-evaluated. However,
247 // disabling it also for scalar mode results in very long fitting times
248 // for specific models (test 14 in stressRooFit), which still needs to be
249 // understood. TODO.
250 if (!info.isScalar) {
251 setOperMode(info.absArg, RooAbsArg::ADirty);
252 }
253 }
254
255 // Extra steps for initializing in cuda mode
257 return;
258
259 // copy observable data to the GPU
260 // TODO: use separate buffers here
261 _cudaMemDataset = static_cast<double *>(RooBatchCompute::dispatchCUDA->cudaMalloc(totalSize * sizeof(double)));
262 size_t idx = 0;
263 for (auto &info : _nodes) {
264 if (!info.fromDataset)
265 continue;
266 std::size_t size = info.outputSize;
267 _dataMapCUDA.at(info.absArg) = RooSpan<double>(_cudaMemDataset + idx, size);
269 size * sizeof(double));
270 idx += size;
271 }
272}
273
275{
276 for (auto &info : _nodes) {
277 info.absArg->setDataToken(info.originalDataToken);
278 }
279
282 }
283}
284
285std::vector<double> RooFitDriver::getValues()
286{
287 getVal();
288 NodeInfo const &nodeInfo = _nodes.back();
289 if (nodeInfo.computeInGPU) {
290 std::size_t nOut = nodeInfo.outputSize;
291 std::vector<double> out(nOut);
292 RooBatchCompute::dispatchCUDA->memcpyToCPU(out.data(), _dataMapCPU.at(&topNode()).data(), nOut * sizeof(double));
293 _dataMapCPU.at(&topNode()) = RooSpan<const double>(out.data(), nOut);
294 return out;
295 }
296 // We copy the data to the output vector
297 auto dataSpan = _dataMapCPU.at(&topNode());
298 std::vector<double> out;
299 out.reserve(dataSpan.size());
300 for (auto const &x : dataSpan) {
301 out.push_back(x);
302 }
303 return out;
304}
305
307{
308 using namespace Detail;
309
310 auto nodeAbsReal = static_cast<RooAbsReal const *>(node);
311
312 const std::size_t nOut = info.outputSize;
313
314 if (nOut == 1) {
318 }
319 nodeAbsReal->computeBatch(nullptr, &info.scalarBuffer, nOut, _dataMapCPU);
320 } else {
321 if (!info.buffer) {
324 }
325 double *buffer = info.buffer->cpuWritePtr();
326 _dataMapCPU.at(node) = RooSpan<const double>(buffer, nOut);
327 // compute node and measure the time the first time
328 if (_getValInvocations == CPUOnly) {
329 using namespace std::chrono;
330 auto start = steady_clock::now();
331 nodeAbsReal->computeBatch(nullptr, buffer, nOut, _dataMapCPU);
332 info.cpuTime = duration_cast<microseconds>(steady_clock::now() - start);
333 } else {
334 nodeAbsReal->computeBatch(nullptr, buffer, nOut, _dataMapCPU);
335 }
336 if (info.copyAfterEvaluation) {
338 if (info.event) {
340 }
341 }
342 }
343}
344
345/// Returns the value of the top node in the computation graph
347{
349
351 return getValHeterogeneous();
352 }
353
354 for (auto &nodeInfo : _nodes) {
355 RooAbsArg *node = nodeInfo.absArg;
356 if (!nodeInfo.fromDataset) {
357 if (nodeInfo.isVariable) {
358 auto *var = static_cast<RooRealVar const *>(node);
359 if (nodeInfo.lastSetValCount != var->valueResetCounter()) {
360 nodeInfo.lastSetValCount = var->valueResetCounter();
361 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
362 clientInfo->isDirty = true;
363 }
364 computeCPUNode(node, nodeInfo);
365 nodeInfo.isDirty = false;
366 }
367 } else {
368 if (nodeInfo.isDirty) {
369 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
370 clientInfo->isDirty = true;
371 }
372 computeCPUNode(node, nodeInfo);
373 nodeInfo.isDirty = false;
374 }
375 }
376 }
377 }
378
379 // return the final value
380 return _dataMapCPU.at(&topNode())[0];
381}
382
383/// Returns the value of the top node in the computation graph
385{
386 for (auto &info : _nodes) {
387 info.remClients = info.clientInfos.size();
388 info.remServers = info.serverInfos.size();
389 if (info.buffer)
390 delete info.buffer;
391 info.buffer = nullptr;
392 }
393
394 // In a cuda fit, use first 3 fits to determine the execution times
395 // and the hardware that computes each part of the graph
396 if (_getValInvocations <= Both) {
397 // leave everything to be computed (and timed) in CPU in the 1st
398 // invocation, and after the 3rd the GPU nodes are already marked.
399 markGPUNodes();
400 }
401
402 // find initial GPU nodes and assign them to GPU
403 for (auto &info : _nodes) {
404 if (info.remServers == 0 && info.computeInGPU) {
405 assignToGPU(info);
406 }
407 }
408
409 NodeInfo const &topNodeInfo = _nodes.back();
410 while (topNodeInfo.remServers != -2) {
411 // find finished GPU nodes
412 for (auto &info : _nodes) {
413 if (info.remServers == -1 && !RooBatchCompute::dispatchCUDA->streamIsActive(info.stream)) {
414 if (_getValInvocations == GPUOnly) {
415 float ms = RooBatchCompute::dispatchCUDA->cudaEventElapsedTime(info.eventStart, info.event);
416 info.cudaTime += std::chrono::microseconds{int(1000.0 * ms)};
417 }
418 info.remServers = -2;
419 // Decrement number of remaining servers for clients and start GPU computations
420 for (auto *infoClient : info.clientInfos) {
421 --infoClient->remServers;
422 if (infoClient->computeInGPU && infoClient->remServers == 0) {
423 assignToGPU(*infoClient);
424 }
425 }
426 for (auto *serverInfo : info.serverInfos) {
427 serverInfo->decrementRemainingClients();
428 }
429 }
430 }
431
432 // find next CPU node
433 auto it = _nodes.begin();
434 for (; it != _nodes.end(); it++) {
435 if (it->remServers == 0 && !it->computeInGPU)
436 break;
437 }
438
439 // if no CPU node available sleep for a while to save CPU usage
440 if (it == _nodes.end()) {
441 std::this_thread::sleep_for(std::chrono::milliseconds(1));
442 continue;
443 }
444
445 // compute next CPU node
446 NodeInfo &info = *it;
447 RooAbsArg const *node = info.absArg;
448 info.remServers = -2; // so that it doesn't get picked again
449
450 if (!info.fromDataset) {
451 computeCPUNode(node, info);
452 }
453
454 // Assign the clients that are computed on the GPU
455 for (auto *infoClient : info.clientInfos) {
456 if (--infoClient->remServers == 0 && infoClient->computeInGPU) {
457 assignToGPU(*infoClient);
458 }
459 }
460 for (auto *serverInfo : info.serverInfos) {
461 serverInfo->decrementRemainingClients();
462 }
463 }
464
465 // return the final value
466 return _dataMapCPU.at(&topNode())[0];
467}
468
469/// Assign a node to be computed in the GPU. Scan it's clients and also assign them
470/// in case they only depend on GPU nodes.
472{
473 using namespace Detail;
474
475 auto node = static_cast<RooAbsReal const *>(info.absArg);
476
477 const std::size_t nOut = info.outputSize;
478
479 info.remServers = -1;
480 // wait for every server to finish
481 for (auto *infoServer : info.serverInfos) {
482 if (infoServer->event)
484 }
485
488 double *buffer = info.buffer->gpuWritePtr();
489 _dataMapCUDA.at(node) = RooSpan<const double>(buffer, nOut);
490 // measure launching overhead (add computation time later)
491 if (_getValInvocations == GPUOnly) {
492 using namespace std::chrono;
494 auto start = steady_clock::now();
495 node->computeBatch(info.stream, buffer, nOut, _dataMapCUDA);
496 info.cudaTime = duration_cast<microseconds>(steady_clock::now() - start);
497 } else {
498 node->computeBatch(info.stream, buffer, nOut, _dataMapCUDA);
499 }
501 if (info.copyAfterEvaluation) {
503 }
504}
505
506} // namespace Experimental
507} // namespace ROOT
508
509namespace {
510
511/// This methods simulates the computation of the whole graph and the time it takes
512/// and decides what to compute in GPU. The decision is made on the basis of avoiding
513/// leaving either the GPU or the CPU idle at any time, if possible, and on assigning
514/// to each piece of hardware a computation that is significantly slower on the other part.
515/// The nodes may be assigned to the non-efficient side (CPU or GPU) to prevent idleness
516/// only if the absolute difference cpuTime-cudaTime does not exceed the diffThreshold.
517std::chrono::microseconds simulateFit(std::vector<bool> &computeInGPU,
518 std::vector<ROOT::Experimental::NodeInfo> const &nodes,
519 std::chrono::microseconds h2dTime, std::chrono::microseconds d2hTime,
520 std::chrono::microseconds diffThreshold)
521{
522 using namespace std::chrono;
524
525 for (std::size_t iNode = 0; iNode < nodes.size(); ++iNode) {
526 auto const &info = nodes[iNode];
527 computeInGPU[iNode] = !info.isScalar && info.absArg->canComputeBatchWithCuda();
528 }
529
530 std::vector<std::chrono::microseconds> timeLaunched;
531 timeLaunched.reserve(nodes.size());
532
533 std::size_t nNodes = nodes.size();
534 // launch scalar nodes (assume they are computed in 0 time)
535 for (auto &info : nodes) {
536 if (info.isScalar) {
537 nNodes--;
538 timeLaunched.emplace_back(0);
539 } else
540 timeLaunched.emplace_back(-1);
541 }
542
543 NodeInfo const *cpuNode = nullptr;
544 NodeInfo const *cudaNode = nullptr;
545 microseconds simulatedTime{0};
546 while (nNodes) {
547 microseconds minDiff = microseconds::max(), maxDiff = -minDiff; // diff = cpuTime - cudaTime
548 NodeInfo const *cpuCandidate = nullptr;
549 NodeInfo const *cudaCandidate = nullptr;
550 microseconds cpuDelay{};
551 microseconds cudaDelay{};
552 for (std::size_t iNode = 0; iNode < nodes.size(); ++iNode) {
553 auto &info = nodes[iNode];
554 if (timeLaunched[iNode] >= microseconds{0}) {
555 // already launched
556 continue;
557 }
558 microseconds diff{info.cpuTime - info.cudaTime}, cpuWait{0}, cudaWait{0};
559
560 bool goToNextCandidate = false;
561
562 for (auto *serverInfo : info.serverInfos) {
563 if (serverInfo->isScalar)
564 continue;
565
566 // dependencies not computed yet
567 if (timeLaunched[serverInfo->iNode] < microseconds{0}) {
568 goToNextCandidate = true;
569 break;
570 }
571 if (computeInGPU[serverInfo->iNode]) {
572 cpuWait =
573 std::max(cpuWait, timeLaunched[serverInfo->iNode] + serverInfo->cudaTime + d2hTime - simulatedTime);
574 } else {
575 cudaWait =
576 std::max(cudaWait, timeLaunched[serverInfo->iNode] + serverInfo->cpuTime + h2dTime - simulatedTime);
577 }
578 }
579
580 if (goToNextCandidate) {
581 continue;
582 }
583
584 diff += cpuWait - cudaWait;
585 if (diff < minDiff) {
586 minDiff = diff;
587 cpuDelay = cpuWait;
588 cpuCandidate = &info;
589 }
590 if (diff > maxDiff && info.absArg->canComputeBatchWithCuda()) {
591 maxDiff = diff;
592 cudaDelay = cudaWait;
593 cudaCandidate = &info;
594 }
595 }
596
597 auto calcDiff = [](const NodeInfo *nodeInfo) { return nodeInfo->cpuTime - nodeInfo->cudaTime; };
598 if (cpuCandidate && calcDiff(cpuCandidate) > diffThreshold)
599 cpuCandidate = nullptr;
600 if (cudaCandidate && -calcDiff(cudaCandidate) > diffThreshold)
601 cudaCandidate = nullptr;
602 // don't compute same node twice
603 if (cpuCandidate == cudaCandidate && !cpuNode && !cudaNode) {
604 if (minDiff < microseconds{0})
605 cudaCandidate = nullptr;
606 else
607 cpuCandidate = nullptr;
608 }
609 if (cpuCandidate && !cpuNode) {
610 cpuNode = cpuCandidate;
611 timeLaunched[cpuNode->iNode] = simulatedTime + cpuDelay;
612 computeInGPU[cpuNode->iNode] = false;
613 nNodes--;
614 }
615 if (cudaCandidate && !cudaNode) {
616 cudaNode = cudaCandidate;
617 timeLaunched[cudaNode->iNode] = simulatedTime + cudaDelay;
618 computeInGPU[cudaNode->iNode] = true;
619 nNodes--;
620 }
621
622 microseconds etaCPU{microseconds::max()}, etaCUDA{microseconds::max()};
623 if (cpuNode) {
624 etaCPU = timeLaunched[cpuNode->iNode] + cpuNode->cpuTime;
625 }
626 if (cudaNode) {
627 etaCUDA = timeLaunched[cudaNode->iNode] + cudaNode->cudaTime;
628 }
629 simulatedTime = std::min(etaCPU, etaCUDA);
630 if (etaCPU < etaCUDA)
631 cpuNode = nullptr;
632 else
633 cudaNode = nullptr;
634 } // while(nNodes)
635 return simulatedTime;
636}
637
638std::vector<bool> selectNodesForGPU(std::vector<ROOT::Experimental::NodeInfo> const &nodes)
639{
640 using namespace std::chrono;
641
642 // Assign nodes to GPU using a greedy algorithm: for the number of bytes
643 // in this benchmark we take the maximum size of spans in the dataset.
644 std::size_t nEvents = 1;
645 for (auto const &node : nodes) {
646 nEvents = std::max(nEvents, node.outputSize);
647 }
648
649 auto transferTimes = RooFit::CUDAHelpers::memcpyBenchmark(nEvents * sizeof(double));
650
651 microseconds h2dTime = transferTimes.first;
652 microseconds d2hTime = transferTimes.second;
653 COUT_DEBUG << "------Copying times------\n";
654 COUT_DEBUG << "h2dTime=" << h2dTime.count() << "us\td2hTime=" << d2hTime.count() << "us\n";
655
656 std::vector<microseconds> diffTimes;
657 for (auto &info : nodes) {
658 if (!info.isScalar)
659 diffTimes.push_back(info.cpuTime - info.cudaTime);
660 }
661 microseconds bestTime = microseconds::max();
662 microseconds bestThreshold{};
663 microseconds ret;
664 std::vector<bool> computeInGPU(nodes.size());
665 for (auto &threshold : diffTimes) {
666 if ((ret = simulateFit(computeInGPU, nodes, h2dTime, d2hTime, microseconds{std::abs(threshold.count())})) <
667 bestTime) {
668 bestTime = ret;
669 bestThreshold = threshold;
670 }
671 }
672 // finalize the marking of the best configuration
673 simulateFit(computeInGPU, nodes, h2dTime, d2hTime, microseconds{std::abs(bestThreshold.count())});
674
675 COUT_DEBUG << "Best threshold = " << bestThreshold.count() << " us" << std::endl;
676
677 return computeInGPU;
678}
679
680} // namespace
681
682namespace ROOT {
683namespace Experimental {
684
685/// Decides which nodes are assigned to the GPU in a cuda fit. In the 1st iteration,
686/// everything is computed in CPU for measuring the CPU time. In the 2nd iteration,
687/// everything is computed in GPU (if possible) to measure the GPU time.
688/// In the 3rd iteration, simulate the computation of the graph by calling simulateFit
689/// with every distinct threshold found as timeDiff within the nodes of the graph and select
690/// the best configuration. In the end, mark the nodes and handle the details accordingly.
692{
693 if (_getValInvocations == CPUOnly) {
694 for (auto &info : _nodes) {
695 info.copyAfterEvaluation = false;
696 info.computeInGPU = false;
697 }
698 } else if (_getValInvocations == Both) {
699 auto computeInGPU = selectNodesForGPU(_nodes);
700 for (std::size_t iNode = 0; iNode < _nodes.size(); ++iNode) {
701 _nodes[iNode].copyAfterEvaluation = false;
702 _nodes[iNode].computeInGPU = computeInGPU[iNode];
703 }
704
705 // deletion of the timing events (to be replaced later by non-timing events)
706 for (auto &info : _nodes) {
709 info.event = info.eventStart = nullptr;
710 }
711 } else {
712 // compute (and time) as much as possible in GPU
713 for (auto &info : _nodes) {
714 info.copyAfterEvaluation = false;
715 info.computeInGPU = !info.isScalar && info.absArg->canComputeBatchWithCuda();
716 }
717 }
718
719 for (auto &info : _nodes) {
720 // scalar nodes don't need copying
721 if (!info.isScalar) {
722 for (auto *clientInfo : info.clientInfos) {
723 if (info.computeInGPU != clientInfo->computeInGPU) {
724 info.copyAfterEvaluation = true;
725 break;
726 }
727 }
728 }
729 }
730
731 // restore a cudaEventDisableTiming event when necessary
732 if (_getValInvocations == Both) {
733 for (auto &info : _nodes) {
734 if (info.computeInGPU || info.copyAfterEvaluation)
735 info.event = RooBatchCompute::dispatchCUDA->newCudaEvent(false);
736 }
737
738 COUT_DEBUG << "------Nodes------\t\t\t\tCpu time: \t Cuda time\n";
739 for (auto &info : _nodes) {
740 COUT_DEBUG << std::setw(20) << info.absArg->GetName() << "\t" << info.absArg << "\t"
741 << (info.computeInGPU ? "CUDA" : "CPU") << "\t" << info.cpuTime.count() << "us\t"
742 << info.cudaTime.count() << "us\n";
743 }
744 }
745}
746
748{
749 for (auto &argInfo : _nodes) {
750 for (auto *serverInfo : argInfo.serverInfos) {
751 if (!argInfo.absArg->isReducerNode()) {
752 argInfo.outputSize = std::max(serverInfo->outputSize, argInfo.outputSize);
753 }
754 }
755 }
756}
757
758/// Temporarily change the operation mode of a RooAbsArg until the
759/// RooFitDriver gets deleted.
761{
762 if (opMode != arg->operMode()) {
763 _changeOperModeRAIIs.emplace(arg, opMode);
764 }
765}
766
768{
769 return static_cast<RooAbsReal &>(_integralUnfolder->arg());
770}
771
772} // namespace Experimental
773} // namespace ROOT
#define COUT_DEBUG
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void data
virtual double const * gpuReadPtr() const =0
virtual double const * cpuReadPtr() const =0
AbsBuffer * makeCpuBuffer(std::size_t size)
Definition: Buffers.cxx:240
AbsBuffer * makePinnedBuffer(std::size_t size, cudaStream_t *stream=nullptr)
Definition: Buffers.cxx:248
AbsBuffer * makeGpuBuffer(std::size_t size)
Definition: Buffers.cxx:244
std::map< RooFit::Detail::DataKey, RooSpan< const double > > DataSpansMap
Definition: RooFitDriver.h:44
const RooFit::BatchModeOption _batchMode
Definition: RooFitDriver.h:78
void setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
Temporarily change the operation mode of a RooAbsArg until the RooFitDriver gets deleted.
RooFitDriver(const RooAbsReal &absReal, RooArgSet const &normSet, RooFit::BatchModeOption batchMode=RooFit::BatchModeOption::Cpu)
Construct a new RooFitDriver.
RooFit::Detail::DataMap _dataMapCPU
Definition: RooFitDriver.h:83
double getValHeterogeneous()
Returns the value of the top node in the computation graph.
double getVal()
Returns the value of the top node in the computation graph.
RooFit::Detail::DataMap _dataMapCUDA
Definition: RooFitDriver.h:84
std::unique_ptr< RooFit::NormalizationIntegralUnfolder > _integralUnfolder
Definition: RooFitDriver.h:91
std::vector< NodeInfo > _nodes
Definition: RooFitDriver.h:87
std::vector< double > getValues()
void assignToGPU(NodeInfo &info)
Assign a node to be computed in the GPU.
void computeCPUNode(const RooAbsArg *node, NodeInfo &info)
Detail::BufferManager _bufferManager
Definition: RooFitDriver.h:76
std::stack< std::vector< double > > _vectorBuffers
Definition: RooFitDriver.h:90
void markGPUNodes()
Decides which nodes are assigned to the GPU in a cuda fit.
std::stack< RooHelpers::ChangeOperModeRAII > _changeOperModeRAIIs
Definition: RooFitDriver.h:94
void setData(RooAbsData const &data, std::string_view rangeName="", RooAbsCategory const *indexCatForSplitting=nullptr, bool skipZeroWeights=false, bool takeGlobalObservablesFromData=true)
RooAbsArg is the common abstract base class for objects that represent a value and a "shape" in RooFi...
Definition: RooAbsArg.h:71
const RefCountList_t & servers() const
List of all servers of this object.
Definition: RooAbsArg.h:198
void treeNodeServerList(RooAbsCollection *list, const RooAbsArg *arg=nullptr, bool doBranch=true, bool doLeaf=true, bool valueOnly=false, bool recurseNonDerived=false) const
Fill supplied list with nodes of the arg tree, following all server links, starting with ourself as t...
Definition: RooAbsArg.cxx:499
OperMode operMode() const
Query the operation mode of this node.
Definition: RooAbsArg.h:484
A space to attach TBranches.
void sortTopologically()
Sort collection topologically: the servers of any RooAbsArg will be before that RooAbsArg in the coll...
Storage_t::const_reverse_iterator rend() const
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
Storage_t::const_reverse_iterator rbegin() const
Storage_t::size_type size() const
RooAbsData is the common abstract base class for binned and unbinned datasets.
Definition: RooAbsData.h:62
RooAbsReal is the common abstract base class for objects that represent a real value and implements f...
Definition: RooAbsReal.h:62
RooArgList is a container object that can hold multiple RooAbsArg objects.
Definition: RooArgList.h:22
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Definition: RooArgSet.h:56
virtual void cudaEventRecord(cudaEvent_t *, cudaStream_t *)
virtual cudaEvent_t * newCudaEvent(bool)
virtual float cudaEventElapsedTime(cudaEvent_t *, cudaEvent_t *)
virtual void memcpyToCPU(void *, const void *, size_t, cudaStream_t *=nullptr)
virtual void cudaStreamWaitEvent(cudaStream_t *, cudaEvent_t *)
virtual bool streamIsActive(cudaStream_t *)
virtual void deleteCudaStream(cudaStream_t *)
virtual void deleteCudaEvent(cudaEvent_t *)
virtual void memcpyToCUDA(void *, const void *, size_t, cudaStream_t *=nullptr)
auto & at(RooAbsArg const *arg, RooAbsArg const *=nullptr)
Definition: DataMap.h:88
auto resize(std::size_t n)
Definition: DataMap.h:86
RooRealVar represents a variable that can be changed from the outside.
Definition: RooRealVar.h:40
std::size_t valueResetCounter() const
Returns how many times the value of this RooRealVar was reset.
Definition: RooRealVar.h:59
A simple container to hold a batch of data values.
Definition: RooSpan.h:34
RVec< PromoteType< T > > abs(const RVec< T > &v)
Definition: RVec.hxx:1756
Double_t x[n]
Definition: legend1.C:17
basic_string_view< char > string_view
This file contains a specialised ROOT message handler to test for diagnostic in unit tests.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
void init()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
std::map< RooFit::Detail::DataKey, RooSpan< const double > > getDataSpans(RooAbsData const &data, std::string_view rangeName, RooAbsCategory const *indexCat, std::stack< std::vector< double > > &buffers, bool skipZeroWeights)
Extract all content from a RooFit datasets as a map of spans.
void logArchitectureInfo(RooFit::BatchModeOption batchMode)
std::pair< std::chrono::microseconds, std::chrono::microseconds > memcpyBenchmark(std::size_t nBytes)
Measure the time for transfering data from host to device and vice-versa.
Definition: CUDAHelpers.cxx:18
BatchModeOption
For setting the batch mode flag with the BatchMode() command argument to RooAbsPdf::fitTo();.
Definition: RooGlobalFunc.h:68
static constexpr double ms
A struct used by the RooFitDriver to store information on the RooAbsArgs in the computation graph.
std::vector< NodeInfo * > serverInfos
std::chrono::microseconds cudaTime
std::vector< NodeInfo * > clientInfos
std::chrono::microseconds cpuTime
void decrementRemainingClients()
Check the servers of a node that has been computed and release it's resources if they are no longer n...
Detail::AbsBuffer * buffer