Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RooFitDriver.cxx
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Jonas Rembser, CERN 2021
5 * Emmanouil Michalainas, CERN 2021
6 *
7 * Copyright (c) 2021, CERN
8 *
9 * Redistribution and use in source and binary forms,
10 * with or without modification, are permitted according to the terms
11 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
12 */
13
14/**
15\file RooFitDriver.cxx
16\class RooFitDriver
17\ingroup Roofitcore
18
19This class can evaluate a RooAbsReal object in other ways than recursive graph
20traversal. Currently, it is being used for evaluating a RooAbsReal object and
21supplying the value to the minimizer, during a fit. The class scans the
22dependencies and schedules the computations in a secure and efficient way. The
23computations take place in the RooBatchCompute library and can be carried off
24by either the CPU or a CUDA-supporting GPU. The RooFitDriver class takes care
25of data transfers. An instance of this class is created every time
26RooAbsPdf::fitTo() is called and gets destroyed when the fitting ends.
27**/
28
29#include <RooFitDriver.h>
30
31#include <RooAbsCategory.h>
32#include <RooAbsData.h>
33#include <RooAbsReal.h>
34#include <RooRealVar.h>
35#include <RooArgList.h>
36#include <RooBatchCompute.h>
37#include <RooMsgService.h>
41#include <RooFit/CUDAHelpers.h>
42
44
45#include <iomanip>
46#include <numeric>
47#include <thread>
48
49namespace ROOT {
50namespace Experimental {
51
52/// A struct used by the RooFitDriver to store information on the RooAbsArgs in
53/// the computation graph.
54struct NodeInfo {
55 /// Check the servers of a node that has been computed and release it's resources
56 /// if they are no longer needed.
58 {
59 if (--remClients == 0) {
60 delete buffer;
61 buffer = nullptr;
62 }
63 }
64
65 RooAbsArg *absArg = nullptr;
66
68
69 cudaEvent_t *event = nullptr;
70 cudaEvent_t *eventStart = nullptr;
71 cudaStream_t *stream = nullptr;
72 std::chrono::microseconds cpuTime{0};
73 std::chrono::microseconds cudaTime{std::chrono::microseconds::max()};
74 std::chrono::microseconds timeLaunched{-1};
75 int remClients = 0;
76 int remServers = 0;
77 bool isScalar = false;
78 bool computeInGPU = false;
79 bool copyAfterEvaluation = false;
80 bool fromDataset = false;
81 bool isVariable = false;
82 bool isDirty = true;
83 bool isCategory = false;
84 std::size_t outputSize = 1;
85 std::size_t lastSetValCount = std::numeric_limits<std::size_t>::max();
86 std::size_t originalDataToken = 0;
88 std::vector<NodeInfo *> serverInfos;
89 std::vector<NodeInfo *> clientInfos;
90
92 {
93 if (event)
95 if (eventStart)
97 if (stream)
99 }
100};
101
102/// Construct a new RooFitDriver. The constructor analyzes and saves metadata about the graph,
103/// useful for the evaluation of it that will be done later. In case the CUDA mode is selected,
104/// there's also some CUDA-related initialization.
105///
106/// \param[in] absReal The RooAbsReal object that sits on top of the
107/// computation graph that we want to evaluate.
108/// \param[in] normSet Normalization set for the evaluation
109/// \param[in] batchMode The computation mode, accepted values are
110/// `RooBatchCompute::Cpu` and `RooBatchCompute::Cuda`.
112 : _batchMode{batchMode}
113{
114 _integralUnfolder = std::make_unique<RooFit::NormalizationIntegralUnfolder>(absReal, normSet);
115
116 // Initialize RooBatchCompute
118
119 // Some checks and logging of used architectures
121
122 // Get the set of nodes in the computation graph. Do the detour via
123 // RooArgList to avoid deduplication done after adding each element.
124 RooArgList serverList;
125 topNode().treeNodeServerList(&serverList, nullptr, true, true, false, true);
126 // If we fill the servers in reverse order, they are approximately in
127 // topological order so we save a bit of work in sortTopologically().
128 RooArgSet serverSet;
129 serverSet.add(serverList.rbegin(), serverList.rend(), /*silent=*/true);
130 // Sort nodes topologically: the servers of any node will be before that
131 // node in the collection.
132 serverSet.sortTopologically();
133
134 _dataMapCPU.resize(serverSet.size());
135 _dataMapCUDA.resize(serverSet.size());
136
137 std::unordered_map<TNamed const *, std::size_t> tokens;
138 std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;
139
140 // Fill the ordered nodes list and initialize the node info structs.
141 _nodes.resize(serverSet.size());
142 std::size_t iNode = 0;
143 for (RooAbsArg *arg : serverSet) {
144
145 tokens[arg->namePtr()] = iNode;
146
147 auto &nodeInfo = _nodes[iNode];
148 nodeInfo.absArg = arg;
149 nodeInfos[arg] = &nodeInfo;
150
151 nodeInfo.originalDataToken = arg->dataToken();
152 arg->setDataToken(iNode);
153
154 if (dynamic_cast<RooRealVar const *>(arg)) {
155 nodeInfo.isVariable = true;
156 }
157 if (dynamic_cast<RooAbsCategory const *>(arg)) {
158 nodeInfo.isCategory = true;
159 }
160
161 ++iNode;
162 }
163
164 for (NodeInfo &info : _nodes) {
165 info.serverInfos.reserve(info.absArg->servers().size());
166 for (RooAbsArg *server : info.absArg->servers()) {
167 if (server->isValueServer(*info.absArg)) {
168 auto *serverInfo = nodeInfos.at(server);
169 info.serverInfos.emplace_back(serverInfo);
170 serverInfo->clientInfos.emplace_back(&info);
171 }
172 server->setDataToken(tokens.at(server->namePtr()));
173 }
174 }
175
177 // create events and streams for every node
178 for (auto &info : _nodes) {
180 info.eventStart = RooBatchCompute::dispatchCUDA->newCudaEvent(true);
182 }
183 }
184}
185
186void RooFitDriver::setData(RooAbsData const &data, std::string_view rangeName,
187 RooAbsCategory const *indexCatForSplitting, bool skipZeroWeights,
188 bool takeGlobalObservablesFromData)
189{
190
191 std::stack<std::vector<double>>{}.swap(_vectorBuffers);
192 DataSpansMap dataSpans = RooFit::BatchModeDataHelpers::getDataSpans(data, rangeName, indexCatForSplitting,
193 _vectorBuffers, skipZeroWeights);
194 if (takeGlobalObservablesFromData && data.getGlobalObservables()) {
195 _vectorBuffers.emplace();
196 auto &buffer = _vectorBuffers.top();
197 buffer.reserve(data.getGlobalObservables()->size());
198 for (auto *arg : static_range_cast<RooRealVar const *>(*data.getGlobalObservables())) {
199 buffer.push_back(arg->getVal());
200 dataSpans[arg] = RooSpan<const double>{&buffer.back(), 1};
201 }
202 }
203 setData(dataSpans);
204}
205
207{
208 // Iterate over the given data spans and add them to the data map. Check if
209 // they are used in the computation graph. If yes, add the span to the data
210 // map and set the node info accordingly.
211 std::size_t totalSize = 0;
212 for (auto &info : _nodes) {
213 if (info.buffer) {
214 delete info.buffer;
215 info.buffer = nullptr;
216 }
217 auto found = dataSpans.find(info.absArg->namePtr());
218 if (found != dataSpans.end()) {
219 _dataMapCPU.at(info.absArg) = found->second;
220 info.outputSize = found->second.size();
221 info.fromDataset = true;
222 info.isDirty = false;
223 totalSize += info.outputSize;
224 } else {
225 info.outputSize = 1;
226 info.fromDataset = false;
227 info.isDirty = true;
228 }
229 }
230
232
233 for (auto &info : _nodes) {
234 // If the node has an output of size 1
235 info.isScalar = info.outputSize == 1;
236
237 // In principle we don't need dirty flag propagation because the driver
238 // takes care of deciding which node needs to be re-evaluated. However,
239 // disabling it also for scalar mode results in very long fitting times
240 // for specific models (test 14 in stressRooFit), which still needs to be
241 // understood. TODO.
242 if (!info.isScalar) {
243 setOperMode(info.absArg, RooAbsArg::ADirty);
244 }
245 }
246
247 // Extra steps for initializing in cuda mode
249 return;
250
251 // copy observable data to the GPU
252 // TODO: use separate buffers here
253 _cudaMemDataset = static_cast<double *>(RooBatchCompute::dispatchCUDA->cudaMalloc(totalSize * sizeof(double)));
254 size_t idx = 0;
255 for (auto &info : _nodes) {
256 if (!info.fromDataset)
257 continue;
258 std::size_t size = info.outputSize;
259 _dataMapCUDA.at(info.absArg) = RooSpan<double>(_cudaMemDataset + idx, size);
261 size * sizeof(double));
262 idx += size;
263 }
264}
265
267{
268 for (auto &info : _nodes) {
269 info.absArg->setDataToken(info.originalDataToken);
270 }
271
274 }
275}
276
277std::vector<double> RooFitDriver::getValues()
278{
279 getVal();
280 NodeInfo const &nodeInfo = _nodes.back();
281 if (nodeInfo.computeInGPU) {
282 std::size_t nOut = nodeInfo.outputSize;
283 std::vector<double> out(nOut);
284 RooBatchCompute::dispatchCUDA->memcpyToCPU(out.data(), _dataMapCPU.at(&topNode()).data(), nOut * sizeof(double));
285 _dataMapCPU.at(&topNode()) = RooSpan<const double>(out.data(), nOut);
286 return out;
287 }
288 // We copy the data to the output vector
289 auto dataSpan = _dataMapCPU.at(&topNode());
290 std::vector<double> out;
291 out.reserve(dataSpan.size());
292 for (auto const &x : dataSpan) {
293 out.push_back(x);
294 }
295 return out;
296}
297
299{
300 using namespace Detail;
301
302 auto nodeAbsReal = static_cast<RooAbsReal const *>(node);
303
304 const std::size_t nOut = info.outputSize;
305
306 if (nOut == 1) {
310 }
311 nodeAbsReal->computeBatch(nullptr, &info.scalarBuffer, nOut, _dataMapCPU);
312 } else {
313 if (!info.buffer) {
316 }
317 double *buffer = info.buffer->cpuWritePtr();
318 _dataMapCPU.at(node) = RooSpan<const double>(buffer, nOut);
319 // compute node and measure the time the first time
320 if (_getValInvocations == 1) {
321 using namespace std::chrono;
322 auto start = steady_clock::now();
323 nodeAbsReal->computeBatch(nullptr, buffer, nOut, _dataMapCPU);
324 info.cpuTime = duration_cast<microseconds>(steady_clock::now() - start);
325 } else {
326 nodeAbsReal->computeBatch(nullptr, buffer, nOut, _dataMapCPU);
327 }
328 if (info.copyAfterEvaluation) {
330 if (info.event) {
332 }
333 }
334 }
335}
336
337/// Returns the value of the top node in the computation graph
339{
341
343 return getValHeterogeneous();
344 }
345
346 for (auto &nodeInfo : _nodes) {
347 RooAbsArg *node = nodeInfo.absArg;
348 if (!nodeInfo.fromDataset) {
349 if (nodeInfo.isVariable) {
350 auto *var = static_cast<RooRealVar const *>(node);
351 if (nodeInfo.lastSetValCount != var->valueResetCounter()) {
352 nodeInfo.lastSetValCount = var->valueResetCounter();
353 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
354 clientInfo->isDirty = true;
355 }
356 computeCPUNode(node, nodeInfo);
357 nodeInfo.isDirty = false;
358 }
359 } else {
360 if (nodeInfo.isDirty) {
361 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
362 clientInfo->isDirty = true;
363 }
364 computeCPUNode(node, nodeInfo);
365 nodeInfo.isDirty = false;
366 }
367 }
368 }
369 }
370
371 // return the final value
372 return _dataMapCPU.at(&topNode())[0];
373}
374
375/// Returns the value of the top node in the computation graph
377{
378 for (auto &info : _nodes) {
379 info.remClients = info.clientInfos.size();
380 info.remServers = info.serverInfos.size();
381 }
382
383 // In a cuda fit, use first 3 fits to determine the execution times
384 // and the hardware that computes each part of the graph
386 markGPUNodes();
387
388 // find initial gpu nodes and assign them to gpu
389 for (auto &info : _nodes) {
390 if (info.remServers == 0 && info.computeInGPU) {
391 assignToGPU(info);
392 }
393 }
394
395 NodeInfo const &topNodeInfo = _nodes.back();
396 while (topNodeInfo.remServers != -2) {
397 // find finished gpu nodes
398 for (auto &info : _nodes) {
399 if (info.remServers == -1 && !RooBatchCompute::dispatchCUDA->streamIsActive(info.stream)) {
400 if (_getValInvocations == 2) {
401 float ms = RooBatchCompute::dispatchCUDA->cudaEventElapsedTime(info.eventStart, info.event);
402 info.cudaTime += std::chrono::microseconds{int(1000.0 * ms)};
403 }
404 info.remServers = -2;
405 // Decrement number of remaining servers for clients and start GPU computations
406 for (auto *infoClient : info.clientInfos) {
407 --infoClient->remServers;
408 if (infoClient->computeInGPU && infoClient->remServers == 0) {
409 assignToGPU(*infoClient);
410 }
411 }
412 for (auto *serverInfo : info.serverInfos) {
413 serverInfo->decrementRemainingClients();
414 }
415 }
416 }
417
418 // find next CPU node
419 auto it = _nodes.begin();
420 for (; it != _nodes.end(); it++) {
421 if (it->remServers == 0 && !it->computeInGPU)
422 break;
423 }
424
425 // if no CPU node available sleep for a while to save CPU usage
426 if (it == _nodes.end()) {
427 std::this_thread::sleep_for(std::chrono::milliseconds(1));
428 continue;
429 }
430
431 // compute next CPU node
432 NodeInfo &info = *it;
433 RooAbsArg const *node = info.absArg;
434 info.remServers = -2; // so that it doesn't get picked again
435
436 if (!info.fromDataset) {
437 computeCPUNode(node, info);
438 }
439
440 // Assign the clients that are computed on the GPU
441 for (auto *infoClient : info.clientInfos) {
442 if (--infoClient->remServers == 0 && infoClient->computeInGPU) {
443 assignToGPU(*infoClient);
444 }
445 }
446 for (auto *serverInfo : info.serverInfos) {
447 serverInfo->decrementRemainingClients();
448 }
449 }
450
451 // return the final value
452 return _dataMapCPU.at(&topNode())[0];
453}
454
455/// Assign a node to be computed in the GPU. Scan it's clients and also assign them
456/// in case they only depend on gpu nodes.
458{
459 using namespace Detail;
460
461 auto node = static_cast<RooAbsReal const *>(info.absArg);
462
463 const std::size_t nOut = info.outputSize;
464
465 info.remServers = -1;
466 // wait for every server to finish
467 for (auto *infoServer : info.serverInfos) {
468 if (infoServer->event)
470 }
471
474 double *buffer = info.buffer->gpuWritePtr();
475 _dataMapCUDA.at(node) = RooSpan<const double>(buffer, nOut);
476 // measure launching overhead (add computation time later)
477 if (_getValInvocations == 2) {
478 using namespace std::chrono;
480 auto start = steady_clock::now();
481 node->computeBatch(info.stream, buffer, nOut, _dataMapCUDA);
482 info.cudaTime = duration_cast<microseconds>(steady_clock::now() - start);
483 } else
484 node->computeBatch(info.stream, buffer, nOut, _dataMapCUDA);
486 if (info.copyAfterEvaluation) {
488 }
489}
490
491/// This methods simulates the computation of the whole graph and the time it takes
492/// and decides what to compute in gpu. The decision is made on the basis of avoiding
493/// leaving either the gpu or the cpu idle at any time, if possible, and on assigning
494/// to each piece of hardware a computation that is significantly slower on the other part.
495/// The nodes may be assigned to the non-efficient side (cpu or gpu) to prevent idleness
496/// only if the absolute difference cpuTime-cudaTime does not exceed the diffThreshold.
497std::chrono::microseconds RooFitDriver::simulateFit(std::chrono::microseconds h2dTime,
498 std::chrono::microseconds d2hTime,
499 std::chrono::microseconds diffThreshold)
500{
501 using namespace std::chrono;
502
503 std::size_t nNodes = _nodes.size();
504 // launch scalar nodes (assume they are computed in 0 time)
505 for (auto &info : _nodes) {
506 if (info.isScalar) {
507 nNodes--;
508 info.timeLaunched = microseconds{0};
509 } else
510 info.timeLaunched = microseconds{-1};
511 }
512
513 NodeInfo *cpuNode = nullptr;
514 NodeInfo *cudaNode = nullptr;
515 microseconds simulatedTime{0};
516 while (nNodes) {
517 microseconds minDiff = microseconds::max(), maxDiff = -minDiff; // diff = cpuTime - cudaTime
518 NodeInfo *cpuCandidate = nullptr;
519 NodeInfo *cudaCandidate = nullptr;
520 microseconds cpuDelay{};
521 microseconds cudaDelay{};
522 for (auto &info : _nodes) {
523 RooAbsArg const *absArg = info.absArg;
524 if (info.timeLaunched >= microseconds{0})
525 continue; // already launched
526 microseconds diff{info.cpuTime - info.cudaTime}, cpuWait{0}, cudaWait{0};
527
528 bool goToNextCandidate = false;
529
530 for (auto *serverInfo : info.serverInfos) {
531 if (serverInfo->isScalar)
532 continue;
533
534 // dependencies not computed yet
535 if (serverInfo->timeLaunched < microseconds{0}) {
536 goToNextCandidate = true;
537 break;
538 }
539 if (serverInfo->computeInGPU)
540 cpuWait = std::max(cpuWait, serverInfo->timeLaunched + serverInfo->cudaTime + d2hTime - simulatedTime);
541 else
542 cudaWait = std::max(cudaWait, serverInfo->timeLaunched + serverInfo->cpuTime + h2dTime - simulatedTime);
543 }
544
545 if (goToNextCandidate) {
546 continue;
547 }
548
549 diff += cpuWait - cudaWait;
550 if (diff < minDiff) {
551 minDiff = diff;
552 cpuDelay = cpuWait;
553 cpuCandidate = &info;
554 }
555 if (diff > maxDiff && absArg->canComputeBatchWithCuda()) {
556 maxDiff = diff;
557 cudaDelay = cudaWait;
558 cudaCandidate = &info;
559 }
560 }
561
562 auto calcDiff = [](const NodeInfo *nodeInfo) { return nodeInfo->cpuTime - nodeInfo->cudaTime; };
563 if (cpuCandidate && calcDiff(cpuCandidate) > diffThreshold)
564 cpuCandidate = nullptr;
565 if (cudaCandidate && -calcDiff(cudaCandidate) > diffThreshold)
566 cudaCandidate = nullptr;
567 // don't compute same node twice
568 if (cpuCandidate == cudaCandidate && !cpuNode && !cudaNode) {
569 if (minDiff < microseconds{0})
570 cudaCandidate = nullptr;
571 else
572 cpuCandidate = nullptr;
573 }
574 if (cpuCandidate && !cpuNode) {
575 cpuNode = cpuCandidate;
576 cpuNode->timeLaunched = simulatedTime + cpuDelay;
577 // If the compute mode is changed, the current buffer might not be appropriate anymore
578 if (cpuNode->computeInGPU) {
579 delete cpuNode->buffer;
580 cpuNode->buffer = nullptr;
581 }
582 cpuNode->computeInGPU = false;
583 nNodes--;
584 }
585 if (cudaCandidate && !cudaNode) {
586 cudaNode = cudaCandidate;
587 cudaNode->timeLaunched = simulatedTime + cudaDelay;
588 // If the compute mode is changed, the current buffer might not be appropriate anymore
589 if (!cudaNode->computeInGPU) {
590 delete cudaNode->buffer;
591 cudaNode->buffer = nullptr;
592 }
593 cudaNode->computeInGPU = true;
594 nNodes--;
595 }
596
597 microseconds etaCPU{microseconds::max()}, etaCUDA{microseconds::max()};
598 if (cpuNode) {
599 etaCPU = cpuNode->timeLaunched + cpuNode->cpuTime;
600 }
601 if (cudaNode) {
602 etaCUDA = cudaNode->timeLaunched + cudaNode->cudaTime;
603 }
604 simulatedTime = std::min(etaCPU, etaCUDA);
605 if (etaCPU < etaCUDA)
606 cpuNode = nullptr;
607 else
608 cudaNode = nullptr;
609 } // while(nNodes)
610 return simulatedTime;
611}
612
613/// Decides which nodes are assigned to the gpu in a cuda fit. In the 1st iteration,
614/// everything is computed in cpu for measuring the cpu time. In the 2nd iteration,
615/// everything is computed in gpu (if possible) to measure the gpu time.
616/// In the 3rd iteration, simulate the computation of the graph by calling simulateFit
617/// with every distinct threshold found as timeDiff within the nodes of the graph and select
618/// the best configuration. In the end, mark the nodes and handle the details accordingly.
619void RooFitDriver::markGPUNodes()
620{
621 using namespace std::chrono;
622
623 if (_getValInvocations == 1) {
624 // leave everything to be computed (and timed) in cpu
625 return;
626 } else if (_getValInvocations == 2) {
627 // compute (and time) as much as possible in gpu
628 for (auto &info : _nodes) {
629 info.computeInGPU = !info.isScalar && info.absArg->canComputeBatchWithCuda();
630 }
631 } else {
632 // Assign nodes to gpu using a greedy algorithm: for the number of bytes
633 // in this benchmark we take the maximum size of spans in the dataset.
634 std::size_t nBytes = 1;
635 for (auto const &item : _dataMapCUDA) {
636 nBytes = std::max(nBytes, item.size() * sizeof(double));
637 }
638 auto transferTimes = RooFit::CUDAHelpers::memcpyBenchmark(nBytes);
639
640 microseconds h2dTime = transferTimes.first;
641 microseconds d2hTime = transferTimes.second;
642 ooccoutD(static_cast<TObject*>(nullptr), FastEvaluations) << "------Copying times------\n";
643 ooccoutD(static_cast<TObject*>(nullptr), FastEvaluations) << "h2dTime=" << h2dTime.count() << "us\td2hTime=" << d2hTime.count()
644 << "us\n";
645
646 std::vector<microseconds> diffTimes;
647 for (auto &info : _nodes) {
648 if (!info.isScalar)
649 diffTimes.push_back(info.cpuTime - info.cudaTime);
650 }
651 microseconds bestTime = microseconds::max();
652 microseconds bestThreshold{};
653 microseconds ret;
654 for (auto &threshold : diffTimes) {
655 if ((ret = simulateFit(h2dTime, d2hTime, microseconds{std::abs(threshold.count())})) < bestTime) {
656 bestTime = ret;
657 bestThreshold = threshold;
658 }
659 }
660 // finalize the marking of the best configuration
661 simulateFit(h2dTime, d2hTime, microseconds{std::abs(bestThreshold.count())});
662 ooccoutD(static_cast<TObject*>(nullptr), FastEvaluations) << "Best threshold=" << bestThreshold.count() << "us" << std::endl;
663
664 // deletion of the timing events (to be replaced later by non-timing events)
665 for (auto &info : _nodes) {
666 // If the copy mode is changed, the current buffer might not be appropriate anymore
667 if (info.copyAfterEvaluation) {
668 delete info.buffer;
669 info.buffer = nullptr;
670 }
671 info.copyAfterEvaluation = false;
674 info.event = info.eventStart = nullptr;
675 }
676 } // else (_getValInvocations > 2)
677
678 for (auto &info : _nodes) {
679 // scalar nodes don't need copying
680 if (!info.isScalar) {
681 for (auto *clientInfo : info.clientInfos) {
682 if (info.computeInGPU != clientInfo->computeInGPU) {
683 // If the copy mode is changed, the current buffer might not be appropriate anymore
684 if (!info.copyAfterEvaluation) {
685 delete info.buffer;
686 info.buffer = nullptr;
687 }
688 info.copyAfterEvaluation = true;
689 break;
690 }
691 }
692 }
693 }
694
695 // restore a cudaEventDisableTiming event when necessary
696 if (_getValInvocations == 3) {
697 for (auto &info : _nodes) {
698 if (info.computeInGPU || info.copyAfterEvaluation)
699 info.event = RooBatchCompute::dispatchCUDA->newCudaEvent(false);
700 }
701
702 ooccoutD(static_cast<TObject*>(nullptr), FastEvaluations) << "------Nodes------\t\t\t\tCpu time: \t Cuda time\n";
703 for (auto &info : _nodes) {
704 ooccoutD(static_cast<TObject*>(nullptr), FastEvaluations)
705 << std::setw(20) << info.absArg->GetName() << "\t" << info.absArg << "\t"
706 << (info.computeInGPU ? "CUDA" : "CPU") << "\t" << info.cpuTime.count()
707 << "us\t" << info.cudaTime.count() << "us\n";
708 }
709 }
710}
711
712void RooFitDriver::determineOutputSizes()
713{
714 for (auto &argInfo : _nodes) {
715 for (auto *serverInfo : argInfo.serverInfos) {
716 if (!argInfo.absArg->isReducerNode()) {
717 argInfo.outputSize = std::max(serverInfo->outputSize, argInfo.outputSize);
718 }
719 }
720 }
721}
722
723/// Temporarily change the operation mode of a RooAbsArg until the
724/// RooFitDriver gets deleted.
725void RooFitDriver::setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
726{
727 if (opMode != arg->operMode()) {
728 _changeOperModeRAIIs.emplace(arg, opMode);
729 }
730}
731
732RooAbsReal &RooFitDriver::topNode() const
733{
734 return static_cast<RooAbsReal &>(_integralUnfolder->arg());
735}
736
737} // namespace Experimental
738} // namespace ROOT
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
#define ooccoutD(o, a)
virtual double const * gpuReadPtr() const =0
virtual double const * cpuReadPtr() const =0
AbsBuffer * makeCpuBuffer(std::size_t size)
Definition Buffers.cxx:240
AbsBuffer * makePinnedBuffer(std::size_t size, cudaStream_t *stream=nullptr)
Definition Buffers.cxx:248
AbsBuffer * makeGpuBuffer(std::size_t size)
Definition Buffers.cxx:244
std::map< RooFit::Detail::DataKey, RooSpan< const double > > DataSpansMap
const RooFit::BatchModeOption _batchMode
void setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
Temporarily change the operation mode of a RooAbsArg until the RooFitDriver gets deleted.
RooFitDriver(const RooAbsReal &absReal, RooArgSet const &normSet, RooFit::BatchModeOption batchMode=RooFit::BatchModeOption::Cpu)
Construct a new RooFitDriver.
RooFit::Detail::DataMap _dataMapCPU
double getValHeterogeneous()
Returns the value of the top node in the computation graph.
double getVal()
Returns the value of the top node in the computation graph.
RooFit::Detail::DataMap _dataMapCUDA
std::unique_ptr< RooFit::NormalizationIntegralUnfolder > _integralUnfolder
std::vector< NodeInfo > _nodes
std::vector< double > getValues()
void assignToGPU(NodeInfo &info)
Assign a node to be computed in the GPU.
void computeCPUNode(const RooAbsArg *node, NodeInfo &info)
Detail::BufferManager _bufferManager
std::stack< std::vector< double > > _vectorBuffers
void markGPUNodes()
Decides which nodes are assigned to the gpu in a cuda fit.
void setData(RooAbsData const &data, std::string_view rangeName="", RooAbsCategory const *indexCatForSplitting=nullptr, bool skipZeroWeights=false, bool takeGlobalObservablesFromData=true)
std::chrono::microseconds simulateFit(std::chrono::microseconds h2dTime, std::chrono::microseconds d2hTime, std::chrono::microseconds diffThreshold)
This methods simulates the computation of the whole graph and the time it takes and decides what to c...
RooAbsArg is the common abstract base class for objects that represent a value and a "shape" in RooFi...
Definition RooAbsArg.h:69
void treeNodeServerList(RooAbsCollection *list, const RooAbsArg *arg=0, Bool_t doBranch=kTRUE, Bool_t doLeaf=kTRUE, Bool_t valueOnly=kFALSE, Bool_t recurseNonDerived=kFALSE) const
Fill supplied list with nodes of the arg tree, following all server links, starting with ourself as t...
OperMode operMode() const
Query the operation mode of this node.
Definition RooAbsArg.h:499
RooAbsCategory is the base class for objects that represent a discrete value with a finite number of ...
void sortTopologically()
Sort collection topologically: the servers of any RooAbsArg will be before that RooAbsArg in the coll...
Storage_t::const_reverse_iterator rend() const
Storage_t::const_reverse_iterator rbegin() const
virtual Bool_t add(const RooAbsArg &var, Bool_t silent=kFALSE)
Add the specified argument to list.
Storage_t::size_type size() const
RooAbsData is the common abstract base class for binned and unbinned datasets.
Definition RooAbsData.h:82
RooArgSet const * getGlobalObservables() const
Returns snapshot of global observables stored in this data.
Definition RooAbsData.h:315
RooAbsReal is the common abstract base class for objects that represent a real value and implements f...
Definition RooAbsReal.h:64
RooArgList is a container object that can hold multiple RooAbsArg objects.
Definition RooArgList.h:22
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Definition RooArgSet.h:35
virtual void cudaEventRecord(cudaEvent_t *, cudaStream_t *)
virtual cudaEvent_t * newCudaEvent(bool)
virtual float cudaEventElapsedTime(cudaEvent_t *, cudaEvent_t *)
virtual void memcpyToCPU(void *, const void *, size_t, cudaStream_t *=nullptr)
virtual void cudaStreamWaitEvent(cudaStream_t *, cudaEvent_t *)
virtual bool streamIsActive(cudaStream_t *)
virtual void deleteCudaStream(cudaStream_t *)
virtual void deleteCudaEvent(cudaEvent_t *)
virtual void memcpyToCUDA(void *, const void *, size_t, cudaStream_t *=nullptr)
auto & at(RooAbsArg const *arg, RooAbsArg const *=nullptr)
Definition DataMap.h:88
auto resize(std::size_t n)
Definition DataMap.h:86
RooRealVar represents a variable that can be changed from the outside.
Definition RooRealVar.h:39
std::size_t valueResetCounter() const
Returns how many times the value of this RooRealVar was reset.
Definition RooRealVar.h:59
A simple container to hold a batch of data values.
Definition RooSpan.h:34
Mother of all ROOT objects.
Definition TObject.h:41
Double_t x[n]
Definition legend1.C:17
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
R__EXTERN RooBatchComputeInterface * dispatchCUDA
void init()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
std::map< RooFit::Detail::DataKey, RooSpan< const double > > getDataSpans(RooAbsData const &data, std::string_view rangeName, RooAbsCategory const *indexCat, std::stack< std::vector< double > > &buffers, bool skipZeroWeights)
Extract all content from a RooFit datasets as a map of spans.
void logArchitectureInfo(RooFit::BatchModeOption batchMode)
std::pair< std::chrono::microseconds, std::chrono::microseconds > memcpyBenchmark(std::size_t nBytes)
Measure the time for transfering data from host to device and vice-versa.
BatchModeOption
For setting the batch mode flag with the BatchMode() command argument to RooAbsPdf::fitTo();.
A struct used by the RooFitDriver to store information on the RooAbsArgs in the computation graph.
std::vector< NodeInfo * > serverInfos
std::chrono::microseconds timeLaunched
std::chrono::microseconds cudaTime
std::vector< NodeInfo * > clientInfos
std::chrono::microseconds cpuTime
void decrementRemainingClients()
Check the servers of a node that has been computed and release it's resources if they are no longer n...
Detail::AbsBuffer * buffer