Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
Evaluator.cxx
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Jonas Rembser, CERN 2021
5 * Emmanouil Michalainas, CERN 2021
6 *
7 * Copyright (c) 2021, CERN
8 *
9 * Redistribution and use in source and binary forms,
10 * with or without modification, are permitted according to the terms
11 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
12 */
13
14/**
15\file Evaluator.cxx
16\class RooFit::Evaluator
17\ingroup Roofitcore
18
19Evaluates a RooAbsReal object in other ways than recursive graph
20traversal. Currently, it is being used for evaluating a RooAbsReal object and
21supplying the value to the minimizer, during a fit. The class scans the
22dependencies and schedules the computations in a secure and efficient way. The
23computations take place in the RooBatchCompute library and can be carried off
24by either the CPU or a CUDA-supporting GPU. The Evaluator class takes care
25of data transfers. An instance of this class is created every time
26RooAbsPdf::fitTo() is called and gets destroyed when the fitting ends.
27**/
28
29#include <RooFit/Evaluator.h>
30
31#include <RooAbsCategory.h>
32#include <RooAbsData.h>
33#include <RooAbsReal.h>
34#include <RooRealVar.h>
35#include <RooBatchCompute.h>
36#include <RooMsgService.h>
37#include <RooNameReg.h>
38#include <RooSimultaneous.h>
39
40#include <RooBatchCompute.h>
41
43#include "RooFitImplHelpers.h"
44
45#include <chrono>
46#include <iomanip>
47#include <numeric>
48#include <thread>
49#include <unordered_set>
50
51namespace RooFit {
52
53namespace {
54
55// To avoid deleted move assignment.
56template <class T>
57void assignSpan(std::span<T> &to, std::span<T> const &from)
58{
59 to = from;
60}
61
63{
64 // We have to exit early if the message stream is not active. Otherwise it's
65 // possible that this function skips logging because it thinks it has
66 // already logged, but actually it didn't.
67 if (!RooMsgService::instance().isActive(nullptr, RooFit::Fitting, RooFit::INFO)) {
68 return;
69 }
70
71 // Don't repeat logging architecture info if the useGPU option didn't change
72 {
73 // Second element of pair tracks whether this function has already been called
74 static std::pair<bool, bool> lastUseGPU;
75 if (lastUseGPU.second && lastUseGPU.first == useGPU)
76 return;
77 lastUseGPU = {useGPU, true};
78 }
79
80 auto log = [](std::string_view message) {
81 oocxcoutI(static_cast<RooAbsArg *>(nullptr), Fitting) << message << std::endl;
82 };
83
85 log("using generic CPU library compiled with no vectorizations");
86 } else {
87 log(std::string("using CPU computation library compiled with -m") + RooBatchCompute::cpuArchitectureName());
88 }
89 if (useGPU) {
90 log("using CUDA computation library");
91 }
92}
93
94} // namespace
95
96/// A struct used by the Evaluator to store information on the RooAbsArgs in
97/// the computation graph.
98struct NodeInfo {
99
100 bool isScalar() const { return outputSize == 1; }
101
102 RooAbsArg *absArg = nullptr;
104
105 std::shared_ptr<RooBatchCompute::AbsBuffer> buffer;
106 std::size_t iNode = 0;
107 int remClients = 0;
108 int remServers = 0;
110 bool fromArrayInput = false;
111 bool isVariable = false;
112 bool isDirty = true;
113 bool isCategory = false;
114 bool hasLogged = false;
115 bool computeInGPU = false;
116 bool isValueServer = false; // if this node is a value server to the top node
117 std::size_t outputSize = 1;
118 std::size_t lastSetValCount = std::numeric_limits<std::size_t>::max();
119 int lastCatVal = std::numeric_limits<int>::max();
120 double scalarBuffer = 0.0;
121 std::vector<NodeInfo *> serverInfos;
122 std::vector<NodeInfo *> clientInfos;
123
126
127 /// Check the servers of a node that has been computed and release its
128 /// resources if they are no longer needed.
130 {
131 if (--remClients == 0 && !fromArrayInput) {
132 buffer.reset();
133 }
134 }
135
143};
144
145/// Construct a new Evaluator. The constructor analyzes and saves metadata about the graph,
146/// useful for the evaluation of it that will be done later. In case the CUDA mode is selected,
147/// there's also some CUDA-related initialization.
148///
149/// \param[in] absReal The RooAbsReal object that sits on top of the
150/// computation graph that we want to evaluate.
151/// \param[in] useGPU Whether the evaluation should be preferably done on the GPU.
153 : _topNode{const_cast<RooAbsReal &>(absReal)}, _useGPU{useGPU}
154{
156 if (useGPU && RooBatchCompute::initCUDA() != 0) {
157 throw std::runtime_error("Can't create Evaluator in CUDA mode because RooBatchCompute CUDA could not be loaded!");
158 }
159 // Some checks and logging of used architectures
161
164
167
169 if (useGPU) {
171 }
172
173 std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;
174
175 // Fill the ordered nodes list and initialize the node info structs.
176 _nodes.reserve(serverSet.size());
177 std::size_t iNode = 0;
178 for (RooAbsArg *arg : serverSet) {
179
180 _nodes.emplace_back();
181 auto &nodeInfo = _nodes.back();
182 _nodesMap[arg->namePtr()] = &nodeInfo;
183
184 nodeInfo.absArg = arg;
185 nodeInfo.originalOperMode = arg->operMode();
186 nodeInfo.iNode = iNode;
187 nodeInfos[arg] = &nodeInfo;
188
189 if (dynamic_cast<RooRealVar const *>(arg)) {
190 nodeInfo.isVariable = true;
191 } else {
192 arg->setDataToken(iNode);
193 }
194 if (dynamic_cast<RooAbsCategory const *>(arg)) {
195 nodeInfo.isCategory = true;
196 }
197
198 ++iNode;
199 }
200
201 for (NodeInfo &info : _nodes) {
202 info.serverInfos.reserve(info.absArg->servers().size());
203 for (RooAbsArg *server : info.absArg->servers()) {
204 if (server->isValueServer(*info.absArg)) {
205 auto *serverInfo = nodeInfos.at(server);
206 info.serverInfos.emplace_back(serverInfo);
207 serverInfo->clientInfos.emplace_back(&info);
208 }
209 }
210 }
211
212 // Figure out which nodes are value servers to the top node
213 _nodes.back().isValueServer = true; // the top node itself
214 for (auto iter = _nodes.rbegin(); iter != _nodes.rend(); ++iter) {
215 if (!iter->isValueServer)
216 continue;
217 for (auto &serverInfo : iter->serverInfos) {
218 serverInfo->isValueServer = true;
219 }
220 }
221
223
224 if (_useGPU) {
225 // create events and streams for every node
226 for (auto &info : _nodes) {
230 cfg.setCudaStream(info.stream);
231 _evalContextCUDA.setConfig(info.absArg, cfg);
232 }
233 }
234}
235
236/// If there are servers with the same name that got de-duplicated in the
237/// `_nodes` list, we need to set their data tokens too. We find such nodes by
238/// visiting the servers of every known node.
240{
241 for (NodeInfo &info : _nodes) {
242 std::size_t iValueServer = 0;
243 for (RooAbsArg *server : info.absArg->servers()) {
244 if (server->isValueServer(*info.absArg)) {
245 auto *knownServer = info.serverInfos[iValueServer]->absArg;
246 if (knownServer->hasDataToken()) {
247 server->setDataToken(knownServer->dataToken());
248 }
249 ++iValueServer;
250 }
251 }
252 }
253}
254
255void Evaluator::setInput(std::string const &name, std::span<const double> inputArray, bool isOnDevice)
256{
257 if (isOnDevice && !_useGPU) {
258 throw std::runtime_error("Evaluator can only take device array as input in CUDA mode!");
259 }
260
261 // Check if "name" is used in the computation graph. If yes, add the span to
262 // the data map and set the node info accordingly.
263
264 auto found = _nodesMap.find(RooNameReg::ptr(name.c_str()));
265
266 if (found == _nodesMap.end())
267 return;
268
270
271 NodeInfo &info = *found->second;
272
273 info.fromArrayInput = true;
274 info.absArg->setDataToken(info.iNode);
275 info.outputSize = inputArray.size();
276
277 if (!_useGPU) {
279 return;
280 }
281
282 if (info.outputSize <= 1) {
283 // Empty or scalar observables from the data don't need to be
284 // copied to the GPU.
287 return;
288 }
289
290 // For simplicity, we put the data on both host and device for
291 // now. This could be optimized by inspecting the clients of the
292 // variable.
293 if (isOnDevice) {
295 auto gpuSpan = _evalContextCUDA.at(info.absArg);
296 info.buffer = _bufferManager->makeCpuBuffer(gpuSpan.size());
297 info.buffer->assignFromDevice(gpuSpan);
298 _evalContextCPU.set(info.absArg, {info.buffer->hostReadPtr(), gpuSpan.size()});
299 } else {
301 auto cpuSpan = _evalContextCPU.at(info.absArg);
302 info.buffer = _bufferManager->makeGpuBuffer(cpuSpan.size());
303 info.buffer->assignFromHost(cpuSpan);
304 _evalContextCUDA.set(info.absArg, {info.buffer->deviceReadPtr(), cpuSpan.size()});
305 }
306}
307
309{
310 std::map<RooFit::Detail::DataKey, std::size_t> sizeMap;
311 for (auto &info : _nodes) {
312 if (info.fromArrayInput) {
313 sizeMap[info.absArg] = info.outputSize;
314 } else {
315 // any buffer for temporary results is invalidated by resetting the output sizes
316 info.buffer.reset();
317 }
318 }
319
320 auto outputSizeMap =
321 RooFit::BatchModeDataHelpers::determineOutputSizes(_topNode, [&](RooFit::Detail::DataKey key) -> int {
322 auto found = sizeMap.find(key);
323 return found != sizeMap.end() ? found->second : -1;
324 });
325
326 for (auto &info : _nodes) {
327 info.outputSize = outputSizeMap.at(info.absArg);
328 info.isDirty = true;
329 }
330
331 if (_useGPU) {
332 markGPUNodes();
333 }
334
336}
337
339{
340 for (auto &info : _nodes) {
341 if (!info.isVariable) {
342 info.absArg->resetDataToken();
343 }
344 }
345}
346
348{
349 using namespace Detail;
350
351 const std::size_t nOut = info.outputSize;
352
353 double *buffer = nullptr;
354 if (nOut == 1) {
355 buffer = &info.scalarBuffer;
356 if (_useGPU) {
357 _evalContextCUDA.set(node, {buffer, nOut});
358 }
359 } else {
360 if (!info.hasLogged && _useGPU) {
361 RooAbsArg const &arg = *info.absArg;
362 oocoutI(&arg, FastEvaluations) << "The argument " << arg.ClassName() << "::" << arg.GetName()
363 << " could not be evaluated on the GPU because the class doesn't support it. "
364 "Consider requesting or implementing it to benefit from a speed up."
365 << std::endl;
366 info.hasLogged = true;
367 }
368 if (!info.buffer) {
369 info.buffer = info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(nOut, info.stream)
370 : _bufferManager->makeCpuBuffer(nOut);
371 }
372 buffer = info.buffer->hostWritePtr();
373 }
375 _evalContextCPU.set(node, {buffer, nOut});
376 if (nOut > 1) {
378 }
379 if (info.isCategory) {
380 auto nodeAbsCategory = static_cast<RooAbsCategory const *>(node);
381 if (nOut == 1) {
382 buffer[0] = nodeAbsCategory->getCurrentIndex();
383 } else {
384 throw std::runtime_error("RooFit::Evaluator - non-scalar category values are not supported!");
385 }
386 } else {
387 auto nodeAbsReal = static_cast<RooAbsReal const *>(node);
389 }
392 if (info.copyAfterEvaluation) {
393 _evalContextCUDA.set(node, {info.buffer->deviceReadPtr(), nOut});
394 if (info.event) {
396 }
397 }
398}
399
400/// Process a variable in the computation graph. This is a separate non-inlined
401/// function such that we can see in performance profiles how long this takes.
403{
404 RooAbsArg *node = nodeInfo.absArg;
405 auto *var = static_cast<RooRealVar const *>(node);
406 if (nodeInfo.lastSetValCount != var->valueResetCounter()) {
407 nodeInfo.lastSetValCount = var->valueResetCounter();
408 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
409 clientInfo->isDirty = true;
410 }
412 nodeInfo.isDirty = false;
413 }
414}
415
416/// Process a category in the computation graph. This is a separate non-inlined
417/// function such that we can see in performance profiles how long this takes.
419{
420 RooAbsArg *node = nodeInfo.absArg;
421 auto *cat = static_cast<RooAbsCategory const *>(node);
422 if (nodeInfo.lastCatVal != cat->getCurrentIndex()) {
423 nodeInfo.lastCatVal = cat->getCurrentIndex();
424 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
425 clientInfo->isDirty = true;
426 }
428 nodeInfo.isDirty = false;
429 }
430}
431
432/// Flags all the clients of a given node dirty. This is a separate non-inlined
433/// function such that we can see in performance profiles how long this takes.
435{
436 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
437 clientInfo->isDirty = true;
438 }
439}
440
441/// Returns the value of the top node in the computation graph
442std::span<const double> Evaluator::run()
443{
446
448
449 if (_useGPU) {
450 return getValHeterogeneous();
451 }
452
453 for (auto &nodeInfo : _nodes) {
454 if (!nodeInfo.fromArrayInput) {
455 if (nodeInfo.isVariable) {
457 } else if (nodeInfo.isCategory) {
459 } else {
460 if (nodeInfo.isDirty) {
463 nodeInfo.isDirty = false;
464 }
465 }
466 }
467 }
468
469 // return the final output
470 return _evalContextCPU.at(&_topNode);
471}
472
473/// Returns the value of the top node in the computation graph
474std::span<const double> Evaluator::getValHeterogeneous()
475{
476 for (auto &info : _nodes) {
477 info.remClients = info.clientInfos.size();
478 info.remServers = info.serverInfos.size();
479 if (info.buffer && !info.fromArrayInput) {
480 info.buffer.reset();
481 }
482 }
483
484 // find initial GPU nodes and assign them to GPU
485 for (auto &info : _nodes) {
486 if (info.remServers == 0 && info.computeInGPU) {
488 }
489 }
490
491 NodeInfo const &topNodeInfo = _nodes.back();
492 while (topNodeInfo.remServers != -2) {
493 // find finished GPU nodes
494 for (auto &info : _nodes) {
495 if (info.remServers == -1 && !RooBatchCompute::dispatchCUDA->cudaStreamIsActive(info.stream)) {
496 info.remServers = -2;
497 // Decrement number of remaining servers for clients and start GPU computations
498 for (auto *infoClient : info.clientInfos) {
499 --infoClient->remServers;
500 if (infoClient->computeInGPU && infoClient->remServers == 0) {
502 }
503 }
504 for (auto *serverInfo : info.serverInfos) {
505 serverInfo->decrementRemainingClients();
506 }
507 }
508 }
509
510 // find next CPU node
511 auto it = _nodes.begin();
512 for (; it != _nodes.end(); it++) {
513 if (it->remServers == 0 && !it->computeInGPU)
514 break;
515 }
516
517 // if no CPU node available sleep for a while to save CPU usage
518 if (it == _nodes.end()) {
519 std::this_thread::sleep_for(std::chrono::milliseconds(1));
520 continue;
521 }
522
523 // compute next CPU node
524 NodeInfo &info = *it;
525 RooAbsArg const *node = info.absArg;
526 info.remServers = -2; // so that it doesn't get picked again
527
528 if (!info.fromArrayInput) {
529 computeCPUNode(node, info);
530 }
531
532 // Assign the clients that are computed on the GPU
533 for (auto *infoClient : info.clientInfos) {
534 if (--infoClient->remServers == 0 && infoClient->computeInGPU) {
536 }
537 }
538 for (auto *serverInfo : info.serverInfos) {
539 serverInfo->decrementRemainingClients();
540 }
541 }
542
543 // return the final value
545}
546
547/// Assign a node to be computed in the GPU. Scan it's clients and also assign them
548/// in case they only depend on GPU nodes.
550{
551 using namespace Detail;
552
553 info.remServers = -1;
554
555 auto node = static_cast<RooAbsReal const *>(info.absArg);
556
557 // wait for every server to finish
558 for (auto *infoServer : info.serverInfos) {
559 if (infoServer->event)
561 }
562
563 const std::size_t nOut = info.outputSize;
564
565 double *buffer = nullptr;
566 if (nOut == 1) {
567 buffer = &info.scalarBuffer;
568 _evalContextCPU.set(node, {buffer, nOut});
569 } else {
570 info.buffer = info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(nOut, info.stream)
571 : _bufferManager->makeGpuBuffer(nOut);
572 buffer = info.buffer->deviceWritePtr();
573 }
575 _evalContextCUDA.set(node, {buffer, nOut});
576 node->doEval(_evalContextCUDA);
578 if (info.copyAfterEvaluation) {
579 _evalContextCPU.set(node, {info.buffer->hostReadPtr(), nOut});
580 }
581}
582
583/// Decides which nodes are assigned to the GPU in a CUDA fit.
585{
586 // Decide which nodes get evaluated on the GPU: we select nodes that support
587 // CUDA evaluation and have at least one input of size greater than one.
588 for (auto &info : _nodes) {
589 info.computeInGPU = false;
590 if (!info.absArg->canComputeBatchWithCuda()) {
591 continue;
592 }
593 for (NodeInfo const *serverInfo : info.serverInfos) {
594 if (serverInfo->outputSize > 1) {
595 info.computeInGPU = true;
596 break;
597 }
598 }
599 }
600
601 // In a second pass, figure out which nodes need to copy over their results.
602 for (auto &info : _nodes) {
603 info.copyAfterEvaluation = false;
604 // scalar nodes don't need copying
605 if (!info.isScalar()) {
606 for (auto *clientInfo : info.clientInfos) {
607 if (info.computeInGPU != clientInfo->computeInGPU) {
608 info.copyAfterEvaluation = true;
609 break;
610 }
611 }
612 }
613 }
614}
615
616/// Temporarily change the operation mode of a RooAbsArg until the
617/// Evaluator gets deleted.
619{
620 if (!_operModeChanges)
621 _operModeChanges = std::make_unique<ChangeOperModeRAII>();
622 _operModeChanges->change(arg, opMode);
623}
624
625// Change the operation modes of all RooAbsArgs in the computation graph.
626// The changes are reset when the returned RAII object goes out of scope.
627//
628// We also walk transitively through value clients of the nodes to cover any
629// node that RooAbsReal::doEval (the fallback scalar implementation) might
630// inadvertently propagate the ADirty mode to via its recursive restore: that
631// helper sets servers temporarily to AClean and then calls
632// setOperMode(oldOperMode) to restore, which recurses to value clients when
633// oldOperMode is ADirty. If we did not protect those clients here, any node
634// outside the computation graph that shares a fundamental (e.g. a parameter
635// like a RooRealVar) would be left permanently in ADirty after the first
636// minimization, dramatically slowing down later scalar evaluations (for
637// example on pdfs held by the legacy test statistics' internal cache).
638std::unique_ptr<ChangeOperModeRAII> Evaluator::setOperModes(RooAbsArg::OperMode opMode)
639{
640 auto out = std::make_unique<ChangeOperModeRAII>();
641 std::unordered_set<RooAbsArg *> visited;
642
643 std::vector<RooAbsArg *> queue;
644 queue.reserve(_nodes.size());
645 for (auto &info : _nodes) {
646 queue.push_back(info.absArg);
647 }
648
649 while (!queue.empty()) {
650 RooAbsArg *node = queue.back();
651 queue.pop_back();
652 if (!visited.insert(node).second)
653 continue;
654
655 out->change(node, opMode);
656
657 // Only follow value-client links: that is exactly the propagation path
658 // used by RooAbsArg::setOperMode with mode==ADirty.
659 if (opMode == RooAbsArg::ADirty) {
660 for (auto *client : node->valueClients()) {
661 queue.push_back(client);
662 }
663 }
664 }
665 return out;
666}
667
668void Evaluator::print(std::ostream &os)
669{
670 std::cout << "--- RooFit BatchMode evaluation ---\n";
671
672 std::vector<int> widths{9, 37, 20, 9, 10, 20};
673
674 auto printElement = [&](int iCol, auto const &t) {
675 const char separator = ' ';
676 os << separator << std::left << std::setw(widths[iCol]) << std::setfill(separator) << t;
677 os << "|";
678 };
679
680 auto printHorizontalRow = [&]() {
681 int n = 0;
682 for (int w : widths) {
683 n += w + 2;
684 }
685 for (int i = 0; i < n; i++) {
686 os << '-';
687 }
688 os << "|\n";
689 };
690
692
693 os << "|";
694 printElement(0, "Index");
695 printElement(1, "Name");
696 printElement(2, "Class");
697 printElement(3, "Size");
698 printElement(4, "From Data");
699 printElement(5, "1st value");
700 std::cout << "\n";
701
703
704 for (std::size_t iNode = 0; iNode < _nodes.size(); ++iNode) {
705 auto &nodeInfo = _nodes[iNode];
706 RooAbsArg *node = nodeInfo.absArg;
707
708 auto span = _evalContextCPU.at(node);
709
710 os << "|";
711 printElement(0, iNode);
712 printElement(1, node->GetName());
713 printElement(2, node->ClassName());
714 printElement(3, nodeInfo.outputSize);
715 printElement(4, nodeInfo.fromArrayInput);
716 printElement(5, span[0]);
717
718 std::cout << "\n";
719 }
720
722}
723
724/// Gets all the parameters of the RooAbsReal. This is in principle not
725/// necessary, because we can always ask the RooAbsReal itself, but the
726/// Evaluator has the cached information to get the answer quicker.
727/// Therefore, this is not meant to be used in general, just where it matters.
728/// \warning If we find another solution to get the parameters efficiently,
729/// this function might be removed without notice.
731{
732 RooArgSet parameters;
733 for (auto &nodeInfo : _nodes) {
734 if (nodeInfo.isValueServer && nodeInfo.absArg->isFundamental()) {
735 parameters.add(*nodeInfo.absArg);
736 }
737 }
738 // Just like in RooAbsArg::getParameters(), we sort the parameters alphabetically.
739 parameters.sort();
740 return parameters;
741}
742
743/// \brief Sets the offset mode for evaluation.
744///
745/// This function sets the offset mode for evaluation to the specified mode.
746/// It updates the offset mode for both CPU and CUDA evaluation contexts.
747///
748/// \param mode The offset mode to be set.
749///
750/// \note This function marks reducer nodes as dirty if the offset mode is
751/// changed, because only reducer nodes can use offsetting.
753{
755 return;
756
759
760 for (auto &nodeInfo : _nodes) {
761 if (nodeInfo.absArg->isReducerNode()) {
762 nodeInfo.isDirty = true;
763 }
764 }
765}
766
767} // namespace RooFit
#define oocoutI(o, a)
#define oocxcoutI(o, a)
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char mode
char name[80]
Definition TGX11.cxx:145
const_iterator begin() const
const_iterator end() const
Common abstract base class for objects that represent a value and a "shape" in RooFit.
Definition RooAbsArg.h:76
const RefCountList_t & valueClients() const
List of all value clients of this object. Value clients receive value updates.
Definition RooAbsArg.h:139
A space to attach TBranches.
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
void sort(bool reverse=false)
Sort collection using std::sort and name comparison.
Abstract base class for objects that represent a real value and implements functionality common to al...
Definition RooAbsReal.h:63
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Definition RooArgSet.h:24
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void setCudaStream(CudaInterface::CudaStream *cudaStream)
virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const =0
virtual void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const =0
virtual std::unique_ptr< AbsBufferManager > createBufferManager() const =0
virtual void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaStream * newCudaStream() const =0
virtual void deleteCudaStream(CudaInterface::CudaStream *) const =0
virtual bool cudaStreamIsActive(CudaInterface::CudaStream *) const =0
void set(RooAbsArg const *arg, std::span< const double > const &span)
Definition EvalContext.h:91
std::span< const double > at(RooAbsArg const *arg, RooAbsArg const *caller=nullptr)
void enableVectorBuffers(bool enable)
OffsetMode _offsetMode
void setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config)
std::span< double > _currentOutput
void resize(std::size_t n)
void print(std::ostream &os)
void setClientsDirty(NodeInfo &nodeInfo)
Flags all the clients of a given node dirty.
std::unique_ptr< ChangeOperModeRAII > setOperModes(RooAbsArg::OperMode opMode)
RooArgSet getParameters() const
Gets all the parameters of the RooAbsReal.
void setOffsetMode(RooFit::EvalContext::OffsetMode)
Sets the offset mode for evaluation.
void syncDataTokens()
If there are servers with the same name that got de-duplicated in the _nodes list,...
const bool _useGPU
Definition Evaluator.h:63
std::unordered_map< TNamed const *, NodeInfo * > _nodesMap
Definition Evaluator.h:69
std::unique_ptr< ChangeOperModeRAII > _operModeChanges
Definition Evaluator.h:70
std::vector< NodeInfo > _nodes
Definition Evaluator.h:68
bool _needToUpdateOutputSizes
Definition Evaluator.h:65
std::span< const double > getValHeterogeneous()
Returns the value of the top node in the computation graph.
std::span< const double > run()
Returns the value of the top node in the computation graph.
Evaluator(const RooAbsReal &absReal, bool useGPU=false)
Construct a new Evaluator.
void processVariable(NodeInfo &nodeInfo)
Process a variable in the computation graph.
void processCategory(NodeInfo &nodeInfo)
Process a category in the computation graph.
std::unique_ptr< RooBatchCompute::AbsBufferManager > _bufferManager
Definition Evaluator.h:61
void markGPUNodes()
Decides which nodes are assigned to the GPU in a CUDA fit.
void assignToGPU(NodeInfo &info)
Assign a node to be computed in the GPU.
void setInput(std::string const &name, std::span< const double > inputArray, bool isOnDevice)
RooFit::EvalContext _evalContextCUDA
Definition Evaluator.h:67
RooFit::EvalContext _evalContextCPU
Definition Evaluator.h:66
void computeCPUNode(const RooAbsArg *node, NodeInfo &info)
void setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
Temporarily change the operation mode of a RooAbsArg until the Evaluator gets deleted.
RooAbsReal & _topNode
Definition Evaluator.h:62
static RooMsgService & instance()
Return reference to singleton instance.
static const TNamed * ptr(const char *stringPtr)
Return a unique TNamed pointer for given C++ string.
Variable that can be changed from the outside.
Definition RooRealVar.h:37
const char * GetName() const override
Returns name of object.
Definition TNamed.h:49
virtual const char * ClassName() const
Returns name of class to which the object belongs.
Definition TObject.cxx:224
RVec< PromoteType< T > > log(const RVec< T > &v)
Definition RVec.hxx:1836
const Int_t n
Definition legend1.C:16
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::string cpuArchitectureName()
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
Architecture cpuArchitecture()
int initCPU()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
Definition CodegenImpl.h:72
@ FastEvaluations
void getSortedComputationGraph(RooAbsArg const &func, RooArgSet &out)
A struct used by the Evaluator to store information on the RooAbsArgs in the computation graph.
Definition Evaluator.cxx:98
RooBatchCompute::CudaInterface::CudaStream * stream
RooAbsArg * absArg
bool isScalar() const
std::size_t iNode
std::size_t lastSetValCount
RooBatchCompute::CudaInterface::CudaEvent * event
std::vector< NodeInfo * > serverInfos
RooAbsArg::OperMode originalOperMode
std::size_t outputSize
std::vector< NodeInfo * > clientInfos
std::shared_ptr< RooBatchCompute::AbsBuffer > buffer
void decrementRemainingClients()
Check the servers of a node that has been computed and release its resources if they are no longer ne...