Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
Evaluator.cxx
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Jonas Rembser, CERN 2021
5 * Emmanouil Michalainas, CERN 2021
6 *
7 * Copyright (c) 2021, CERN
8 *
9 * Redistribution and use in source and binary forms,
10 * with or without modification, are permitted according to the terms
11 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
12 */
13
14/**
15\file Evaluator.cxx
16\class RooFit::Evaluator
17\ingroup Roofitcore
18
19Evaluates a RooAbsReal object in other ways than recursive graph
20traversal. Currently, it is being used for evaluating a RooAbsReal object and
21supplying the value to the minimizer, during a fit. The class scans the
22dependencies and schedules the computations in a secure and efficient way. The
23computations take place in the RooBatchCompute library and can be carried off
24by either the CPU or a CUDA-supporting GPU. The Evaluator class takes care
25of data transfers. An instance of this class is created every time
26RooAbsPdf::fitTo() is called and gets destroyed when the fitting ends.
27**/
28
29#include <RooFit/Evaluator.h>
30
31#include <RooAbsCategory.h>
32#include <RooAbsData.h>
33#include <RooAbsReal.h>
34#include <RooRealVar.h>
35#include <RooBatchCompute.h>
36#include <RooMsgService.h>
37#include <RooNameReg.h>
38#include <RooSimultaneous.h>
39
40#include <RooBatchCompute.h>
41
43#include "RooFitImplHelpers.h"
44
45#include <chrono>
46#include <iomanip>
47#include <numeric>
48#include <thread>
49
50namespace RooFit {
51
52namespace {
53
54// To avoid deleted move assignment.
55template <class T>
56void assignSpan(std::span<T> &to, std::span<T> const &from)
57{
58 to = from;
59}
60
62{
63 // We have to exit early if the message stream is not active. Otherwise it's
64 // possible that this function skips logging because it thinks it has
65 // already logged, but actually it didn't.
66 if (!RooMsgService::instance().isActive(nullptr, RooFit::Fitting, RooFit::INFO)) {
67 return;
68 }
69
70 // Don't repeat logging architecture info if the useGPU option didn't change
71 {
72 // Second element of pair tracks whether this function has already been called
73 static std::pair<bool, bool> lastUseGPU;
74 if (lastUseGPU.second && lastUseGPU.first == useGPU)
75 return;
76 lastUseGPU = {useGPU, true};
77 }
78
79 auto log = [](std::string_view message) {
80 oocxcoutI(static_cast<RooAbsArg *>(nullptr), Fitting) << message << std::endl;
81 };
82
84 log("using generic CPU library compiled with no vectorizations");
85 } else {
86 log(std::string("using CPU computation library compiled with -m") + RooBatchCompute::cpuArchitectureName());
87 }
88 if (useGPU) {
89 log("using CUDA computation library");
90 }
91}
92
93} // namespace
94
95/// A struct used by the Evaluator to store information on the RooAbsArgs in
96/// the computation graph.
97struct NodeInfo {
98
99 bool isScalar() const { return outputSize == 1; }
100
101 RooAbsArg *absArg = nullptr;
103
104 std::shared_ptr<RooBatchCompute::AbsBuffer> buffer;
105 std::size_t iNode = 0;
106 int remClients = 0;
107 int remServers = 0;
109 bool fromArrayInput = false;
110 bool isVariable = false;
111 bool isDirty = true;
112 bool isCategory = false;
113 bool hasLogged = false;
114 bool computeInGPU = false;
115 std::size_t outputSize = 1;
116 std::size_t lastSetValCount = std::numeric_limits<std::size_t>::max();
117 int lastCatVal = std::numeric_limits<int>::max();
118 double scalarBuffer = 0.0;
119 std::vector<NodeInfo *> serverInfos;
120 std::vector<NodeInfo *> clientInfos;
121
124
125 /// Check the servers of a node that has been computed and release its
126 /// resources if they are no longer needed.
128 {
129 if (--remClients == 0 && !fromArrayInput) {
130 buffer.reset();
131 }
132 }
133
141};
142
143/// Construct a new Evaluator. The constructor analyzes and saves metadata about the graph,
144/// useful for the evaluation of it that will be done later. In case the CUDA mode is selected,
145/// there's also some CUDA-related initialization.
146///
147/// \param[in] absReal The RooAbsReal object that sits on top of the
148/// computation graph that we want to evaluate.
149/// \param[in] useGPU Whether the evaluation should be preferably done on the GPU.
151 : _topNode{const_cast<RooAbsReal &>(absReal)}, _useGPU{useGPU}
152{
154 if (useGPU && RooBatchCompute::initCUDA() != 0) {
155 throw std::runtime_error("Can't create Evaluator in CUDA mode because RooBatchCompute CUDA could not be loaded!");
156 }
157 // Some checks and logging of used architectures
159
162
165
167 if (useGPU) {
169 }
170
171 std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;
172
173 // Fill the ordered nodes list and initialize the node info structs.
174 _nodes.reserve(serverSet.size());
175 std::size_t iNode = 0;
176 for (RooAbsArg *arg : serverSet) {
177
178 _nodes.emplace_back();
179 auto &nodeInfo = _nodes.back();
180 _nodesMap[arg->namePtr()] = &nodeInfo;
181
182 nodeInfo.absArg = arg;
183 nodeInfo.originalOperMode = arg->operMode();
184 nodeInfo.iNode = iNode;
185 nodeInfos[arg] = &nodeInfo;
186
187 if (dynamic_cast<RooRealVar const *>(arg)) {
188 nodeInfo.isVariable = true;
189 } else {
190 arg->setDataToken(iNode);
191 }
192 if (dynamic_cast<RooAbsCategory const *>(arg)) {
193 nodeInfo.isCategory = true;
194 }
195
196 ++iNode;
197 }
198
199 for (NodeInfo &info : _nodes) {
200 info.serverInfos.reserve(info.absArg->servers().size());
201 for (RooAbsArg *server : info.absArg->servers()) {
202 if (server->isValueServer(*info.absArg)) {
203 auto *serverInfo = nodeInfos.at(server);
204 info.serverInfos.emplace_back(serverInfo);
205 serverInfo->clientInfos.emplace_back(&info);
206 }
207 }
208 }
209
211
212 if (_useGPU) {
213 // create events and streams for every node
214 for (auto &info : _nodes) {
218 cfg.setCudaStream(info.stream);
219 _evalContextCUDA.setConfig(info.absArg, cfg);
220 }
221 }
222}
223
224/// If there are servers with the same name that got de-duplicated in the
225/// `_nodes` list, we need to set their data tokens too. We find such nodes by
226/// visiting the servers of every known node.
228{
229 for (NodeInfo &info : _nodes) {
230 std::size_t iValueServer = 0;
231 for (RooAbsArg *server : info.absArg->servers()) {
232 if (server->isValueServer(*info.absArg)) {
233 auto *knownServer = info.serverInfos[iValueServer]->absArg;
234 if (knownServer->hasDataToken()) {
235 server->setDataToken(knownServer->dataToken());
236 }
237 ++iValueServer;
238 }
239 }
240 }
241}
242
243void Evaluator::setInput(std::string const &name, std::span<const double> inputArray, bool isOnDevice)
244{
245 if (isOnDevice && !_useGPU) {
246 throw std::runtime_error("Evaluator can only take device array as input in CUDA mode!");
247 }
248
249 // Check if "name" is used in the computation graph. If yes, add the span to
250 // the data map and set the node info accordingly.
251
252 auto found = _nodesMap.find(RooNameReg::ptr(name.c_str()));
253
254 if (found == _nodesMap.end())
255 return;
256
258
259 NodeInfo &info = *found->second;
260
261 info.fromArrayInput = true;
262 info.absArg->setDataToken(info.iNode);
263 info.outputSize = inputArray.size();
264
265 if (!_useGPU) {
267 return;
268 }
269
270 if (info.outputSize <= 1) {
271 // Empty or scalar observables from the data don't need to be
272 // copied to the GPU.
275 return;
276 }
277
278 // For simplicity, we put the data on both host and device for
279 // now. This could be optimized by inspecting the clients of the
280 // variable.
281 if (isOnDevice) {
283 auto gpuSpan = _evalContextCUDA.at(info.absArg);
284 info.buffer = _bufferManager->makeCpuBuffer(gpuSpan.size());
285 info.buffer->assignFromDevice(gpuSpan);
286 _evalContextCPU.set(info.absArg, {info.buffer->hostReadPtr(), gpuSpan.size()});
287 } else {
289 auto cpuSpan = _evalContextCPU.at(info.absArg);
290 info.buffer = _bufferManager->makeGpuBuffer(cpuSpan.size());
291 info.buffer->assignFromHost(cpuSpan);
292 _evalContextCUDA.set(info.absArg, {info.buffer->deviceReadPtr(), cpuSpan.size()});
293 }
294}
295
297{
298 std::map<RooFit::Detail::DataKey, std::size_t> sizeMap;
299 for (auto &info : _nodes) {
300 if (info.fromArrayInput) {
301 sizeMap[info.absArg] = info.outputSize;
302 } else {
303 // any buffer for temporary results is invalidated by resetting the output sizes
304 info.buffer.reset();
305 }
306 }
307
308 auto outputSizeMap =
309 RooFit::BatchModeDataHelpers::determineOutputSizes(_topNode, [&](RooFit::Detail::DataKey key) -> int {
310 auto found = sizeMap.find(key);
311 return found != sizeMap.end() ? found->second : -1;
312 });
313
314 for (auto &info : _nodes) {
315 info.outputSize = outputSizeMap.at(info.absArg);
316 info.isDirty = true;
317
318 // In principle we don't need dirty flag propagation because the driver
319 // takes care of deciding which node needs to be re-evaluated. However,
320 // disabling it also for scalar mode results in very long fitting times
321 // for specific models (test 14 in stressRooFit), which still needs to be
322 // understood. TODO.
323 if (!info.isScalar()) {
325 } else {
326 setOperMode(info.absArg, info.originalOperMode);
327 }
328 }
329
330 if (_useGPU) {
331 markGPUNodes();
332 }
333
335}
336
338{
339 for (auto &info : _nodes) {
340 if (!info.isVariable) {
341 info.absArg->resetDataToken();
342 }
343 }
344}
345
347{
348 using namespace Detail;
349
350 const std::size_t nOut = info.outputSize;
351
352 double *buffer = nullptr;
353 if (nOut == 1) {
354 buffer = &info.scalarBuffer;
355 if (_useGPU) {
356 _evalContextCUDA.set(node, {buffer, nOut});
357 }
358 } else {
359 if (!info.hasLogged && _useGPU) {
360 RooAbsArg const &arg = *info.absArg;
361 oocoutI(&arg, FastEvaluations) << "The argument " << arg.ClassName() << "::" << arg.GetName()
362 << " could not be evaluated on the GPU because the class doesn't support it. "
363 "Consider requesting or implementing it to benefit from a speed up."
364 << std::endl;
365 info.hasLogged = true;
366 }
367 if (!info.buffer) {
368 info.buffer = info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(nOut, info.stream)
369 : _bufferManager->makeCpuBuffer(nOut);
370 }
371 buffer = info.buffer->hostWritePtr();
372 }
374 _evalContextCPU.set(node, {buffer, nOut});
375 if (nOut > 1) {
377 }
378 if (info.isCategory) {
379 auto nodeAbsCategory = static_cast<RooAbsCategory const *>(node);
380 if (nOut == 1) {
381 buffer[0] = nodeAbsCategory->getCurrentIndex();
382 } else {
383 throw std::runtime_error("RooFit::Evaluator - non-scalar category values are not supported!");
384 }
385 } else {
386 auto nodeAbsReal = static_cast<RooAbsReal const *>(node);
388 }
391 if (info.copyAfterEvaluation) {
392 _evalContextCUDA.set(node, {info.buffer->deviceReadPtr(), nOut});
393 if (info.event) {
395 }
396 }
397}
398
399/// Process a variable in the computation graph. This is a separate non-inlined
400/// function such that we can see in performance profiles how long this takes.
402{
403 RooAbsArg *node = nodeInfo.absArg;
404 auto *var = static_cast<RooRealVar const *>(node);
405 if (nodeInfo.lastSetValCount != var->valueResetCounter()) {
406 nodeInfo.lastSetValCount = var->valueResetCounter();
407 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
408 clientInfo->isDirty = true;
409 }
411 nodeInfo.isDirty = false;
412 }
413}
414
415/// Process a category in the computation graph. This is a separate non-inlined
416/// function such that we can see in performance profiles how long this takes.
418{
419 RooAbsArg *node = nodeInfo.absArg;
420 auto *cat = static_cast<RooAbsCategory const *>(node);
421 if (nodeInfo.lastCatVal != cat->getCurrentIndex()) {
422 nodeInfo.lastCatVal = cat->getCurrentIndex();
423 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
424 clientInfo->isDirty = true;
425 }
427 nodeInfo.isDirty = false;
428 }
429}
430
431/// Flags all the clients of a given node dirty. This is a separate non-inlined
432/// function such that we can see in performance profiles how long this takes.
434{
435 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
436 clientInfo->isDirty = true;
437 }
438}
439
440/// Returns the value of the top node in the computation graph
441std::span<const double> Evaluator::run()
442{
445
447
448 if (_useGPU) {
449 return getValHeterogeneous();
450 }
451
452 for (auto &nodeInfo : _nodes) {
453 if (!nodeInfo.fromArrayInput) {
454 if (nodeInfo.isVariable) {
456 } else if (nodeInfo.isCategory) {
458 } else {
459 if (nodeInfo.isDirty) {
462 nodeInfo.isDirty = false;
463 }
464 }
465 }
466 }
467
468 // return the final output
469 return _evalContextCPU.at(&_topNode);
470}
471
472/// Returns the value of the top node in the computation graph
473std::span<const double> Evaluator::getValHeterogeneous()
474{
475 for (auto &info : _nodes) {
476 info.remClients = info.clientInfos.size();
477 info.remServers = info.serverInfos.size();
478 if (info.buffer && !info.fromArrayInput) {
479 info.buffer.reset();
480 }
481 }
482
483 // find initial GPU nodes and assign them to GPU
484 for (auto &info : _nodes) {
485 if (info.remServers == 0 && info.computeInGPU) {
487 }
488 }
489
490 NodeInfo const &topNodeInfo = _nodes.back();
491 while (topNodeInfo.remServers != -2) {
492 // find finished GPU nodes
493 for (auto &info : _nodes) {
494 if (info.remServers == -1 && !RooBatchCompute::dispatchCUDA->cudaStreamIsActive(info.stream)) {
495 info.remServers = -2;
496 // Decrement number of remaining servers for clients and start GPU computations
497 for (auto *infoClient : info.clientInfos) {
498 --infoClient->remServers;
499 if (infoClient->computeInGPU && infoClient->remServers == 0) {
501 }
502 }
503 for (auto *serverInfo : info.serverInfos) {
504 serverInfo->decrementRemainingClients();
505 }
506 }
507 }
508
509 // find next CPU node
510 auto it = _nodes.begin();
511 for (; it != _nodes.end(); it++) {
512 if (it->remServers == 0 && !it->computeInGPU)
513 break;
514 }
515
516 // if no CPU node available sleep for a while to save CPU usage
517 if (it == _nodes.end()) {
518 std::this_thread::sleep_for(std::chrono::milliseconds(1));
519 continue;
520 }
521
522 // compute next CPU node
523 NodeInfo &info = *it;
524 RooAbsArg const *node = info.absArg;
525 info.remServers = -2; // so that it doesn't get picked again
526
527 if (!info.fromArrayInput) {
528 computeCPUNode(node, info);
529 }
530
531 // Assign the clients that are computed on the GPU
532 for (auto *infoClient : info.clientInfos) {
533 if (--infoClient->remServers == 0 && infoClient->computeInGPU) {
535 }
536 }
537 for (auto *serverInfo : info.serverInfos) {
538 serverInfo->decrementRemainingClients();
539 }
540 }
541
542 // return the final value
544}
545
546/// Assign a node to be computed in the GPU. Scan it's clients and also assign them
547/// in case they only depend on GPU nodes.
549{
550 using namespace Detail;
551
552 info.remServers = -1;
553
554 auto node = static_cast<RooAbsReal const *>(info.absArg);
555
556 // wait for every server to finish
557 for (auto *infoServer : info.serverInfos) {
558 if (infoServer->event)
560 }
561
562 const std::size_t nOut = info.outputSize;
563
564 double *buffer = nullptr;
565 if (nOut == 1) {
566 buffer = &info.scalarBuffer;
567 _evalContextCPU.set(node, {buffer, nOut});
568 } else {
569 info.buffer = info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(nOut, info.stream)
570 : _bufferManager->makeGpuBuffer(nOut);
571 buffer = info.buffer->deviceWritePtr();
572 }
574 _evalContextCUDA.set(node, {buffer, nOut});
575 node->doEval(_evalContextCUDA);
577 if (info.copyAfterEvaluation) {
578 _evalContextCPU.set(node, {info.buffer->hostReadPtr(), nOut});
579 }
580}
581
582/// Decides which nodes are assigned to the GPU in a CUDA fit.
584{
585 // Decide which nodes get evaluated on the GPU: we select nodes that support
586 // CUDA evaluation and have at least one input of size greater than one.
587 for (auto &info : _nodes) {
588 info.computeInGPU = false;
589 if (!info.absArg->canComputeBatchWithCuda()) {
590 continue;
591 }
592 for (NodeInfo const *serverInfo : info.serverInfos) {
593 if (serverInfo->outputSize > 1) {
594 info.computeInGPU = true;
595 break;
596 }
597 }
598 }
599
600 // In a second pass, figure out which nodes need to copy over their results.
601 for (auto &info : _nodes) {
602 info.copyAfterEvaluation = false;
603 // scalar nodes don't need copying
604 if (!info.isScalar()) {
605 for (auto *clientInfo : info.clientInfos) {
606 if (info.computeInGPU != clientInfo->computeInGPU) {
607 info.copyAfterEvaluation = true;
608 break;
609 }
610 }
611 }
612 }
613}
614
615/// Temporarily change the operation mode of a RooAbsArg until the
616/// Evaluator gets deleted.
618{
619 if (opMode != arg->operMode()) {
620 _changeOperModeRAIIs.emplace(std::make_unique<ChangeOperModeRAII>(arg, opMode));
621 }
622}
623
624void Evaluator::print(std::ostream &os)
625{
626 std::cout << "--- RooFit BatchMode evaluation ---\n";
627
628 std::vector<int> widths{9, 37, 20, 9, 10, 20};
629
630 auto printElement = [&](int iCol, auto const &t) {
631 const char separator = ' ';
632 os << separator << std::left << std::setw(widths[iCol]) << std::setfill(separator) << t;
633 os << "|";
634 };
635
636 auto printHorizontalRow = [&]() {
637 int n = 0;
638 for (int w : widths) {
639 n += w + 2;
640 }
641 for (int i = 0; i < n; i++) {
642 os << '-';
643 }
644 os << "|\n";
645 };
646
648
649 os << "|";
650 printElement(0, "Index");
651 printElement(1, "Name");
652 printElement(2, "Class");
653 printElement(3, "Size");
654 printElement(4, "From Data");
655 printElement(5, "1st value");
656 std::cout << "\n";
657
659
660 for (std::size_t iNode = 0; iNode < _nodes.size(); ++iNode) {
661 auto &nodeInfo = _nodes[iNode];
662 RooAbsArg *node = nodeInfo.absArg;
663
664 auto span = _evalContextCPU.at(node);
665
666 os << "|";
667 printElement(0, iNode);
668 printElement(1, node->GetName());
669 printElement(2, node->ClassName());
670 printElement(3, nodeInfo.outputSize);
671 printElement(4, nodeInfo.fromArrayInput);
672 printElement(5, span[0]);
673
674 std::cout << "\n";
675 }
676
678}
679
680/// Gets all the parameters of the RooAbsReal. This is in principle not
681/// necessary, because we can always ask the RooAbsReal itself, but the
682/// Evaluator has the cached information to get the answer quicker.
683/// Therefore, this is not meant to be used in general, just where it matters.
684/// \warning If we find another solution to get the parameters efficiently,
685/// this function might be removed without notice.
687{
688 RooArgSet parameters;
689 for (auto &nodeInfo : _nodes) {
690 if (nodeInfo.absArg->isFundamental()) {
691 parameters.add(*nodeInfo.absArg);
692 }
693 }
694 // Just like in RooAbsArg::getParameters(), we sort the parameters alphabetically.
695 parameters.sort();
696 return parameters;
697}
698
699/// \brief Sets the offset mode for evaluation.
700///
701/// This function sets the offset mode for evaluation to the specified mode.
702/// It updates the offset mode for both CPU and CUDA evaluation contexts.
703///
704/// \param mode The offset mode to be set.
705///
706/// \note This function marks reducer nodes as dirty if the offset mode is
707/// changed, because only reducer nodes can use offsetting.
709{
711 return;
712
715
716 for (auto &nodeInfo : _nodes) {
717 if (nodeInfo.absArg->isReducerNode()) {
718 nodeInfo.isDirty = true;
719 }
720 }
721}
722
723} // namespace RooFit
#define oocoutI(o, a)
#define oocxcoutI(o, a)
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char mode
char name[80]
Definition TGX11.cxx:110
const_iterator begin() const
const_iterator end() const
Common abstract base class for objects that represent a value and a "shape" in RooFit.
Definition RooAbsArg.h:76
OperMode operMode() const
Query the operation mode of this node.
Definition RooAbsArg.h:419
A space to attach TBranches.
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
void sort(bool reverse=false)
Sort collection using std::sort and name comparison.
Abstract base class for objects that represent a real value and implements functionality common to al...
Definition RooAbsReal.h:63
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Definition RooArgSet.h:24
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void setCudaStream(CudaInterface::CudaStream *cudaStream)
virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const =0
virtual void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const =0
virtual std::unique_ptr< AbsBufferManager > createBufferManager() const =0
virtual void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaStream * newCudaStream() const =0
virtual void deleteCudaStream(CudaInterface::CudaStream *) const =0
virtual bool cudaStreamIsActive(CudaInterface::CudaStream *) const =0
void set(RooAbsArg const *arg, std::span< const double > const &span)
Definition EvalContext.h:91
std::span< const double > at(RooAbsArg const *arg, RooAbsArg const *caller=nullptr)
void enableVectorBuffers(bool enable)
OffsetMode _offsetMode
void setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config)
std::span< double > _currentOutput
void resize(std::size_t n)
void print(std::ostream &os)
void setClientsDirty(NodeInfo &nodeInfo)
Flags all the clients of a given node dirty.
RooArgSet getParameters() const
Gets all the parameters of the RooAbsReal.
void setOffsetMode(RooFit::EvalContext::OffsetMode)
Sets the offset mode for evaluation.
void syncDataTokens()
If there are servers with the same name that got de-duplicated in the _nodes list,...
const bool _useGPU
Definition Evaluator.h:62
std::unordered_map< TNamed const *, NodeInfo * > _nodesMap
Definition Evaluator.h:68
std::vector< NodeInfo > _nodes
Definition Evaluator.h:67
bool _needToUpdateOutputSizes
Definition Evaluator.h:64
std::span< const double > getValHeterogeneous()
Returns the value of the top node in the computation graph.
std::span< const double > run()
Returns the value of the top node in the computation graph.
Evaluator(const RooAbsReal &absReal, bool useGPU=false)
Construct a new Evaluator.
void processVariable(NodeInfo &nodeInfo)
Process a variable in the computation graph.
void processCategory(NodeInfo &nodeInfo)
Process a category in the computation graph.
std::unique_ptr< RooBatchCompute::AbsBufferManager > _bufferManager
Definition Evaluator.h:60
void markGPUNodes()
Decides which nodes are assigned to the GPU in a CUDA fit.
void assignToGPU(NodeInfo &info)
Assign a node to be computed in the GPU.
void setInput(std::string const &name, std::span< const double > inputArray, bool isOnDevice)
RooFit::EvalContext _evalContextCUDA
Definition Evaluator.h:66
RooFit::EvalContext _evalContextCPU
Definition Evaluator.h:65
void computeCPUNode(const RooAbsArg *node, NodeInfo &info)
std::stack< std::unique_ptr< ChangeOperModeRAII > > _changeOperModeRAIIs
Definition Evaluator.h:69
void setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
Temporarily change the operation mode of a RooAbsArg until the Evaluator gets deleted.
RooAbsReal & _topNode
Definition Evaluator.h:61
static RooMsgService & instance()
Return reference to singleton instance.
static const TNamed * ptr(const char *stringPtr)
Return a unique TNamed pointer for given C++ string.
Variable that can be changed from the outside.
Definition RooRealVar.h:37
const char * GetName() const override
Returns name of object.
Definition TNamed.h:49
virtual const char * ClassName() const
Returns name of class to which the object belongs.
Definition TObject.cxx:227
const Int_t n
Definition legend1.C:16
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::string cpuArchitectureName()
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
Architecture cpuArchitecture()
int initCPU()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
Definition CodegenImpl.h:67
@ FastEvaluations
void getSortedComputationGraph(RooAbsArg const &func, RooArgSet &out)
A struct used by the Evaluator to store information on the RooAbsArgs in the computation graph.
Definition Evaluator.cxx:97
RooBatchCompute::CudaInterface::CudaStream * stream
RooAbsArg * absArg
bool isScalar() const
Definition Evaluator.cxx:99
std::size_t iNode
std::size_t lastSetValCount
RooBatchCompute::CudaInterface::CudaEvent * event
std::vector< NodeInfo * > serverInfos
RooAbsArg::OperMode originalOperMode
std::size_t outputSize
std::vector< NodeInfo * > clientInfos
std::shared_ptr< RooBatchCompute::AbsBuffer > buffer
void decrementRemainingClients()
Check the servers of a node that has been computed and release its resources if they are no longer ne...