Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
Evaluator.cxx
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Jonas Rembser, CERN 2021
5 * Emmanouil Michalainas, CERN 2021
6 *
7 * Copyright (c) 2021, CERN
8 *
9 * Redistribution and use in source and binary forms,
10 * with or without modification, are permitted according to the terms
11 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
12 */
13
14/**
15\file Evaluator.cxx
16\class RooFit::Evaluator
17\ingroup Roofitcore
18
19Evaluates a RooAbsReal object in other ways than recursive graph
20traversal. Currently, it is being used for evaluating a RooAbsReal object and
21supplying the value to the minimizer, during a fit. The class scans the
22dependencies and schedules the computations in a secure and efficient way. The
23computations take place in the RooBatchCompute library and can be carried off
24by either the CPU or a CUDA-supporting GPU. The Evaluator class takes care
25of data transfers. An instance of this class is created every time
26RooAbsPdf::fitTo() is called and gets destroyed when the fitting ends.
27**/
28
29#include <RooFit/Evaluator.h>
30
31#include <RooAbsCategory.h>
32#include <RooAbsData.h>
33#include <RooAbsReal.h>
34#include <RooRealVar.h>
35#include <RooBatchCompute.h>
36#include <RooMsgService.h>
37#include <RooNameReg.h>
38#include <RooSimultaneous.h>
39
40#include <RooBatchCompute.h>
41
43#include "RooFitImplHelpers.h"
44
45#include <chrono>
46#include <iomanip>
47#include <numeric>
48#include <thread>
49
50namespace RooFit {
51
52namespace {
53
54// To avoid deleted move assignment.
55template <class T>
56void assignSpan(std::span<T> &to, std::span<T> const &from)
57{
58 to = from;
59}
60
61void logArchitectureInfo(bool useGPU)
62{
63 // We have to exit early if the message stream is not active. Otherwise it's
64 // possible that this function skips logging because it thinks it has
65 // already logged, but actually it didn't.
66 if (!RooMsgService::instance().isActive(nullptr, RooFit::Fitting, RooFit::INFO)) {
67 return;
68 }
69
70 // Don't repeat logging architecture info if the useGPU option didn't change
71 {
72 // Second element of pair tracks whether this function has already been called
73 static std::pair<bool, bool> lastUseGPU;
74 if (lastUseGPU.second && lastUseGPU.first == useGPU)
75 return;
76 lastUseGPU = {useGPU, true};
77 }
78
79 auto log = [](std::string_view message) {
80 oocxcoutI(static_cast<RooAbsArg *>(nullptr), Fitting) << message << std::endl;
81 };
82
84 log("using generic CPU library compiled with no vectorizations");
85 } else {
86 log(std::string("using CPU computation library compiled with -m") + RooBatchCompute::cpuArchitectureName());
87 }
88 if (useGPU) {
89 log("using CUDA computation library");
90 }
91}
92
93} // namespace
94
95/// A struct used by the Evaluator to store information on the RooAbsArgs in
96/// the computation graph.
97struct NodeInfo {
98
99 bool isScalar() const { return outputSize == 1; }
100
101 RooAbsArg *absArg = nullptr;
103
104 std::shared_ptr<RooBatchCompute::AbsBuffer> buffer;
105 std::size_t iNode = 0;
106 int remClients = 0;
107 int remServers = 0;
109 bool fromArrayInput = false;
110 bool isVariable = false;
111 bool isDirty = true;
112 bool isCategory = false;
113 bool hasLogged = false;
114 bool computeInGPU = false;
115 std::size_t outputSize = 1;
116 std::size_t lastSetValCount = std::numeric_limits<std::size_t>::max();
117 double scalarBuffer = 0.0;
118 std::vector<NodeInfo *> serverInfos;
119 std::vector<NodeInfo *> clientInfos;
120
123
124 /// Check the servers of a node that has been computed and release its
125 /// resources if they are no longer needed.
127 {
128 if (--remClients == 0 && !fromArrayInput) {
129 buffer.reset();
130 }
131 }
132
134 {
135 if (event)
137 if (stream)
139 }
140};
141
142/// Construct a new Evaluator. The constructor analyzes and saves metadata about the graph,
143/// useful for the evaluation of it that will be done later. In case the CUDA mode is selected,
144/// there's also some CUDA-related initialization.
145///
146/// \param[in] absReal The RooAbsReal object that sits on top of the
147/// computation graph that we want to evaluate.
148/// \param[in] useGPU Whether the evaluation should be preferably done on the GPU.
149Evaluator::Evaluator(const RooAbsReal &absReal, bool useGPU)
150 : _topNode{const_cast<RooAbsReal &>(absReal)}, _useGPU{useGPU}
151{
153 if (useGPU && RooBatchCompute::initCUDA() != 0) {
154 throw std::runtime_error("Can't create Evaluator in CUDA mode because RooBatchCompute CUDA could not be loaded!");
155 }
156 // Some checks and logging of used architectures
157 logArchitectureInfo(_useGPU);
158
161
162 RooArgSet serverSet;
164
165 _evalContextCPU.resize(serverSet.size());
166 if (useGPU) {
167 _evalContextCUDA.resize(serverSet.size());
168 }
169
170 std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;
171
172 // Fill the ordered nodes list and initialize the node info structs.
173 _nodes.reserve(serverSet.size());
174 std::size_t iNode = 0;
175 for (RooAbsArg *arg : serverSet) {
176
177 _nodes.emplace_back();
178 auto &nodeInfo = _nodes.back();
179 nodeInfo.absArg = arg;
180 nodeInfo.originalOperMode = arg->operMode();
181 nodeInfo.iNode = iNode;
182 nodeInfos[arg] = &nodeInfo;
183
184 if (dynamic_cast<RooRealVar const *>(arg)) {
185 nodeInfo.isVariable = true;
186 } else {
187 arg->setDataToken(iNode);
188 }
189 if (dynamic_cast<RooAbsCategory const *>(arg)) {
190 nodeInfo.isCategory = true;
191 }
192
193 ++iNode;
194 }
195
196 for (NodeInfo &info : _nodes) {
197 info.serverInfos.reserve(info.absArg->servers().size());
198 for (RooAbsArg *server : info.absArg->servers()) {
199 if (server->isValueServer(*info.absArg)) {
200 auto *serverInfo = nodeInfos.at(server);
201 info.serverInfos.emplace_back(serverInfo);
202 serverInfo->clientInfos.emplace_back(&info);
203 }
204 }
205 }
206
208
209 if (_useGPU) {
210 // create events and streams for every node
211 for (auto &info : _nodes) {
212 info.event = RooBatchCompute::dispatchCUDA->newCudaEvent(false);
215 cfg.setCudaStream(info.stream);
216 _evalContextCUDA.setConfig(info.absArg, cfg);
217 }
218 }
219}
220
221/// If there are servers with the same name that got de-duplicated in the
222/// `_nodes` list, we need to set their data tokens too. We find such nodes by
223/// visiting the servers of every known node.
225{
226 for (NodeInfo &info : _nodes) {
227 std::size_t iValueServer = 0;
228 for (RooAbsArg *server : info.absArg->servers()) {
229 if (server->isValueServer(*info.absArg)) {
230 auto *knownServer = info.serverInfos[iValueServer]->absArg;
231 if (knownServer->hasDataToken()) {
232 server->setDataToken(knownServer->dataToken());
233 }
234 ++iValueServer;
235 }
236 }
237 }
238}
239
240void Evaluator::setInput(std::string const &name, std::span<const double> inputArray, bool isOnDevice)
241{
242 if (isOnDevice && !_useGPU) {
243 throw std::runtime_error("Evaluator can only take device array as input in CUDA mode!");
244 }
245
246 auto namePtr = RooNameReg::ptr(name.c_str());
247
248 // Iterate over the given data spans and add them to the data map. Check if
249 // they are used in the computation graph. If yes, add the span to the data
250 // map and set the node info accordingly.
251 std::size_t iNode = 0;
252 for (auto &info : _nodes) {
253 const bool fromArrayInput = info.absArg->namePtr() == namePtr;
254 if (fromArrayInput) {
255 info.fromArrayInput = true;
256 info.absArg->setDataToken(iNode);
257 info.outputSize = inputArray.size();
258 if (_useGPU && info.outputSize <= 1) {
259 // Empty or scalar observables from the data don't need to be
260 // copied to the GPU.
261 _evalContextCPU.set(info.absArg, inputArray);
262 _evalContextCUDA.set(info.absArg, inputArray);
263 } else if (_useGPU && info.outputSize > 1) {
264 // For simplicity, we put the data on both host and device for
265 // now. This could be optimized by inspecting the clients of the
266 // variable.
267 if (isOnDevice) {
268 _evalContextCUDA.set(info.absArg, inputArray);
269 auto gpuSpan = _evalContextCUDA.at(info.absArg);
270 info.buffer = _bufferManager->makeCpuBuffer(gpuSpan.size());
271 info.buffer->assignFromDevice(gpuSpan);
272 _evalContextCPU.set(info.absArg, {info.buffer->hostReadPtr(), gpuSpan.size()});
273 } else {
274 _evalContextCPU.set(info.absArg, inputArray);
275 auto cpuSpan = _evalContextCPU.at(info.absArg);
276 info.buffer = _bufferManager->makeGpuBuffer(cpuSpan.size());
277 info.buffer->assignFromHost(cpuSpan);
278 _evalContextCUDA.set(info.absArg, {info.buffer->deviceReadPtr(), cpuSpan.size()});
279 }
280 } else {
281 _evalContextCPU.set(info.absArg, inputArray);
282 }
283 }
284 info.isDirty = !info.fromArrayInput;
285 ++iNode;
286 }
287
288 _needToUpdateOutputSizes = true;
289}
290
291void Evaluator::updateOutputSizes()
292{
293 std::map<RooFit::Detail::DataKey, std::size_t> sizeMap;
294 for (auto &info : _nodes) {
295 if (info.fromArrayInput) {
296 sizeMap[info.absArg] = info.outputSize;
297 } else {
298 // any buffer for temporary results is invalidated by resetting the output sizes
299 info.buffer.reset();
300 }
301 }
302
303 auto outputSizeMap =
304 RooFit::BatchModeDataHelpers::determineOutputSizes(_topNode, [&](RooFit::Detail::DataKey key) -> int {
305 auto found = sizeMap.find(key);
306 return found != sizeMap.end() ? found->second : -1;
307 });
308
309 for (auto &info : _nodes) {
310 info.outputSize = outputSizeMap.at(info.absArg);
311
312 // In principle we don't need dirty flag propagation because the driver
313 // takes care of deciding which node needs to be re-evaluated. However,
314 // disabling it also for scalar mode results in very long fitting times
315 // for specific models (test 14 in stressRooFit), which still needs to be
316 // understood. TODO.
317 if (!info.isScalar()) {
318 setOperMode(info.absArg, RooAbsArg::ADirty);
319 } else {
320 setOperMode(info.absArg, info.originalOperMode);
321 }
322 }
323
324 if (_useGPU) {
325 markGPUNodes();
326 }
327
328 _needToUpdateOutputSizes = false;
329}
330
331Evaluator::~Evaluator()
332{
333 for (auto &info : _nodes) {
334 if (!info.isVariable) {
335 info.absArg->resetDataToken();
336 }
337 }
338}
339
340void Evaluator::computeCPUNode(const RooAbsArg *node, NodeInfo &info)
341{
342 using namespace Detail;
343
344 const std::size_t nOut = info.outputSize;
345
346 double *buffer = nullptr;
347 if (nOut == 1) {
348 buffer = &info.scalarBuffer;
349 if (_useGPU) {
350 _evalContextCUDA.set(node, {buffer, nOut});
351 }
352 } else {
353 if (!info.hasLogged && _useGPU) {
354 RooAbsArg const &arg = *info.absArg;
355 oocoutI(&arg, FastEvaluations) << "The argument " << arg.ClassName() << "::" << arg.GetName()
356 << " could not be evaluated on the GPU because the class doesn't support it. "
357 "Consider requesting or implementing it to benefit from a speed up."
358 << std::endl;
359 info.hasLogged = true;
360 }
361 if (!info.buffer) {
362 info.buffer = info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(nOut, info.stream)
363 : _bufferManager->makeCpuBuffer(nOut);
364 }
365 buffer = info.buffer->hostWritePtr();
366 }
367 assignSpan(_evalContextCPU._currentOutput, {buffer, nOut});
368 _evalContextCPU.set(node, {buffer, nOut});
369 if (nOut > 1) {
370 _evalContextCPU.enableVectorBuffers(true);
371 }
372 if (info.isCategory) {
373 auto nodeAbsCategory = static_cast<RooAbsCategory const *>(node);
374 if (nOut == 1) {
375 buffer[0] = nodeAbsCategory->getCurrentIndex();
376 } else {
377 throw std::runtime_error("RooFit::Evaluator - non-scalar category values are not supported!");
378 }
379 } else {
380 auto nodeAbsReal = static_cast<RooAbsReal const *>(node);
381 nodeAbsReal->doEval(_evalContextCPU);
382 }
383 _evalContextCPU.resetVectorBuffers();
384 _evalContextCPU.enableVectorBuffers(false);
385 if (info.copyAfterEvaluation) {
386 _evalContextCUDA.set(node, {info.buffer->deviceReadPtr(), nOut});
387 if (info.event) {
389 }
390 }
391}
392
393/// Process a variable in the computation graph. This is a separate non-inlined
394/// function such that we can see in performance profiles how long this takes.
395void Evaluator::processVariable(NodeInfo &nodeInfo)
396{
397 RooAbsArg *node = nodeInfo.absArg;
398 auto *var = static_cast<RooRealVar const *>(node);
399 if (nodeInfo.lastSetValCount != var->valueResetCounter()) {
400 nodeInfo.lastSetValCount = var->valueResetCounter();
401 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
402 clientInfo->isDirty = true;
403 }
404 computeCPUNode(node, nodeInfo);
405 nodeInfo.isDirty = false;
406 }
407}
408
409/// Flags all the clients of a given node dirty. This is a separate non-inlined
410/// function such that we can see in performance profiles how long this takes.
411void Evaluator::setClientsDirty(NodeInfo &nodeInfo)
412{
413 for (NodeInfo *clientInfo : nodeInfo.clientInfos) {
414 clientInfo->isDirty = true;
415 }
416}
417
418/// Returns the value of the top node in the computation graph
419std::span<const double> Evaluator::run()
420{
421 if (_needToUpdateOutputSizes)
422 updateOutputSizes();
423
424 ++_nEvaluations;
425
426 if (_useGPU) {
427 return getValHeterogeneous();
428 }
429
430 for (auto &nodeInfo : _nodes) {
431 if (!nodeInfo.fromArrayInput) {
432 if (nodeInfo.isVariable) {
433 processVariable(nodeInfo);
434 } else {
435 if (nodeInfo.isDirty) {
436 setClientsDirty(nodeInfo);
437 computeCPUNode(nodeInfo.absArg, nodeInfo);
438 nodeInfo.isDirty = false;
439 }
440 }
441 }
442 }
443
444 // return the final output
445 return _evalContextCPU.at(&_topNode);
446}
447
448/// Returns the value of the top node in the computation graph
449std::span<const double> Evaluator::getValHeterogeneous()
450{
451 for (auto &info : _nodes) {
452 info.remClients = info.clientInfos.size();
453 info.remServers = info.serverInfos.size();
454 if (info.buffer && !info.fromArrayInput) {
455 info.buffer.reset();
456 }
457 }
458
459 // find initial GPU nodes and assign them to GPU
460 for (auto &info : _nodes) {
461 if (info.remServers == 0 && info.computeInGPU) {
462 assignToGPU(info);
463 }
464 }
465
466 NodeInfo const &topNodeInfo = _nodes.back();
467 while (topNodeInfo.remServers != -2) {
468 // find finished GPU nodes
469 for (auto &info : _nodes) {
470 if (info.remServers == -1 && !RooBatchCompute::dispatchCUDA->cudaStreamIsActive(info.stream)) {
471 info.remServers = -2;
472 // Decrement number of remaining servers for clients and start GPU computations
473 for (auto *infoClient : info.clientInfos) {
474 --infoClient->remServers;
475 if (infoClient->computeInGPU && infoClient->remServers == 0) {
476 assignToGPU(*infoClient);
477 }
478 }
479 for (auto *serverInfo : info.serverInfos) {
480 serverInfo->decrementRemainingClients();
481 }
482 }
483 }
484
485 // find next CPU node
486 auto it = _nodes.begin();
487 for (; it != _nodes.end(); it++) {
488 if (it->remServers == 0 && !it->computeInGPU)
489 break;
490 }
491
492 // if no CPU node available sleep for a while to save CPU usage
493 if (it == _nodes.end()) {
494 std::this_thread::sleep_for(std::chrono::milliseconds(1));
495 continue;
496 }
497
498 // compute next CPU node
499 NodeInfo &info = *it;
500 RooAbsArg const *node = info.absArg;
501 info.remServers = -2; // so that it doesn't get picked again
502
503 if (!info.fromArrayInput) {
504 computeCPUNode(node, info);
505 }
506
507 // Assign the clients that are computed on the GPU
508 for (auto *infoClient : info.clientInfos) {
509 if (--infoClient->remServers == 0 && infoClient->computeInGPU) {
510 assignToGPU(*infoClient);
511 }
512 }
513 for (auto *serverInfo : info.serverInfos) {
514 serverInfo->decrementRemainingClients();
515 }
516 }
517
518 // return the final value
519 return _evalContextCUDA.at(&_topNode);
520}
521
522/// Assign a node to be computed in the GPU. Scan it's clients and also assign them
523/// in case they only depend on GPU nodes.
524void Evaluator::assignToGPU(NodeInfo &info)
525{
526 using namespace Detail;
527
528 info.remServers = -1;
529
530 auto node = static_cast<RooAbsReal const *>(info.absArg);
531
532 // wait for every server to finish
533 for (auto *infoServer : info.serverInfos) {
534 if (infoServer->event)
536 }
537
538 const std::size_t nOut = info.outputSize;
539
540 double *buffer = nullptr;
541 if (nOut == 1) {
542 buffer = &info.scalarBuffer;
543 _evalContextCPU.set(node, {buffer, nOut});
544 } else {
545 info.buffer = info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(nOut, info.stream)
546 : _bufferManager->makeGpuBuffer(nOut);
547 buffer = info.buffer->deviceWritePtr();
548 }
549 assignSpan(_evalContextCUDA._currentOutput, {buffer, nOut});
550 _evalContextCUDA.set(node, {buffer, nOut});
551 node->doEval(_evalContextCUDA);
553 if (info.copyAfterEvaluation) {
554 _evalContextCPU.set(node, {info.buffer->hostReadPtr(), nOut});
555 }
556}
557
558/// Decides which nodes are assigned to the GPU in a CUDA fit.
559void Evaluator::markGPUNodes()
560{
561 // Decide which nodes get evaluated on the GPU: we select nodes that support
562 // CUDA evaluation and have at least one input of size greater than one.
563 for (auto &info : _nodes) {
564 info.computeInGPU = false;
565 if (!info.absArg->canComputeBatchWithCuda()) {
566 continue;
567 }
568 for (NodeInfo const *serverInfo : info.serverInfos) {
569 if (serverInfo->outputSize > 1) {
570 info.computeInGPU = true;
571 break;
572 }
573 }
574 }
575
576 // In a second pass, figure out which nodes need to copy over their results.
577 for (auto &info : _nodes) {
578 info.copyAfterEvaluation = false;
579 // scalar nodes don't need copying
580 if (!info.isScalar()) {
581 for (auto *clientInfo : info.clientInfos) {
582 if (info.computeInGPU != clientInfo->computeInGPU) {
583 info.copyAfterEvaluation = true;
584 break;
585 }
586 }
587 }
588 }
589}
590
591/// Temporarily change the operation mode of a RooAbsArg until the
592/// Evaluator gets deleted.
593void Evaluator::setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
594{
595 if (opMode != arg->operMode()) {
596 _changeOperModeRAIIs.emplace(std::make_unique<ChangeOperModeRAII>(arg, opMode));
597 }
598}
599
600void Evaluator::print(std::ostream &os)
601{
602 std::cout << "--- RooFit BatchMode evaluation ---\n";
603
604 std::vector<int> widths{9, 37, 20, 9, 10, 20};
605
606 auto printElement = [&](int iCol, auto const &t) {
607 const char separator = ' ';
608 os << separator << std::left << std::setw(widths[iCol]) << std::setfill(separator) << t;
609 os << "|";
610 };
611
612 auto printHorizontalRow = [&]() {
613 int n = 0;
614 for (int w : widths) {
615 n += w + 2;
616 }
617 for (int i = 0; i < n; i++) {
618 os << '-';
619 }
620 os << "|\n";
621 };
622
623 printHorizontalRow();
624
625 os << "|";
626 printElement(0, "Index");
627 printElement(1, "Name");
628 printElement(2, "Class");
629 printElement(3, "Size");
630 printElement(4, "From Data");
631 printElement(5, "1st value");
632 std::cout << "\n";
633
634 printHorizontalRow();
635
636 for (std::size_t iNode = 0; iNode < _nodes.size(); ++iNode) {
637 auto &nodeInfo = _nodes[iNode];
638 RooAbsArg *node = nodeInfo.absArg;
639
640 auto span = _evalContextCPU.at(node);
641
642 os << "|";
643 printElement(0, iNode);
644 printElement(1, node->GetName());
645 printElement(2, node->ClassName());
646 printElement(3, nodeInfo.outputSize);
647 printElement(4, nodeInfo.fromArrayInput);
648 printElement(5, span[0]);
649
650 std::cout << "\n";
651 }
652
653 printHorizontalRow();
654}
655
656/// Gets all the parameters of the RooAbsReal. This is in principle not
657/// necessary, because we can always ask the RooAbsReal itself, but the
658/// Evaluator has the cached information to get the answer quicker.
659/// Therefore, this is not meant to be used in general, just where it matters.
660/// \warning If we find another solution to get the parameters efficiently,
661/// this function might be removed without notice.
662RooArgSet Evaluator::getParameters() const
663{
664 RooArgSet parameters;
665 for (auto &nodeInfo : _nodes) {
666 if (nodeInfo.isVariable) {
667 parameters.add(*nodeInfo.absArg);
668 }
669 }
670 // Just like in RooAbsArg::getParameters(), we sort the parameters alphabetically.
671 parameters.sort();
672 return parameters;
673}
674
675/// \brief Sets the offset mode for evaluation.
676///
677/// This function sets the offset mode for evaluation to the specified mode.
678/// It updates the offset mode for both CPU and CUDA evaluation contexts.
679///
680/// \param mode The offset mode to be set.
681///
682/// \note This function marks reducer nodes as dirty if the offset mode is
683/// changed, because only reducer nodes can use offsetting.
684void Evaluator::setOffsetMode(RooFit::EvalContext::OffsetMode mode)
685{
686 if (mode == _evalContextCPU._offsetMode)
687 return;
688
689 _evalContextCPU._offsetMode = mode;
690 _evalContextCUDA._offsetMode = mode;
691
692 for (auto &nodeInfo : _nodes) {
693 if (nodeInfo.absArg->isReducerNode()) {
694 nodeInfo.isDirty = true;
695 }
696 }
697}
698
699} // namespace RooFit
#define oocoutI(o, a)
#define oocxcoutI(o, a)
Option_t Option_t TPoint TPoint const char mode
char name[80]
Definition TGX11.cxx:110
Common abstract base class for objects that represent a value and a "shape" in RooFit.
Definition RooAbsArg.h:77
OperMode operMode() const
Query the operation mode of this node.
Definition RooAbsArg.h:425
A space to attach TBranches.
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
Storage_t::size_type size() const
void sort(bool reverse=false)
Sort collection using std::sort and name comparison.
Abstract base class for objects that represent a real value and implements functionality common to al...
Definition RooAbsReal.h:59
virtual void doEval(RooFit::EvalContext &) const
Base function for computing multiple values of a RooAbsReal.
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Definition RooArgSet.h:24
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void setCudaStream(CudaInterface::CudaStream *cudaStream)
virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const =0
virtual void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const =0
virtual std::unique_ptr< AbsBufferManager > createBufferManager() const =0
virtual void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaStream * newCudaStream() const =0
virtual void deleteCudaStream(CudaInterface::CudaStream *) const =0
virtual bool cudaStreamIsActive(CudaInterface::CudaStream *) const =0
void set(RooAbsArg const *arg, std::span< const double > const &span)
Definition EvalContext.h:91
std::span< const double > at(RooAbsArg const *arg, RooAbsArg const *caller=nullptr)
void setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config)
void resize(std::size_t n)
void syncDataTokens()
If there are servers with the same name that got de-duplicated in the _nodes list,...
const bool _useGPU
Definition Evaluator.h:61
std::vector< NodeInfo > _nodes
Definition Evaluator.h:66
Evaluator(const RooAbsReal &absReal, bool useGPU=false)
Construct a new Evaluator.
std::unique_ptr< RooBatchCompute::AbsBufferManager > _bufferManager
Definition Evaluator.h:59
void setInput(std::string const &name, std::span< const double > inputArray, bool isOnDevice)
RooFit::EvalContext _evalContextCUDA
Definition Evaluator.h:65
RooFit::EvalContext _evalContextCPU
Definition Evaluator.h:64
RooAbsReal & _topNode
Definition Evaluator.h:60
static RooMsgService & instance()
Return reference to singleton instance.
static const TNamed * ptr(const char *stringPtr)
Return a unique TNamed pointer for given C++ string.
Variable that can be changed from the outside.
Definition RooRealVar.h:37
const char * GetName() const override
Returns name of object.
Definition TNamed.h:47
virtual const char * ClassName() const
Returns name of class to which the object belongs.
Definition TObject.cxx:225
RVec< PromoteType< T > > log(const RVec< T > &v)
Definition RVec.hxx:1841
const Int_t n
Definition legend1.C:16
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::string cpuArchitectureName()
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
Architecture cpuArchitecture()
int initCPU()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
Definition CodegenImpl.h:64
@ FastEvaluations
void getSortedComputationGraph(RooAbsArg const &func, RooArgSet &out)
A struct used by the Evaluator to store information on the RooAbsArgs in the computation graph.
Definition Evaluator.cxx:97
RooBatchCompute::CudaInterface::CudaStream * stream
RooAbsArg * absArg
bool isScalar() const
Definition Evaluator.cxx:99
std::size_t iNode
std::size_t lastSetValCount
RooBatchCompute::CudaInterface::CudaEvent * event
std::vector< NodeInfo * > serverInfos
RooAbsArg::OperMode originalOperMode
std::size_t outputSize
std::vector< NodeInfo * > clientInfos
std::shared_ptr< RooBatchCompute::AbsBuffer > buffer
void decrementRemainingClients()
Check the servers of a node that has been computed and release its resources if they are no longer ne...