56void assignSpan(std::span<T> &to, std::span<T>
const &from)
61void logArchitectureInfo(
bool useGPU)
73 static std::pair<bool, bool> lastUseGPU;
74 if (lastUseGPU.second && lastUseGPU.first == useGPU)
76 lastUseGPU = {useGPU,
true};
79 auto log = [](std::string_view message) {
84 log(
"using generic CPU library compiled with no vectorizations");
89 log(
"using CUDA computation library");
104 std::shared_ptr<RooBatchCompute::AbsBuffer>
buffer;
150 : _topNode{const_cast<
RooAbsReal &>(absReal)}, _useGPU{useGPU}
154 throw std::runtime_error(
"Can't create Evaluator in CUDA mode because RooBatchCompute CUDA could not be loaded!");
170 std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;
174 std::size_t iNode = 0;
178 auto &nodeInfo =
_nodes.back();
179 nodeInfo.absArg = arg;
180 nodeInfo.originalOperMode = arg->operMode();
181 nodeInfo.iNode = iNode;
182 nodeInfos[arg] = &nodeInfo;
185 nodeInfo.isVariable =
true;
187 arg->setDataToken(iNode);
190 nodeInfo.isCategory =
true;
197 info.serverInfos.reserve(info.absArg->servers().size());
198 for (
RooAbsArg *server : info.absArg->servers()) {
199 if (server->isValueServer(*info.absArg)) {
200 auto *serverInfo = nodeInfos.at(server);
201 info.serverInfos.emplace_back(serverInfo);
202 serverInfo->clientInfos.emplace_back(&info);
211 for (
auto &info :
_nodes) {
227 std::size_t iValueServer = 0;
228 for (
RooAbsArg *server : info.absArg->servers()) {
229 if (server->isValueServer(*info.absArg)) {
230 auto *knownServer = info.serverInfos[iValueServer]->absArg;
231 if (knownServer->hasDataToken()) {
232 server->setDataToken(knownServer->dataToken());
243 throw std::runtime_error(
"Evaluator can only take device array as input in CUDA mode!");
251 std::size_t iNode = 0;
252 for (
auto &info :
_nodes) {
253 const bool fromArrayInput = info.absArg->namePtr() == namePtr;
254 if (fromArrayInput) {
255 info.fromArrayInput =
true;
256 info.absArg->setDataToken(iNode);
257 info.outputSize = inputArray.size();
258 if (
_useGPU && info.outputSize <= 1) {
263 }
else if (
_useGPU && info.outputSize > 1) {
271 info.buffer->assignFromDevice(gpuSpan);
277 info.buffer->assignFromHost(cpuSpan);
284 info.isDirty = !info.fromArrayInput;
288 _needToUpdateOutputSizes =
true;
291void Evaluator::updateOutputSizes()
293 std::map<RooFit::Detail::DataKey, std::size_t> sizeMap;
294 for (
auto &info : _nodes) {
295 if (info.fromArrayInput) {
296 sizeMap[info.absArg] = info.outputSize;
305 auto found = sizeMap.find(key);
306 return found != sizeMap.end() ? found->second : -1;
309 for (
auto &info : _nodes) {
310 info.outputSize = outputSizeMap.at(info.absArg);
317 if (!info.isScalar()) {
320 setOperMode(info.absArg, info.originalOperMode);
328 _needToUpdateOutputSizes =
false;
331Evaluator::~Evaluator()
333 for (
auto &info : _nodes) {
334 if (!info.isVariable) {
335 info.absArg->resetDataToken();
342 using namespace Detail;
346 double *buffer =
nullptr;
350 _evalContextCUDA.set(node, {buffer, nOut});
356 <<
" could not be evaluated on the GPU because the class doesn't support it. "
357 "Consider requesting or implementing it to benefit from a speed up."
363 : _bufferManager->makeCpuBuffer(nOut);
365 buffer = info.
buffer->hostWritePtr();
367 assignSpan(_evalContextCPU._currentOutput, {buffer, nOut});
368 _evalContextCPU.set(node, {buffer, nOut});
370 _evalContextCPU.enableVectorBuffers(
true);
375 buffer[0] = nodeAbsCategory->getCurrentIndex();
377 throw std::runtime_error(
"RooFit::Evaluator - non-scalar category values are not supported!");
380 auto nodeAbsReal =
static_cast<RooAbsReal const *
>(node);
381 nodeAbsReal->
doEval(_evalContextCPU);
383 _evalContextCPU.resetVectorBuffers();
384 _evalContextCPU.enableVectorBuffers(
false);
386 _evalContextCUDA.set(node, {info.
buffer->deviceReadPtr(), nOut});
398 auto *var =
static_cast<RooRealVar const *
>(node);
402 clientInfo->isDirty =
true;
404 computeCPUNode(node, nodeInfo);
414 clientInfo->isDirty =
true;
419std::span<const double> Evaluator::run()
421 if (_needToUpdateOutputSizes)
427 return getValHeterogeneous();
430 for (
auto &nodeInfo : _nodes) {
431 if (!nodeInfo.fromArrayInput) {
432 if (nodeInfo.isVariable) {
433 processVariable(nodeInfo);
435 if (nodeInfo.isDirty) {
436 setClientsDirty(nodeInfo);
437 computeCPUNode(nodeInfo.absArg, nodeInfo);
438 nodeInfo.isDirty =
false;
445 return _evalContextCPU.at(&_topNode);
449std::span<const double> Evaluator::getValHeterogeneous()
451 for (
auto &info : _nodes) {
452 info.remClients = info.clientInfos.size();
453 info.remServers = info.serverInfos.size();
454 if (info.buffer && !info.fromArrayInput) {
460 for (
auto &info : _nodes) {
461 if (info.remServers == 0 && info.computeInGPU) {
466 NodeInfo const &topNodeInfo = _nodes.back();
469 for (
auto &info : _nodes) {
473 for (
auto *infoClient : info.clientInfos) {
474 --infoClient->remServers;
475 if (infoClient->computeInGPU && infoClient->remServers == 0) {
476 assignToGPU(*infoClient);
479 for (
auto *serverInfo : info.serverInfos) {
480 serverInfo->decrementRemainingClients();
486 auto it = _nodes.begin();
487 for (; it != _nodes.end(); it++) {
488 if (it->remServers == 0 && !it->computeInGPU)
493 if (it == _nodes.end()) {
494 std::this_thread::sleep_for(std::chrono::milliseconds(1));
504 computeCPUNode(node, info);
509 if (--infoClient->remServers == 0 && infoClient->computeInGPU) {
510 assignToGPU(*infoClient);
514 serverInfo->decrementRemainingClients();
519 return _evalContextCUDA.at(&_topNode);
526 using namespace Detail;
534 if (infoServer->event)
540 double *buffer =
nullptr;
543 _evalContextCPU.set(node, {buffer, nOut});
546 : _bufferManager->makeGpuBuffer(nOut);
547 buffer = info.
buffer->deviceWritePtr();
549 assignSpan(_evalContextCUDA._currentOutput, {buffer, nOut});
550 _evalContextCUDA.set(node, {buffer, nOut});
551 node->doEval(_evalContextCUDA);
554 _evalContextCPU.set(node, {info.
buffer->hostReadPtr(), nOut});
559void Evaluator::markGPUNodes()
563 for (
auto &info : _nodes) {
564 info.computeInGPU =
false;
565 if (!info.absArg->canComputeBatchWithCuda()) {
568 for (
NodeInfo const *serverInfo : info.serverInfos) {
569 if (serverInfo->outputSize > 1) {
570 info.computeInGPU =
true;
577 for (
auto &info : _nodes) {
578 info.copyAfterEvaluation =
false;
580 if (!info.isScalar()) {
581 for (
auto *clientInfo : info.clientInfos) {
582 if (info.computeInGPU != clientInfo->computeInGPU) {
583 info.copyAfterEvaluation =
true;
596 _changeOperModeRAIIs.emplace(std::make_unique<ChangeOperModeRAII>(arg, opMode));
600void Evaluator::print(std::ostream &os)
602 std::cout <<
"--- RooFit BatchMode evaluation ---\n";
604 std::vector<int> widths{9, 37, 20, 9, 10, 20};
606 auto printElement = [&](
int iCol,
auto const &t) {
607 const char separator =
' ';
608 os << separator << std::left << std::setw(widths[iCol]) << std::setfill(separator) << t;
612 auto printHorizontalRow = [&]() {
614 for (
int w : widths) {
617 for (
int i = 0; i <
n; i++) {
623 printHorizontalRow();
626 printElement(0,
"Index");
627 printElement(1,
"Name");
628 printElement(2,
"Class");
629 printElement(3,
"Size");
630 printElement(4,
"From Data");
631 printElement(5,
"1st value");
634 printHorizontalRow();
636 for (std::size_t iNode = 0; iNode < _nodes.size(); ++iNode) {
637 auto &nodeInfo = _nodes[iNode];
640 auto span = _evalContextCPU.at(node);
643 printElement(0, iNode);
644 printElement(1, node->
GetName());
646 printElement(3, nodeInfo.outputSize);
647 printElement(4, nodeInfo.fromArrayInput);
648 printElement(5, span[0]);
653 printHorizontalRow();
665 for (
auto &nodeInfo : _nodes) {
666 if (nodeInfo.isVariable) {
667 parameters.
add(*nodeInfo.absArg);
686 if (
mode == _evalContextCPU._offsetMode)
689 _evalContextCPU._offsetMode =
mode;
690 _evalContextCUDA._offsetMode =
mode;
692 for (
auto &nodeInfo : _nodes) {
693 if (nodeInfo.absArg->isReducerNode()) {
694 nodeInfo.isDirty =
true;
Option_t Option_t TPoint TPoint const char mode
Common abstract base class for objects that represent a value and a "shape" in RooFit.
OperMode operMode() const
Query the operation mode of this node.
A space to attach TBranches.
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
Storage_t::size_type size() const
void sort(bool reverse=false)
Sort collection using std::sort and name comparison.
Abstract base class for objects that represent a real value and implements functionality common to al...
virtual void doEval(RooFit::EvalContext &) const
Base function for computing multiple values of a RooAbsReal.
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void setCudaStream(CudaInterface::CudaStream *cudaStream)
virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const =0
virtual void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const =0
virtual std::unique_ptr< AbsBufferManager > createBufferManager() const =0
virtual void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaStream * newCudaStream() const =0
virtual void deleteCudaStream(CudaInterface::CudaStream *) const =0
virtual bool cudaStreamIsActive(CudaInterface::CudaStream *) const =0
void set(RooAbsArg const *arg, std::span< const double > const &span)
std::span< const double > at(RooAbsArg const *arg, RooAbsArg const *caller=nullptr)
void setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config)
void resize(std::size_t n)
void syncDataTokens()
If there are servers with the same name that got de-duplicated in the _nodes list,...
std::vector< NodeInfo > _nodes
Evaluator(const RooAbsReal &absReal, bool useGPU=false)
Construct a new Evaluator.
std::unique_ptr< RooBatchCompute::AbsBufferManager > _bufferManager
void setInput(std::string const &name, std::span< const double > inputArray, bool isOnDevice)
RooFit::EvalContext _evalContextCUDA
RooFit::EvalContext _evalContextCPU
static RooMsgService & instance()
Return reference to singleton instance.
static const TNamed * ptr(const char *stringPtr)
Return a unique TNamed pointer for given C++ string.
Variable that can be changed from the outside.
const char * GetName() const override
Returns name of object.
virtual const char * ClassName() const
Returns name of class to which the object belongs.
RVec< PromoteType< T > > log(const RVec< T > &v)
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::string cpuArchitectureName()
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
Architecture cpuArchitecture()
int initCPU()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
void getSortedComputationGraph(RooAbsArg const &func, RooArgSet &out)
A struct used by the Evaluator to store information on the RooAbsArgs in the computation graph.
RooBatchCompute::CudaInterface::CudaStream * stream
std::size_t lastSetValCount
RooBatchCompute::CudaInterface::CudaEvent * event
std::vector< NodeInfo * > serverInfos
RooAbsArg::OperMode originalOperMode
std::vector< NodeInfo * > clientInfos
std::shared_ptr< RooBatchCompute::AbsBuffer > buffer
void decrementRemainingClients()
Check the servers of a node that has been computed and release its resources if they are no longer ne...