49#include <unordered_set>
57void assignSpan(std::span<T> &to, std::span<T>
const &from)
62void logArchitectureInfo(
bool useGPU)
74 static std::pair<bool, bool> lastUseGPU;
75 if (lastUseGPU.second && lastUseGPU.first == useGPU)
77 lastUseGPU = {useGPU,
true};
80 auto log = [](std::string_view message) {
81 oocxcoutI(
static_cast<RooAbsArg *
>(
nullptr),
Fitting) << message << std::endl;
85 log(
"using generic CPU library compiled with no vectorizations");
90 log(
"using CUDA computation library");
105 std::shared_ptr<RooBatchCompute::AbsBuffer>
buffer;
157 throw std::runtime_error(
"Can't create Evaluator in CUDA mode because RooBatchCompute CUDA could not be loaded!");
173 std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;
177 std::size_t iNode = 0;
181 auto &nodeInfo =
_nodes.back();
184 nodeInfo.absArg = arg;
185 nodeInfo.originalOperMode = arg->operMode();
186 nodeInfo.iNode = iNode;
187 nodeInfos[arg] = &nodeInfo;
190 nodeInfo.isVariable =
true;
192 arg->setDataToken(iNode);
195 nodeInfo.isCategory =
true;
202 info.serverInfos.reserve(info.absArg->servers().size());
204 if (server->isValueServer(*info.absArg)) {
205 auto *serverInfo = nodeInfos.at(server);
206 info.serverInfos.emplace_back(serverInfo);
207 serverInfo->clientInfos.emplace_back(&info);
213 _nodes.back().isValueServer =
true;
214 for (
auto iter =
_nodes.rbegin(); iter !=
_nodes.rend(); ++iter) {
215 if (!iter->isValueServer)
217 for (
auto &serverInfo : iter->serverInfos) {
218 serverInfo->isValueServer =
true;
226 for (
auto &info :
_nodes) {
242 std::size_t iValueServer = 0;
244 if (server->isValueServer(*info.absArg)) {
245 auto *knownServer = info.serverInfos[iValueServer]->absArg;
246 if (knownServer->hasDataToken()) {
247 server->setDataToken(knownServer->dataToken());
258 throw std::runtime_error(
"Evaluator can only take device array as input in CUDA mode!");
297 info.
buffer->assignFromDevice(gpuSpan);
303 info.
buffer->assignFromHost(cpuSpan);
310 std::map<RooFit::Detail::DataKey, std::size_t> sizeMap;
311 for (
auto &info :
_nodes) {
312 if (info.fromArrayInput) {
313 sizeMap[info.absArg] = info.outputSize;
322 auto found = sizeMap.find(key);
323 return found != sizeMap.end() ? found->second : -1;
326 for (
auto &info :
_nodes) {
327 info.outputSize = outputSizeMap.at(info.absArg);
340 for (
auto &info :
_nodes) {
341 if (!info.isVariable) {
342 info.absArg->resetDataToken();
353 double *buffer =
nullptr;
363 <<
" could not be evaluated on the GPU because the class doesn't support it. "
364 "Consider requesting or implementing it to benefit from a speed up."
372 buffer = info.
buffer->hostWritePtr();
382 buffer[0] = nodeAbsCategory->getCurrentIndex();
384 throw std::runtime_error(
"RooFit::Evaluator - non-scalar category values are not supported!");
387 auto nodeAbsReal =
static_cast<RooAbsReal const *
>(node);
405 auto *var =
static_cast<RooRealVar const *
>(node);
422 if (nodeInfo.
lastCatVal != cat->getCurrentIndex()) {
453 for (
auto &nodeInfo :
_nodes) {
454 if (!nodeInfo.fromArrayInput) {
455 if (nodeInfo.isVariable) {
457 }
else if (nodeInfo.isCategory) {
460 if (nodeInfo.isDirty) {
463 nodeInfo.isDirty =
false;
476 for (
auto &info :
_nodes) {
477 info.remClients = info.clientInfos.size();
478 info.remServers = info.serverInfos.size();
479 if (info.buffer && !info.fromArrayInput) {
485 for (
auto &info :
_nodes) {
486 if (info.remServers == 0 && info.computeInGPU) {
494 for (
auto &info :
_nodes) {
496 info.remServers = -2;
498 for (
auto *infoClient : info.clientInfos) {
499 --infoClient->remServers;
500 if (infoClient->computeInGPU && infoClient->remServers == 0) {
504 for (
auto *serverInfo : info.serverInfos) {
505 serverInfo->decrementRemainingClients();
512 for (; it !=
_nodes.end(); it++) {
513 if (it->remServers == 0 && !it->computeInGPU)
519 std::this_thread::sleep_for(std::chrono::milliseconds(1));
534 if (--infoClient->remServers == 0 && infoClient->computeInGPU) {
539 serverInfo->decrementRemainingClients();
559 if (infoServer->event)
565 double *buffer =
nullptr;
572 buffer = info.
buffer->deviceWritePtr();
588 for (
auto &info :
_nodes) {
589 info.computeInGPU =
false;
590 if (!info.absArg->canComputeBatchWithCuda()) {
595 info.computeInGPU =
true;
602 for (
auto &info :
_nodes) {
603 info.copyAfterEvaluation =
false;
605 if (!info.isScalar()) {
606 for (
auto *clientInfo : info.clientInfos) {
607 if (info.computeInGPU != clientInfo->computeInGPU) {
608 info.copyAfterEvaluation =
true;
640 auto out = std::make_unique<ChangeOperModeRAII>();
641 std::unordered_set<RooAbsArg *> visited;
643 std::vector<RooAbsArg *> queue;
644 queue.reserve(
_nodes.size());
645 for (
auto &info :
_nodes) {
646 queue.push_back(info.absArg);
649 while (!queue.empty()) {
652 if (!visited.insert(node).second)
655 out->change(node, opMode);
661 queue.push_back(client);
670 std::cout <<
"--- RooFit BatchMode evaluation ---\n";
672 std::vector<int> widths{9, 37, 20, 9, 10, 20};
674 auto printElement = [&](
int iCol,
auto const &t) {
675 const char separator =
' ';
676 os << separator << std::left << std::setw(widths[iCol]) << std::setfill(separator) << t;
680 auto printHorizontalRow = [&]() {
682 for (
int w : widths) {
685 for (
int i = 0; i <
n; i++) {
691 printHorizontalRow();
694 printElement(0,
"Index");
695 printElement(1,
"Name");
696 printElement(2,
"Class");
697 printElement(3,
"Size");
698 printElement(4,
"From Data");
699 printElement(5,
"1st value");
702 printHorizontalRow();
704 for (std::size_t iNode = 0; iNode <
_nodes.size(); ++iNode) {
705 auto &nodeInfo =
_nodes[iNode];
711 printElement(0, iNode);
712 printElement(1, node->
GetName());
714 printElement(3, nodeInfo.outputSize);
715 printElement(4, nodeInfo.fromArrayInput);
716 printElement(5, span[0]);
721 printHorizontalRow();
733 for (
auto &nodeInfo :
_nodes) {
734 if (nodeInfo.isValueServer && nodeInfo.absArg->isFundamental()) {
735 parameters.
add(*nodeInfo.absArg);
760 for (
auto &nodeInfo :
_nodes) {
761 if (nodeInfo.absArg->isReducerNode()) {
762 nodeInfo.isDirty =
true;
Common abstract base class for objects that represent a value and a "shape" in RooFit.
void setDataToken(std::size_t index)
Sets the token for retrieving results in the BatchMode. For internal use only.
const RefCountList_t & valueClients() const
List of all value clients of this object. Value clients receive value updates.
const RefCountList_t & servers() const
List of all servers of this object.
A space to attach TBranches.
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
Storage_t::size_type size() const
void sort(bool reverse=false)
Sort collection using std::sort and name comparison.
Abstract base class for objects that represent a real value and implements functionality common to al...
virtual void doEval(RooFit::EvalContext &) const
Base function for computing multiple values of a RooAbsReal.
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void setCudaStream(CudaInterface::CudaStream *cudaStream)
void print(std::ostream &os)
void setClientsDirty(NodeInfo &nodeInfo)
Flags all the clients of a given node dirty.
std::unique_ptr< ChangeOperModeRAII > setOperModes(RooAbsArg::OperMode opMode)
RooArgSet getParameters() const
Gets all the parameters of the RooAbsReal.
void setOffsetMode(RooFit::EvalContext::OffsetMode)
Sets the offset mode for evaluation.
void syncDataTokens()
If there are servers with the same name that got de-duplicated in the _nodes list,...
std::unordered_map< TNamed const *, NodeInfo * > _nodesMap
std::unique_ptr< ChangeOperModeRAII > _operModeChanges
std::vector< NodeInfo > _nodes
bool _needToUpdateOutputSizes
std::span< const double > getValHeterogeneous()
Returns the value of the top node in the computation graph.
std::span< const double > run()
Returns the value of the top node in the computation graph.
Evaluator(const RooAbsReal &absReal, bool useGPU=false)
Construct a new Evaluator.
void processVariable(NodeInfo &nodeInfo)
Process a variable in the computation graph.
void processCategory(NodeInfo &nodeInfo)
Process a category in the computation graph.
std::unique_ptr< RooBatchCompute::AbsBufferManager > _bufferManager
void markGPUNodes()
Decides which nodes are assigned to the GPU in a CUDA fit.
void assignToGPU(NodeInfo &info)
Assign a node to be computed in the GPU.
void setInput(std::string const &name, std::span< const double > inputArray, bool isOnDevice)
RooFit::EvalContext _evalContextCUDA
RooFit::EvalContext _evalContextCPU
void computeCPUNode(const RooAbsArg *node, NodeInfo &info)
void setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
Temporarily change the operation mode of a RooAbsArg until the Evaluator gets deleted.
static RooMsgService & instance()
Return reference to singleton instance.
static const TNamed * ptr(const char *stringPtr)
Return a unique TNamed pointer for given C++ string.
Variable that can be changed from the outside.
const char * GetName() const override
Returns name of object.
virtual const char * ClassName() const
Returns name of class to which the object belongs.
RVec< PromoteType< T > > log(const RVec< T > &v)
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::string cpuArchitectureName()
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
Architecture cpuArchitecture()
int initCPU()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
void getSortedComputationGraph(RooAbsArg const &func, RooArgSet &out)
A struct used by the Evaluator to store information on the RooAbsArgs in the computation graph.
RooBatchCompute::CudaInterface::CudaStream * stream
std::size_t lastSetValCount
RooBatchCompute::CudaInterface::CudaEvent * event
std::vector< NodeInfo * > serverInfos
RooAbsArg::OperMode originalOperMode
std::vector< NodeInfo * > clientInfos
std::shared_ptr< RooBatchCompute::AbsBuffer > buffer
void decrementRemainingClients()
Check the servers of a node that has been computed and release its resources if they are no longer ne...