63void assignSpan(std::span<T> &to, std::span<T>
const &from)
68void logArchitectureInfo(
bool useGPU)
80 static std::pair<bool, bool> lastUseGPU;
81 if (lastUseGPU.second && lastUseGPU.first == useGPU)
83 lastUseGPU = {useGPU,
true};
86 auto log = [](std::string_view message) {
91 throw std::runtime_error(std::string(
"In: ") + __func__ +
"(), " + __FILE__ +
":" + __LINE__ +
92 ": Cuda implementation of the computing library is not available\n");
95 log(
"using generic CPU library compiled with no vectorizations");
100 log(
"using CUDA computation library");
119 std::shared_ptr<Detail::AbsBuffer>
buffer;
124 bool copyAfterEvaluation =
false;
138 std::unique_ptr<RooFit::Detail::CudaInterface::CudaEvent> event;
139 std::unique_ptr<RooFit::Detail::CudaInterface::CudaStream> stream;
143 void decrementRemainingClients()
160 : _bufferManager{std::make_unique<Detail::BufferManager>()},
166 throw std::runtime_error(
"Can't create Evaluator in CUDA mode because ROOT was compiled without CUDA support!");
180 std::map<RooFit::Detail::DataKey, NodeInfo *> nodeInfos;
184 std::size_t iNode = 0;
188 auto &nodeInfo =
_nodes.back();
189 nodeInfo.absArg = arg;
190 nodeInfo.originalOperMode = arg->operMode();
191 nodeInfo.iNode = iNode;
192 nodeInfos[arg] = &nodeInfo;
195 nodeInfo.isVariable =
true;
197 arg->setDataToken(iNode);
200 nodeInfo.isCategory =
true;
207 info.serverInfos.reserve(info.absArg->servers().size());
208 for (
RooAbsArg *server : info.absArg->servers()) {
209 if (server->isValueServer(*info.absArg)) {
210 auto *serverInfo = nodeInfos.at(server);
211 info.serverInfos.emplace_back(serverInfo);
212 serverInfo->clientInfos.emplace_back(&info);
222 for (
auto &info :
_nodes) {
223 info.event = std::make_unique<CudaInterface::CudaEvent>(
false);
224 info.stream = std::make_unique<CudaInterface::CudaStream>();
226 cfg.setCudaStream(info.stream.get());
239 std::size_t iValueServer = 0;
240 for (
RooAbsArg *server : info.absArg->servers()) {
241 if (server->isValueServer(*info.absArg)) {
242 auto *knownServer = info.serverInfos[iValueServer]->absArg;
243 if (knownServer->hasDataToken()) {
244 server->setDataToken(knownServer->dataToken());
255 throw std::runtime_error(
"Evaluator can only take device array as input in CUDA mode!");
263 std::size_t iNode = 0;
264 for (
auto &info :
_nodes) {
265 const bool fromArrayInput = info.absArg->namePtr() == namePtr;
266 if (fromArrayInput) {
267 info.fromArrayInput =
true;
268 info.absArg->setDataToken(iNode);
269 info.outputSize = inputArray.size();
270 if (
_useGPU && info.outputSize <= 1) {
276 }
else if (
_useGPU && info.outputSize > 1) {
298 info.isDirty = !info.fromArrayInput;
307 std::map<RooFit::Detail::DataKey, std::size_t> sizeMap;
308 for (
auto &info :
_nodes) {
309 if (info.fromArrayInput) {
310 sizeMap[info.absArg] = info.outputSize;
319 auto found = sizeMap.find(key);
320 return found != sizeMap.
end() ? found->second : -1;
323 for (
auto &info :
_nodes) {
324 info.outputSize = outputSizeMap.at(info.absArg);
331 if (!info.isScalar()) {
349 for (
auto &info :
_nodes) {
350 info.absArg->resetDataToken();
356 using namespace Detail;
358 auto nodeAbsReal =
static_cast<RooAbsReal const *
>(node);
362 double *buffer =
nullptr;
375 <<
" could not be evaluated on the GPU because the class doesn't support it. "
376 "Consider requesting or implementing it to benefit from a speed up."
383 info.
buffer = info.copyAfterEvaluation ?
_bufferManager->makePinnedBuffer(nOut, info.stream.get())
389 buffer = info.
buffer->cpuWritePtr();
400 if (info.copyAfterEvaluation) {
414 auto *var =
static_cast<RooRealVar const *
>(node);
418 clientInfo->isDirty =
true;
430 clientInfo->isDirty =
true;
448 for (
auto &nodeInfo :
_nodes) {
449 if (!nodeInfo.fromArrayInput) {
450 if (nodeInfo.isVariable) {
453 if (nodeInfo.isDirty) {
456 nodeInfo.isDirty =
false;
470 for (
auto &info :
_nodes) {
471 info.remClients = info.clientInfos.size();
472 info.remServers = info.serverInfos.size();
473 if (info.buffer && !info.fromArrayInput) {
479 for (
auto &info :
_nodes) {
480 if (info.remServers == 0 && info.computeInGPU()) {
488 for (
auto &info :
_nodes) {
489 if (info.remServers == -1 && !info.stream->isActive()) {
492 for (
auto *infoClient : info.clientInfos) {
493 --infoClient->remServers;
494 if (infoClient->computeInGPU() && infoClient->remServers == 0) {
498 for (
auto *serverInfo : info.serverInfos) {
499 serverInfo->decrementRemainingClients();
506 for (; it !=
_nodes.end(); it++) {
507 if (it->remServers == 0 && !it->computeInGPU())
513 std::this_thread::sleep_for(std::chrono::milliseconds(1));
528 if (--infoClient->remServers == 0 && infoClient->computeInGPU()) {
533 serverInfo->decrementRemainingClients();
550 using namespace Detail;
559 if (infoServer->event)
560 info.stream->waitForEvent(*infoServer->event);
565 double *buffer =
nullptr;
570 info.
buffer = info.copyAfterEvaluation ?
_bufferManager->makePinnedBuffer(nOut, info.stream.get())
572 buffer = info.
buffer->gpuWritePtr();
578 if (info.copyAfterEvaluation) {
588 for (
auto &info :
_nodes) {
589 info.copyAfterEvaluation =
false;
591 if (!info.isScalar()) {
592 for (
auto *clientInfo : info.clientInfos) {
593 if (info.computeInGPU() != clientInfo->computeInGPU()) {
594 info.copyAfterEvaluation =
true;
614 std::cout <<
"--- RooFit BatchMode evaluation ---\n";
616 std::vector<int> widths{9, 37, 20, 9, 10, 20};
618 auto printElement = [&](
int iCol,
auto const &t) {
619 const char separator =
' ';
620 os << separator << std::left << std::setw(widths[iCol]) << std::setfill(separator) << t;
624 auto printHorizontalRow = [&]() {
626 for (
int w : widths) {
629 for (
int i = 0; i <
n; i++) {
635 printHorizontalRow();
638 printElement(0,
"Index");
639 printElement(1,
"Name");
640 printElement(2,
"Class");
641 printElement(3,
"Size");
642 printElement(4,
"From Data");
643 printElement(5,
"1st value");
646 printHorizontalRow();
648 for (std::size_t iNode = 0; iNode <
_nodes.size(); ++iNode) {
649 auto &nodeInfo =
_nodes[iNode];
655 printElement(0, iNode);
656 printElement(1, node->
GetName());
658 printElement(3, nodeInfo.outputSize);
659 printElement(4, nodeInfo.fromArrayInput);
660 printElement(5, span[0]);
665 printHorizontalRow();
677 for (
auto &nodeInfo :
_nodes) {
678 if (nodeInfo.isVariable) {
679 parameters.
add(*nodeInfo.absArg);
704 for (
auto &nodeInfo :
_nodes) {
705 if (nodeInfo.absArg->isReducerNode()) {
706 nodeInfo.isDirty =
true;
Option_t Option_t TPoint TPoint const char mode
Common abstract base class for objects that represent a value and a "shape" in RooFit.
virtual bool canComputeBatchWithCuda() const
virtual bool isReducerNode() const
TIterator Use end() or range-based loops.")
OperMode operMode() const
Query the operation mode of this node.
A space to attach TBranches.
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
Storage_t::size_type size() const
void sort(bool reverse=false)
Sort collection using std::sort and name comparison.
Abstract base class for objects that represent a real value and implements functionality common to al...
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void set(RooAbsArg const *arg, std::span< const double > const &span)
std::span< const double > at(RooAbsArg const *arg, RooAbsArg const *caller=nullptr)
void resetVectorBuffers()
void enableVectorBuffers(bool enable)
void setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config)
std::span< double > _currentOutput
void resize(std::size_t n)
void print(std::ostream &os)
void setClientsDirty(NodeInfo &nodeInfo)
Flags all the clients of a given node dirty.
RooArgSet getParameters() const
Gets all the parameters of the RooAbsReal.
void setOffsetMode(RooFit::EvalContext::OffsetMode)
Sets the offset mode for evaluation.
void syncDataTokens()
If there are servers with the same name that got de-duplicated in the _nodes list,...
std::vector< NodeInfo > _nodes
bool _needToUpdateOutputSizes
std::span< const double > getValHeterogeneous()
Returns the value of the top node in the computation graph.
std::span< const double > run()
Returns the value of the top node in the computation graph.
Evaluator(const RooAbsReal &absReal, bool useGPU=false)
Construct a new Evaluator.
void processVariable(NodeInfo &nodeInfo)
Process a variable in the computation graph.
void markGPUNodes()
Decides which nodes are assigned to the GPU in a CUDA fit.
void assignToGPU(NodeInfo &info)
Assign a node to be computed in the GPU.
void setInput(std::string const &name, std::span< const double > inputArray, bool isOnDevice)
RooFit::EvalContext _evalContextCUDA
RooFit::EvalContext _evalContextCPU
std::unique_ptr< Detail::BufferManager > _bufferManager
void computeCPUNode(const RooAbsArg *node, NodeInfo &info)
std::stack< std::unique_ptr< ChangeOperModeRAII > > _changeOperModeRAIIs
void setOperMode(RooAbsArg *arg, RooAbsArg::OperMode opMode)
Temporarily change the operation mode of a RooAbsArg until the Evaluator gets deleted.
static RooMsgService & instance()
Return reference to singleton instance.
static const TNamed * ptr(const char *stringPtr)
Return a unique TNamed pointer for given C++ string.
Variable that can be changed from the outside.
const char * GetName() const override
Returns name of object.
virtual const char * ClassName() const
Returns name of class to which the object belongs.
RVec< PromoteType< T > > log(const RVec< T > &v)
std::string cpuArchitectureName()
Architecture cpuArchitecture()
void cudaEventRecord(CudaEvent &, CudaStream &)
Records a CUDA event.
void copyDeviceToHost(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the CUDA device to the host.
void copyHostToDevice(const T *src, T *dest, std::size_t n, CudaStream *=nullptr)
Copies data from the host to the CUDA device.
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
void getSortedComputationGraph(RooAbsArg const &func, RooArgSet &out)
A struct used by the Evaluator to store information on the RooAbsArgs in the computation graph.
std::size_t lastSetValCount
std::vector< NodeInfo * > serverInfos
std::shared_ptr< Detail::AbsBuffer > buffer
RooAbsArg::OperMode originalOperMode
std::vector< NodeInfo * > clientInfos