79 auto log = [](std::string_view message) {
84 log(
"using generic CPU library compiled with no vectorizations");
89 log(
"using CUDA computation library");
106 std::shared_ptr<RooBatchCompute::AbsBuffer>
buffer;
155 throw std::runtime_error(
"Can't create Evaluator in CUDA mode because RooBatchCompute CUDA could not be loaded!");
171 std::map<RooFit::Detail::DataKey, NodeInfo *>
nodeInfos;
175 std::size_t iNode = 0;
181 nodeInfo.originalOperMode = arg->operMode();
188 arg->setDataToken(iNode);
198 info.serverInfos.reserve(
info.absArg->servers().size());
244 throw std::runtime_error(
"Evaluator can only take device array as input in CUDA mode!");
252 std::size_t iNode = 0;
254 const bool fromArrayInput =
info.absArg->namePtr() == namePtr;
255 if (fromArrayInput) {
256 info.fromArrayInput =
true;
257 info.absArg->setDataToken(iNode);
285 info.isDirty = !
info.fromArrayInput;
289 _needToUpdateOutputSizes =
true;
292void Evaluator::updateOutputSizes()
294 std::map<RooFit::Detail::DataKey, std::size_t>
sizeMap;
295 for (
auto &
info : _nodes) {
296 if (
info.fromArrayInput) {
306 auto found =
sizeMap.find(key);
307 return found !=
sizeMap.
end() ? found->second : -1;
310 for (
auto &
info : _nodes) {
318 if (!
info.isScalar()) {
321 setOperMode(
info.absArg,
info.originalOperMode);
329 _needToUpdateOutputSizes =
false;
332Evaluator::~Evaluator()
334 for (
auto &
info : _nodes) {
335 if(!
info.isVariable) {
336 info.absArg->resetDataToken();
343 using namespace Detail;
347 const std::size_t
nOut =
info.outputSize;
349 double *buffer =
nullptr;
351 buffer = &
info.scalarBuffer;
353 _evalContextCUDA.set(node, {buffer,
nOut});
356 if (!
info.hasLogged && _useGPU) {
359 <<
" could not be evaluated on the GPU because the class doesn't support it. "
360 "Consider requesting or implementing it to benefit from a speed up."
362 info.hasLogged =
true;
365 info.buffer =
info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(
nOut,
info.stream)
366 : _bufferManager->makeCpuBuffer(
nOut);
368 buffer =
info.buffer->hostWritePtr();
370 assignSpan(_evalContextCPU._currentOutput, {buffer, nOut});
371 _evalContextCPU.set(node, {buffer,
nOut});
373 _evalContextCPU.enableVectorBuffers(
true);
376 _evalContextCPU.resetVectorBuffers();
377 _evalContextCPU.enableVectorBuffers(
false);
378 if (
info.copyAfterEvaluation) {
379 _evalContextCUDA.set(node, {
info.buffer->deviceReadPtr(),
nOut});
391 auto *var =
static_cast<RooRealVar const *
>(node);
392 if (
nodeInfo.lastSetValCount != var->valueResetCounter()) {
393 nodeInfo.lastSetValCount = var->valueResetCounter();
412std::span<const double> Evaluator::run()
414 if (_needToUpdateOutputSizes)
420 return getValHeterogeneous();
438 return _evalContextCPU.at(&_topNode);
442std::span<const double> Evaluator::getValHeterogeneous()
444 for (
auto &
info : _nodes) {
445 info.remClients =
info.clientInfos.size();
446 info.remServers =
info.serverInfos.size();
447 if (
info.buffer && !
info.fromArrayInput) {
453 for (
auto &
info : _nodes) {
454 if (
info.remServers == 0 &&
info.computeInGPU()) {
462 for (
auto &
info : _nodes) {
464 info.remServers = -2;
479 auto it = _nodes.
begin();
480 for (; it != _nodes.end(); it++) {
481 if (it->remServers == 0 && !it->computeInGPU())
486 if (it == _nodes.end()) {
487 std::this_thread::sleep_for(std::chrono::milliseconds(1));
494 info.remServers = -2;
496 if (!
info.fromArrayInput) {
497 computeCPUNode(node,
info);
512 return _evalContextCUDA.at(&_topNode);
519 using namespace Detail;
521 info.remServers = -1;
531 const std::size_t
nOut =
info.outputSize;
533 double *buffer =
nullptr;
535 buffer = &
info.scalarBuffer;
536 _evalContextCPU.set(node, {buffer,
nOut});
538 info.buffer =
info.copyAfterEvaluation ? _bufferManager->makePinnedBuffer(
nOut,
info.stream)
539 : _bufferManager->makeGpuBuffer(
nOut);
540 buffer =
info.buffer->deviceWritePtr();
542 assignSpan(_evalContextCUDA._currentOutput, {buffer, nOut});
543 _evalContextCUDA.set(node, {buffer,
nOut});
544 node->doEval(_evalContextCUDA);
546 if (
info.copyAfterEvaluation) {
547 _evalContextCPU.set(node, {
info.buffer->hostReadPtr(),
nOut});
552void Evaluator::markGPUNodes()
554 for (
auto &
info : _nodes) {
555 info.copyAfterEvaluation =
false;
557 if (!
info.isScalar()) {
560 info.copyAfterEvaluation =
true;
573 _changeOperModeRAIIs.emplace(std::make_unique<ChangeOperModeRAII>(arg,
opMode));
577void Evaluator::print(std::ostream &os)
579 std::cout <<
"--- RooFit BatchMode evaluation ---\n";
581 std::vector<int>
widths{9, 37, 20, 9, 10, 20};
584 const char separator =
' ';
585 os << separator << std::left << std::setw(
widths[
iCol]) << std::setfill(separator) << t;
594 for (
int i = 0; i <
n; i++) {
613 for (std::size_t iNode = 0; iNode < _nodes.size(); ++iNode) {
617 auto span = _evalContextCPU.at(node);
663 if (
mode == _evalContextCPU._offsetMode)
666 _evalContextCPU._offsetMode =
mode;
667 _evalContextCUDA._offsetMode =
mode;
670 if (
nodeInfo.absArg->isReducerNode()) {
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char mode
const_iterator begin() const
const_iterator end() const
Common abstract base class for objects that represent a value and a "shape" in RooFit.
virtual bool canComputeBatchWithCuda() const
virtual bool isReducerNode() const
OperMode operMode() const
Query the operation mode of this node.
A space to attach TBranches.
virtual bool add(const RooAbsArg &var, bool silent=false)
Add the specified argument to list.
void sort(bool reverse=false)
Sort collection using std::sort and name comparison.
Abstract base class for objects that represent a real value and implements functionality common to al...
RooArgSet is a container object that can hold multiple RooAbsArg objects.
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
void setCudaStream(CudaInterface::CudaStream *cudaStream)
virtual void deleteCudaEvent(CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaEvent * newCudaEvent(bool forTiming) const =0
virtual void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const =0
virtual std::unique_ptr< AbsBufferManager > createBufferManager() const =0
virtual void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const =0
virtual CudaInterface::CudaStream * newCudaStream() const =0
virtual void deleteCudaStream(CudaInterface::CudaStream *) const =0
virtual bool cudaStreamIsActive(CudaInterface::CudaStream *) const =0
void set(RooAbsArg const *arg, std::span< const double > const &span)
std::span< const double > at(RooAbsArg const *arg, RooAbsArg const *caller=nullptr)
void setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config)
void resize(std::size_t n)
void syncDataTokens()
If there are servers with the same name that got de-duplicated in the _nodes list,...
std::vector< NodeInfo > _nodes
Evaluator(const RooAbsReal &absReal, bool useGPU=false)
Construct a new Evaluator.
std::unique_ptr< RooBatchCompute::AbsBufferManager > _bufferManager
void setInput(std::string const &name, std::span< const double > inputArray, bool isOnDevice)
RooFit::EvalContext _evalContextCUDA
RooFit::EvalContext _evalContextCPU
static RooMsgService & instance()
Return reference to singleton instance.
static const TNamed * ptr(const char *stringPtr)
Return a unique TNamed pointer for given C++ string.
Variable that can be changed from the outside.
const char * GetName() const override
Returns name of object.
virtual const char * ClassName() const
Returns name of class to which the object belongs.
R__EXTERN RooBatchComputeInterface * dispatchCUDA
std::string cpuArchitectureName()
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
Architecture cpuArchitecture()
int initCPU()
Inspect hardware capabilities, and load the optimal library for RooFit computations.
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
void getSortedComputationGraph(RooAbsArg const &func, RooArgSet &out)
A struct used by the Evaluator to store information on the RooAbsArgs in the computation graph.
RooBatchCompute::CudaInterface::CudaStream * stream
std::size_t lastSetValCount
RooBatchCompute::CudaInterface::CudaEvent * event
std::vector< NodeInfo * > serverInfos
bool computeInGPU() const
RooAbsArg::OperMode originalOperMode
std::vector< NodeInfo * > clientInfos
std::shared_ptr< RooBatchCompute::AbsBuffer > buffer
void decrementRemainingClients()
Check the servers of a node that has been computed and release its resources if they are no longer ne...