doc/hackathon/RooBatchCompute_8cxx_source.html

/*

 * Project: RooFit

 * Authors:

 *   Emmanouil Michalainas, CERN, September 2020

 *

 * Copyright (c) 2021, CERN

 *

 * Redistribution and use in source and binary forms,

 * with or without modification, are permitted according to the terms

 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)

 */


/**

\file RooBatchCompute.cxx

\class RbcClass

\ingroup roofit_dev_docs_batchcompute


This file contains the code for cpu computations using the RooBatchCompute library.

**/


#include "RooBatchCompute.h"

#include "RooNaNPacker.h"

#include "Batches.h"


#include <ROOT/RConfig.hxx>


#ifdef ROOBATCHCOMPUTE_USE_IMT

#include <ROOT/TExecutor.hxx>

#endif


#include <Math/Util.h>


#include <algorithm>

#include <functional>

#include <map>

#include <queue>

#include <sstream>

#include <stdexcept>


#include <vector>


#ifndef RF_ARCH

#error "RF_ARCH should always be defined"

#endif


namespace RooBatchCompute {

namespace RF_ARCH {


namespace {


void fillBatches(Batches &batches, double *output, size_t nEvents, std::size_t nBatches, ArgSpan extraArgs)

{

   batches.extra = extraArgs.data();

   batches.nEvents = nEvents;

   batches.nBatches = nBatches;

   batches.nExtra = extraArgs.size();

   batches.output = output;

}


void fillArrays(std::span<Batch> arrays, VarSpan vars, std::size_t nEvents)

{

   for (std::size_t i = 0; i < vars.size(); i++) {

      arrays[i]._array = vars[i].data();

      arrays[i]._isVector = vars[i].empty() || vars[i].size() >= nEvents;

   }

}


inline void advance(Batches &batches, std::size_t nEvents)

{

   for (std::size_t i = 0; i < batches.nBatches; i++) {

      Batch &arg = batches.args[i];

      arg._array += arg._isVector * nEvents;

   }

   batches.output += nEvents;

}


} // namespace


std::vector<void (*)(Batches &)> getFunctions();


/// This class overrides some RooBatchComputeInterface functions, for the

/// purpose of providing a CPU specific implementation of the library.


class RooBatchComputeClass : public RooBatchComputeInterface {

public:


   RooBatchComputeClass() : _computeFunctions(getFunctions())

   {

      // Set the dispatch pointer to this instance of the library upon loading

      dispatchCPU = this;

   }


   Architecture architecture() const override { return Architecture::RF_ARCH; };


   std::string architectureName() const override

   {

      // transform to lower case to match the original architecture name passed to the compiler

      std::string out = _R_QUOTEVAL_(RF_ARCH);

      std::transform(out.begin(), out.end(), out.begin(), [](unsigned char c) { return std::tolower(c); });

      return out;

   };


   void compute(Config const &, Computer computer, std::span<double> output, VarSpan vars, ArgSpan extraArgs) override;

   double reduceSum(Config const &, InputArr input, size_t n) override;

   ReduceNLLOutput reduceNLL(Config const &, std::span<const double> probas, std::span<const double> weights,

                             std::span<const double> offsetProbas) override;


   std::unique_ptr<AbsBufferManager> createBufferManager() const override;


   CudaInterface::CudaEvent *newCudaEvent(bool) const override { throw std::bad_function_call(); }

   CudaInterface::CudaStream *newCudaStream() const override { throw std::bad_function_call(); }

   void deleteCudaEvent(CudaInterface::CudaEvent *) const override { throw std::bad_function_call(); }

   void deleteCudaStream(CudaInterface::CudaStream *) const override { throw std::bad_function_call(); }


   void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const override

   {

      throw std::bad_function_call();

   }


   void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const override

   {

      throw std::bad_function_call();

   }


   bool cudaStreamIsActive(CudaInterface::CudaStream *) const override { throw std::bad_function_call(); }


private:

#ifdef ROOBATCHCOMPUTE_USE_IMT

   void computeIMT(Computer computer, std::span<double> output, VarSpan vars, ArgSpan extraArgs);

#endif


   const std::vector<void (*)(Batches &)> _computeFunctions;

};


#ifdef ROOBATCHCOMPUTE_USE_IMT

void RooBatchComputeClass::computeIMT(Computer computer, std::span<double> output, VarSpan vars, ArgSpan extraArgs)

{

   std::size_t nEvents = output.size();


   if (nEvents == 0)

      return;

   ROOT::Internal::TExecutor ex;

   std::size_t nThreads = ex.GetPoolSize();


   std::size_t nEventsPerThread = nEvents / nThreads + (nEvents % nThreads > 0);


   // Reset the number of threads to the number we actually need given nEventsPerThread

   nThreads = nEvents / nEventsPerThread + (nEvents % nEventsPerThread > 0);


   auto task = [&](std::size_t idx) -> int {

      // Fill a std::vector<Batches> with the same object and with ~nEvents/nThreads

      // Then advance every object but the first to split the work between threads

      Batches batches;

      std::vector<Batch> arrays(vars.size());

      fillBatches(batches, output.data(), nEventsPerThread, vars.size(), extraArgs);

      fillArrays(arrays, vars, nEvents);

      batches.args = arrays.data();

      advance(batches, batches.nEvents * idx);


      // Set the number of events of the last Batches object as the remaining events

      if (idx == nThreads - 1) {

         batches.nEvents = nEvents - idx * batches.nEvents;

      }


      std::size_t events = batches.nEvents;

      batches.nEvents = bufferSize;

      while (events > bufferSize) {

         _computeFunctions[computer](batches);

         advance(batches, bufferSize);

         events -= bufferSize;

      }

      batches.nEvents = events;

      _computeFunctions[computer](batches);

      return 0;

   };


   std::vector<std::size_t> indices(nThreads);

   for (unsigned int i = 1; i < nThreads; i++) {

      indices[i] = i;

   }

   ex.Map(task, indices);

}

#endif


/** Compute multiple values using optimized functions.

This method creates a Batches object and passes it to the correct compute function.

In case Implicit Multithreading is enabled, the events to be processed are equally

divided among the tasks to be generated and computed in parallel.

\param computer An enum specifying the compute function to be used.

\param output The array where the computation results are stored.

\param vars A std::span containing pointers to the variables involved in the computation.

\param extraArgs An optional std::span containing extra double values that may participate in the computation. **/


void RooBatchComputeClass::compute(Config const &, Computer computer, std::span<double> output, VarSpan vars,

                                   ArgSpan extraArgs)

{

   // In the original implementation of this library, the evaluation was done

   // multi-threaded in implicit multi-threading was enabled in ROOT with

   // ROOT::EnableImplicitMT().

   //

   // However, this multithreaded mode was not carefully validated and is

   // therefore not production ready. One would first have to study the

   // overhead for different numbers of cores, number of events, and model

   // complexity. The, we should only consider implicit multithreading here if

   // there is no performance penalty for any scenario, to not surprise the

   // users with unexpected slowdows!

   //

   // Note that the priority of investigating this is not high, because RooFit

   // R & D efforts currently go in the direction of parallelization at the

   // level of the gradient components, or achieving single-threaded speedup

   // with automatic differentiation. Furthermore, the single-threaded

   // performance of the new CPU evaluation backend with the RooBatchCompute

   // library, is generally much faster than the legacy evaluation backend

   // already, even if the latter uses multi-threading.

#ifdef ROOBATCHCOMPUTE_USE_IMT

   if (ROOT::IsImplicitMTEnabled()) {

      computeIMT(computer, output, vars, extraArgs);

   }

#endif


   std::size_t nEvents = output.size();


   // Fill a std::vector<Batches> with the same object and with ~nEvents/nThreads

   // Then advance every object but the first to split the work between threads

   Batches batches;

   std::vector<Batch> arrays(vars.size());

   fillBatches(batches, output.data(), nEvents, vars.size(), extraArgs);

   fillArrays(arrays, vars, nEvents);

   batches.args = arrays.data();


   std::size_t events = batches.nEvents;

   batches.nEvents = bufferSize;

   while (events > bufferSize) {

      _computeFunctions[computer](batches);

      advance(batches, bufferSize);

      events -= bufferSize;

   }

   batches.nEvents = events;

   _computeFunctions[computer](batches);

}


namespace {


inline std::pair<double, double> getLog(double prob, ReduceNLLOutput &out)

{

   if (prob <= 0.0) {

      out.nNonPositiveValues++;

      return {std::log(prob), -prob};

   }


   if (std::isinf(prob)) {

      out.nInfiniteValues++;

   }


   if (std::isnan(prob)) {

      out.nNaNValues++;

      return {prob, RooNaNPacker::unpackNaN(prob)};

   }


   return {std::log(prob), 0.0};

}


} // namespace


double RooBatchComputeClass::reduceSum(Config const &, InputArr input, size_t n)

{

   return ROOT::Math::KahanSum<double, 4u>::Accumulate(input, input + n).Sum();

}


ReduceNLLOutput RooBatchComputeClass::reduceNLL(Config const &, std::span<const double> probas,

                                                std::span<const double> weights, std::span<const double> offsetProbas)

{

   ReduceNLLOutput out;


   double badness = 0.0;


   ROOT::Math::KahanSum<double> nllSum;


   for (std::size_t i = 0; i < weights.size(); ++i) {


      if (0. == weights[i])

         continue;


      std::pair<double, double> logOut = getLog(probas.size() == 1 ? probas[0] : probas[i], out);

      double term = logOut.first;

      badness += logOut.second;


      if (!offsetProbas.empty()) {

         term -= std::log(offsetProbas[i]);

      }


      term *= -weights[i];


      nllSum.Add(term);

   }


   out.nllSum = nllSum.Sum();

   out.nllSumCarry = nllSum.Carry();


   if (badness != 0.) {

      // Some events with evaluation errors. Return "badness" of errors.

      out.nllSum = RooNaNPacker::packFloatIntoNaN(badness);

      out.nllSumCarry = 0.0;

   }


   return out;

}


namespace {


class ScalarBufferContainer {

public:

   ScalarBufferContainer() {}

   ScalarBufferContainer(std::size_t size)

   {

      if (size != 1)

         throw std::runtime_error("ScalarBufferContainer can only be of size 1");

   }


   double const *hostReadPtr() const { return &_val; }

   double const *deviceReadPtr() const { return &_val; }


   double *hostWritePtr() { return &_val; }

   double *deviceWritePtr() { return &_val; }


   void assignFromHost(std::span<const double> input) { _val = input[0]; }

   void assignFromDevice(std::span<const double>) { throw std::bad_function_call(); }


private:

   double _val;

};


class CPUBufferContainer {

public:

   CPUBufferContainer(std::size_t size) : _vec(size) {}


   double const *hostReadPtr() const { return _vec.data(); }

   double const *deviceReadPtr() const

   {

      throw std::bad_function_call();

      return nullptr;

   }


   double *hostWritePtr() { return _vec.data(); }

   double *deviceWritePtr()

   {

      throw std::bad_function_call();

      return nullptr;

   }


   void assignFromHost(std::span<const double> input) { _vec.assign(input.begin(), input.end()); }

   void assignFromDevice(std::span<const double>) { throw std::bad_function_call(); }


private:

   std::vector<double> _vec;

};


template <class Container>

class BufferImpl : public AbsBuffer {

public:

   using Queue = std::queue<std::unique_ptr<Container>>;


   BufferImpl(std::size_t size, Queue &queue) : _queue{queue}

   {

      if (_queue.empty()) {

         _vec = std::make_unique<Container>(size);

      } else {

         _vec = std::move(_queue.front());

         _queue.pop();

      }

   }


   ~BufferImpl() override { _queue.emplace(std::move(_vec)); }


   double const *hostReadPtr() const override { return _vec->hostReadPtr(); }

   double const *deviceReadPtr() const override { return _vec->deviceReadPtr(); }


   double *hostWritePtr() override { return _vec->hostWritePtr(); }

   double *deviceWritePtr() override { return _vec->deviceWritePtr(); }


   void assignFromHost(std::span<const double> input) override { _vec->assignFromHost(input); }

   void assignFromDevice(std::span<const double> input) override { _vec->assignFromDevice(input); }


   Container &vec() { return *_vec; }


private:

   std::unique_ptr<Container> _vec;

   Queue &_queue;

};


using ScalarBuffer = BufferImpl<ScalarBufferContainer>;

using CPUBuffer = BufferImpl<CPUBufferContainer>;


struct BufferQueuesMaps {

   std::map<std::size_t, ScalarBuffer::Queue> scalarBufferQueuesMap;

   std::map<std::size_t, CPUBuffer::Queue> cpuBufferQueuesMap;

};


class BufferManager : public AbsBufferManager {


public:

   BufferManager() : _queuesMaps{std::make_unique<BufferQueuesMaps>()} {}


   std::unique_ptr<AbsBuffer> makeScalarBuffer() override

   {

      return std::make_unique<ScalarBuffer>(1, _queuesMaps->scalarBufferQueuesMap[1]);

   }

   std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t size) override

   {

      return std::make_unique<CPUBuffer>(size, _queuesMaps->cpuBufferQueuesMap[size]);

   }

   std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t) override { throw std::bad_function_call(); }

   std::unique_ptr<AbsBuffer> makePinnedBuffer(std::size_t, CudaInterface::CudaStream * = nullptr) override

   {

      throw std::bad_function_call();

   }


private:

   std::unique_ptr<BufferQueuesMaps> _queuesMaps;

};


} // namespace


std::unique_ptr<AbsBufferManager> RooBatchComputeClass::createBufferManager() const

{

   return std::make_unique<BufferManager>();

}


/// Static object to trigger the constructor which overwrites the dispatch pointer.

static RooBatchComputeClass computeObj;


} // End namespace RF_ARCH

} // End namespace RooBatchCompute

Batches.h

RConfig.hxx

_R_QUOTEVAL_
#define _R_QUOTEVAL_(string)
Definition RConfig.hxx:450

c
#define c(i)
Definition RSha256.hxx:101

RooBatchCompute.h

size
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix

RooNaNPacker.h

TExecutor.hxx

input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Definition TGWin32VirtualXProxy.cxx:142

Util.h

ROOT::Internal::TExecutor
This class implements the interface to execute the same task multiple times, sequentially or in paral...
Definition TExecutor.hxx:37

ROOT::Math::KahanSum
The Kahan summation is a compensated summation algorithm, which significantly reduces numerical error...
Definition Util.h:141

ROOT::Math::KahanSum::Sum
T Sum() const
Definition Util.h:259

ROOT::Math::KahanSum::Accumulate
static KahanSum< T, N > Accumulate(Iterator begin, Iterator end, T initialValue=T{})
Iterate over a range and return an instance of a KahanSum.
Definition Util.h:230

ROOT::Math::KahanSum::Carry
T Carry() const
Definition Util.h:269

ROOT::Math::KahanSum::Add
void Add(T x)
Single-element accumulation. Will not vectorise.
Definition Util.h:184

RooBatchCompute::Batches
Definition Batches.h:42

RooBatchCompute::Batches::nEvents
std::size_t nEvents
Definition Batches.h:46

RooBatchCompute::Batches::args
Batch * args
Definition Batches.h:44

RooBatchCompute::Config
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
Definition RooBatchCompute.h:55

RooBatchCompute::CudaInterface::CudaEvent
Definition CudaInterface.h:43

RooBatchCompute::CudaInterface::CudaStream
Definition CudaInterface.h:58

RooBatchCompute::RF_ARCH::RooBatchComputeClass
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a CPU spec...
Definition RooBatchCompute.cxx:83

RooBatchCompute::RF_ARCH::RooBatchComputeClass::RooBatchComputeClass
RooBatchComputeClass()
Definition RooBatchCompute.cxx:85

RooBatchCompute::RF_ARCH::RooBatchComputeClass::architectureName
std::string architectureName() const override
Definition RooBatchCompute.cxx:92

RooBatchCompute::RF_ARCH::RooBatchComputeClass::cudaEventRecord
void cudaEventRecord(CudaInterface::CudaEvent *, CudaInterface::CudaStream *) const override
Definition RooBatchCompute.cxx:111

RooBatchCompute::RF_ARCH::RooBatchComputeClass::compute
void compute(Config const &, Computer computer, std::span< double > output, VarSpan vars, ArgSpan extraArgs) override
Compute multiple values using optimized functions.
Definition RooBatchCompute.cxx:187

RooBatchCompute::RF_ARCH::RooBatchComputeClass::_computeFunctions
const std::vector< void(*)(Batches &)> _computeFunctions
Definition RooBatchCompute.cxx:126

RooBatchCompute::RF_ARCH::RooBatchComputeClass::newCudaEvent
CudaInterface::CudaEvent * newCudaEvent(bool) const override
Definition RooBatchCompute.cxx:107

RooBatchCompute::RF_ARCH::RooBatchComputeClass::deleteCudaEvent
void deleteCudaEvent(CudaInterface::CudaEvent *) const override
Definition RooBatchCompute.cxx:109

RooBatchCompute::RF_ARCH::RooBatchComputeClass::reduceSum
double reduceSum(Config const &, InputArr input, size_t n) override
Definition RooBatchCompute.cxx:258

RooBatchCompute::RF_ARCH::RooBatchComputeClass::deleteCudaStream
void deleteCudaStream(CudaInterface::CudaStream *) const override
Definition RooBatchCompute.cxx:110

RooBatchCompute::RF_ARCH::RooBatchComputeClass::createBufferManager
std::unique_ptr< AbsBufferManager > createBufferManager() const override
Definition RooBatchCompute.cxx:417

RooBatchCompute::RF_ARCH::RooBatchComputeClass::newCudaStream
CudaInterface::CudaStream * newCudaStream() const override
Definition RooBatchCompute.cxx:108

RooBatchCompute::RF_ARCH::RooBatchComputeClass::cudaStreamIsActive
bool cudaStreamIsActive(CudaInterface::CudaStream *) const override
Definition RooBatchCompute.cxx:119

RooBatchCompute::RF_ARCH::RooBatchComputeClass::architecture
Architecture architecture() const override
Definition RooBatchCompute.cxx:91

RooBatchCompute::RF_ARCH::RooBatchComputeClass::reduceNLL
ReduceNLLOutput reduceNLL(Config const &, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
Definition RooBatchCompute.cxx:263

RooBatchCompute::RF_ARCH::RooBatchComputeClass::cudaStreamWaitForEvent
void cudaStreamWaitForEvent(CudaInterface::CudaStream *, CudaInterface::CudaEvent *) const override
Definition RooBatchCompute.cxx:115

RooBatchCompute::RooBatchComputeInterface
The interface which should be implemented to provide optimised computation functions for implementati...
Definition RooBatchCompute.h:169

n
const Int_t n
Definition legend1.C:16

ex
Double_t ex[n]
Definition legend1.C:17

ROOT::Internal::VecOps::void
void(off) SmallVectorTemplateBase< T

ROOT::IsImplicitMTEnabled
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
Definition TROOT.cxx:669

RooBatchCompute::RF_ARCH
Definition ComputeFunctions.cxx:45

RooBatchCompute::RF_ARCH::getFunctions
std::vector< void(*)(Batches &)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
Definition ComputeFunctions.cxx:920

RooBatchCompute::RF_ARCH::computeObj
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
Definition RooBatchCompute.cxx:423

RooBatchCompute
Namespace for dispatching RooFit computations to various backends.
Definition RooBatchCompute.h:37

RooBatchCompute::ArgSpan
std::span< double > ArgSpan
Definition RooBatchCompute.h:45

RooBatchCompute::Architecture
Architecture
Definition RooBatchCompute.h:65

RooBatchCompute::dispatchCPU
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
Definition RooBatchCompute.h:199

RooBatchCompute::bufferSize
constexpr std::size_t bufferSize
Definition RooBatchCompute.h:48

RooBatchCompute::InputArr
const double *__restrict InputArr
Definition RooBatchCompute.h:46

RooBatchCompute::VarSpan
std::span< const std::span< const double > > VarSpan
Definition RooBatchCompute.h:44

RooBatchCompute::Computer
Computer
Definition RooBatchCompute.h:74

RooBatchCompute::ReduceNLLOutput
Definition RooBatchCompute.h:116

RooBatchCompute::ReduceNLLOutput::nInfiniteValues
std::size_t nInfiniteValues
Definition RooBatchCompute.h:119

RooBatchCompute::ReduceNLLOutput::nNaNValues
std::size_t nNaNValues
Definition RooBatchCompute.h:121

RooBatchCompute::ReduceNLLOutput::nllSumCarry
double nllSumCarry
Definition RooBatchCompute.h:118

RooBatchCompute::ReduceNLLOutput::nNonPositiveValues
std::size_t nNonPositiveValues
Definition RooBatchCompute.h:120

RooBatchCompute::ReduceNLLOutput::nllSum
double nllSum
Definition RooBatchCompute.h:117

RooNaNPacker::packFloatIntoNaN
static double packFloatIntoNaN(float payload)
Pack float into mantissa of a NaN.
Definition RooNaNPacker.h:109

RooNaNPacker::unpackNaN
static float unpackNaN(double val)
If val is NaN and a this NaN has been tagged as containing a payload, unpack the float from the manti...
Definition RooNaNPacker.h:123