doc/v628/RooBatchCompute_8cxx_source.html

/*

 * Project: RooFit

 * Authors:

 *   Emmanouil Michalainas, CERN, September 2020

 *

 * Copyright (c) 2021, CERN

 *

 * Redistribution and use in source and binary forms,

 * with or without modification, are permitted according to the terms

 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)

 */


/**

\file RooBatchCompute.cxx

\class RbcClass

\ingroup Roobatchcompute


This file contains the code for cpu computations using the RooBatchCompute library.

**/


#include "RooBatchCompute.h"

#include "RooNaNPacker.h"

#include "RooVDTHeaders.h"

#include "Batches.h"


#include <ROOT/RConfig.hxx>

#include <ROOT/TExecutor.hxx>


#include <Math/Util.h>


#include <algorithm>

#include <sstream>

#include <stdexcept>


#ifndef RF_ARCH

#error "RF_ARCH should always be defined"

#endif


namespace RooBatchCompute {

namespace RF_ARCH {


std::vector<void (*)(BatchesHandle)> getFunctions();


/// This class overrides some RooBatchComputeInterface functions, for the

/// purpose of providing a CPU specific implementation of the library.

class RooBatchComputeClass : public RooBatchComputeInterface {

private:

   const std::vector<void (*)(BatchesHandle)> _computeFunctions;


public:

   RooBatchComputeClass() : _computeFunctions(getFunctions())

   {

      // Set the dispatch pointer to this instance of the library upon loading

      dispatchCPU = this;

   }


   Architecture architecture() const override { return Architecture::RF_ARCH; };

   std::string architectureName() const override

   {

      // transform to lower case to match the original architecture name passed to the compiler

#ifdef _QUOTEVAL_ // to quote the value of the preprocessor macro instead of the name

#error "It's unexpected that _QUOTEVAL_ is defined at this point!"

#endif

#define _QUOTEVAL_(x) _QUOTE_(x)

      std::string out = _QUOTEVAL_(RF_ARCH);

#undef _QUOTEVAL_

      std::transform(out.begin(), out.end(), out.begin(), [](unsigned char c) { return std::tolower(c); });

      ;

      return out;

   };


   /** Compute multiple values using optimized functions.

   This method creates a Batches object and passes it to the correct compute function.

   In case Implicit Multithreading is enabled, the events to be processed are equally

   divided among the tasks to be generated and computed in parallel.

   \param computer An enum specifying the compute function to be used.

   \param output The array where the computation results are stored.

   \param nEvents The number of events to be processed.

   \param vars A std::vector containing pointers to the variables involved in the computation.

   \param extraArgs An optional std::vector containing extra double values that may participate in the computation. **/

   void compute(cudaStream_t *, Computer computer, RestrictArr output, size_t nEvents, const VarVector &vars,

                ArgVector &extraArgs) override

   {

      static std::vector<double> buffer;

      buffer.resize(vars.size() * bufferSize);


      if (ROOT::IsImplicitMTEnabled()) {

         ROOT::Internal::TExecutor ex;

         std::size_t nThreads = ex.GetPoolSize();


         std::size_t nEventsPerThread = nEvents / nThreads + (nEvents % nThreads > 0);


         // Reset the number of threads to the number we actually need given nEventsPerThread

         nThreads = nEvents / nEventsPerThread + (nEvents % nEventsPerThread > 0);


         auto task = [&](std::size_t idx) -> int {

            // Fill a std::vector<Batches> with the same object and with ~nEvents/nThreads

            // Then advance every object but the first to split the work between threads

            Batches batches(output, nEventsPerThread, vars, extraArgs, buffer.data());

            batches.advance(batches.getNEvents() * idx);


            // Set the number of events of the last Batches object as the remaining events

            if (idx == nThreads - 1) {

               batches.setNEvents(nEvents - idx * batches.getNEvents());

            }


            std::size_t events = batches.getNEvents();

            batches.setNEvents(bufferSize);

            while (events > bufferSize) {

               _computeFunctions[computer](batches);

               batches.advance(bufferSize);

               events -= bufferSize;

            }

            batches.setNEvents(events);

            _computeFunctions[computer](batches);

            return 0;

         };


         std::vector<std::size_t> indices(nThreads);

         for (unsigned int i = 1; i < nThreads; i++) {

            indices[i] = i;

         }

         ex.Map(task, indices);

      } else {

         // Fill a std::vector<Batches> with the same object and with ~nEvents/nThreads

         // Then advance every object but the first to split the work between threads

         Batches batches(output, nEvents, vars, extraArgs, buffer.data());


         std::size_t events = batches.getNEvents();

         batches.setNEvents(bufferSize);

         while (events > bufferSize) {

            _computeFunctions[computer](batches);

            batches.advance(bufferSize);

            events -= bufferSize;

         }

         batches.setNEvents(events);

         _computeFunctions[computer](batches);

      }

   }

   /// Return the sum of an input array

   double reduceSum(cudaStream_t *, InputArr input, size_t n) override;

   ReduceNLLOutput reduceNLL(cudaStream_t *, RooSpan<const double> probas, RooSpan<const double> weightSpan,

                             RooSpan<const double> weights, double weightSum,

                             RooSpan<const double> binVolumes) override;

}; // End class RooBatchComputeClass


namespace {


inline std::pair<double, double> getLog(double prob, ReduceNLLOutput &out)

{

   if (std::abs(prob) > 1e6) {

      out.nLargeValues++;

   }


   if (prob <= 0.0) {

      out.nNonPositiveValues++;

      return {std::log(prob), -prob};

   }


   if (std::isnan(prob)) {

      out.nNaNValues++;

      return {prob, RooNaNPacker::unpackNaN(prob)};

   }


   return {std::log(prob), 0.0};

}


} // namespace


double RooBatchComputeClass::reduceSum(cudaStream_t *, InputArr input, size_t n)

{

   return ROOT::Math::KahanSum<double, 4u>::Accumulate(input, input + n).Sum();

}


ReduceNLLOutput RooBatchComputeClass::reduceNLL(cudaStream_t *, RooSpan<const double> probas,

                                                RooSpan<const double> weightSpan, RooSpan<const double> weights,

                                                double weightSum, RooSpan<const double> binVolumes)

{

   ReduceNLLOutput out;


   double badness = 0.0;


   for (std::size_t i = 0; i < probas.size(); ++i) {


      const double eventWeight = weightSpan.size() > 1 ? weightSpan[i] : weightSpan[0];


      if (0. == eventWeight)

         continue;


      std::pair<double, double> logOut = getLog(probas[i], out);

      double term = logOut.first;

      badness += logOut.second;


      if (!binVolumes.empty()) {

         term -= std::log(weights[i]) - std::log(binVolumes[i]) - std::log(weightSum);

      }


      term *= -eventWeight;


      out.nllSum.Add(term);

   }


   if (badness != 0.) {

      // Some events with evaluation errors. Return "badness" of errors.

      out.nllSum = ROOT::Math::KahanSum<double>(RooNaNPacker::packFloatIntoNaN(badness));

   }


   return out;

}


/// Static object to trigger the constructor which overwrites the dispatch pointer.

static RooBatchComputeClass computeObj;


/** Construct a Batches object

\param output The array where the computation results are stored.

\param nEvents The number of events to be processed.

\param vars A std::vector containing pointers to the variables involved in the computation.

\param extraArgs An optional std::vector containing extra double values that may participate in the computation.

\param buffer A 2D array that is used as a buffer for scalar variables.

For every scalar parameter a buffer (one row of the buffer) is filled with copies of the scalar

value, so that it behaves as a batch and facilitates auto-vectorization. The Batches object can be

passed by value to a compute function to perform efficient computations. **/

Batches::Batches(RestrictArr output, size_t nEvents, const VarVector &vars, ArgVector &extraArgs, double *buffer)

   : _extraArgs{extraArgs.data()},

     _nEvents(nEvents),

     _nBatches(vars.size()),

     _nExtraArgs(extraArgs.size()),

     _output(output)

{

   _arrays.resize(vars.size());

   for (size_t i = 0; i < vars.size(); i++) {

      const RooSpan<const double> &span = vars[i];

      if (span.empty()) {

         std::stringstream ss;

         ss << "The span number " << i << " passed to Batches::Batches() is empty!";

         throw std::runtime_error(ss.str());

      } else if (span.size() > 1)

         _arrays[i].set(span.data()[0], span.data(), true);

      else {

         std::fill_n(&buffer[i * bufferSize], bufferSize, span.data()[0]);

         _arrays[i].set(span.data()[0], &buffer[i * bufferSize], false);

      }

   }

}


} // End namespace RF_ARCH

} // End namespace RooBatchCompute

Batches.h

RConfig.hxx

c
#define c(i)
Definition RSha256.hxx:101

_QUOTEVAL_
#define _QUOTEVAL_(x)

RooBatchCompute.h

size
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix

RooNaNPacker.h

RooVDTHeaders.h

TExecutor.hxx

data
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void data
Definition TGWin32VirtualXProxy.cxx:104

input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Definition TGWin32VirtualXProxy.cxx:142

Util.h

ROOT::Internal::TExecutor
This class implements the interface to execute the same task multiple times, sequentially or in paral...
Definition TExecutor.hxx:38

ROOT::Internal::TExecutor::GetPoolSize
unsigned GetPoolSize() const
Return the number of pooled workers.
Definition TExecutor.hxx:309

ROOT::Math::KahanSum
The Kahan summation is a compensated summation algorithm, which significantly reduces numerical error...
Definition Util.h:122

ROOT::Math::KahanSum::Accumulate
static KahanSum< T, N > Accumulate(Iterator begin, Iterator end, T initialValue=T{})
Iterate over a range and return an instance of a KahanSum.
Definition Util.h:211

ROOT::Math::KahanSum::Add
void Add(T x)
Single-element accumulation. Will not vectorise.
Definition Util.h:165

RooBatchCompute::RF_ARCH::Batches
Definition Batches.h:69

RooBatchCompute::RF_ARCH::Batches::Batches
Batches(RestrictArr output, std::size_t nEvents, const VarVector &vars, ArgVector &extraArgs, double *buffer=nullptr)

RooBatchCompute::RF_ARCH::Batches::getNEvents
__roodevice__ std::size_t getNEvents() const
Definition Batches.h:99

RooBatchCompute::RF_ARCH::Batches::advance
void advance(std::size_t nEvents)
Definition Batches.h:105

RooBatchCompute::RF_ARCH::Batches::setNEvents
void setNEvents(std::size_t n)
Definition Batches.h:104

RooBatchCompute::RF_ARCH::RooBatchComputeClass
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a CPU spec...
Definition RooBatchCompute.cxx:46

RooBatchCompute::RF_ARCH::RooBatchComputeClass::RooBatchComputeClass
RooBatchComputeClass()
Definition RooBatchCompute.cxx:51

RooBatchCompute::RF_ARCH::RooBatchComputeClass::architectureName
std::string architectureName() const override
Definition RooBatchCompute.cxx:58

RooBatchCompute::RF_ARCH::RooBatchComputeClass::reduceNLL
ReduceNLLOutput reduceNLL(cudaStream_t *, RooSpan< const double > probas, RooSpan< const double > weightSpan, RooSpan< const double > weights, double weightSum, RooSpan< const double > binVolumes) override
Definition RooBatchCompute.cxx:175

RooBatchCompute::RF_ARCH::RooBatchComputeClass::reduceSum
double reduceSum(cudaStream_t *, InputArr input, size_t n) override
Return the sum of an input array.
Definition RooBatchCompute.cxx:170

RooBatchCompute::RF_ARCH::RooBatchComputeClass::compute
void compute(cudaStream_t *, Computer computer, RestrictArr output, size_t nEvents, const VarVector &vars, ArgVector &extraArgs) override
Compute multiple values using optimized functions.
Definition RooBatchCompute.cxx:81

RooBatchCompute::RF_ARCH::RooBatchComputeClass::architecture
Architecture architecture() const override
Definition RooBatchCompute.cxx:57

RooBatchCompute::RF_ARCH::RooBatchComputeClass::_computeFunctions
const std::vector< void(*)(BatchesHandle)> _computeFunctions
Definition RooBatchCompute.cxx:48

RooBatchCompute::RooBatchComputeInterface
The interface which should be implemented to provide optimised computation functions for implementati...
Definition RooBatchCompute.h:107

RooSpan
A simple container to hold a batch of data values.
Definition RooSpan.h:34

RooSpan::size
constexpr std::size_t size() const noexcept
Definition RooSpan.h:119

RooSpan::data
constexpr std::span< T >::pointer data() const
Definition RooSpan.h:102

RooSpan::empty
constexpr bool empty() const noexcept
Definition RooSpan.h:124

n
const Int_t n
Definition legend1.C:16

ex
Double_t ex[n]
Definition legend1.C:17

ROOT::IsImplicitMTEnabled
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
Definition TROOT.cxx:558

RooBatchCompute::RF_ARCH::computeObj
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
Definition RooBatchCompute.cxx:212

RooBatchCompute::RF_ARCH::getFunctions
std::vector< void(*)(BatchesHandle)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
Definition ComputeFunctions.cxx:787

RooBatchCompute::RF_ARCH::BatchesHandle
Batches & BatchesHandle
Definition Batches.h:117

RooBatchCompute
Namespace for dispatching RooFit computations to various backends.
Definition BracketAdapters.h:24

RooBatchCompute::VarVector
std::vector< RooSpan< const double > > VarVector
Definition RooBatchComputeTypes.h:36

RooBatchCompute::Architecture
Architecture
Definition RooBatchCompute.h:40

RooBatchCompute::dispatchCPU
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
Definition RooBatchCompute.h:149

RooBatchCompute::bufferSize
constexpr std::size_t bufferSize
Definition Batches.h:38

RooBatchCompute::InputArr
const double *__restrict InputArr
Definition RooBatchComputeTypes.h:39

RooBatchCompute::ArgVector
std::vector< double > ArgVector
Definition RooBatchComputeTypes.h:37

RooBatchCompute::RestrictArr
double *__restrict RestrictArr
Definition RooBatchComputeTypes.h:38

RooBatchCompute::Computer
Computer
Definition RooBatchCompute.h:42

RooBatchCompute::ReduceNLLOutput
Definition RooBatchCompute.h:80

RooBatchCompute::ReduceNLLOutput::nNaNValues
std::size_t nNaNValues
Definition RooBatchCompute.h:84

RooBatchCompute::ReduceNLLOutput::nLargeValues
std::size_t nLargeValues
Definition RooBatchCompute.h:82

RooBatchCompute::ReduceNLLOutput::nllSum
ROOT::Math::KahanSum< double > nllSum
Definition RooBatchCompute.h:81

RooBatchCompute::ReduceNLLOutput::nNonPositiveValues
std::size_t nNonPositiveValues
Definition RooBatchCompute.h:83

RooNaNPacker::packFloatIntoNaN
__roodevice__ static __roohost__ double packFloatIntoNaN(float payload)
Pack float into mantissa of a NaN.
Definition RooNaNPacker.h:109

RooNaNPacker::unpackNaN
static float unpackNaN(double val)
If val is NaN and a this NaN has been tagged as containing a payload, unpack the float from the manti...
Definition RooNaNPacker.h:123

output
static void output()