doc/hackathon/TThreadExecutor_8hxx_source.html

// @(#)root/thread:$Id$

// Author: Xavier Valls March 2016


/*************************************************************************

 * Copyright (C) 1995-2020, Rene Brun and Fons Rademakers.               *

 * All rights reserved.                                                  *

 *                                                                       *

 * For the licensing terms see $ROOTSYS/LICENSE.                         *

 * For the list of contributors see $ROOTSYS/README/CREDITS.             *

 *************************************************************************/


#ifndef ROOT_TThreadExecutor

#define ROOT_TThreadExecutor


#include "RConfigure.h"


// exclude in case ROOT does not have IMT support

#ifndef R__USE_IMT

// No need to error out for dictionaries.

# if !defined(__ROOTCLING__) && !defined(G__DICTIONARY)

#  error "Cannot use ROOT::TThreadExecutor without defining R__USE_IMT."

# endif

#else


#include "ROOT/TExecutorCRTP.hxx"

#include "ROOT/TSeq.hxx"

#include "ROOT/TypeTraits.hxx" // InvokeResult

#include "RTaskArena.hxx"

#include "TError.h"


#include <functional> //std::function

#include <initializer_list>

#include <memory>

#include <numeric> //std::accumulate

#include <type_traits> //std::enable_if

#include <utility> //std::move

#include <vector>


namespace ROOT {


   class TThreadExecutor: public TExecutorCRTP<TThreadExecutor> {

      friend TExecutorCRTP;


   public:


      explicit TThreadExecutor(UInt_t nThreads = 0u);


      TThreadExecutor(const TThreadExecutor &) = delete;

      TThreadExecutor &operator=(const TThreadExecutor &) = delete;


      // ForEach

      //

      template<class F>

      void Foreach(F func, unsigned nTimes, unsigned nChunks = 0);

      template<class F, class INTEGER>

      void Foreach(F func, ROOT::TSeq<INTEGER> args, unsigned nChunks = 0);

      template<class F, class T>

      void Foreach(F func, std::initializer_list<T> args, unsigned nChunks = 0);

      template<class F, class T>

      void Foreach(F func, std::vector<T> &args, unsigned nChunks = 0);

      template<class F, class T>

      void Foreach(F func, const std::vector<T> &args, unsigned nChunks = 0);


      // Map

      //

      using TExecutorCRTP<TThreadExecutor>::Map;


      // Extension of the Map interfaces with chunking, specific to this class

      template <class F, class R, class Cond = validMapReturnCond<F>>

      auto Map(F func, unsigned nTimes, R redfunc, unsigned nChunks) -> std::vector<InvokeResult_t<F>>;

      template <class F, class INTEGER, class R, class Cond = validMapReturnCond<F, INTEGER>>

      auto Map(F func, ROOT::TSeq<INTEGER> args, R redfunc, unsigned nChunks)

         -> std::vector<InvokeResult_t<F, INTEGER>>;

      template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>

      auto Map(F func, std::initializer_list<T> args, R redfunc, unsigned nChunks) -> std::vector<InvokeResult_t<F, T>>;

      template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>

      auto Map(F func, std::vector<T> &args, R redfunc, unsigned nChunks) -> std::vector<InvokeResult_t<F, T>>;

      template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>

      auto Map(F func, const std::vector<T> &args, R redfunc, unsigned nChunks) -> std::vector<InvokeResult_t<F, T>>;


      // MapReduce

      //

      // We need to reimplement the MapReduce interfaces to allow for parallel reduction, defined in

      // this class but not in the base class.

      //

      // the late return types also check at compile-time whether redfunc is compatible with func,

      // other than checking that func is compatible with the type of arguments.

      // a static_assert check in TThreadExecutor::Reduce is used to check that redfunc is compatible with the type returned by func

      using TExecutorCRTP<TThreadExecutor>::MapReduce;

      template <class F, class R, class Cond = validMapReturnCond<F>>

      auto MapReduce(F func, unsigned nTimes, R redfunc) -> InvokeResult_t<F>;

      template <class F, class R, class Cond = validMapReturnCond<F>>

      auto MapReduce(F func, unsigned nTimes, R redfunc, unsigned nChunks) -> InvokeResult_t<F>;

      template <class F, class INTEGER, class R, class Cond = validMapReturnCond<F, INTEGER>>

      auto MapReduce(F func, ROOT::TSeq<INTEGER> args, R redfunc, unsigned nChunks) -> InvokeResult_t<F, INTEGER>;

      template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>

      auto MapReduce(F func, std::initializer_list<T> args, R redfunc, unsigned nChunks) -> InvokeResult_t<F, T>;

      template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>

      auto MapReduce(F func, std::vector<T> &args, R redfunc) -> InvokeResult_t<F, T>;

      template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>

      auto MapReduce(F func, const std::vector<T> &args, R redfunc) -> InvokeResult_t<F, T>;

      template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>

      auto MapReduce(F func, std::vector<T> &args, R redfunc, unsigned nChunks) -> InvokeResult_t<F, T>;

      template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>

      auto MapReduce(F func, const std::vector<T> &args, R redfunc, unsigned nChunks) -> InvokeResult_t<F, T>;


      using TExecutorCRTP<TThreadExecutor>::Reduce;

      template<class T, class R> auto Reduce(const std::vector<T> &objs, R redfunc) -> decltype(redfunc(objs));

      template<class T, class BINARYOP> auto Reduce(const std::vector<T> &objs, BINARYOP redfunc) -> decltype(redfunc(objs.front(), objs.front()));


      unsigned GetPoolSize() const;


   private:

      // Implementation of the Map functions declared in the parent class (TExecutorCRTP)

      //

      template <class F, class Cond = validMapReturnCond<F>>

      auto MapImpl(F func, unsigned nTimes) -> std::vector<InvokeResult_t<F>>;

      template <class F, class INTEGER, class Cond = validMapReturnCond<F, INTEGER>>

      auto MapImpl(F func, ROOT::TSeq<INTEGER> args) -> std::vector<InvokeResult_t<F, INTEGER>>;

      template <class F, class T, class Cond = validMapReturnCond<F, T>>

      auto MapImpl(F func, std::vector<T> &args) -> std::vector<InvokeResult_t<F, T>>;

      template <class F, class T, class Cond = validMapReturnCond<F, T>>

      auto MapImpl(F func, const std::vector<T> &args) -> std::vector<InvokeResult_t<F, T>>;


      // Functions that interface with the parallel library used as a backend

      void   ParallelFor(unsigned start, unsigned end, unsigned step, const std::function<void(unsigned int i)> &f);

      double ParallelReduce(const std::vector<double> &objs, const std::function<double(double a, double b)> &redfunc);

      float  ParallelReduce(const std::vector<float> &objs, const std::function<float(float a, float b)> &redfunc);

      template<class T, class R>

      auto SeqReduce(const std::vector<T> &objs, R redfunc) -> decltype(redfunc(objs));


      /// Pointer to the TBB task arena wrapper

      std::shared_ptr<ROOT::Internal::RTaskArenaWrapper> fTaskArenaW = nullptr;

   };


   /************ TEMPLATE METHODS IMPLEMENTATION ******************/


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function without arguments several times in parallel, dividing the execution in nChunks.

   ///

   /// \param func Function to be executed.

   /// \param nTimes Number of times function should be called.

   /// \param nChunks Number of chunks to split the input data for processing.

   template<class F>


   void TThreadExecutor::Foreach(F func, unsigned nTimes, unsigned nChunks) {

      if (nChunks == 0) {

         ParallelFor(0U, nTimes, 1, [&](unsigned int){func();});

         return;

      }


      unsigned step = (nTimes + nChunks - 1) / nChunks;

      auto lambda = [&](unsigned int i)

      {

         for (unsigned j = 0; j < step && (i + j) < nTimes; j++) {

            func();

         }

      };

      ParallelFor(0U, nTimes, step, lambda);

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function in parallel over a sequence of indexes, dividing the execution in nChunks.

   ///

   /// \param func Function to be executed. Must take an element of the sequence passed assecond argument as a parameter.

   /// \param args Sequence of indexes to execute `func` on.

   /// \param nChunks Number of chunks to split the input data for processing.

   template<class F, class INTEGER>


   void TThreadExecutor::Foreach(F func, ROOT::TSeq<INTEGER> args, unsigned nChunks) {

      if (nChunks == 0) {

         ParallelFor(*args.begin(), *args.end(), args.step(), [&](unsigned int i){func(i);});

         return;

      }

      unsigned start = *args.begin();

      unsigned end = *args.end();

      unsigned seqStep = args.step();

      unsigned step = (end - start + nChunks - 1) / nChunks; //ceiling the division


      auto lambda = [&](unsigned int i)

      {

         for (unsigned j = 0; j < step && (i + j) < end; j+=seqStep) {

            func(i + j);

         }

      };

      ParallelFor(start, end, step, lambda);

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function in parallel over the elements of an initializer_list, dividing the execution in nChunks.

   ///

   /// \param func Function to be executed on the elements of the initializer_list passed as second parameter.

   /// \param args initializer_list for a vector to apply `func` on.

   /// \param nChunks Number of chunks to split the input data for processing.

   template<class F, class T>


   void TThreadExecutor::Foreach(F func, std::initializer_list<T> args, unsigned nChunks) {

      std::vector<T> vargs(std::move(args));

      Foreach(func, vargs, nChunks);

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function in parallel over the elements of a vector, dividing the execution in nChunks.

   ///

   /// \param func Function to be executed on the elements of the vector passed as second parameter.

   /// \param args Vector of elements passed as an argument to `func`.

   /// \param nChunks Number of chunks to split the input data for processing.

   template<class F, class T>


   void TThreadExecutor::Foreach(F func, std::vector<T> &args, unsigned nChunks) {

      unsigned int nToProcess = args.size();

      if (nChunks == 0) {

         ParallelFor(0U, nToProcess, 1, [&](unsigned int i){func(args[i]);});

         return;

      }


      unsigned step = (nToProcess + nChunks - 1) / nChunks; //ceiling the division

      auto lambda = [&](unsigned int i)

      {

         for (unsigned j = 0; j < step && (i + j) < nToProcess; j++) {

            func(args[i + j]);

         }

      };

      ParallelFor(0U, nToProcess, step, lambda);

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function in parallel over the elements of a immutable vector, dividing the execution in nChunks.

   ///

   /// \param func Function to be executed on the elements of the vector passed as second parameter.

   /// \param args Immutable vector of elements passed as an argument to `func`.

   /// \param nChunks Number of chunks to split the input data for processing.

   template<class F, class T>


   void TThreadExecutor::Foreach(F func, const std::vector<T> &args, unsigned nChunks) {

      unsigned int nToProcess = args.size();

      if (nChunks == 0) {

         ParallelFor(0U, nToProcess, 1, [&](unsigned int i){func(args[i]);});

         return;

      }


      unsigned step = (nToProcess + nChunks - 1) / nChunks; //ceiling the division

      auto lambda = [&](unsigned int i)

      {

         for (unsigned j = 0; j < step && (i + j) < nToProcess; j++) {

            func(args[i + j]);

         }

      };

      ParallelFor(0U, nToProcess, step, lambda);

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function without arguments several times in parallel.

   /// Implementation of the Map method.

   ///

   /// \copydetails TExecutorCRTP::Map(F func,unsigned nTimes)

   template <class F, class Cond>


   auto TThreadExecutor::MapImpl(F func, unsigned nTimes) -> std::vector<InvokeResult_t<F>>

   {

      using retType = decltype(func());

      std::vector<retType> reslist(nTimes);

      auto lambda = [&](unsigned int i)

      {

         reslist[i] = func();

      };

      ParallelFor(0U, nTimes, 1, lambda);


      return reslist;

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function over a sequence of indexes in parallel.

   /// Implementation of the Map method.

   ///

   /// \copydetails TExecutorCRTP::Map(F func,ROOT::TSeq<INTEGER> args)

   template <class F, class INTEGER, class Cond>


   auto TThreadExecutor::MapImpl(F func, ROOT::TSeq<INTEGER> args) -> std::vector<InvokeResult_t<F, INTEGER>>

   {

      using retType = decltype(func(*args.begin()));

      std::vector<retType> reslist(args.size());

      auto lambda = [&](unsigned int i) { reslist[i] = func(args[i]); };

      ParallelFor(0U, args.size(), 1, lambda);


      return reslist;

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function `nTimes` in parallel, dividing the execution in nChunks and

   /// providing a result per chunk.

   ///

   /// \copydetails ROOT::Internal::TExecutor::Map(F func,unsigned nTimes,R redfunc,unsigned nChunks)

   template <class F, class R, class Cond>


   auto TThreadExecutor::Map(F func, unsigned nTimes, R redfunc, unsigned nChunks) -> std::vector<InvokeResult_t<F>>

   {

      if (nChunks == 0)

      {

         return Map(func, nTimes);

      }


      unsigned step = (nTimes + nChunks - 1) / nChunks;

      // Avoid empty chunks

      unsigned actualChunks = (nTimes + step - 1) / step;

      using retType = decltype(func());

      std::vector<retType> reslist(actualChunks);

      auto lambda = [&](unsigned int i)

      {

         std::vector<retType> partialResults(std::min(nTimes-i, step));

         for (unsigned j = 0; j < step && (i + j) < nTimes; j++) {

            partialResults[j] = func();

         }

         reslist[i / step] = Reduce(partialResults, redfunc);

      };

      ParallelFor(0U, nTimes, step, lambda);


      return reslist;

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function over the elements of a vector in parallel.

   /// Implementation of the Map method.

   ///

   /// \copydetails TExecutorCRTP::Map(F func,std::vector<T> &args)

   template <class F, class T, class Cond>


   auto TThreadExecutor::MapImpl(F func, std::vector<T> &args) -> std::vector<InvokeResult_t<F, T>>

   {

      // //check whether func is callable

      using retType = decltype(func(args.front()));


      unsigned int nToProcess = args.size();

      std::vector<retType> reslist(nToProcess);


      auto lambda = [&](unsigned int i)

      {

         reslist[i] = func(args[i]);

      };


      ParallelFor(0U, nToProcess, 1, lambda);


      return reslist;

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function over the elements of a vector in parallel.

   /// Implementation of the Map method.

   ///

   /// \copydetails TExecutorCRTP::Map(F func,const std::vector<T> &args)

   template <class F, class T, class Cond>


   auto TThreadExecutor::MapImpl(F func, const std::vector<T> &args) -> std::vector<InvokeResult_t<F, T>>

   {

      // //check whether func is callable

      using retType = decltype(func(args.front()));


      unsigned int nToProcess = args.size();

      std::vector<retType> reslist(nToProcess);


      auto lambda = [&](unsigned int i)

      {

         reslist[i] = func(args[i]);

      };


      ParallelFor(0U, nToProcess, 1, lambda);


      return reslist;

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function in parallel over the elements of a sequence, dividing the execution in nChunks and

   /// providing a result per chunk.

   ///

   /// \copydetails ROOT::Internal::TExecutor::Map(F func,ROOT::TSeq<INTEGER> args,R redfunc,unsigned nChunks)

   template <class F, class INTEGER, class R, class Cond>


   auto TThreadExecutor::Map(F func, ROOT::TSeq<INTEGER> args, R redfunc, unsigned nChunks)

      -> std::vector<InvokeResult_t<F, INTEGER>>

   {

      if (nChunks == 0)

      {

         return Map(func, args);

      }


      unsigned nToProcess = args.size();

      unsigned step = (nToProcess + nChunks - 1) / nChunks; // ceiling the division

      // Avoid empty chunks

      unsigned actualChunks = (nToProcess + step - 1) / step;


      using retType = decltype(func(*args.begin()));

      std::vector<retType> reslist(actualChunks);

      auto lambda = [&](unsigned int i) {

         std::vector<retType> partialResults(std::min(step, nToProcess - i)); // last chunk might be smaller

         for (unsigned j = 0; j < partialResults.size(); j++) {

            partialResults[j] = func(args[i + j]);

         }

         reslist[i / step] = Reduce(partialResults, redfunc);

      };


      ParallelFor(0U, nToProcess, step, lambda);


      return reslist;

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function in parallel over the elements of a vector, dividing the execution in nChunks and

   /// providing a result per chunk.

   ///

   /// \copydetails ROOT::Internal::TExecutor::Map(F func,std::vector<T> &args,R redfunc,unsigned nChunks)

   template <class F, class T, class R, class Cond>


   auto TThreadExecutor::Map(F func, std::vector<T> &args, R redfunc, unsigned nChunks)

      -> std::vector<InvokeResult_t<F, T>>

   {

      if (nChunks == 0)

      {

         return Map(func, args);

      }


      unsigned int nToProcess = args.size();

      unsigned step = (nToProcess + nChunks - 1) / nChunks; //ceiling the division

      // Avoid empty chunks

      unsigned actualChunks = (nToProcess + step - 1) / step;


      using retType = decltype(func(args.front()));

      std::vector<retType> reslist(actualChunks);

      auto lambda = [&](unsigned int i) {

         std::vector<retType> partialResults(std::min(step, nToProcess - i));

         for (unsigned j = 0; j < partialResults.size(); j++) {

            partialResults[j] = func(args[i + j]);

         }

         reslist[i / step] = Reduce(partialResults, redfunc);

      };


      ParallelFor(0U, nToProcess, step, lambda);


      return reslist;

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function in parallel over the elements of an immutable vector, dividing the execution in nChunks and

   /// providing a result per chunk.

   ///

   /// \copydetails ROOT::Internal::TExecutor::Map(F func,const std::vector<T> &args,R redfunc,unsigned nChunks)

   template <class F, class T, class R, class Cond>


   auto TThreadExecutor::Map(F func, const std::vector<T> &args, R redfunc, unsigned nChunks)

      -> std::vector<InvokeResult_t<F, T>>

   {

      if (nChunks == 0)

      {

         return Map(func, args);

      }


      unsigned int nToProcess = args.size();

      unsigned step = (nToProcess + nChunks - 1) / nChunks; //ceiling the division

      // Avoid empty chunks

      unsigned actualChunks = (nToProcess + step - 1) / step;


      using retType = decltype(func(args.front()));

      std::vector<retType> reslist(actualChunks);

      auto lambda = [&](unsigned int i) {

         std::vector<retType> partialResults(std::min(step, nToProcess - i));

         for (unsigned j = 0; j < partialResults.size(); j++) {

            partialResults[j] = func(args[i + j]);

         }

         reslist[i / step] = Reduce(partialResults, redfunc);

      };


      ParallelFor(0U, nToProcess, step, lambda);


      return reslist;

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function in parallel over the elements of an initializer_list, dividing the execution in nChunks and

   /// providing a result per chunk.

   ///

   /// \copydetails ROOT::Internal::TExecutor::Map(F func,std::initializer_list<T> args,R redfunc,unsigned nChunks)

   template <class F, class T, class R, class Cond>


   auto TThreadExecutor::Map(F func, std::initializer_list<T> args, R redfunc, unsigned nChunks)

      -> std::vector<InvokeResult_t<F, T>>

   {

      std::vector<T> vargs(std::move(args));

      const auto &reslist = Map(func, vargs, redfunc, nChunks);

      return reslist;

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function `nTimes` in parallel (Map) and accumulate the results into a single value (Reduce).

   /// \copydetails  ROOT::Internal::TExecutor::MapReduce(F func,unsigned nTimes,R redfunc)

   template <class F, class R, class Cond>


   auto TThreadExecutor::MapReduce(F func, unsigned nTimes, R redfunc) -> InvokeResult_t<F>

   {

      return Reduce(Map(func, nTimes), redfunc);

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function in parallel over the elements of a vector (Map) and accumulate the results into a single value (Reduce).

   /// Benefits from partial reduction into `nChunks` intermediate results.

   ///

   /// \copydetails ROOT::Internal::TExecutor::MapReduce(F func,unsigned nTimes,R redfunc,unsigned nChunks)

   template <class F, class R, class Cond>


   auto TThreadExecutor::MapReduce(F func, unsigned nTimes, R redfunc, unsigned nChunks) -> InvokeResult_t<F>

   {

      return Reduce(Map(func, nTimes, redfunc, nChunks), redfunc);

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function in parallel over the elements of a vector (Map) and accumulate the results into a single value (Reduce).

   /// Benefits from partial reduction into `nChunks` intermediate results.

   ///

   /// \copydetails ROOT::Internal::TExecutor::MapReduce(F func,ROOT::TSeq<INTEGER> args,R redfunc,unsigned nChunks)

   template <class F, class INTEGER, class R, class Cond>


   auto TThreadExecutor::MapReduce(F func, ROOT::TSeq<INTEGER> args, R redfunc, unsigned nChunks)

      -> InvokeResult_t<F, INTEGER>

   {

      return Reduce(Map(func, args, redfunc, nChunks), redfunc);

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function in parallel over the elements of an initializer_list (Map) and accumulate the results into a single value (Reduce).

   /// Benefits from partial reduction into `nChunks` intermediate results.

   ///

   /// \copydetails ROOT::Internal::TExecutor::MapReduce(F func,std::initializer_list<T> args,R redfunc,unsigned nChunks)

   template <class F, class T, class R, class Cond>


   auto TThreadExecutor::MapReduce(F func, std::initializer_list<T> args, R redfunc, unsigned nChunks)

      -> InvokeResult_t<F, T>

   {

      return Reduce(Map(func, args, redfunc, nChunks), redfunc);

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function over the elements of a vector in parallel (Map) and accumulate the results into a single value (Reduce).

   /// \copydetails  ROOT::Internal::TExecutor::MapReduce(F func,std::vector<T> &args,R redfunc)

   template <class F, class T, class R, class Cond>


   auto TThreadExecutor::MapReduce(F func, std::vector<T> &args, R redfunc) -> InvokeResult_t<F, T>

   {

      return Reduce(Map(func, args), redfunc);

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function over the elements of an immutable vector in parallel (Map) and accumulate the results into a single value (Reduce).

   /// \copydetails  ROOT::Internal::TExecutor::MapReduce(F func,const std::vector<T> &args,R redfunc)

   template <class F, class T, class R, class Cond>


   auto TThreadExecutor::MapReduce(F func, const std::vector<T> &args, R redfunc) -> InvokeResult_t<F, T>

   {

      return Reduce(Map(func, args), redfunc);

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function in parallel over the elements of a vector (Map) and accumulate the results into a single value (Reduce).

   /// Benefits from partial reduction into `nChunks` intermediate results.

   ///

   /// \copydetails ROOT::Internal::TExecutor::MapReduce(F func,std::vector<T> &args,R redfunc,unsigned nChunks)

   template <class F, class T, class R, class Cond>


   auto TThreadExecutor::MapReduce(F func, std::vector<T> &args, R redfunc, unsigned nChunks) -> InvokeResult_t<F, T>

   {

      return Reduce(Map(func, args, redfunc, nChunks), redfunc);

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief Execute a function in parallel over the elements of an immutable vector (Map) and accumulate the results into a single value (Reduce).

   /// Benefits from partial reduction into `nChunks` intermediate results.

   ///

   /// \copydetails ROOT::Internal::TExecutor::MapReduce(F func,const std::vector<T> &args,R redfunc,unsigned nChunks)

   template <class F, class T, class R, class Cond>


   auto TThreadExecutor::MapReduce(F func, const std::vector<T> &args, R redfunc, unsigned nChunks)

      -> InvokeResult_t<F, T>

   {

      return Reduce(Map(func, args, redfunc, nChunks), redfunc);

   }


   //////////////////////////////////////////////////////////////////////////

   /// \copydoc ROOT::Internal::TExecutor::Reduce(const std::vector<T> &objs,R redfunc)

   template<class T, class R>


   auto TThreadExecutor::Reduce(const std::vector<T> &objs, R redfunc) -> decltype(redfunc(objs))

   {

      // check we can apply reduce to objs

      static_assert(std::is_same<decltype(redfunc(objs)), T>::value, "redfunc does not have the correct signature");

      return SeqReduce(objs, redfunc);

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief "Reduce" an std::vector into a single object in parallel by passing a

   /// binary function as the second argument defining the reduction operation.

   ///

   /// \param objs A vector of elements to combine.

   /// \param redfunc Binary reduction function to combine the elements of the vector `objs`.

   /// \return A value result of combining the vector elements into a single object of the same type.

   template<class T, class BINARYOP>


   auto TThreadExecutor::Reduce(const std::vector<T> &objs, BINARYOP redfunc) -> decltype(redfunc(objs.front(), objs.front()))

   {

      // check we can apply reduce to objs

      static_assert(std::is_same<decltype(redfunc(objs.front(), objs.front())), T>::value, "redfunc does not have the correct signature");

      return ParallelReduce(objs, redfunc);

   }


   //////////////////////////////////////////////////////////////////////////

   /// \brief "Reduce", sequentially, an std::vector into a single object

   ///

   /// \param objs A vector of elements to combine.

   /// \param redfunc Reduction function to combine the elements of the vector `objs`.

   /// \return A value result of combining the vector elements into a single object of the same type.

   template<class T, class R>


   auto TThreadExecutor::SeqReduce(const std::vector<T> &objs, R redfunc) -> decltype(redfunc(objs))

   {

      return redfunc(objs);

   }


} // namespace ROOT


#endif   // R__USE_IMT

#endif

b
#define b(i)
Definition RSha256.hxx:100

f
#define f(i)
Definition RSha256.hxx:104

a
#define a(i)
Definition RSha256.hxx:99

RTaskArena.hxx

start
start
Definition Rotated.cxx:223

UInt_t
unsigned int UInt_t
Unsigned integer 4 bytes (unsigned int).
Definition RtypesCore.h:60

TError.h

TExecutorCRTP.hxx

value
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
Definition TGWin32VirtualXProxy.cxx:142

TSeq.hxx

TypeTraits.hxx

ROOT::TExecutorCRTP< TThreadExecutor >::InvokeResult_t
ROOT::TypeTraits::InvokeResult_t< F, Args... > InvokeResult_t
Definition TExecutorCRTP.hxx:107

ROOT::TSeq
A pseudo container class which is a generator of indices.
Definition TSeq.hxx:67

ROOT::TSeq::begin
iterator begin() const
Definition TSeq.hxx:172

ROOT::TSeq::step
T step() const
Definition TSeq.hxx:193

ROOT::TSeq::end
iterator end() const
Definition TSeq.hxx:175

ROOT::TThreadExecutor::SeqReduce
auto SeqReduce(const std::vector< T > &objs, R redfunc) -> decltype(redfunc(objs))
"Reduce", sequentially, an std::vector into a single object
Definition TThreadExecutor.hxx:594

ROOT::TThreadExecutor::Map
auto Map(F func, unsigned nTimes, R redfunc, unsigned nChunks) -> std::vector< InvokeResult_t< F > >
Execute a function nTimes in parallel, dividing the execution in nChunks and providing a result per c...
Definition TThreadExecutor.hxx:288

ROOT::TThreadExecutor::ParallelFor
void ParallelFor(unsigned start, unsigned end, unsigned step, const std::function< void(unsigned int i)> &f)
Execute a function in parallel over the indices of a loop.
Definition TThreadExecutor.cxx:160

ROOT::TThreadExecutor::GetPoolSize
unsigned GetPoolSize() const
Returns the number of worker threads in the task arena.
Definition TThreadExecutor.cxx:215

ROOT::TThreadExecutor::MapReduce
auto MapReduce(F func, unsigned nTimes, R redfunc) -> InvokeResult_t< F >
Execute a function nTimes in parallel (Map) and accumulate the results into a single value (Reduce).
Definition TThreadExecutor.hxx:481

ROOT::TThreadExecutor::fTaskArenaW
std::shared_ptr< ROOT::Internal::RTaskArenaWrapper > fTaskArenaW
Pointer to the TBB task arena wrapper.
Definition TThreadExecutor.hxx:133

ROOT::TThreadExecutor::Reduce
auto Reduce(const std::vector< T > &objs, R redfunc) -> decltype(redfunc(objs))
Definition TThreadExecutor.hxx:565

ROOT::TThreadExecutor::Foreach
void Foreach(F func, unsigned nTimes, unsigned nChunks=0)
Execute a function without arguments several times in parallel, dividing the execution in nChunks.
Definition TThreadExecutor.hxx:145

ROOT::TThreadExecutor::TExecutorCRTP
friend TExecutorCRTP
Definition TThreadExecutor.hxx:42

ROOT::TThreadExecutor::TThreadExecutor
TThreadExecutor(UInt_t nThreads=0u)
Class constructor.
Definition TThreadExecutor.cxx:148

ROOT::TThreadExecutor::ParallelReduce
double ParallelReduce(const std::vector< double > &objs, const std::function< double(double a, double b)> &redfunc)
"Reduce" in parallel an std::vector<double> into a single double value
Definition TThreadExecutor.cxx:182

ROOT::TThreadExecutor::operator=
TThreadExecutor & operator=(const TThreadExecutor &)=delete

ROOT::TThreadExecutor::TThreadExecutor
TThreadExecutor(const TThreadExecutor &)=delete

ROOT::TThreadExecutor::MapImpl
auto MapImpl(F func, unsigned nTimes) -> std::vector< InvokeResult_t< F > >
Execute a function without arguments several times in parallel.
Definition TThreadExecutor.hxx:253

F
#define F(x, y, z)

ROOT::R
namespace associated R package for ROOT.
Definition RExports.h:72

ROOT
Definition EExecutionPolicy.hxx:4