Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RooBatchCompute.cxx
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * Emmanouil Michalainas, CERN, September 2020
5 *
6 * Copyright (c) 2021, CERN
7 *
8 * Redistribution and use in source and binary forms,
9 * with or without modification, are permitted according to the terms
10 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
11 */
12
13/**
14\file RooBatchCompute.cxx
15\class RbcClass
16\ingroup Roobatchcompute
17
18This file contains the code for cpu computations using the RooBatchCompute library.
19**/
20
21#include "RooBatchCompute.h"
22#include "RooNaNPacker.h"
23#include "RooVDTHeaders.h"
24#include "Batches.h"
25
26#include <ROOT/RConfig.hxx>
27
28#ifdef ROOBATCHCOMPUTE_USE_IMT
29#include <ROOT/TExecutor.hxx>
30#endif
31
32#include <Math/Util.h>
33
34#include <algorithm>
35#include <sstream>
36#include <stdexcept>
37#include <vector>
38
39#ifndef RF_ARCH
40#error "RF_ARCH should always be defined"
41#endif
42
43namespace RooBatchCompute {
44namespace RF_ARCH {
45
46namespace {
47
48void fillBatches(Batches &batches, RestrictArr output, size_t nEvents, std::size_t nBatches, ArgSpan extraArgs)
49{
50 batches.extra = extraArgs.data();
51 batches.nEvents = nEvents;
52 batches.nBatches = nBatches;
53 batches.nExtra = extraArgs.size();
54 batches.output = output;
55}
56
57void fillArrays(std::span<Batch> arrays, VarSpan vars, std::size_t nEvents)
58{
59 for (std::size_t i = 0; i < vars.size(); i++) {
60 arrays[i]._array = vars[i].data();
61 arrays[i]._isVector = vars[i].empty() || vars[i].size() >= nEvents;
62 }
63}
64
65inline void advance(Batches &batches, std::size_t nEvents)
66{
67 for (std::size_t i = 0; i < batches.nBatches; i++) {
68 Batch &arg = batches.args[i];
69 arg._array += arg._isVector * nEvents;
70 }
71 batches.output += nEvents;
72}
73
74} // namespace
75
76std::vector<void (*)(Batches &)> getFunctions();
77
78/// This class overrides some RooBatchComputeInterface functions, for the
79/// purpose of providing a CPU specific implementation of the library.
80class RooBatchComputeClass : public RooBatchComputeInterface {
81public:
83 {
84 // Set the dispatch pointer to this instance of the library upon loading
85 dispatchCPU = this;
86 }
87
88 Architecture architecture() const override { return Architecture::RF_ARCH; };
89 std::string architectureName() const override
90 {
91 // transform to lower case to match the original architecture name passed to the compiler
92#ifdef _QUOTEVAL_ // to quote the value of the preprocessor macro instead of the name
93#error "It's unexpected that _QUOTEVAL_ is defined at this point!"
94#endif
95#define _QUOTEVAL_(x) _QUOTE_(x)
96 std::string out = _QUOTEVAL_(RF_ARCH);
97#undef _QUOTEVAL_
98 std::transform(out.begin(), out.end(), out.begin(), [](unsigned char c) { return std::tolower(c); });
99 return out;
100 };
101
102 void compute(Config const &, Computer computer, RestrictArr output, size_t nEvents, VarSpan vars,
103 ArgSpan extraArgs) override;
104 double reduceSum(Config const &, InputArr input, size_t n) override;
105 ReduceNLLOutput reduceNLL(Config const &, std::span<const double> probas, std::span<const double> weights,
106 std::span<const double> offsetProbas) override;
107
108private:
109#ifdef ROOBATCHCOMPUTE_USE_IMT
110 void computeIMT(Computer computer, RestrictArr output, size_t nEvents, VarSpan vars, ArgSpan extraArgs);
111#endif
112
113 const std::vector<void (*)(Batches &)> _computeFunctions;
114};
115
116#ifdef ROOBATCHCOMPUTE_USE_IMT
117void RooBatchComputeClass::computeIMT(Computer computer, RestrictArr output, size_t nEvents, VarSpan vars,
118 ArgSpan extraArgs)
119{
120 if (nEvents == 0)
121 return;
123 std::size_t nThreads = ex.GetPoolSize();
124
125 std::size_t nEventsPerThread = nEvents / nThreads + (nEvents % nThreads > 0);
126
127 // Reset the number of threads to the number we actually need given nEventsPerThread
128 nThreads = nEvents / nEventsPerThread + (nEvents % nEventsPerThread > 0);
129
130 auto task = [&](std::size_t idx) -> int {
131 // Fill a std::vector<Batches> with the same object and with ~nEvents/nThreads
132 // Then advance every object but the first to split the work between threads
133 Batches batches;
134 std::vector<Batch> arrays(vars.size());
135 fillBatches(batches, output, nEventsPerThread, vars.size(), extraArgs);
136 fillArrays(arrays, vars, nEvents);
137 batches.args = arrays.data();
138 advance(batches, batches.nEvents * idx);
139
140 // Set the number of events of the last Batches object as the remaining events
141 if (idx == nThreads - 1) {
142 batches.nEvents = nEvents - idx * batches.nEvents;
143 }
144
145 std::size_t events = batches.nEvents;
146 batches.nEvents = bufferSize;
147 while (events > bufferSize) {
148 _computeFunctions[computer](batches);
149 advance(batches, bufferSize);
150 events -= bufferSize;
151 }
152 batches.nEvents = events;
153 _computeFunctions[computer](batches);
154 return 0;
155 };
156
157 std::vector<std::size_t> indices(nThreads);
158 for (unsigned int i = 1; i < nThreads; i++) {
159 indices[i] = i;
160 }
161 ex.Map(task, indices);
162}
163#endif
164
165/** Compute multiple values using optimized functions.
166This method creates a Batches object and passes it to the correct compute function.
167In case Implicit Multithreading is enabled, the events to be processed are equally
168divided among the tasks to be generated and computed in parallel.
169\param computer An enum specifying the compute function to be used.
170\param output The array where the computation results are stored.
171\param nEvents The number of events to be processed.
172\param vars A std::span containing pointers to the variables involved in the computation.
173\param extraArgs An optional std::span containing extra double values that may participate in the computation. **/
174void RooBatchComputeClass::compute(Config const &, Computer computer, RestrictArr output, size_t nEvents, VarSpan vars,
175 ArgSpan extraArgs)
176{
177 // In the original implementation of this library, the evaluation was done
178 // multi-threaded in implicit multi-threading was enabled in ROOT with
179 // ROOT::EnableImplicitMT().
180 //
181 // However, this multithreaded mode was not carefully validated and is
182 // therefore not production ready. One would first have to study the
183 // overhead for different numbers of cores, number of events, and model
184 // complexity. The, we should only consider implicit multithreading here if
185 // there is no performance penalty for any scenario, to not surprise the
186 // users with unexpected slowdows!
187 //
188 // Note that the priority of investigating this is not high, because RooFit
189 // R & D efforts currently go in the direction of parallelization at the
190 // level of the gradient components, or achieving single-threaded speedup
191 // with automatic differentiation. Furthermore, the single-threaded
192 // performance of the new CPU evaluation backend with the RooBatchCompute
193 // library, is generally much faster than the legacy evaluation backend
194 // already, even if the latter uses multi-threading.
195#ifdef ROOBATCHCOMPUTE_USE_IMT
197 computeIMT(computer, output, nEvents, vars, extraArgs);
198 }
199#endif
200
201 // Fill a std::vector<Batches> with the same object and with ~nEvents/nThreads
202 // Then advance every object but the first to split the work between threads
203 Batches batches;
204 std::vector<Batch> arrays(vars.size());
205 fillBatches(batches, output, nEvents, vars.size(), extraArgs);
206 fillArrays(arrays, vars, nEvents);
207 batches.args = arrays.data();
208
209 std::size_t events = batches.nEvents;
210 batches.nEvents = bufferSize;
211 while (events > bufferSize) {
212 _computeFunctions[computer](batches);
213 advance(batches, bufferSize);
214 events -= bufferSize;
215 }
216 batches.nEvents = events;
217 _computeFunctions[computer](batches);
218}
219
220namespace {
221
222inline std::pair<double, double> getLog(double prob, ReduceNLLOutput &out)
223{
224 if (std::abs(prob) > 1e6) {
225 out.nLargeValues++;
226 }
227
228 if (prob <= 0.0) {
229 out.nNonPositiveValues++;
230 return {std::log(prob), -prob};
231 }
232
233 if (std::isnan(prob)) {
234 out.nNaNValues++;
235 return {prob, RooNaNPacker::unpackNaN(prob)};
236 }
237
238 return {std::log(prob), 0.0};
239}
240
241} // namespace
242
243double RooBatchComputeClass::reduceSum(Config const &, InputArr input, size_t n)
244{
246}
247
248ReduceNLLOutput RooBatchComputeClass::reduceNLL(Config const &, std::span<const double> probas,
249 std::span<const double> weights, std::span<const double> offsetProbas)
250{
251 ReduceNLLOutput out;
252
253 double badness = 0.0;
254
256
257 for (std::size_t i = 0; i < probas.size(); ++i) {
258
259 const double eventWeight = weights.size() > 1 ? weights[i] : weights[0];
260
261 if (0. == eventWeight)
262 continue;
263
264 std::pair<double, double> logOut = getLog(probas[i], out);
265 double term = logOut.first;
266 badness += logOut.second;
267
268 if (!offsetProbas.empty()) {
269 term -= std::log(offsetProbas[i]);
270 }
271
272 term *= -eventWeight;
273
274 nllSum.Add(term);
275 }
276
277 out.nllSum = nllSum.Sum();
278 out.nllSumCarry = nllSum.Carry();
279
280 if (badness != 0.) {
281 // Some events with evaluation errors. Return "badness" of errors.
282 out.nllSum = RooNaNPacker::packFloatIntoNaN(badness);
283 out.nllSumCarry = 0.0;
284 }
285
286 return out;
287}
288
289/// Static object to trigger the constructor which overwrites the dispatch pointer.
291
292} // End namespace RF_ARCH
293} // End namespace RooBatchCompute
#define c(i)
Definition RSha256.hxx:101
#define _QUOTEVAL_(x)
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
These classes encapsulate the necessary data for the computations.
This class implements the interface to execute the same task multiple times, sequentially or in paral...
Definition TExecutor.hxx:37
unsigned GetPoolSize() const
Return the number of pooled workers.
The Kahan summation is a compensated summation algorithm, which significantly reduces numerical error...
Definition Util.h:122
T Sum() const
Definition Util.h:240
static KahanSum< T, N > Accumulate(Iterator begin, Iterator end, T initialValue=T{})
Iterate over a range and return an instance of a KahanSum.
Definition Util.h:211
T Carry() const
Definition Util.h:250
void Add(T x)
Single-element accumulation. Will not vectorise.
Definition Util.h:165
std::size_t nEvents
Definition Batches.h:46
Minimal configuration struct to steer the evaluation of a single node with the RooBatchCompute librar...
This class overrides some RooBatchComputeInterface functions, for the purpose of providing a cuda spe...
void compute(Config const &, Computer computer, RestrictArr output, size_t nEvents, VarSpan vars, ArgSpan extraArgs) override
ReduceNLLOutput reduceNLL(RooBatchCompute::Config const &cfg, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
double reduceSum(Config const &, InputArr input, size_t n) override
const std::vector< void(*)(Batches &)> _computeFunctions
double reduceSum(RooBatchCompute::Config const &cfg, InputArr input, size_t n) override
Return the sum of an input array.
void compute(RooBatchCompute::Config const &cfg, Computer computer, RestrictArr output, size_t nEvents, VarSpan vars, ArgSpan extraArgs) override
Compute multiple values using cuda kernels.
ReduceNLLOutput reduceNLL(Config const &, std::span< const double > probas, std::span< const double > weights, std::span< const double > offsetProbas) override
const Int_t n
Definition legend1.C:16
Double_t ex[n]
Definition legend1.C:17
void(off) SmallVectorTemplateBase< T
Bool_t IsImplicitMTEnabled()
Returns true if the implicit multi-threading in ROOT is enabled.
Definition TROOT.cxx:568
std::vector< void(*)(Batches &)> getFunctions()
Returns a std::vector of pointers to the compute functions in this file.
static RooBatchComputeClass computeObj
Static object to trigger the constructor which overwrites the dispatch pointer.
Namespace for dispatching RooFit computations to various backends.
std::span< double > ArgSpan
R__EXTERN RooBatchComputeInterface * dispatchCPU
This dispatch pointer points to an implementation of the compute library, provided one has been loade...
constexpr std::size_t bufferSize
const double *__restrict InputArr
std::span< const std::span< const double > > VarSpan
double *__restrict RestrictArr
void probas(TString dataset, TString fin="TMVA.root", Bool_t useTMVAStyle=kTRUE)
__roodevice__ static __roohost__ double packFloatIntoNaN(float payload)
Pack float into mantissa of a NaN.
static float unpackNaN(double val)
If val is NaN and a this NaN has been tagged as containing a payload, unpack the float from the manti...
static void output()