39void fillBatches(
Batches &batches,
double *output,
size_t nEvents, std::size_t nBatches, std::size_t nExtraArgs)
43 batches.
nExtra = nExtraArgs;
47void fillArrays(
Batch *arrays,
VarSpan vars,
double *buffer,
double *bufferDevice, std::size_t nEvents)
49 for (
int i = 0; i < vars.size(); i++) {
50 const std::span<const double> &span = vars[i];
51 arrays[i].
_isVector = span.empty() || span.size() >= nEvents;
52 if (!arrays[i]._isVector) {
56 arrays[i].
_array = bufferDevice + i;
62 arrays[i].
_array = span.data();
67int getGridSize(std::size_t
n)
80 constexpr int maxGridSize = 84;
81 return std::min(
int(std::ceil(
double(
n) /
blockSize)), maxGridSize);
113 std::size_t nEvents = output.size();
115 const std::size_t memSize =
sizeof(
Batches) + vars.size() *
sizeof(
Batch) + vars.size() *
sizeof(
double) +
116 extraArgs.size() *
sizeof(
double);
118 std::vector<char> hostMem(memSize);
119 auto batches =
reinterpret_cast<Batches *
>(hostMem.data());
120 auto arrays =
reinterpret_cast<Batch *
>(batches + 1);
121 auto scalarBuffer =
reinterpret_cast<double *
>(arrays + vars.size());
122 auto extraArgsHost =
reinterpret_cast<double *
>(scalarBuffer + vars.size());
124 DeviceArray<char> deviceMem(memSize);
125 auto batchesDevice =
reinterpret_cast<Batches *
>(deviceMem.data());
126 auto arraysDevice =
reinterpret_cast<Batch *
>(batchesDevice + 1);
127 auto scalarBufferDevice =
reinterpret_cast<double *
>(arraysDevice + vars.size());
128 auto extraArgsDevice =
reinterpret_cast<double *
>(scalarBufferDevice + vars.size());
130 fillBatches(*batches, output.data(), nEvents, vars.size(), extraArgs.size());
131 fillArrays(arrays, vars, scalarBuffer, scalarBufferDevice, nEvents);
132 batches->args = arraysDevice;
134 if (!extraArgs.empty()) {
135 std::copy(std::cbegin(extraArgs), std::cend(extraArgs), extraArgsHost);
136 batches->extra = extraArgsDevice;
139 copyHostToDevice(hostMem.data(), deviceMem.data(), hostMem.size(), cfg.
cudaStream());
141 const int gridSize = getGridSize(nEvents);
147 if (!extraArgs.empty()) {
148 copyDeviceToHost(extraArgsDevice, extraArgs.data(), extraArgs.size(), cfg.
cudaStream());
154 std::span<const double> weights, std::span<const double> offsetProbas)
override;
184 const double y =
a - (carry + otherCarry);
185 const double t =
sum +
y;
188 carry = (t -
sum) -
y;
195inline __device__
void kahanSumReduction(
double *shared,
size_t n,
double *__restrict__ result,
int carry_index)
198 for (
int i = blockDim.x / 2; i > 0; i >>= 1) {
199 if (threadIdx.x < i && (threadIdx.x + i) <
n) {
200 kahanSumUpdate(shared[threadIdx.x], shared[carry_index], shared[threadIdx.x + i], shared[carry_index + i]);
206 if (threadIdx.x == 0) {
207 result[blockIdx.x] = shared[0];
208 result[blockIdx.x + gridDim.x] = shared[carry_index];
212__global__
void kahanSum(
const double *__restrict__ input,
const double *__restrict__ carries,
size_t n,
213 double *__restrict__ result,
bool nll)
215 int thIdx = threadIdx.x;
216 int gthIdx = thIdx + blockIdx.x *
blockSize;
217 int carry_index = threadIdx.x + blockDim.x;
218 const int nThreadsTotal =
blockSize * gridDim.x;
221 extern __shared__
double shared[];
226 for (
int i = gthIdx; i <
n; i += nThreadsTotal) {
229 double val = nll == 1 ? -std::log(input[i]) : input[i];
234 shared[carry_index] = carry;
242__global__
void nllSumKernel(
const double *__restrict__ probas,
const double *__restrict__ weights,
243 const double *__restrict__ offsetProbas,
size_t nProbas,
double scalarProba,
244 size_t nWeights,
double *__restrict__ result)
246 int thIdx = threadIdx.x;
247 int gthIdx = thIdx + blockIdx.x *
blockSize;
248 int carry_index = threadIdx.x + blockDim.x;
249 const int nThreadsTotal =
blockSize * gridDim.x;
252 extern __shared__
double shared[];
257 for (
int i = gthIdx; i < nWeights; i += nThreadsTotal) {
260 double val = -std::log(nProbas == 1 ? scalarProba : probas[i]);
262 val += std::log(offsetProbas[i]);
263 val = weights[i] * val;
268 shared[carry_index] = carry;
280 const int gridSize = getGridSize(
n);
292 std::span<const double> weights, std::span<const double> offsetProbas)
295 if (probas.empty()) {
298 const int gridSize = getGridSize(weights.size());
304 for (
auto span : {probas, weights, offsetProbas}) {
305 cudaPointerAttributes attr;
306 assert(span.size() == 0 || span.data() ==
nullptr ||
307 (cudaPointerGetAttributes(&attr, span.data()) == cudaSuccess && attr.type == cudaMemoryTypeDevice));
312 probas.data(), weights.data(), offsetProbas.empty() ?
nullptr : offsetProbas.data(), probas.size(),
313 probas.size() == 1 ? probas[0] : 0.0, weights.size(), devOut.
data());
318 double tmpCarry = 0.0;
329class ScalarBufferContainer {
331 ScalarBufferContainer() {}
332 ScalarBufferContainer(std::size_t
size)
335 throw std::runtime_error(
"ScalarBufferContainer can only be of size 1");
338 double const *hostReadPtr()
const {
return &_val; }
339 double const *deviceReadPtr()
const {
return &_val; }
341 double *hostWritePtr() {
return &_val; }
342 double *deviceWritePtr() {
return &_val; }
344 void assignFromHost(std::span<const double> input) { _val = input[0]; }
345 void assignFromDevice(std::span<const double> input)
354class CPUBufferContainer {
356 CPUBufferContainer(std::size_t
size) : _vec(
size) {}
358 double const *hostReadPtr()
const {
return _vec.data(); }
359 double const *deviceReadPtr()
const
361 throw std::bad_function_call();
365 double *hostWritePtr() {
return _vec.data(); }
366 double *deviceWritePtr()
368 throw std::bad_function_call();
372 void assignFromHost(std::span<const double> input) { _vec.assign(input.begin(), input.end()); }
373 void assignFromDevice(std::span<const double> input)
379 std::vector<double> _vec;
382class GPUBufferContainer {
384 GPUBufferContainer(std::size_t
size) : _arr(
size) {}
386 double const *hostReadPtr()
const
388 throw std::bad_function_call();
391 double const *deviceReadPtr()
const {
return _arr.data(); }
393 double *hostWritePtr()
const
395 throw std::bad_function_call();
398 double *deviceWritePtr()
const {
return const_cast<double *
>(_arr.data()); }
400 void assignFromHost(std::span<const double> input)
404 void assignFromDevice(std::span<const double> input)
413class PinnedBufferContainer {
415 PinnedBufferContainer(std::size_t
size) : _arr{
size}, _gpuBuffer{
size} {}
416 std::size_t
size()
const {
return _arr.size(); }
418 void setCudaStream(CudaInterface::CudaStream *stream) { _cudaStream = stream; }
420 double const *hostReadPtr()
const
423 if (_lastAccess == LastAccessType::GPU_WRITE) {
428 _lastAccess = LastAccessType::CPU_READ;
429 return const_cast<double *
>(_arr.data());
431 double const *deviceReadPtr()
const
434 if (_lastAccess == LastAccessType::CPU_WRITE) {
438 _lastAccess = LastAccessType::GPU_READ;
439 return _gpuBuffer.deviceReadPtr();
442 double *hostWritePtr()
444 _lastAccess = LastAccessType::CPU_WRITE;
447 double *deviceWritePtr()
449 _lastAccess = LastAccessType::GPU_WRITE;
450 return _gpuBuffer.deviceWritePtr();
453 void assignFromHost(std::span<const double> input) { std::copy(input.begin(), input.end(), hostWritePtr()); }
454 void assignFromDevice(std::span<const double> input)
460 enum class LastAccessType {
468 GPUBufferContainer _gpuBuffer;
469 CudaInterface::CudaStream *_cudaStream =
nullptr;
470 mutable LastAccessType _lastAccess = LastAccessType::CPU_READ;
473template <
class Container>
474class BufferImpl :
public AbsBuffer {
476 using Queue = std::queue<std::unique_ptr<Container>>;
478 BufferImpl(std::size_t
size, Queue &queue) : _queue{queue}
480 if (_queue.empty()) {
481 _vec = std::make_unique<Container>(
size);
483 _vec = std::move(_queue.front());
488 ~BufferImpl()
override { _queue.emplace(std::move(_vec)); }
490 double const *hostReadPtr()
const override {
return _vec->hostReadPtr(); }
491 double const *deviceReadPtr()
const override {
return _vec->deviceReadPtr(); }
493 double *hostWritePtr()
override {
return _vec->hostWritePtr(); }
494 double *deviceWritePtr()
override {
return _vec->deviceWritePtr(); }
496 void assignFromHost(std::span<const double> input)
override { _vec->assignFromHost(input); }
497 void assignFromDevice(std::span<const double> input)
override { _vec->assignFromDevice(input); }
499 Container &vec() {
return *_vec; }
502 std::unique_ptr<Container> _vec;
506using ScalarBuffer = BufferImpl<ScalarBufferContainer>;
507using CPUBuffer = BufferImpl<CPUBufferContainer>;
508using GPUBuffer = BufferImpl<GPUBufferContainer>;
509using PinnedBuffer = BufferImpl<PinnedBufferContainer>;
511struct BufferQueuesMaps {
512 std::map<std::size_t, ScalarBuffer::Queue> scalarBufferQueuesMap;
513 std::map<std::size_t, CPUBuffer::Queue> cpuBufferQueuesMap;
514 std::map<std::size_t, GPUBuffer::Queue> gpuBufferQueuesMap;
515 std::map<std::size_t, PinnedBuffer::Queue> pinnedBufferQueuesMap;
518class BufferManager :
public AbsBufferManager {
521 BufferManager() : _queuesMaps{std::make_unique<BufferQueuesMaps>()} {}
523 std::unique_ptr<AbsBuffer> makeScalarBuffer()
override
525 return std::make_unique<ScalarBuffer>(1, _queuesMaps->scalarBufferQueuesMap[1]);
527 std::unique_ptr<AbsBuffer> makeCpuBuffer(std::size_t
size)
override
529 return std::make_unique<CPUBuffer>(
size, _queuesMaps->cpuBufferQueuesMap[
size]);
531 std::unique_ptr<AbsBuffer> makeGpuBuffer(std::size_t
size)
override
533 return std::make_unique<GPUBuffer>(
size, _queuesMaps->gpuBufferQueuesMap[
size]);
535 std::unique_ptr<AbsBuffer> makePinnedBuffer(std::size_t
size, CudaInterface::CudaStream *stream =
nullptr)
override
537 auto out = std::make_unique<PinnedBuffer>(
size, _queuesMaps->pinnedBufferQueuesMap[
size]);
538 out->vec().setCudaStream(stream);
543 std::unique_ptr<BufferQueuesMaps> _queuesMaps;
550 return std::make_unique<BufferManager>();