19 #include "cuda_runtime.h" 28 template<
typename AFloat>
31 cudaFreeHost(*devicePointer);
32 delete[] devicePointer;
36 template<
typename AFloat>
40 AFloat ** pointer =
new AFloat * [1];
41 cudaMallocHost(pointer, size *
sizeof(AFloat));
46 template<
typename AFloat>
53 template<
typename AFloat>
66 template<
typename AFloat>
69 cudaFree(*devicePointer);
70 delete[] devicePointer;
74 template<
typename AFloat>
78 AFloat ** pointer =
new AFloat * [1];
79 cudaMalloc(pointer, size *
sizeof(AFloat));
85 template<
typename AFloat>
90 AFloat ** pointer =
new AFloat * [1];
91 cudaMalloc(pointer, size *
sizeof(AFloat));
96 template<
typename AFloat>
102 AFloat ** pointer =
new AFloat * [1];
103 *pointer = devicePointer;
108 template<
typename AFloat>
119 template<
typename AFloat>
126 template<
typename AFloat>
130 cudaMemcpyAsync(*
this, buffer,
fSize *
sizeof(AFloat),
135 template<
typename AFloat>
138 cudaMemcpyAsync(*
this, buffer,
fSize *
sizeof(AFloat),
153 for (
size_t i = 0; i < batchSize; i++) {
154 size_t sampleIndex = *sampleIterator;
155 for (
size_t j = 0; j <
n; j++) {
156 size_t bufferIndex = j * batchSize + i;
157 buffer[bufferIndex] =
static_cast<float>(inputMatrix(sampleIndex, j));
173 for (
size_t i = 0; i < batchSize; i++) {
174 size_t sampleIndex = *sampleIterator;
175 for (
size_t j = 0; j <
n; j++) {
176 size_t bufferIndex = j * batchSize + i;
177 buffer[bufferIndex] =
static_cast<float>(outputMatrix(sampleIndex, j));
190 Event *
event = fData.front();
191 size_t n =
event->GetNVariables();
195 for (
size_t i = 0; i < batchSize; i++) {
196 size_t sampleIndex = * sampleIterator++;
197 event = fData[sampleIndex];
198 for (
size_t j = 0; j <
n; j++) {
199 size_t bufferIndex = j * batchSize + i;
200 buffer[bufferIndex] =
static_cast<float>(
event->GetValue(j));
212 Event *
event = fData.front();
213 size_t n = buffer.
GetSize() / batchSize;
217 for (
size_t i = 0; i < batchSize; i++) {
218 size_t sampleIndex = * sampleIterator++;
219 event = fData[sampleIndex];
220 for (
size_t j = 0; j <
n; j++) {
222 size_t bufferIndex = j * batchSize + i;
224 if (event->GetNTargets() == 0) {
227 buffer[bufferIndex] = (
event->GetClass() == 0) ? 1.0 : 0.0;
230 buffer[bufferIndex] = 0.0;
231 if (j == event->GetClass()) {
232 buffer[bufferIndex] = 1.0;
236 buffer[bufferIndex] =
static_cast<float>(
event->GetTarget(j));
252 for (
size_t i = 0; i < batchSize; i++) {
253 size_t sampleIndex = *sampleIterator;
254 for (
size_t j = 0; j <
n; j++) {
255 size_t bufferIndex = j * batchSize + i;
256 buffer[bufferIndex] = inputMatrix(sampleIndex, j);
272 for (
size_t i = 0; i < batchSize; i++) {
273 size_t sampleIndex = *sampleIterator;
274 for (
size_t j = 0; j <
n; j++) {
275 size_t bufferIndex = j * batchSize + i;
276 buffer[bufferIndex] = outputMatrix(sampleIndex, j);
289 Event *
event = fData.front();
290 size_t n =
event->GetNVariables();
294 for (
size_t i = 0; i < batchSize; i++) {
295 size_t sampleIndex = * sampleIterator++;
296 event = fData[sampleIndex];
297 for (
size_t j = 0; j <
n; j++) {
298 size_t bufferIndex = j * batchSize + i;
299 buffer[bufferIndex] =
event->GetValue(j);
311 Event *
event = fData.front();
312 size_t n = buffer.
GetSize() / batchSize;
316 for (
size_t i = 0; i < batchSize; i++) {
317 size_t sampleIndex = * sampleIterator++;
318 event = fData[sampleIndex];
319 for (
size_t j = 0; j <
n; j++) {
321 size_t bufferIndex = j * batchSize + i;
323 if (event->GetNTargets() == 0) {
326 buffer[bufferIndex] = (
event->GetClass() == 0) ? 1.0 : 0.0;
329 buffer[bufferIndex] = 0.0;
330 if (j == event->GetClass()) {
331 buffer[bufferIndex] = 1.0;
335 buffer[bufferIndex] =
event->GetTarget(j);
std::shared_ptr< AFloat * > fHostPointer
Pointer to the buffer data.
TCudaDeviceBuffer()=default
typename std::vector< size_t >::iterator IndexIterator_t
size_t fOffset
Offset for sub-buffers.
void CopyTo(const TCudaHostBuffer< AFloat > &) const
struct TMVA::DNN::TCudaHostBuffer::TDestructor fDestructor
cudaStream_t fComputeStream
cudaStream for data transfer
struct TMVA::DNN::TCudaDeviceBuffer::TDestructor fDestructor
void CopyFrom(const TCudaHostBuffer< AFloat > &) const
void operator()(AFloat **devicePointer)
void operator()(AFloat **devicePointer)
std::shared_ptr< AFloat * > fDevicePointer
Pointer to the buffer data.
TCudaHostBuffer()=default
Abstract ClassifierFactory template that handles arbitrary types.
TCudaDeviceBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
TCudaHostBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
size_t fOffset
Offset for sub-buffers.
cudaStream_t fComputeStream
cudaStream for data transfer