Logo ROOT   6.12/07
Reference Guide
CudaBuffers.cxx
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 07/08/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 ////////////////////////////////////////////////////////////////////////
13 // Implementation of device and host buffers for CUDA architectures. //
14 ////////////////////////////////////////////////////////////////////////
15 
16 #include "TMVA/DataSetInfo.h"
17 #include "TMVA/DNN/DataLoader.h"
20 #include "cuda_runtime.h"
21 #include <iostream>
22 
23 namespace TMVA {
24 namespace DNN {
25 
26 //
27 // TCudaHostBuffer
28 //______________________________________________________________________________
29 template<typename AFloat>
31 {
32  cudaFreeHost(*devicePointer);
33  delete[] devicePointer;
34 }
35 
36 //______________________________________________________________________________
37 template<typename AFloat>
39  : fOffset(0), fSize(size), fComputeStream(0), fDestructor()
40 {
41  AFloat ** pointer = new AFloat * [1];
42  cudaMallocHost(pointer, size * sizeof(AFloat));
43  fHostPointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
44 }
45 
46 //______________________________________________________________________________
47 template<typename AFloat>
49 {
50  return *fHostPointer + fOffset;
51 }
52 
53 //______________________________________________________________________________
54 template<typename AFloat>
56  size_t size)
57 {
58  TCudaHostBuffer buffer = *this;
59  buffer.fOffset = offset;
60  buffer.fSize = size;
61  return buffer;
62 }
63 
64 //
65 // TCudaDevicePointer
66 //______________________________________________________________________________
67 template<typename AFloat>
69 {
70  cudaFree(*devicePointer);
71  delete[] devicePointer;
72 }
73 
74 //______________________________________________________________________________
75 template<typename AFloat>
77  : fOffset(0), fSize(size), fDestructor()
78 {
79  AFloat ** pointer = new AFloat * [1];
80  cudaMalloc(pointer, size * sizeof(AFloat));
81  fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
82  cudaStreamCreate(&fComputeStream);
83 }
84 
85 //______________________________________________________________________________
86 template<typename AFloat>
88  cudaStream_t stream)
89  : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
90 {
91  AFloat ** pointer = new AFloat * [1];
92  cudaMalloc(pointer, size * sizeof(AFloat));
93  fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
94 }
95 
96 //______________________________________________________________________________
97 template<typename AFloat>
99  size_t size,
100  cudaStream_t stream)
101  : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
102 {
103  AFloat ** pointer = new AFloat * [1];
104  *pointer = devicePointer;
105  fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
106 }
107 
108 //______________________________________________________________________________
109 template<typename AFloat>
111  size_t size)
112 {
113  TCudaDeviceBuffer buffer = *this;
114  buffer.fOffset = offset;
115  buffer.fSize = size;
116  return buffer;
117 }
118 
119 //______________________________________________________________________________
120 template<typename AFloat>
122 {
123  return *fDevicePointer + fOffset;
124 }
125 
126 //______________________________________________________________________________
127 template<typename AFloat>
129 {
130  cudaStreamSynchronize(fComputeStream);
131  cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat),
132  cudaMemcpyHostToDevice, fComputeStream);
133 }
134 
135 //______________________________________________________________________________
136 template<typename AFloat>
138 {
139  cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat),
140  cudaMemcpyDeviceToHost, fComputeStream);
142 }
143 
144 //______________________________________________________________________________
145 template<>
147  TCudaHostBuffer<float> & buffer,
148  IndexIterator_t sampleIterator,
149  size_t batchSize)
150 {
151  const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
152  size_t n = inputMatrix.GetNcols();
153 
154  for (size_t i = 0; i < batchSize; i++) {
155  size_t sampleIndex = *sampleIterator;
156  for (size_t j = 0; j < n; j++) {
157  size_t bufferIndex = j * batchSize + i;
158  buffer[bufferIndex] = static_cast<float>(inputMatrix(sampleIndex, j));
159  }
160  sampleIterator++;
161  }
162 }
163 
164 //______________________________________________________________________________
165 template<>
167  TCudaHostBuffer<float> & buffer,
168  IndexIterator_t sampleIterator,
169  size_t batchSize)
170 {
171  const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
172  size_t n = outputMatrix.GetNcols();
173 
174  for (size_t i = 0; i < batchSize; i++) {
175  size_t sampleIndex = *sampleIterator;
176  for (size_t j = 0; j < n; j++) {
177  size_t bufferIndex = j * batchSize + i;
178  buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j));
179  }
180  sampleIterator++;
181  }
182 }
183 
184 //______________________________________________________________________________
185 template <>
187  IndexIterator_t sampleIterator, size_t batchSize)
188 {
189  const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
190  for (size_t i = 0; i < batchSize; i++) {
191  buffer[i] = static_cast<float>(weightMatrix(*sampleIterator, 0));
192  sampleIterator++;
193  }
194 }
195 
196 //______________________________________________________________________________
197 template<>
199  TCudaHostBuffer<float> & buffer,
200  IndexIterator_t sampleIterator,
201  size_t batchSize)
202 {
203  Event *event = std::get<0>(fData)[0];
204  size_t n = event->GetNVariables();
205  for (size_t i = 0; i < batchSize; i++) {
206  size_t sampleIndex = * sampleIterator++;
207  event = std::get<0>(fData)[sampleIndex];
208  for (size_t j = 0; j < n; j++) {
209  size_t bufferIndex = j * batchSize + i;
210  buffer[bufferIndex] = static_cast<float>(event->GetValue(j));
211  }
212  }
213 }
214 
215 //______________________________________________________________________________
216 template<>
218  TCudaHostBuffer<float> & buffer,
219  IndexIterator_t sampleIterator,
220  size_t batchSize)
221 {
222  const DataSetInfo &info = std::get<1>(fData);
223  size_t n = buffer.GetSize() / batchSize;
224 
225  // Copy target(s).
226 
227  for (size_t i = 0; i < batchSize; i++) {
228  size_t sampleIndex = *sampleIterator++;
229  Event *event = std::get<0>(fData)[sampleIndex];
230  for (size_t j = 0; j < n; j++) {
231  // Copy output matrices.
232  size_t bufferIndex = j * batchSize + i;
233  // Classification
234  if (event->GetNTargets() == 0) {
235  if (n == 1) {
236  // Binary.
237  buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
238  } else {
239  // Multiclass.
240  buffer[bufferIndex] = 0.0;
241  if (j == event->GetClass()) {
242  buffer[bufferIndex] = 1.0;
243  }
244  }
245  } else {
246  buffer[bufferIndex] = static_cast<float>(event->GetTarget(j));
247  }
248  }
249  }
250 }
251 
252 //______________________________________________________________________________
253 template <>
255  size_t batchSize)
256 {
257  for (size_t i = 0; i < batchSize; i++) {
258  size_t sampleIndex = *sampleIterator++;
259  Event *event = std::get<0>(fData)[sampleIndex];
260  buffer[i] = static_cast<float>(event->GetWeight());
261  }
262 }
263 
264 //______________________________________________________________________________
265 template <>
267  IndexIterator_t sampleIterator, size_t batchSize)
268 {
269  const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
270  size_t n = inputMatrix.GetNcols();
271 
272  for (size_t i = 0; i < batchSize; i++) {
273  size_t sampleIndex = *sampleIterator;
274  for (size_t j = 0; j < n; j++) {
275  size_t bufferIndex = j * batchSize + i;
276  buffer[bufferIndex] = inputMatrix(sampleIndex, j);
277  }
278  sampleIterator++;
279  }
280 }
281 
282 //______________________________________________________________________________
283 template<>
285  TCudaHostBuffer<double> & buffer,
286  IndexIterator_t sampleIterator,
287  size_t batchSize)
288 {
289  const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
290  size_t n = outputMatrix.GetNcols();
291 
292  for (size_t i = 0; i < batchSize; i++) {
293  size_t sampleIndex = *sampleIterator;
294  for (size_t j = 0; j < n; j++) {
295  size_t bufferIndex = j * batchSize + i;
296  buffer[bufferIndex] = outputMatrix(sampleIndex, j);
297  }
298  sampleIterator++;
299  }
300 }
301 
302 //______________________________________________________________________________
303 template <>
305  IndexIterator_t sampleIterator, size_t batchSize)
306 {
307  const TMatrixT<Double_t> &weightMatrix = std::get<2>(fData);
308  for (size_t i = 0; i < batchSize; i++) {
309  buffer[i] = static_cast<double>(weightMatrix(*sampleIterator, 0));
310  sampleIterator++;
311  }
312 }
313 
314 //______________________________________________________________________________
315 template <>
317  size_t batchSize)
318 {
319  Event *event = std::get<0>(fData)[0];
320  size_t n = event->GetNVariables();
321  for (size_t i = 0; i < batchSize; i++) {
322  size_t sampleIndex = * sampleIterator++;
323  event = std::get<0>(fData)[sampleIndex];
324  for (size_t j = 0; j < n; j++) {
325  size_t bufferIndex = j * batchSize + i;
326  buffer[bufferIndex] = event->GetValue(j);
327  }
328  }
329 }
330 
331 //______________________________________________________________________________
332 template<>
334  TCudaHostBuffer<double> & buffer,
335  IndexIterator_t sampleIterator,
336  size_t batchSize)
337 {
338  const DataSetInfo &info = std::get<1>(fData);
339  size_t n = buffer.GetSize() / batchSize;
340 
341  // Copy target(s).
342 
343  for (size_t i = 0; i < batchSize; i++) {
344  size_t sampleIndex = *sampleIterator++;
345  Event *event = std::get<0>(fData)[sampleIndex];
346  for (size_t j = 0; j < n; j++) {
347  // Copy output matrices.
348  size_t bufferIndex = j * batchSize + i;
349  // Classification
350  if (event->GetNTargets() == 0) {
351  // Binary.
352  if (n == 1) {
353  buffer[bufferIndex] = (info.IsSignal(event)) ? 1.0 : 0.0;
354  } else {
355  // Multiclass.
356  buffer[bufferIndex] = 0.0;
357  if (j == event->GetClass()) {
358  buffer[bufferIndex] = 1.0;
359  }
360  }
361  } else {
362  buffer[bufferIndex] = event->GetTarget(j);
363  }
364  }
365  }
366 }
367 
368 //______________________________________________________________________________
369 template <>
371  IndexIterator_t sampleIterator, size_t batchSize)
372 {
373  for (size_t i = 0; i < batchSize; i++) {
374  size_t sampleIndex = *sampleIterator++;
375  Event *event = std::get<0>(fData)[sampleIndex];
376  buffer[i] = static_cast<double>(event->GetWeight());
377  }
378 }
379 
380 // Explicit Instantiations.
381 
382 template class TCudaDeviceBuffer<float>;
383 template class TCudaDeviceBuffer<double>;
384 
385 template class TCudaHostBuffer<float>;
386 template class TCudaHostBuffer<double>;
387 
392 
393 } // TMVA
394 } // DNN
std::shared_ptr< AFloat * > fHostPointer
Pointer to the buffer data.
Definition: CudaBuffers.h:48
typename std::vector< size_t >::iterator IndexIterator_t
Definition: DataLoader.h:42
Int_t GetNcols() const
Definition: TMatrixTBase.h:125
TCudaDeviceBuffer.
Definition: CudaBuffers.h:27
size_t GetSize() const
Definition: CudaBuffers.h:81
size_t fOffset
Offset for sub-buffers.
Definition: CudaBuffers.h:100
void CopyTo(const TCudaHostBuffer< AFloat > &) const
struct TMVA::DNN::TCudaHostBuffer::TDestructor fDestructor
cudaStream_t fComputeStream
cudaStream for data transfer
Definition: CudaBuffers.h:47
TCudaHostBuffer.
Definition: CudaBuffers.h:41
Class that contains all the data information.
Definition: DataSetInfo.h:60
struct TMVA::DNN::TCudaDeviceBuffer::TDestructor fDestructor
void CopyFrom(const TCudaHostBuffer< AFloat > &) const
void operator()(AFloat **devicePointer)
Definition: CudaBuffers.cxx:68
void operator()(AFloat **devicePointer)
Definition: CudaBuffers.cxx:30
TDataLoader.
Definition: DataLoader.h:79
std::shared_ptr< AFloat * > fDevicePointer
Pointer to the buffer data.
Definition: CudaBuffers.h:103
Abstract ClassifierFactory template that handles arbitrary types.
TCudaDeviceBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
Bool_t IsSignal(const Event *ev) const
TCudaHostBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
Definition: CudaBuffers.cxx:55
const Int_t n
Definition: legend1.C:16
size_t fOffset
Offset for sub-buffers.
Definition: CudaBuffers.h:45
cudaStream_t fComputeStream
cudaStream for data transfer
Definition: CudaBuffers.h:102