Logo ROOT   6.10/09
Reference Guide
CudaBuffers.cxx
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 07/08/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 ////////////////////////////////////////////////////////////////////////
13 // Implementation of device and host buffers for CUDA architectures. //
14 ////////////////////////////////////////////////////////////////////////
15 
16 #include "TMVA/DNN/DataLoader.h"
19 #include "cuda_runtime.h"
20 #include <iostream>
21 
22 namespace TMVA {
23 namespace DNN {
24 
25 //
26 // TCudaHostBuffer
27 //______________________________________________________________________________
28 template<typename AFloat>
30 {
31  cudaFreeHost(*devicePointer);
32  delete[] devicePointer;
33 }
34 
35 //______________________________________________________________________________
36 template<typename AFloat>
38  : fOffset(0), fSize(size), fComputeStream(0), fDestructor()
39 {
40  AFloat ** pointer = new AFloat * [1];
41  cudaMallocHost(pointer, size * sizeof(AFloat));
42  fHostPointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
43 }
44 
45 //______________________________________________________________________________
46 template<typename AFloat>
48 {
49  return *fHostPointer + fOffset;
50 }
51 
52 //______________________________________________________________________________
53 template<typename AFloat>
55  size_t size)
56 {
57  TCudaHostBuffer buffer = *this;
58  buffer.fOffset = offset;
59  buffer.fSize = size;
60  return buffer;
61 }
62 
63 //
64 // TCudaDevicePointer
65 //______________________________________________________________________________
66 template<typename AFloat>
68 {
69  cudaFree(*devicePointer);
70  delete[] devicePointer;
71 }
72 
73 //______________________________________________________________________________
74 template<typename AFloat>
76  : fOffset(0), fSize(size), fDestructor()
77 {
78  AFloat ** pointer = new AFloat * [1];
79  cudaMalloc(pointer, size * sizeof(AFloat));
80  fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
81  cudaStreamCreate(&fComputeStream);
82 }
83 
84 //______________________________________________________________________________
85 template<typename AFloat>
87  cudaStream_t stream)
88  : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
89 {
90  AFloat ** pointer = new AFloat * [1];
91  cudaMalloc(pointer, size * sizeof(AFloat));
92  fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
93 }
94 
95 //______________________________________________________________________________
96 template<typename AFloat>
98  size_t size,
99  cudaStream_t stream)
100  : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
101 {
102  AFloat ** pointer = new AFloat * [1];
103  *pointer = devicePointer;
104  fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
105 }
106 
107 //______________________________________________________________________________
108 template<typename AFloat>
110  size_t size)
111 {
112  TCudaDeviceBuffer buffer = *this;
113  buffer.fOffset = offset;
114  buffer.fSize = size;
115  return buffer;
116 }
117 
118 //______________________________________________________________________________
119 template<typename AFloat>
121 {
122  return *fDevicePointer + fOffset;
123 }
124 
125 //______________________________________________________________________________
126 template<typename AFloat>
128 {
129  cudaStreamSynchronize(fComputeStream);
130  cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat),
131  cudaMemcpyHostToDevice, fComputeStream);
132 }
133 
134 //______________________________________________________________________________
135 template<typename AFloat>
137 {
138  cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat),
139  cudaMemcpyDeviceToHost, fComputeStream);
141 }
142 
143 //______________________________________________________________________________
144 template<>
146  TCudaHostBuffer<float> & buffer,
147  IndexIterator_t sampleIterator,
148  size_t batchSize)
149 {
150  const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
151  size_t n = inputMatrix.GetNcols();
152 
153  for (size_t i = 0; i < batchSize; i++) {
154  size_t sampleIndex = *sampleIterator;
155  for (size_t j = 0; j < n; j++) {
156  size_t bufferIndex = j * batchSize + i;
157  buffer[bufferIndex] = static_cast<float>(inputMatrix(sampleIndex, j));
158  }
159  sampleIterator++;
160  }
161 }
162 
163 //______________________________________________________________________________
164 template<>
166  TCudaHostBuffer<float> & buffer,
167  IndexIterator_t sampleIterator,
168  size_t batchSize)
169 {
170  const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
171  size_t n = outputMatrix.GetNcols();
172 
173  for (size_t i = 0; i < batchSize; i++) {
174  size_t sampleIndex = *sampleIterator;
175  for (size_t j = 0; j < n; j++) {
176  size_t bufferIndex = j * batchSize + i;
177  buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j));
178  }
179  sampleIterator++;
180  }
181 }
182 
183 //______________________________________________________________________________
184 template<>
186  TCudaHostBuffer<float> & buffer,
187  IndexIterator_t sampleIterator,
188  size_t batchSize)
189 {
190  Event * event = fData.front();
191  size_t n = event->GetNVariables();
192 
193  // Copy input variables.
194 
195  for (size_t i = 0; i < batchSize; i++) {
196  size_t sampleIndex = * sampleIterator++;
197  event = fData[sampleIndex];
198  for (size_t j = 0; j < n; j++) {
199  size_t bufferIndex = j * batchSize + i;
200  buffer[bufferIndex] = static_cast<float>(event->GetValue(j));
201  }
202  }
203 }
204 
205 //______________________________________________________________________________
206 template<>
208  TCudaHostBuffer<float> & buffer,
209  IndexIterator_t sampleIterator,
210  size_t batchSize)
211 {
212  Event * event = fData.front();
213  size_t n = buffer.GetSize() / batchSize;
214 
215  // Copy target(s).
216 
217  for (size_t i = 0; i < batchSize; i++) {
218  size_t sampleIndex = * sampleIterator++;
219  event = fData[sampleIndex];
220  for (size_t j = 0; j < n; j++) {
221  // Copy output matrices.
222  size_t bufferIndex = j * batchSize + i;
223  // Classification
224  if (event->GetNTargets() == 0) {
225  if (n == 1) {
226  // Binary.
227  buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
228  } else {
229  // Multiclass.
230  buffer[bufferIndex] = 0.0;
231  if (j == event->GetClass()) {
232  buffer[bufferIndex] = 1.0;
233  }
234  }
235  } else {
236  buffer[bufferIndex] = static_cast<float>(event->GetTarget(j));
237  }
238  }
239  }
240 }
241 
242 //______________________________________________________________________________
243 template<>
245  TCudaHostBuffer<double> & buffer,
246  IndexIterator_t sampleIterator,
247  size_t batchSize)
248 {
249  const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
250  size_t n = inputMatrix.GetNcols();
251 
252  for (size_t i = 0; i < batchSize; i++) {
253  size_t sampleIndex = *sampleIterator;
254  for (size_t j = 0; j < n; j++) {
255  size_t bufferIndex = j * batchSize + i;
256  buffer[bufferIndex] = inputMatrix(sampleIndex, j);
257  }
258  sampleIterator++;
259  }
260 }
261 
262 //______________________________________________________________________________
263 template<>
265  TCudaHostBuffer<double> & buffer,
266  IndexIterator_t sampleIterator,
267  size_t batchSize)
268 {
269  const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
270  size_t n = outputMatrix.GetNcols();
271 
272  for (size_t i = 0; i < batchSize; i++) {
273  size_t sampleIndex = *sampleIterator;
274  for (size_t j = 0; j < n; j++) {
275  size_t bufferIndex = j * batchSize + i;
276  buffer[bufferIndex] = outputMatrix(sampleIndex, j);
277  }
278  sampleIterator++;
279  }
280 }
281 
282 //______________________________________________________________________________
283 template<>
285  TCudaHostBuffer<double> & buffer,
286  IndexIterator_t sampleIterator,
287  size_t batchSize)
288 {
289  Event * event = fData.front();
290  size_t n = event->GetNVariables();
291 
292  // Copy input variables.
293 
294  for (size_t i = 0; i < batchSize; i++) {
295  size_t sampleIndex = * sampleIterator++;
296  event = fData[sampleIndex];
297  for (size_t j = 0; j < n; j++) {
298  size_t bufferIndex = j * batchSize + i;
299  buffer[bufferIndex] = event->GetValue(j);
300  }
301  }
302 }
303 
304 //______________________________________________________________________________
305 template<>
307  TCudaHostBuffer<double> & buffer,
308  IndexIterator_t sampleIterator,
309  size_t batchSize)
310 {
311  Event * event = fData.front();
312  size_t n = buffer.GetSize() / batchSize;
313 
314  // Copy target(s).
315 
316  for (size_t i = 0; i < batchSize; i++) {
317  size_t sampleIndex = * sampleIterator++;
318  event = fData[sampleIndex];
319  for (size_t j = 0; j < n; j++) {
320  // Copy output matrices.
321  size_t bufferIndex = j * batchSize + i;
322  // Classification
323  if (event->GetNTargets() == 0) {
324  // Binary.
325  if (n == 1) {
326  buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
327  } else {
328  // Multiclass.
329  buffer[bufferIndex] = 0.0;
330  if (j == event->GetClass()) {
331  buffer[bufferIndex] = 1.0;
332  }
333  }
334  } else {
335  buffer[bufferIndex] = event->GetTarget(j);
336  }
337  }
338  }
339 }
340 
341 // Explicit Instantiations.
342 
343 template class TCudaDeviceBuffer<float>;
344 template class TCudaDeviceBuffer<double>;
345 
346 template class TCudaHostBuffer<float>;
347 template class TCudaHostBuffer<double>;
348 
353 
354 } // TMVA
355 } // DNN
std::shared_ptr< AFloat * > fHostPointer
Pointer to the buffer data.
Definition: CudaBuffers.h:48
typename std::vector< size_t >::iterator IndexIterator_t
Definition: DataLoader.h:38
Int_t GetNcols() const
Definition: TMatrixTBase.h:125
TCudaDeviceBuffer.
Definition: CudaBuffers.h:27
size_t GetSize() const
Definition: CudaBuffers.h:81
size_t fOffset
Offset for sub-buffers.
Definition: CudaBuffers.h:100
void CopyTo(const TCudaHostBuffer< AFloat > &) const
struct TMVA::DNN::TCudaHostBuffer::TDestructor fDestructor
cudaStream_t fComputeStream
cudaStream for data transfer
Definition: CudaBuffers.h:47
TCudaHostBuffer.
Definition: CudaBuffers.h:41
struct TMVA::DNN::TCudaDeviceBuffer::TDestructor fDestructor
void CopyFrom(const TCudaHostBuffer< AFloat > &) const
void operator()(AFloat **devicePointer)
Definition: CudaBuffers.cxx:67
void operator()(AFloat **devicePointer)
Definition: CudaBuffers.cxx:29
TDataLoader.
Definition: DataLoader.h:73
std::shared_ptr< AFloat * > fDevicePointer
Pointer to the buffer data.
Definition: CudaBuffers.h:103
Abstract ClassifierFactory template that handles arbitrary types.
TCudaDeviceBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
TCudaHostBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
Definition: CudaBuffers.cxx:54
const Int_t n
Definition: legend1.C:16
size_t fOffset
Offset for sub-buffers.
Definition: CudaBuffers.h:45
cudaStream_t fComputeStream
cudaStream for data transfer
Definition: CudaBuffers.h:102