Logo ROOT   6.07/09
Reference Guide
CudaBuffers.cxx
Go to the documentation of this file.
1 // @(#)root/tmva/tmva/dnn:$Id$
2 // Author: Simon Pfreundschuh 07/08/16
3 
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 ////////////////////////////////////////////////////////////////////////
13 // Implementation of device and host buffers for CUDA architectures. //
14 ////////////////////////////////////////////////////////////////////////
15 
16 #include "TMVA/DNN/DataLoader.h"
19 #include "cuda_runtime.h"
20 #include <iostream>
21 
22 namespace TMVA {
23 namespace DNN {
24 
25 //
26 // TCudaHostBuffer
27 //______________________________________________________________________________
28 template<typename AFloat>
30 {
31  cudaFreeHost(*devicePointer);
32  delete[] devicePointer;
33 }
34 
35 //______________________________________________________________________________
36 template<typename AFloat>
39 {
40  AFloat ** pointer = new AFloat * [1];
41  cudaMallocHost(pointer, size * sizeof(AFloat));
42  fHostPointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
43 }
44 
45 //______________________________________________________________________________
46 template<typename AFloat>
48 {
49  return *fHostPointer + fOffset;
50 }
51 
52 //______________________________________________________________________________
53 template<typename AFloat>
55  size_t /*size*/)
56 {
57  TCudaHostBuffer buffer = *this;
58  buffer.fOffset = offset;
59  return buffer;
60 }
61 
62 //
63 // TCudaDevicePointer
64 //______________________________________________________________________________
65 template<typename AFloat>
67 {
68  cudaFree(*devicePointer);
69  delete[] devicePointer;
70 }
71 
72 //______________________________________________________________________________
73 template<typename AFloat>
75  : fOffset(0), fSize(size), fDestructor()
76 {
77  AFloat ** pointer = new AFloat * [1];
78  cudaMalloc(pointer, size * sizeof(AFloat));
79  fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
80  cudaStreamCreate(&fComputeStream);
81 }
82 
83 //______________________________________________________________________________
84 template<typename AFloat>
86  cudaStream_t stream)
87  : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
88 {
89  AFloat ** pointer = new AFloat * [1];
90  cudaMalloc(pointer, size * sizeof(AFloat));
91  fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
92 }
93 
94 //______________________________________________________________________________
95 template<typename AFloat>
97  size_t size,
98  cudaStream_t stream)
99  : fOffset(0), fSize(size), fComputeStream(stream), fDestructor()
100 {
101  AFloat ** pointer = new AFloat * [1];
102  *pointer = devicePointer;
103  fDevicePointer = std::shared_ptr<AFloat *>(pointer, fDestructor);
104 }
105 
106 //______________________________________________________________________________
107 template<typename AFloat>
109  size_t size)
110 {
111  TCudaDeviceBuffer buffer = *this;
112  buffer.fOffset = offset;
113  buffer.fSize = size;
114  return buffer;
115 }
116 
117 //______________________________________________________________________________
118 template<typename AFloat>
120 {
121  return *fDevicePointer + fOffset;
122 }
123 
124 //______________________________________________________________________________
125 template<typename AFloat>
127 {
128  cudaStreamSynchronize(fComputeStream);
129  cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat),
130  cudaMemcpyHostToDevice, fComputeStream);
131 }
132 
133 //______________________________________________________________________________
134 template<typename AFloat>
136 {
137  cudaMemcpyAsync(*this, buffer, fSize * sizeof(AFloat),
138  cudaMemcpyDeviceToHost, fComputeStream);
140 }
141 
142 //______________________________________________________________________________
143 template<>
145  TCudaHostBuffer<float> & buffer,
146  IndexIterator_t sampleIterator,
147  size_t batchSize)
148 {
149  const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
150  size_t n = inputMatrix.GetNcols();
151 
152  for (size_t i = 0; i < batchSize; i++) {
153  size_t sampleIndex = *sampleIterator;
154  for (size_t j = 0; j < n; j++) {
155  size_t bufferIndex = j * batchSize + i;
156  buffer[bufferIndex] = static_cast<float>(inputMatrix(sampleIndex, j));
157  }
158  sampleIterator++;
159  }
160 }
161 
162 //______________________________________________________________________________
163 template<>
165  TCudaHostBuffer<float> & buffer,
166  IndexIterator_t sampleIterator,
167  size_t batchSize)
168 {
169  const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
170  size_t n = outputMatrix.GetNcols();
171 
172  for (size_t i = 0; i < batchSize; i++) {
173  size_t sampleIndex = *sampleIterator;
174  for (size_t j = 0; j < n; j++) {
175  size_t bufferIndex = j * batchSize + i;
176  buffer[bufferIndex] = static_cast<float>(outputMatrix(sampleIndex, j));
177  }
178  sampleIterator++;
179  }
180 }
181 
182 //______________________________________________________________________________
183 template<>
185  TCudaHostBuffer<float> & buffer,
186  IndexIterator_t sampleIterator,
187  size_t batchSize)
188 {
189  Event * event = fData.front();
190  size_t n = event->GetNVariables();
191 
192  // Copy input variables.
193 
194  for (size_t i = 0; i < batchSize; i++) {
195  size_t sampleIndex = * sampleIterator++;
196  event = fData[sampleIndex];
197  for (size_t j = 0; j < n; j++) {
198  size_t bufferIndex = j * batchSize + i;
199  buffer[bufferIndex] = static_cast<float>(event->GetValue(j));
200  }
201  }
202 }
203 
204 //______________________________________________________________________________
205 template<>
207  TCudaHostBuffer<float> & buffer,
208  IndexIterator_t sampleIterator,
209  size_t batchSize)
210 {
211  Event * event = fData.front();
212  size_t n = (event->GetNTargets() == 0) ? 1 : event->GetNTargets();
213 
214  // Copy target(s).
215 
216  for (size_t i = 0; i < batchSize; i++) {
217  size_t sampleIndex = * sampleIterator++;
218  event = fData[sampleIndex];
219  for (size_t j = 0; j < n; j++) {
220  // Copy output matrices.
221  size_t bufferIndex = j * batchSize + i;
222  if (event->GetNTargets() == 0) {
223  buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
224  } else {
225  buffer[bufferIndex] = static_cast<float>(event->GetTarget(j));
226  }
227  }
228  }
229 }
230 
231 //______________________________________________________________________________
232 template<>
234  TCudaHostBuffer<double> & buffer,
235  IndexIterator_t sampleIterator,
236  size_t batchSize)
237 {
238  const TMatrixT<Double_t> &inputMatrix = std::get<0>(fData);
239  size_t n = inputMatrix.GetNcols();
240 
241  for (size_t i = 0; i < batchSize; i++) {
242  size_t sampleIndex = *sampleIterator;
243  for (size_t j = 0; j < n; j++) {
244  size_t bufferIndex = j * batchSize + i;
245  buffer[bufferIndex] = inputMatrix(sampleIndex, j);
246  }
247  sampleIterator++;
248  }
249 }
250 
251 //______________________________________________________________________________
252 template<>
254  TCudaHostBuffer<double> & buffer,
255  IndexIterator_t sampleIterator,
256  size_t batchSize)
257 {
258  const TMatrixT<Double_t> &outputMatrix = std::get<1>(fData);
259  size_t n = outputMatrix.GetNcols();
260 
261  for (size_t i = 0; i < batchSize; i++) {
262  size_t sampleIndex = *sampleIterator;
263  for (size_t j = 0; j < n; j++) {
264  size_t bufferIndex = j * batchSize + i;
265  buffer[bufferIndex] = outputMatrix(sampleIndex, j);
266  }
267  sampleIterator++;
268  }
269 }
270 
271 //______________________________________________________________________________
272 template<>
274  TCudaHostBuffer<double> & buffer,
275  IndexIterator_t sampleIterator,
276  size_t batchSize)
277 {
278  Event * event = fData.front();
279  size_t n = event->GetNVariables();
280 
281  // Copy input variables.
282 
283  for (size_t i = 0; i < batchSize; i++) {
284  size_t sampleIndex = * sampleIterator++;
285  event = fData[sampleIndex];
286  for (size_t j = 0; j < n; j++) {
287  size_t bufferIndex = j * batchSize + i;
288  buffer[bufferIndex] = event->GetValue(j);
289  }
290  }
291 }
292 
293 //______________________________________________________________________________
294 template<>
296  TCudaHostBuffer<double> & buffer,
297  IndexIterator_t sampleIterator,
298  size_t batchSize)
299 {
300  Event * event = fData.front();
301  size_t n = (event->GetNTargets() == 0) ? 1 : event->GetNTargets();
302 
303  // Copy target(s).
304 
305  for (size_t i = 0; i < batchSize; i++) {
306  size_t sampleIndex = * sampleIterator++;
307  event = fData[sampleIndex];
308  for (size_t j = 0; j < n; j++) {
309  // Copy output matrices.
310  size_t bufferIndex = j * batchSize + i;
311  if (event->GetNTargets() == 0) {
312  buffer[bufferIndex] = (event->GetClass() == 0) ? 1.0 : 0.0;
313  } else {
314  buffer[bufferIndex] = event->GetTarget(j);
315  }
316  }
317  }
318 }
319 
320 // Explicit Instantiations.
321 
322 template class TCudaDeviceBuffer<float>;
323 template class TCudaDeviceBuffer<double>;
324 
325 template class TCudaHostBuffer<float>;
326 template class TCudaHostBuffer<double>;
327 
332 
333 } // TMVA
334 } // DNN
std::shared_ptr< AFloat * > fHostPointer
Pointer to the buffer data.
Definition: CudaBuffers.h:47
typename std::vector< size_t >::iterator IndexIterator_t
Definition: DataLoader.h:37
void CopyTo(const TCudaHostBuffer< AFloat > &) const
TCudaDeviceBuffer.
Definition: CudaBuffers.h:27
void CopyFrom(const TCudaHostBuffer< AFloat > &) const
size_t fOffset
Offset for sub-buffers.
Definition: CudaBuffers.h:97
struct TMVA::DNN::TCudaHostBuffer::TDestructor fDestructor
cudaStream_t fComputeStream
cudaStream for data transfer
Definition: CudaBuffers.h:46
TCudaHostBuffer.
Definition: CudaBuffers.h:41
struct TMVA::DNN::TCudaDeviceBuffer::TDestructor fDestructor
void operator()(AFloat **devicePointer)
Definition: CudaBuffers.cxx:66
void operator()(AFloat **devicePointer)
Definition: CudaBuffers.cxx:29
TDataLoader.
Definition: DataLoader.h:72
std::shared_ptr< AFloat * > fDevicePointer
Pointer to the buffer data.
Definition: CudaBuffers.h:100
Int_t GetNcols() const
Definition: TMatrixTBase.h:137
Abstract ClassifierFactory template that handles arbitrary types.
TCudaDeviceBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
TCudaHostBuffer GetSubBuffer(size_t offset, size_t size)
Return sub-buffer of the current buffer.
Definition: CudaBuffers.cxx:54
const Int_t n
Definition: legend1.C:16
size_t fOffset
Offset for sub-buffers.
Definition: CudaBuffers.h:45
cudaStream_t fComputeStream
cudaStream for data transfer
Definition: CudaBuffers.h:99