Functions | |
template<typename AFloat > | |
__global__ void | AbsoluteSum (AFloat *result, const AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | AdamUpdate (AFloat *A, const AFloat *M, const AFloat *V, int m, int n, AFloat alpha, AFloat eps) |
optimizer kernel functions | |
template<typename AFloat > | |
__global__ void | AdamUpdateFirstMom (AFloat *A, const AFloat *B, int m, int n, AFloat beta) |
template<typename AFloat > | |
__global__ void | AdamUpdateSecondMom (AFloat *A, const AFloat *B, int m, int n, AFloat beta) |
template<typename AFloat > | |
__global__ void | AddBiases (AFloat *A, const AFloat *B, int nRows, int nCols) |
template<typename AFloat > | |
__global__ void | AddL1RegularizationGradients (AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n) |
template<typename AFloat > | |
__global__ void | AddL2RegularizationGradients (AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n) |
template<typename AFloat > | |
__global__ void | AddRowWise (AFloat *W, const AFloat *theta, int m, int n) |
template<typename AFloat > | |
__global__ void | AlmostEquals (bool *result, const AFloat *A, const AFloat *B, double epsilon, int m, int n) |
template<typename AFloat > | |
__device__ AFloat | AtomicAdd (AFloat *address, AFloat val) |
template<> | |
__device__ double | AtomicAdd (double *address, double val) |
template<> | |
__device__ float | AtomicAdd (float *address, float val) |
__device__ int | calculateDimension (int imgDim, int fltDim, int padding, int stride) |
Calculate the dimension of an output volume, given the sliding parameters and the input shape. | |
template<typename AFloat > | |
__global__ void | ConstAdd (AFloat *A, AFloat beta, int m, int n) |
template<typename AFloat > | |
__global__ void | ConstMult (AFloat *A, AFloat beta, int m, int n) |
template<typename AFloat > | |
__global__ void | CrossEntropy (AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n) |
template<typename AFloat > | |
__global__ void | CrossEntropyGradients (AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n) |
template<typename AFloat > | |
__global__ void | Deflatten (AFloat *A, const AFloat *B, int size, int nRows, int nCols) |
Deflatten a 2D-array into an array of 2D-arrays. | |
template<typename AFloat > | |
__global__ void | DeflattenRM (AFloat *A, const AFloat *B, int size, int nRows, int nCols) |
template<typename AFloat > | |
__global__ void | Downsample (AFloat *output, AFloat *indexMatrix, const AFloat *input, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols) |
Downsampling kernel used as the forward propagation step of a Max-Pooling layer. | |
template<typename AFloat > | |
__global__ void | Dropout (AFloat *A, int m, int n, AFloat dropoutProbability, curandState_t *state) |
template<typename AFloat > | |
__global__ void | Flatten (AFloat *A, const AFloat *B, int size, int nRows, int nCols) |
Flatten an array of 2D-arrays into a single 2D-array. | |
template<typename AFloat > | |
__global__ void | FlattenRM (AFloat *A, const AFloat *B, int size, int nRows, int nCols) |
template<typename AFloat > | |
__global__ void | Gauss (AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | GaussDerivative (AFloat *B, const AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | Hadamard (AFloat *B, const AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | IdentityDerivative (AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | Im2Col (AFloat *A, const AFloat *B, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols, int zeroPaddingHeight, int zeroPaddingWidth) |
A kernel that re-arranges image regions of the input matrix \B, into column vectors in matrix \A. | |
template<typename AFloat > | |
__device__ AFloat | max (AFloat x, AFloat y) |
template<typename AFloat > | |
__global__ void | MaxPoolBackward (AFloat *activationGradientsBackward, const AFloat *activationGradients, const AFloat *indexMatrix, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols) |
Back-propagate the gradients through a max-pooling layer. | |
template<typename AFloat > | |
__global__ void | MeanSquaredError (AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n) |
template<typename AFloat > | |
__global__ void | MeanSquaredErrorGradients (AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n) |
template<typename AFloat > | |
__global__ void | ReciprocalElementWise (AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | ReduceMatrix (AFloat *result, const AFloat *A, int m, int n) |
template<typename AFloat > | |
__device__ void | ReduceSum (AFloat *result, AFloat *sdata) |
template<typename AFloat > | |
__device__ void | ReduceSumVertical (AFloat *result, AFloat *sdata, int n) |
template<typename AFloat > | |
__global__ void | Relu (AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | ReluDerivative (AFloat *B, const AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | Reshape (AFloat *A, const AFloat *B, int nRowsA, int nColsA, int nRowsB, int nColsB) |
template<typename AFloat > | |
__global__ void | RotateWeights (AFloat *A, const AFloat *B, int filterDepth, int filterHeight, int filterWidth, int numFilters) |
template<typename AFloat > | |
__global__ void | Sigmoid (AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | Sigmoid (AFloat *B, const AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | SigmoidDerivative (AFloat *B, const AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | Softmax (AFloat *B, const AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | SoftmaxCrossEntropy (AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n) |
template<typename AFloat > | |
__global__ void | SoftmaxCrossEntropyGradients (AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n) |
template<typename AFloat > | |
__global__ void | SoftSign (AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | SoftSignDerivative (AFloat *B, const AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | SqrtElementWise (AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | SquaredSum (AFloat *result, const AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | SquareElementWise (AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | SumColumns (AFloat *B, const AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | SymmetricRelu (AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | SymmetricReluDerivative (AFloat *B, const AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | Tanh (AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | TanhDerivative (AFloat *B, const AFloat *A, int m, int n) |
template<typename AFloat > | |
__global__ void | UpdateWeights (AFloat *A, const AFloat **B, int batchSize, int nRows, int nCols) |
__global__ void TMVA::DNN::Cuda::AbsoluteSum | ( | AFloat * | result, |
const AFloat * | A, | ||
int | m, | ||
int | n | ||
) |
Definition at line 729 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::AdamUpdate | ( | AFloat * | A, |
const AFloat * | M, | ||
const AFloat * | V, | ||
int | m, | ||
int | n, | ||
AFloat | alpha, | ||
AFloat | eps | ||
) |
optimizer kernel functions
Definition at line 408 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::AdamUpdateFirstMom | ( | AFloat * | A, |
const AFloat * | B, | ||
int | m, | ||
int | n, | ||
AFloat | beta | ||
) |
Definition at line 422 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::AdamUpdateSecondMom | ( | AFloat * | A, |
const AFloat * | B, | ||
int | m, | ||
int | n, | ||
AFloat | beta | ||
) |
Definition at line 436 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::AddBiases | ( | AFloat * | A, |
const AFloat * | B, | ||
int | nRows, | ||
int | nCols | ||
) |
Definition at line 1118 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::AddL1RegularizationGradients | ( | AFloat * | A, |
const AFloat * | B, | ||
AFloat | weightDecay, | ||
int | m, | ||
int | n | ||
) |
Definition at line 767 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::AddL2RegularizationGradients | ( | AFloat * | A, |
const AFloat * | B, | ||
AFloat | weightDecay, | ||
int | m, | ||
int | n | ||
) |
Definition at line 784 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::AddRowWise | ( | AFloat * | W, |
const AFloat * | theta, | ||
int | m, | ||
int | n | ||
) |
Definition at line 307 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::AlmostEquals | ( | bool * | result, |
const AFloat * | A, | ||
const AFloat * | B, | ||
double | epsilon, | ||
int | m, | ||
int | n | ||
) |
Definition at line 949 of file Kernels.cuh.
__device__ AFloat TMVA::DNN::Cuda::AtomicAdd | ( | AFloat * | address, |
AFloat | val | ||
) |
Definition at line 34 of file Kernels.cuh.
__device__ float TMVA::DNN::Cuda::AtomicAdd | ( | float * | address, |
float | val | ||
) |
Definition at line 48 of file Kernels.cuh.
__device__ int TMVA::DNN::Cuda::calculateDimension | ( | int | imgDim, |
int | fltDim, | ||
int | padding, | ||
int | stride | ||
) |
Calculate the dimension of an output volume, given the sliding parameters and the input shape.
[in] | imgDim | The size of the input tensor in a spatial dimension. |
[in] | fltDim | The size of the sliding filter in the same dimension. |
[in] | padding | Number of zeroes to pad the input with. |
[in] | stride | Number of pixels the kernel is sliding in each iteration. |
Note that no checks are performed to assert validity of the input parameters. We are allowed to assume them valid because those checks have already been performed prior to the invocation of the kernel.
Definition at line 226 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::ConstAdd | ( | AFloat * | A, |
AFloat | beta, | ||
int | m, | ||
int | n | ||
) |
Definition at line 335 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::ConstMult | ( | AFloat * | A, |
AFloat | beta, | ||
int | m, | ||
int | n | ||
) |
Definition at line 349 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::CrossEntropy | ( | AFloat * | result, |
const AFloat * | Y, | ||
const AFloat * | output, | ||
const AFloat * | weights, | ||
int | m, | ||
int | n | ||
) |
Definition at line 800 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::CrossEntropyGradients | ( | AFloat * | dY, |
const AFloat * | Y, | ||
const AFloat * | output, | ||
const AFloat * | weights, | ||
int | m, | ||
int | n | ||
) |
Definition at line 831 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::Deflatten | ( | AFloat * | A, |
const AFloat * | B, | ||
int | size, | ||
int | nRows, | ||
int | nCols | ||
) |
Deflatten a 2D-array into an array of 2D-arrays.
[out] | A | Output array of 2D-arrays, each of which is column-major. |
[in] | B | Input 2D-array to be split into size parts. |
[in] | size | Number of 2D-arrays in the output. |
[in] | nRows | Number of rows in each matrix of the output. |
[in] | nCols | Number of columns on each matrix of the output. |
A is a pointer to size
raw TCudaMatrix
pointers. Each of those will contain elements saved on column major order. However the concatenation is performed row wise. Each thread writes a single output element by locating the appropriate input index.
Definition at line 1225 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::DeflattenRM | ( | AFloat * | A, |
const AFloat * | B, | ||
int | size, | ||
int | nRows, | ||
int | nCols | ||
) |
Definition at line 1243 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::Downsample | ( | AFloat * | output, |
AFloat * | indexMatrix, | ||
const AFloat * | input, | ||
int | depth, | ||
int | imgHeight, | ||
int | imgWidth, | ||
int | fltHeight, | ||
int | fltWidth, | ||
int | strideRows, | ||
int | strideCols | ||
) |
Downsampling kernel used as the forward propagation step of a Max-Pooling layer.
[out] | A | The output matrix. Each row corresponds to a slice and each element is the max within a receptive field. |
[out] | B | The winning indices matrix. Each element is the index of the max element. |
[in] | C | The input matrix. Each row is a slice. |
[in] | imgHeight | The heigh of the input. |
[in] | imgWidth | The output of the input. |
[in] | fltHeight | Height of the kernel. |
[in] | fltWidth | Width of the kernel. |
[in] | strideRows | stride size in the horizontal dimension. |
[in] | strideCols | stride size in the vertical dimension. |
Each output element is the maximum of the receptive field. The caller launches one thread per output element in order to eliminate shared write access.
Definition at line 1002 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::Dropout | ( | AFloat * | A, |
int | m, | ||
int | n, | ||
AFloat | dropoutProbability, | ||
curandState_t * | state | ||
) |
Definition at line 964 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::Flatten | ( | AFloat * | A, |
const AFloat * | B, | ||
int | size, | ||
int | nRows, | ||
int | nCols | ||
) |
Flatten an array of 2D-arrays into a single 2D-array.
[out] | A | Output 2D-array saved in column major order. |
[in] | B | Input array of 2D-arrays. Each element is a matrix to be concatenated. |
[in] | size | Number of 2D-arrays in the input. |
[in] | nRows | Number of rows in each matrix of the input. |
[in] | nCols | Number of columns on each matrix of the input. |
B is a pointer to size
raw TCudaMatrix
pointers. Each of those contains elements saved on column major order. However the concatenation is performed row wise. Each thread writes a single output element by locating the appropriate input index.
Definition at line 1171 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::FlattenRM | ( | AFloat * | A, |
const AFloat * | B, | ||
int | size, | ||
int | nRows, | ||
int | nCols | ||
) |
Definition at line 1191 of file Kernels.cuh.
Definition at line 650 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::GaussDerivative | ( | AFloat * | B, |
const AFloat * | A, | ||
int | m, | ||
int | n | ||
) |
Definition at line 665 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::Hadamard | ( | AFloat * | B, |
const AFloat * | A, | ||
int | m, | ||
int | n | ||
) |
Definition at line 321 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::IdentityDerivative | ( | AFloat * | A, |
int | m, | ||
int | n | ||
) |
Definition at line 450 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::Im2Col | ( | AFloat * | A, |
const AFloat * | B, | ||
int | depth, | ||
int | imgHeight, | ||
int | imgWidth, | ||
int | fltHeight, | ||
int | fltWidth, | ||
int | strideRows, | ||
int | strideCols, | ||
int | zeroPaddingHeight, | ||
int | zeroPaddingWidth | ||
) |
A kernel that re-arranges image regions of the input matrix \B, into column vectors in matrix \A.
[out] | A | The output matrix. Each row corresponds to a receptive field. |
[in] | B | The input matrix. Each row corresponds to a row in the image view. |
[in] | depth | The depth of the input tensor. |
[in] | imgHeight | The height of the input tensor. |
[in] | imgWidth | The output of the input tensor |
[in] | fltHeight | Height of the filter. |
[in] | fltWidth | Width of the filter. |
[in] | strideRows | stride size in the horizontal dimension. |
[in] | strideCols | stride size in the vertical dimension. |
[in] | zeroPaddingHeight | The padding in the horizontal dimension. |
[in] | zeroPaddingWidth | The padding in the vertical dimension. |
The kernel should be invoked with one thread per output element. Note that matrices \A and \B have different shapes. Each thread in this kernel is responsible for filling one cell of the output matrix \A. It does so by computing the correct element to copy from the input matrix \B. We therefore never need to block. When reading this kernel it is important to keep in mind that TCudaMatrix objects are saved in column major order for compatibility with cuBLAS.
Definition at line 256 of file Kernels.cuh.
__device__ AFloat TMVA::DNN::Cuda::max | ( | AFloat | x, |
AFloat | y | ||
) |
Definition at line 207 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::MaxPoolBackward | ( | AFloat * | activationGradientsBackward, |
const AFloat * | activationGradients, | ||
const AFloat * | indexMatrix, | ||
int | depth, | ||
int | imgHeight, | ||
int | imgWidth, | ||
int | fltHeight, | ||
int | fltWidth, | ||
int | strideRows, | ||
int | strideCols | ||
) |
Back-propagate the gradients through a max-pooling layer.
[out] | gradientsBackward | The gradients to be written. One gradient for each neuron at the layers's input. |
[in] | gradients | The gradients coming from the next layer. One gradient for each receptive field. |
[in] | indexMatrix | Winning indices. One index for each receptive field. |
[in] | depth | The depth of the input tensor. |
[in] | imgHeight | The height of the input tensor. |
[in] | imgWidth | The output of the input tensor |
[in] | fltHeight | Height of the filter. |
[in] | fltWidth | Width of the filter. |
[in] | strideRows | stride size in the horizontal dimension. |
[in] | strideCols | stride size in the vertical dimension. |
Definition at line 1059 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::MeanSquaredError | ( | AFloat * | result, |
const AFloat * | Y, | ||
const AFloat * | output, | ||
const AFloat * | weights, | ||
int | m, | ||
int | n | ||
) |
Definition at line 681 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::MeanSquaredErrorGradients | ( | AFloat * | dY, |
const AFloat * | Y, | ||
const AFloat * | output, | ||
const AFloat * | weights, | ||
int | m, | ||
int | n | ||
) |
Definition at line 750 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::ReciprocalElementWise | ( | AFloat * | A, |
int | m, | ||
int | n | ||
) |
Definition at line 363 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::ReduceMatrix | ( | AFloat * | result, |
const AFloat * | A, | ||
int | m, | ||
int | n | ||
) |
Definition at line 907 of file Kernels.cuh.
__device__ void TMVA::DNN::Cuda::ReduceSum | ( | AFloat * | result, |
AFloat * | sdata | ||
) |
Definition at line 134 of file Kernels.cuh.
__device__ void TMVA::DNN::Cuda::ReduceSumVertical | ( | AFloat * | result, |
AFloat * | sdata, | ||
int | n | ||
) |
Definition at line 55 of file Kernels.cuh.
Definition at line 463 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::ReluDerivative | ( | AFloat * | B, |
const AFloat * | A, | ||
int | m, | ||
int | n | ||
) |
Definition at line 478 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::Reshape | ( | AFloat * | A, |
const AFloat * | B, | ||
int | nRowsA, | ||
int | nColsA, | ||
int | nRowsB, | ||
int | nColsB | ||
) |
Definition at line 1142 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::RotateWeights | ( | AFloat * | A, |
const AFloat * | B, | ||
int | filterDepth, | ||
int | filterHeight, | ||
int | filterWidth, | ||
int | numFilters | ||
) |
Definition at line 1102 of file Kernels.cuh.
Definition at line 493 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::Sigmoid | ( | AFloat * | B, |
const AFloat * | A, | ||
int | m, | ||
int | n | ||
) |
Definition at line 508 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::SigmoidDerivative | ( | AFloat * | B, |
const AFloat * | A, | ||
int | m, | ||
int | n | ||
) |
Definition at line 524 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::Softmax | ( | AFloat * | B, |
const AFloat * | A, | ||
int | m, | ||
int | n | ||
) |
Definition at line 540 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::SoftmaxCrossEntropy | ( | AFloat * | result, |
const AFloat * | Y, | ||
const AFloat * | output, | ||
const AFloat * | weights, | ||
int | m, | ||
int | n | ||
) |
Definition at line 851 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::SoftmaxCrossEntropyGradients | ( | AFloat * | dY, |
const AFloat * | Y, | ||
const AFloat * | output, | ||
const AFloat * | weights, | ||
int | m, | ||
int | n | ||
) |
Definition at line 882 of file Kernels.cuh.
Definition at line 619 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::SoftSignDerivative | ( | AFloat * | B, |
const AFloat * | A, | ||
int | m, | ||
int | n | ||
) |
Definition at line 634 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::SqrtElementWise | ( | AFloat * | A, |
int | m, | ||
int | n | ||
) |
Definition at line 391 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::SquaredSum | ( | AFloat * | result, |
const AFloat * | A, | ||
int | m, | ||
int | n | ||
) |
Definition at line 707 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::SquareElementWise | ( | AFloat * | A, |
int | m, | ||
int | n | ||
) |
Definition at line 377 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::SumColumns | ( | AFloat * | B, |
const AFloat * | A, | ||
int | m, | ||
int | n | ||
) |
Definition at line 927 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::SymmetricRelu | ( | AFloat * | A, |
int | m, | ||
int | n | ||
) |
Definition at line 590 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::SymmetricReluDerivative | ( | AFloat * | B, |
const AFloat * | A, | ||
int | m, | ||
int | n | ||
) |
Definition at line 604 of file Kernels.cuh.
Definition at line 559 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::TanhDerivative | ( | AFloat * | B, |
const AFloat * | A, | ||
int | m, | ||
int | n | ||
) |
Definition at line 574 of file Kernels.cuh.
__global__ void TMVA::DNN::Cuda::UpdateWeights | ( | AFloat * | A, |
const AFloat ** | B, | ||
int | batchSize, | ||
int | nRows, | ||
int | nCols | ||
) |
Definition at line 1128 of file Kernels.cuh.