17#ifndef TMVA_DNN_ARCHITECTURES_CUDA_KERNELS
18#define TMVA_DNN_ARCHITECTURES_CUDA_KERNELS
30template<
typename AFloat>
36 unsigned long long int*
address_as_ull = (
unsigned long long int*)address;
54template<
typename AFloat>
65 if ((
blockDim.y > 512) && (i < 512)) {
72 if ((
blockDim.y > 256) && (i < 256)) {
78 if ((
blockDim.y > 128) && (i < 128)) {
133template<
typename AFloat>
206template<
typename AFloat>
255template<
typename AFloat>
306template<
typename AFloat>
308 const AFloat * theta,
315 if ((i <
m) && (
j <
n))
320template<
typename AFloat>
329 if ((i <
m) && (
j <
n))
334template<
typename AFloat>
342 if ((i <
m) && (
j <
n)) {
348template<
typename AFloat>
356 if ((i <
m) && (
j <
n)) {
362template<
typename AFloat>
370 if ((i <
m) && (
j <
n)) {
376template<
typename AFloat>
384 if ((i <
m) && (
j <
n)) {
390template<
typename AFloat>
398 if ((i <
m) && (
j <
n)) {
407template<
typename AFloat>
409 int m,
int n, AFloat alpha, AFloat eps)
415 if ((i <
m) && (
j <
n)) {
421template<
typename AFloat>
423 int m,
int n, AFloat beta)
429 if ((i <
m) && (
j <
n)) {
435template<
typename AFloat>
437 int m,
int n, AFloat beta)
443 if ((i <
m) && (
j <
n)) {
449template<
typename AFloat>
457 if ((i <
m) && (
j <
n))
462template<
typename AFloat>
470 if ((i <
m) && (
j <
n)) {
472 A[
index] = (
x < 0.0) ? 0.0 :
x;
477template<
typename AFloat>
479 const AFloat * A,
int m,
int n)
485 if ((i <
m) && (
j <
n)) {
487 B[
index] = (
x < 0.0) ? 0.0 : 1.0;
492template<
typename AFloat>
500 if ((i <
m) && (
j <
n)) {
501 AFloat sig = 1.0 / (1.0 + exp(-A[
index]));
507template<
typename AFloat>
516 if ((i <
m) && (
j <
n)) {
517 AFloat sig = 1.0 / (1.0 + exp(-A[
index]));
523template<
typename AFloat>
532 if ((i <
m) && (
j <
n)) {
533 AFloat sig = 1.0 / (1.0 + exp(-A[
index]));
534 B[
index] = sig * (1.0 - sig);
539template<
typename AFloat>
548 for (
int j = 0;
j <
n;
j++) {
549 sum += exp(A[i +
j *
n]);
551 for (
int j = 0;
j <
n;
j++) {
552 B[i +
j *
n] = exp(A[i *
n +
j]) /
sum;
558template<
typename AFloat>
566 if ((i <
m) && (
j <
n)) {
567 AFloat t = ::tanh(A[
index]);
573template<
typename AFloat>
582 if ((i <
m) && (
j <
n)) {
583 AFloat t = ::tanh(A[
index]);
589template<
typename AFloat>
597 if ((i <
m) && (
j <
n)) {
603template<
typename AFloat>
612 if ((i <
m) && (
j <
n)) {
618template<
typename AFloat>
626 if ((i <
m) && (
j <
n)) {
633template<
typename AFloat>
642 if ((i <
m) && (
j <
n)) {
643 AFloat
x = 1.0 + fabs(A[
index]);
649template<
typename AFloat>
657 if ((i <
m) && (
j <
n)) {
664template<
typename AFloat>
673 if ((i <
m) && (
j <
n)) {
675 B[
index] = - 2.0 *
x * exp(-
x *
x);
680template<
typename AFloat>
684 const AFloat * weights,
694 if ((i <
m) && (
j <
n)) {
695 AFloat
w = weights[i];
696 AFloat norm = 1 / ((AFloat) (
m *
n));
706template<
typename AFloat>
718 if ((i <
m) && (
j <
n)) {
728template<
typename AFloat>
740 if ((i <
m) && (
j <
n)) {
749template<
typename AFloat>
753 const AFloat * weights,
760 if ((i <
m) && (
j <
n)) {
766template<
typename AFloat>
776 if ((i <
m) && (
j <
n)) {
777 AFloat
sign = (B[
index] < 0.0) ? -1.0 : 1.0;
783template<
typename AFloat>
793 if ((i <
m) && (
j <
n)) {
799template<
typename AFloat>
803 const AFloat * weights,
813 if ((i <
m) && (
j <
n)) {
814 AFloat norm = 1 / ((AFloat) (
m *
n));
816 AFloat
lr = std::log(1. + exp(-
x));
817 if (
x < -75.)
lr = -
x;
818 else if (
x > 75.)
lr = exp(-
x);
830template<
typename AFloat>
834 const AFloat * weights,
841 if ((i <
m) && (
j <
n)) {
842 AFloat norm = 1 / ((AFloat) (
m *
n));
845 dY[
index] = weights[i] * norm * (sig -
y);
850template<
typename AFloat>
854 const AFloat * weights,
861 AFloat norm = 1.0 / ((AFloat)
m);
866 for (
int j = 0;
j <
n;
j++) {
869 for (
int j = 0;
j <
n;
j++) {
881template<
typename AFloat>
885 const AFloat * weights,
889 AFloat norm = 1.0 / ((AFloat)
m);
894 for (
int j = 0;
j <
n;
j++) {
898 for (
int j = 0;
j <
n;
j++) {
900 dY[i +
j *
m] *= weights[i] * norm;
906template<
typename AFloat>
917 if ((i <
m) && (
j <
n))
926template<
typename AFloat>
939 if ((i <
m) && (
j <
n)) {
948template<
typename AFloat>
954 if (i >=
m ||
j >=
n)
return;
963template<
typename AFloat>
972 if ((i <
m) && (
j <
n)) {
1001template<
typename AFloat>
1058template<
typename AFloat>
1063 int strideRows,
int strideCols)
1088 if (row >=
height || col >=
width || col < 0 || row < 0)
continue;
1101template<
typename AFloat>
1110 int jump = filterHeight * filterWidth;
1117template<
typename AFloat>
1124 A[i +
j *
nRows] += B[i];
1127template<
typename AFloat>
1135 for (
size_t event = 0;
event < batchSize;
event++) {
1141template<
typename AFloat>
1170template<
typename AFloat>
1190template<
typename AFloat>
1224template<
typename AFloat>
1242template<
typename AFloat>
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t r
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t height
static constexpr int BlockSize
__global__ void SymmetricRelu(AFloat *A, int m, int n)
__global__ void UpdateWeights(AFloat *A, const AFloat **B, int batchSize, int nRows, int nCols)
__device__ int calculateDimension(int imgDim, int fltDim, int padding, int stride)
Calculate the dimension of an output volume, given the sliding parameters and the input shape.
__global__ void SigmoidDerivative(AFloat *B, const AFloat *A, int m, int n)
__device__ AFloat AtomicAdd(AFloat *address, AFloat val)
__global__ void Dropout(AFloat *A, int m, int n, AFloat dropoutProbability, curandState_t *state)
__global__ void SoftmaxCrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void SumColumns(AFloat *B, const AFloat *A, int m, int n)
__global__ void IdentityDerivative(AFloat *A, int m, int n)
__global__ void SqrtElementWise(AFloat *A, int m, int n)
__global__ void AdamUpdate(AFloat *A, const AFloat *M, const AFloat *V, int m, int n, AFloat alpha, AFloat eps)
optimizer kernel functions
__global__ void SoftmaxCrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void AddL1RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
__device__ void ReduceSumVertical(AFloat *result, AFloat *sdata, int n)
__global__ void MeanSquaredErrorGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void Relu(AFloat *A, int m, int n)
__global__ void ReluDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void AbsoluteSum(AFloat *result, const AFloat *A, int m, int n)
__global__ void AddL2RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
__device__ AFloat max(AFloat x, AFloat y)
__global__ void AddRowWise(AFloat *W, const AFloat *theta, int m, int n)
__global__ void ConstMult(AFloat *A, AFloat beta, int m, int n)
__global__ void GaussDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void Deflatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Deflatten a 2D-array into an array of 2D-arrays.
__global__ void Flatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Flatten an array of 2D-arrays into a single 2D-array.
__global__ void CrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void Softmax(AFloat *B, const AFloat *A, int m, int n)
__global__ void RotateWeights(AFloat *A, const AFloat *B, int filterDepth, int filterHeight, int filterWidth, int numFilters)
__global__ void TanhDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void CrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void ReduceMatrix(AFloat *result, const AFloat *A, int m, int n)
__global__ void Im2Col(AFloat *A, const AFloat *B, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols, int zeroPaddingHeight, int zeroPaddingWidth)
A kernel that re-arranges image regions of the input matrix \B, into column vectors in matrix \A.
__global__ void DeflattenRM(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
__global__ void ConstAdd(AFloat *A, AFloat beta, int m, int n)
__global__ void SymmetricReluDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void MeanSquaredError(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void SquareElementWise(AFloat *A, int m, int n)
__global__ void SoftSignDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void Reshape(AFloat *A, const AFloat *B, int nRowsA, int nColsA, int nRowsB, int nColsB)
__global__ void Hadamard(AFloat *B, const AFloat *A, int m, int n)
__global__ void AlmostEquals(bool *result, const AFloat *A, const AFloat *B, double epsilon, int m, int n)
__global__ void FlattenRM(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
__global__ void SquaredSum(AFloat *result, const AFloat *A, int m, int n)
__global__ void AdamUpdateFirstMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
__global__ void ReciprocalElementWise(AFloat *A, int m, int n)
__device__ void ReduceSum(AFloat *result, AFloat *sdata)
__global__ void MaxPoolBackward(AFloat *activationGradientsBackward, const AFloat *activationGradients, const AFloat *indexMatrix, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Back-propagate the gradients through a max-pooling layer.
__global__ void Downsample(AFloat *output, AFloat *indexMatrix, const AFloat *input, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Downsampling kernel used as the forward propagation step of a Max-Pooling layer.
__global__ void AdamUpdateSecondMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
__global__ void AddBiases(AFloat *A, const AFloat *B, int nRows, int nCols)
std::shared_ptr< std::function< double(double)> > Tanh
double weightDecay(double error, ItWeight itWeight, ItWeight itWeightEnd, double factorWeightDecay, EnumRegularization eRegularization)
compute the weight decay for regularization (L1 or L2)
std::shared_ptr< std::function< double(double)> > Gauss
std::shared_ptr< std::function< double(double)> > Sigmoid
std::shared_ptr< std::function< double(double)> > SoftSign
create variable transformations
static uint64_t sum(uint64_t i)