17#ifndef TMVA_DNN_ARCHITECTURES_CUDA_KERNELS
18#define TMVA_DNN_ARCHITECTURES_CUDA_KERNELS
30template<
typename AFloat>
31__device__ AFloat
AtomicAdd(AFloat* address, AFloat val);
34__device__
double AtomicAdd(
double* address,
double val)
36 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
37 unsigned long long int old = *address_as_ull, assumed;
40 old = atomicCAS(address_as_ull, assumed,
41 __double_as_longlong(val +
42 __longlong_as_double(assumed)));
43 }
while (assumed != old);
44 return __longlong_as_double(old);
48__device__
float AtomicAdd(
float* address,
float val)
50 return atomicAdd(address, val);
54template<
typename AFloat>
62 int index = i * blockDim.x + j;
65 if ((blockDim.y > 512) && (i < 512)) {
66 if ((i + 512) < blockDim.y) {
67 sdata[
index] += sdata[
index + 512 * blockDim.x];
72 if ((blockDim.y > 256) && (i < 256)) {
73 if ((i + 256) < blockDim.y) {
74 sdata[
index] += sdata[
index + 256 * blockDim.x];
78 if ((blockDim.y > 128) && (i < 128)) {
79 if ((i + 128) < blockDim.y) {
80 sdata[
index] += sdata[
index + 128 * blockDim.x];
84 if ((blockDim.y > 64) && (i < 64)) {
85 if ((i + 64) < blockDim.y) {
86 sdata[
index] += sdata[
index + 64 * blockDim.x];
90 if ((blockDim.y > 32) && (i < 32)) {
91 if ((i + 32) < blockDim.y) {
92 sdata[
index] += sdata[
index + 32 * blockDim.x];
96 if ((blockDim.y > 16) && (i < 16)) {
97 if ((i + 16) < blockDim.y) {
98 sdata[
index] += sdata[
index + 16 * blockDim.x];
102 if ((blockDim.y > 8) && (i < 8)) {
103 if ((i + 8) < blockDim.y) {
104 sdata[
index] += sdata[
index + 8 * blockDim.x];
108 if ((blockDim.y > 4) && (i < 4)) {
109 if ((i + 4) < blockDim.y) {
110 sdata[
index] += sdata[
index + 4 * blockDim.x];
114 if ((blockDim.y > 2) && (i < 2)) {
115 if ((i + 2) < blockDim.y) {
116 sdata[
index] += sdata[
index + 2 * blockDim.x];
120 if ((blockDim.y > 1) && (i < 1)) {
121 if ((i + 1) < blockDim.y) {
122 sdata[
index] += sdata[
index + 1 * blockDim.x];
126 if ((i == 0) && ((blockIdx.x * blockDim.x + threadIdx.x) <
n)) {
133template<
typename AFloat>
136 int tid = threadIdx.x + threadIdx.y * blockDim.x;
141 sdata[tid] += sdata[tid + 512];
148 sdata[tid] += sdata[tid + 256];
154 sdata[tid] += sdata[tid + 128];
160 sdata[tid] += sdata[tid + 64];
166 sdata[tid] += sdata[tid + 32];
172 sdata[tid] += sdata[tid + 16];
178 sdata[tid] += sdata[tid + 8];
184 sdata[tid] += sdata[tid + 4];
190 sdata[tid] += sdata[tid + 2];
196 sdata[tid] += sdata[tid + 1];
206template<
typename AFloat>
207__device__ AFloat
max(AFloat
x, AFloat
y)
229 return ((imgDim - fltDim + 2 * padding) / stride) + 1;
255template<
typename AFloat>
265 int zeroPaddingHeight,
266 int zeroPaddingWidth)
269 int i = blockDim.y * blockIdx.y + threadIdx.y;
272 int j = blockDim.x * blockIdx.x + threadIdx.x;
275 int NLocalViewPixels = fltHeight * fltWidth * depth;
278 int NLocalViews =
calculateDimension(imgWidth, fltWidth, zeroPaddingWidth, strideCols) *
281 if (i >= NLocalViews || j >= NLocalViewPixels)
return;
283 int index = j * NLocalViews + i;
285 int numSlidesPerRow =
calculateDimension(imgWidth, fltWidth, zeroPaddingWidth, strideCols);
288 int bz = j / (fltHeight * fltWidth);
291 int by = (i / numSlidesPerRow) * strideRows - zeroPaddingHeight + (j - bz * fltHeight * fltWidth) / fltWidth;
294 int bx = (i % numSlidesPerRow) * strideCols - zeroPaddingWidth + (j - bz * fltHeight * fltWidth) % fltWidth;
296 if (bx < 0 || by < 0 || bx >= imgWidth || by >= imgHeight) {
301 A[
index] = B[(bx + by * imgWidth) * depth + bz];
306template<
typename AFloat>
308 const AFloat * theta,
311 int i = blockDim.y * blockIdx.y + threadIdx.y;
312 int j = blockDim.x * blockIdx.x + threadIdx.x;
315 if ((i <
m) && (j <
n))
316 W[
index] += theta[j];
320template<
typename AFloat>
325 int i = blockDim.y * blockIdx.y + threadIdx.y;
326 int j = blockDim.x * blockIdx.x + threadIdx.x;
329 if ((i <
m) && (j <
n))
334template<
typename AFloat>
338 int i = blockDim.y * blockIdx.y + threadIdx.y;
339 int j = blockDim.x * blockIdx.x + threadIdx.x;
342 if ((i <
m) && (j <
n)) {
348template<
typename AFloat>
352 int i = blockDim.y * blockIdx.y + threadIdx.y;
353 int j = blockDim.x * blockIdx.x + threadIdx.x;
356 if ((i <
m) && (j <
n)) {
362template<
typename AFloat>
366 int i = blockDim.y * blockIdx.y + threadIdx.y;
367 int j = blockDim.x * blockIdx.x + threadIdx.x;
370 if ((i <
m) && (j <
n)) {
376template<
typename AFloat>
380 int i = blockDim.y * blockIdx.y + threadIdx.y;
381 int j = blockDim.x * blockIdx.x + threadIdx.x;
384 if ((i <
m) && (j <
n)) {
390template<
typename AFloat>
394 int i = blockDim.y * blockIdx.y + threadIdx.y;
395 int j = blockDim.x * blockIdx.x + threadIdx.x;
398 if ((i <
m) && (j <
n)) {
407template<
typename AFloat>
408__global__
void AdamUpdate(AFloat * A,
const AFloat * M,
const AFloat * V,
409 int m,
int n, AFloat alpha, AFloat eps)
411 int i = blockDim.y * blockIdx.y + threadIdx.y;
412 int j = blockDim.x * blockIdx.x + threadIdx.x;
415 if ((i <
m) && (j <
n)) {
421template<
typename AFloat>
423 int m,
int n, AFloat beta)
425 int i = blockDim.y * blockIdx.y + threadIdx.y;
426 int j = blockDim.x * blockIdx.x + threadIdx.x;
429 if ((i <
m) && (j <
n)) {
435template<
typename AFloat>
437 int m,
int n, AFloat beta)
439 int i = blockDim.y * blockIdx.y + threadIdx.y;
440 int j = blockDim.x * blockIdx.x + threadIdx.x;
443 if ((i <
m) && (j <
n)) {
449template<
typename AFloat>
453 int i = blockDim.y * blockIdx.y + threadIdx.y;
454 int j = blockDim.x * blockIdx.x + threadIdx.x;
457 if ((i <
m) && (j <
n))
462template<
typename AFloat>
463__global__
void Relu(AFloat * A,
466 int i = blockDim.y * blockIdx.y + threadIdx.y;
467 int j = blockDim.x * blockIdx.x + threadIdx.x;
470 if ((i <
m) && (j <
n)) {
472 A[
index] = (
x < 0.0) ? 0.0 :
x;
477template<
typename AFloat>
479 const AFloat * A,
int m,
int n)
481 int i = blockDim.y * blockIdx.y + threadIdx.y;
482 int j = blockDim.x * blockIdx.x + threadIdx.x;
485 if ((i <
m) && (j <
n)) {
487 B[
index] = (
x < 0.0) ? 0.0 : 1.0;
492template<
typename AFloat>
496 int i = blockDim.y * blockIdx.y + threadIdx.y;
497 int j = blockDim.x * blockIdx.x + threadIdx.x;
500 if ((i <
m) && (j <
n)) {
501 AFloat sig = 1.0 / (1.0 + exp(-A[
index]));
507template<
typename AFloat>
512 int i = blockDim.y * blockIdx.y + threadIdx.y;
513 int j = blockDim.x * blockIdx.x + threadIdx.x;
516 if ((i <
m) && (j <
n)) {
517 AFloat sig = 1.0 / (1.0 + exp(-A[
index]));
523template<
typename AFloat>
528 int i = blockDim.y * blockIdx.y + threadIdx.y;
529 int j = blockDim.x * blockIdx.x + threadIdx.x;
532 if ((i <
m) && (j <
n)) {
533 AFloat sig = 1.0 / (1.0 + exp(-A[
index]));
534 B[
index] = sig * (1.0 - sig);
539template<
typename AFloat>
544 int i = blockDim.x * blockIdx.x + threadIdx.x;
548 for (
int j = 0; j <
n; j++) {
549 sum += exp(A[i + j *
n]);
551 for (
int j = 0; j <
n; j++) {
552 B[i + j *
n] = exp(A[i *
n + j]) /
sum;
558template<
typename AFloat>
559__global__
void Tanh(AFloat * A,
562 int i = blockDim.y * blockIdx.y + threadIdx.y;
563 int j = blockDim.x * blockIdx.x + threadIdx.x;
566 if ((i <
m) && (j <
n)) {
567 AFloat t = ::tanh(A[
index]);
573template<
typename AFloat>
578 int i = blockDim.y * blockIdx.y + threadIdx.y;
579 int j = blockDim.x * blockIdx.x + threadIdx.x;
582 if ((i <
m) && (j <
n)) {
583 AFloat t = ::tanh(A[
index]);
589template<
typename AFloat>
593 int i = blockDim.y * blockIdx.y + threadIdx.y;
594 int j = blockDim.x * blockIdx.x + threadIdx.x;
597 if ((i <
m) && (j <
n)) {
603template<
typename AFloat>
608 int i = blockDim.y * blockIdx.y + threadIdx.y;
609 int j = blockDim.x * blockIdx.x + threadIdx.x;
612 if ((i <
m) && (j <
n)) {
618template<
typename AFloat>
622 int i = blockDim.y * blockIdx.y + threadIdx.y;
623 int j = blockDim.x * blockIdx.x + threadIdx.x;
626 if ((i <
m) && (j <
n)) {
633template<
typename AFloat>
638 int i = blockDim.y * blockIdx.y + threadIdx.y;
639 int j = blockDim.x * blockIdx.x + threadIdx.x;
642 if ((i <
m) && (j <
n)) {
643 AFloat
x = 1.0 + fabs(A[
index]);
649template<
typename AFloat>
653 int i = blockDim.y * blockIdx.y + threadIdx.y;
654 int j = blockDim.x * blockIdx.x + threadIdx.x;
657 if ((i <
m) && (j <
n)) {
664template<
typename AFloat>
669 int i = blockDim.y * blockIdx.y + threadIdx.y;
670 int j = blockDim.x * blockIdx.x + threadIdx.x;
673 if ((i <
m) && (j <
n)) {
675 B[
index] = - 2.0 *
x * exp(-
x *
x);
680template<
typename AFloat>
684 const AFloat * weights,
687 int i = blockDim.y * blockIdx.y + threadIdx.y;
688 int j = blockDim.x * blockIdx.x + threadIdx.x;
689 int tid = blockDim.x * threadIdx.y + threadIdx.x;
694 if ((i <
m) && (j <
n)) {
695 AFloat
w = weights[i];
696 AFloat norm = 1 / ((AFloat) (
m *
n));
698 sdata[tid] =
w * norm *
e *
e;
706template<
typename AFloat>
711 int i = blockDim.y * blockIdx.y + threadIdx.y;
712 int j = blockDim.x * blockIdx.x + threadIdx.x;
713 int tid = blockDim.x * threadIdx.y + threadIdx.x;
718 if ((i <
m) && (j <
n)) {
728template<
typename AFloat>
733 int i = blockDim.y * blockIdx.y + threadIdx.y;
734 int j = blockDim.x * blockIdx.x + threadIdx.x;
735 int tid = blockDim.x * threadIdx.y + threadIdx.x;
740 if ((i <
m) && (j <
n)) {
741 sdata[tid] = abs(A[
index]);
749template<
typename AFloat>
753 const AFloat * weights,
756 int i = blockDim.y * blockIdx.y + threadIdx.y;
757 int j = blockDim.x * blockIdx.x + threadIdx.x;
760 if ((i <
m) && (j <
n)) {
766template<
typename AFloat>
772 int i = blockDim.y * blockIdx.y + threadIdx.y;
773 int j = blockDim.x * blockIdx.x + threadIdx.x;
776 if ((i <
m) && (j <
n)) {
777 AFloat sign = (B[
index] < 0.0) ? -1.0 : 1.0;
783template<
typename AFloat>
789 int i = blockDim.y * blockIdx.y + threadIdx.y;
790 int j = blockDim.x * blockIdx.x + threadIdx.x;
793 if ((i <
m) && (j <
n)) {
799template<
typename AFloat>
803 const AFloat * weights,
806 int i = blockDim.y * blockIdx.y + threadIdx.y;
807 int j = blockDim.x * blockIdx.x + threadIdx.x;
808 int tid = blockDim.x * threadIdx.y + threadIdx.x;
813 if ((i <
m) && (j <
n)) {
814 AFloat norm = 1 / ((AFloat) (
m *
n));
817 sdata[tid] = -weights[i] * norm * log(1.0 - sig);
818 else if (Y[
index] == 1.0)
819 sdata[tid] = -weights[i] * norm * log(sig);
821 AFloat ce = Y[
index] * log(sig) + (1.0 - Y[
index]) * log(1.0 - sig);
822 sdata[tid] = -weights[i] * norm * ce;
832template<
typename AFloat>
836 const AFloat * weights,
839 int i = blockDim.y * blockIdx.y + threadIdx.y;
840 int j = blockDim.x * blockIdx.x + threadIdx.x;
843 if ((i <
m) && (j <
n)) {
844 AFloat norm = 1 / ((AFloat) (
m *
n));
847 dY[
index] = weights[i] * norm * (sig -
y);
852template<
typename AFloat>
856 const AFloat * weights,
859 int i = blockDim.y * blockIdx.y + threadIdx.y;
860 int tid = threadIdx.y;
863 AFloat norm = 1.0 / ((AFloat)
m);
868 for (
int j = 0; j <
n; j++) {
871 for (
int j = 0; j <
n; j++) {
872 sdata[tid] += Y[i + j *
m] * log(exp(
output[i + j *
m]) /
sum);
874 sdata[tid] *= -weights[i] * norm;
883template<
typename AFloat>
887 const AFloat * weights,
890 int i = blockDim.y * blockIdx.y + threadIdx.y;
891 AFloat norm = 1.0 / ((AFloat)
m);
896 for (
int j = 0; j <
n; j++) {
898 sumY += Y[i + j *
m];
900 for (
int j = 0; j <
n; j++) {
901 dY[i + j *
m] = sumY * exp(
output[i + j *
m]) /
sum - Y[i + j *
m];
902 dY[i + j *
m] *= weights[i] * norm;
908template<
typename AFloat>
913 int i = blockDim.y * blockIdx.y + threadIdx.y;
914 int j = blockDim.x * blockIdx.x + threadIdx.x;
915 int tid = threadIdx.y * blockDim.x + threadIdx.x;
919 if ((i <
m) && (j <
n))
920 smem[tid] = A[
index];
928template<
typename AFloat>
933 int i = blockDim.y * blockIdx.y + threadIdx.y;
934 int j = blockDim.x * blockIdx.x + threadIdx.x;
935 int matrixIndex = j *
m + i;
936 int blockIndex = blockDim.x * threadIdx.y + threadIdx.x;
941 if ((i <
m) && (j <
n)) {
942 smem[blockIndex] = A[matrixIndex];
944 smem[blockIndex] = 0.0;
950template<
typename AFloat>
953 int i = blockDim.y * blockIdx.y + threadIdx.y;
954 int j = blockDim.x * blockIdx.x + threadIdx.x;
956 if (i >=
m || j >=
n)
return;
957 int matrixIndex = j *
m + i;
961 if(fabs(A[matrixIndex] - B[matrixIndex]) >
epsilon)
result[0] =
false;
965template<
typename AFloat>
968 AFloat dropoutProbability,
969 curandState_t *state)
971 int i = blockDim.y * blockIdx.y + threadIdx.y;
972 int j = blockDim.x * blockIdx.x + threadIdx.x;
973 int tid = i * gridDim.x + j;
974 if ((i <
m) && (j <
n)) {
975 float r = curand_uniform(state + tid);
976 if (
r > dropoutProbability) {
979 A[j *
m + i] /= dropoutProbability;
1003template<
typename AFloat>
1005 int imgWidth,
int fltHeight,
int fltWidth,
int strideRows,
int strideCols)
1008 int i = blockDim.y * blockIdx.y + threadIdx.y;
1011 int j = blockDim.x * blockIdx.x + threadIdx.x;
1017 if (i >= depth || j >= NLocalViews)
return;
1019 int outputIndex = j * depth + i;
1023 int rowMin = (j / numSlidesPerRow) * strideRows;
1024 int colMin = (j % numSlidesPerRow) * strideCols;
1028 AFloat maxIndex = 0;
1031 for (
size_t by = rowMin; by < rowMin + fltHeight; by++) {
1032 for (
size_t bx = colMin; bx < colMin + fltWidth; bx++) {
1033 int inputIndex = (bx + by * imgWidth) * depth + bz;
1036 maxIndex = bx + by * imgWidth;
1041 indexMatrix[outputIndex] = maxIndex;
1060template<
typename AFloat>
1062 const AFloat * activationGradients,
1063 const AFloat * indexMatrix,
1064 int depth,
int imgHeight,
int imgWidth,
int fltHeight,
int fltWidth,
1065 int strideRows,
int strideCols)
1067 int slice = blockDim.y * blockIdx.y + threadIdx.y;
1068 int j = blockDim.x * blockIdx.x + threadIdx.x;
1070 if (slice >= depth || j >= imgHeight * imgWidth)
return;
1076 int backRow = j % imgHeight;
1077 int backCol = j / imgHeight;
1080 int nextRowMin = floor((backRow - fltHeight) / (AFloat) strideRows) + 1;
1081 int nextColMin = floor((backCol - fltWidth) / (AFloat) strideCols) + 1;
1083 int outputIndex = 0;
1087 for (
int row = nextRowMin; row <= nextRowMin + fltHeight - strideRows; row++) {
1088 for (
int col = nextColMin; col <= nextColMin + fltWidth - strideCols; col++) {
1090 if (row >=
height || col >=
width || col < 0 || row < 0)
continue;
1092 outputIndex = (row *
width + col) * depth + slice;
1095 if (indexMatrix[outputIndex] == backCol + backRow * imgWidth) {
1096 grad += activationGradients[outputIndex];
1100 activationGradientsBackward[(backCol + backRow * imgWidth) * depth + slice] = grad;
1103template<
typename AFloat>
1104__global__
void RotateWeights(AFloat * A,
const AFloat * B,
int filterDepth,
int filterHeight,
int filterWidth,
1107 int i = blockDim.y * blockIdx.y + threadIdx.y;
1108 int j = blockDim.x * blockIdx.x + threadIdx.x;
1110 if (i >= numFilters || j > filterDepth * filterHeight * filterWidth)
return;
1112 int jump = filterHeight * filterWidth;
1114 int col = i * jump + jump - j % jump - 1;
1116 A[col * filterDepth + row] = B[j * numFilters + i];
1119template<
typename AFloat>
1120__global__
void AddBiases(AFloat * A,
const AFloat * B,
int nRows,
int nCols)
1122 int i = blockDim.y * blockIdx.y + threadIdx.y;
1123 int j = blockDim.x * blockIdx.x + threadIdx.x;
1124 if (i >= nRows || j >= nCols)
return;
1126 A[i + j * nRows] += B[i];
1129template<
typename AFloat>
1130__global__
void UpdateWeights(AFloat * A,
const AFloat ** B,
int batchSize,
int nRows,
int nCols)
1132 int i = blockDim.y * blockIdx.y + threadIdx.y;
1133 int j = blockDim.x * blockIdx.x + threadIdx.x;
1135 if (i >= nRows || j >= nCols)
return;
1137 for (
size_t event = 0;
event < batchSize;
event++) {
1138 size_t index = i * nCols + j;
1143template<
typename AFloat>
1144__global__
void Reshape(AFloat * A,
const AFloat * B,
int nRowsA,
int nColsA,
int nRowsB,
int nColsB)
1146 int i = blockDim.y * blockIdx.y + threadIdx.y;
1147 int j = blockDim.x * blockIdx.x + threadIdx.x;
1148 if (i >= nRowsA || j >= nColsA)
return;
1150 size_t indexA = j * nRowsA + i;
1152 size_t nElem = i * nColsA + j;
1153 size_t indexB = (nElem % nColsB) * nRowsB + nElem / nColsB;
1155 A[indexA] = B[indexB];
1172template<
typename AFloat>
1173__global__
void Flatten(AFloat * A,
const AFloat *B,
int size,
int nRows,
int nCols)
1175 int i = blockDim.y * blockIdx.y + threadIdx.y;
1176 int j = blockDim.x * blockIdx.x + threadIdx.x;
1178 int nColsA = nRows * nCols;
1179 if (i >=
size || j >= nColsA)
return;
1182 int row = j / nCols;
1183 int col = j % nCols;
1185 AFloat element = B[ i * nColsA + col * nRows + row ];
1192template<
typename AFloat>
1193__global__
void FlattenRM(AFloat * A,
const AFloat *B,
int size,
int nRows,
int nCols)
1195 int i = blockDim.y * blockIdx.y + threadIdx.y;
1196 int j = blockDim.x * blockIdx.x + threadIdx.x;
1198 int nColsA = nRows * nCols;
1199 if (i >=
size || j >= nColsA)
return;
1202 int row = j / nCols;
1203 int col = j % nCols;
1205 AFloat element = B[ i * nColsA + row * nCols + col ];
1226template<
typename AFloat>
1227__global__
void Deflatten(AFloat * A,
const AFloat * B,
int size,
int nRows,
int nCols)
1229 int i = blockDim.y * blockIdx.y + threadIdx.y;
1230 int j = blockDim.x * blockIdx.x + threadIdx.x;
1232 int nColsB = nRows * nCols;
1233 if (i >=
size || j >= nColsB)
return;
1235 AFloat element = B[j *
size + i];
1238 int row = j / nCols;
1239 int col = j % nCols;
1240 A[ i * nColsB + col * nRows + row] = element;
1244template<
typename AFloat>
1247 int i = blockDim.y * blockIdx.y + threadIdx.y;
1248 int j = blockDim.x * blockIdx.x + threadIdx.x;
1250 int nColsB = nRows * nCols;
1251 if (i >=
size || j >= nColsB)
return;
1253 AFloat element = B[j *
size + i];
1256 int row = j / nCols;
1257 int col = j % nCols;
1258 A[ i * nColsB + row * nCols + col] = element;
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t r
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t height
Implementation of the CrossEntropy as separation criterion.
static constexpr int BlockSize
__global__ void SymmetricRelu(AFloat *A, int m, int n)
__global__ void UpdateWeights(AFloat *A, const AFloat **B, int batchSize, int nRows, int nCols)
__device__ int calculateDimension(int imgDim, int fltDim, int padding, int stride)
Calculate the dimension of an output volume, given the sliding parameters and the input shape.
__global__ void SigmoidDerivative(AFloat *B, const AFloat *A, int m, int n)
__device__ AFloat AtomicAdd(AFloat *address, AFloat val)
__global__ void Dropout(AFloat *A, int m, int n, AFloat dropoutProbability, curandState_t *state)
__global__ void SoftmaxCrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void SumColumns(AFloat *B, const AFloat *A, int m, int n)
__global__ void IdentityDerivative(AFloat *A, int m, int n)
__global__ void SqrtElementWise(AFloat *A, int m, int n)
__global__ void AdamUpdate(AFloat *A, const AFloat *M, const AFloat *V, int m, int n, AFloat alpha, AFloat eps)
optimizer kernel functions
__global__ void SoftmaxCrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void AddL1RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
__device__ void ReduceSumVertical(AFloat *result, AFloat *sdata, int n)
__global__ void MeanSquaredErrorGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void Relu(AFloat *A, int m, int n)
__global__ void ReluDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void AbsoluteSum(AFloat *result, const AFloat *A, int m, int n)
__global__ void AddL2RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
__device__ AFloat max(AFloat x, AFloat y)
__global__ void AddRowWise(AFloat *W, const AFloat *theta, int m, int n)
__global__ void ConstMult(AFloat *A, AFloat beta, int m, int n)
__global__ void GaussDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void Deflatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Deflatten a 2D-array into an array of 2D-arrays.
__global__ void Flatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Flatten an array of 2D-arrays into a single 2D-array.
__global__ void Softmax(AFloat *B, const AFloat *A, int m, int n)
__global__ void RotateWeights(AFloat *A, const AFloat *B, int filterDepth, int filterHeight, int filterWidth, int numFilters)
__global__ void TanhDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void CrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void ReduceMatrix(AFloat *result, const AFloat *A, int m, int n)
__global__ void Im2Col(AFloat *A, const AFloat *B, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols, int zeroPaddingHeight, int zeroPaddingWidth)
A kernel that re-arranges image regions of the input matrix \B, into column vectors in matrix \A.
__global__ void DeflattenRM(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
__global__ void ConstAdd(AFloat *A, AFloat beta, int m, int n)
__global__ void SymmetricReluDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void MeanSquaredError(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
__global__ void SquareElementWise(AFloat *A, int m, int n)
__global__ void SoftSignDerivative(AFloat *B, const AFloat *A, int m, int n)
__global__ void Reshape(AFloat *A, const AFloat *B, int nRowsA, int nColsA, int nRowsB, int nColsB)
__global__ void Hadamard(AFloat *B, const AFloat *A, int m, int n)
__global__ void AlmostEquals(bool *result, const AFloat *A, const AFloat *B, double epsilon, int m, int n)
__global__ void FlattenRM(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
__global__ void SquaredSum(AFloat *result, const AFloat *A, int m, int n)
__global__ void AdamUpdateFirstMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
__global__ void ReciprocalElementWise(AFloat *A, int m, int n)
__device__ void ReduceSum(AFloat *result, AFloat *sdata)
__global__ void MaxPoolBackward(AFloat *activationGradientsBackward, const AFloat *activationGradients, const AFloat *indexMatrix, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Back-propagate the gradients through a max-pooling layer.
__global__ void Downsample(AFloat *output, AFloat *indexMatrix, const AFloat *input, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Downsampling kernel used as the forward propagation step of a Max-Pooling layer.
__global__ void AdamUpdateSecondMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
__global__ void AddBiases(AFloat *A, const AFloat *B, int nRows, int nCols)
std::shared_ptr< std::function< double(double)> > Tanh
double weightDecay(double error, ItWeight itWeight, ItWeight itWeightEnd, double factorWeightDecay, EnumRegularization eRegularization)
compute the weight decay for regularization (L1 or L2)
std::shared_ptr< std::function< double(double)> > Gauss
std::shared_ptr< std::function< double(double)> > Sigmoid
std::shared_ptr< std::function< double(double)> > SoftSign
create variable transformations
static uint64_t sum(uint64_t i)