Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
Kernels.cuh
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Simon Pfreundschuh 13/07/16
3
4/*************************************************************************
5 * Copyright (C) 2016, Simon Pfreundschuh *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12/////////////////////////////////////////////////////////////////////////
13// Implementation of the device kernels for the CUDA implementation of //
14// the low-level interface. //
15/////////////////////////////////////////////////////////////////////////
16
17#ifndef TMVA_DNN_ARCHITECTURES_CUDA_KERNELS
18#define TMVA_DNN_ARCHITECTURES_CUDA_KERNELS
19
22#include "cuda.h"
23#include "math.h"
24
25namespace TMVA {
26namespace DNN {
27namespace Cuda {
28
29//____________________________________________________________________________
30template<typename AFloat>
31__device__ AFloat AtomicAdd(AFloat* address, AFloat val);
32
33template<>
34__device__ double AtomicAdd(double* address, double val)
35{
36 unsigned long long int* address_as_ull = (unsigned long long int*)address;
37 unsigned long long int old = *address_as_ull, assumed;
38 do {
39 assumed = old;
40 old = atomicCAS(address_as_ull, assumed,
41 __double_as_longlong(val +
42 __longlong_as_double(assumed)));
43 } while (assumed != old);
44 return __longlong_as_double(old);
45}
46
47template<>
48__device__ float AtomicAdd(float* address, float val)
49{
50 return atomicAdd(address, val);
51}
52
53//____________________________________________________________________________
54template<typename AFloat>
55__device__ void ReduceSumVertical(AFloat *result,
56 AFloat * sdata,
57 int n)
58{
59 // i,j are block row and column indices.
60 int i = threadIdx.y;
61 int j = threadIdx.x;
62 int index = i * blockDim.x + j;
63
64 __syncthreads();
65 if ((blockDim.y > 512) && (i < 512)) {
66 if ((i + 512) < blockDim.y) {
67 sdata[index] += sdata[index + 512 * blockDim.x];
68 }
69 }
70
71 __syncthreads();
72 if ((blockDim.y > 256) && (i < 256)) {
73 if ((i + 256) < blockDim.y) {
74 sdata[index] += sdata[index + 256 * blockDim.x];
75 }
76 }
77 __syncthreads();
78 if ((blockDim.y > 128) && (i < 128)) {
79 if ((i + 128) < blockDim.y) {
80 sdata[index] += sdata[index + 128 * blockDim.x];
81 }
82 }
83 __syncthreads();
84 if ((blockDim.y > 64) && (i < 64)) {
85 if ((i + 64) < blockDim.y) {
86 sdata[index] += sdata[index + 64 * blockDim.x];
87 }
88 }
89 __syncthreads();
90 if ((blockDim.y > 32) && (i < 32)) {
91 if ((i + 32) < blockDim.y) {
92 sdata[index] += sdata[index + 32 * blockDim.x];
93 }
94 }
95 __syncthreads();
96 if ((blockDim.y > 16) && (i < 16)) {
97 if ((i + 16) < blockDim.y) {
98 sdata[index] += sdata[index + 16 * blockDim.x];
99 }
100 }
101 __syncthreads();
102 if ((blockDim.y > 8) && (i < 8)) {
103 if ((i + 8) < blockDim.y) {
104 sdata[index] += sdata[index + 8 * blockDim.x];
105 }
106 }
107 __syncthreads();
108 if ((blockDim.y > 4) && (i < 4)) {
109 if ((i + 4) < blockDim.y) {
110 sdata[index] += sdata[index + 4 * blockDim.x];
111 }
112 }
113 __syncthreads();
114 if ((blockDim.y > 2) && (i < 2)) {
115 if ((i + 2) < blockDim.y) {
116 sdata[index] += sdata[index + 2 * blockDim.x];
117 }
118 }
119 __syncthreads();
120 if ((blockDim.y > 1) && (i < 1)) {
121 if ((i + 1) < blockDim.y) {
122 sdata[index] += sdata[index + 1 * blockDim.x];
123 }
124 }
125 __syncthreads();
126 if ((i == 0) && ((blockIdx.x * blockDim.x + threadIdx.x) < n)) {
127 AtomicAdd(result + j, sdata[index]);
128 }
129 __syncthreads();
130}
131
132//____________________________________________________________________________
133template<typename AFloat>
134__device__ void ReduceSum(AFloat *result, AFloat * sdata)
135{
136 int tid = threadIdx.x + threadIdx.y * blockDim.x;
137
138 __syncthreads();
139 if ((TDevice::BlockSize > 512) && (tid < 512)) {
140 if ((tid + 512) < TDevice::BlockSize) {
141 sdata[tid] += sdata[tid + 512];
142 }
143 }
144
145 __syncthreads();
146 if ((TDevice::BlockSize > 256) && (tid < 256)) {
147 if ((tid + 256) < TDevice::BlockSize) {
148 sdata[tid] += sdata[tid + 256];
149 }
150 }
151 __syncthreads();
152 if ((TDevice::BlockSize > 128) && (tid < 128)) {
153 if ((tid + 128) < TDevice::BlockSize) {
154 sdata[tid] += sdata[tid + 128];
155 }
156 }
157 __syncthreads();
158 if ((TDevice::BlockSize > 64) && (tid < 64)) {
159 if ((tid + 64) < TDevice::BlockSize) {
160 sdata[tid] += sdata[tid + 64];
161 }
162 }
163 __syncthreads();
164 if ((TDevice::BlockSize > 32) && (tid < 32)) {
165 if ((tid + 32) < TDevice::BlockSize) {
166 sdata[tid] += sdata[tid + 32];
167 }
168 }
169 __syncthreads();
170 if ((TDevice::BlockSize > 16) && (tid < 16)) {
171 if ((tid + 16) < TDevice::BlockSize) {
172 sdata[tid] += sdata[tid + 16];
173 }
174 }
175 __syncthreads();
176 if ((TDevice::BlockSize > 8) && (tid < 8)) {
177 if ((tid + 8) < TDevice::BlockSize) {
178 sdata[tid] += sdata[tid + 8];
179 }
180 }
181 __syncthreads();
182 if ((TDevice::BlockSize > 4) && (tid < 4)) {
183 if ((tid + 4) < TDevice::BlockSize) {
184 sdata[tid] += sdata[tid + 4];
185 }
186 }
187 __syncthreads();
188 if ((TDevice::BlockSize > 2) && (tid < 2)) {
189 if ((tid + 2) < TDevice::BlockSize) {
190 sdata[tid] += sdata[tid + 2];
191 }
192 }
193 __syncthreads();
194 if ((TDevice::BlockSize > 1) && (tid < 1)) {
195 if ((tid + 1) < TDevice::BlockSize) {
196 sdata[tid] += sdata[tid + 1];
197 }
198 }
199 if (tid == 0) {
200 AtomicAdd(result, sdata[0]);
201 }
202
203 __syncthreads();
204}
205
206template<typename AFloat>
207__device__ AFloat max(AFloat x, AFloat y)
208{
209 if (x < y) return y;
210 return x;
211}
212
213////////////////////////////////////////////////////////////////////////////////////
214/// \brief Calculate the dimension of an output volume, given the sliding parameters
215/// and the input shape.
216/// \param[in] imgDim The size of the input tensor in a spatial dimension.
217/// \param[in] fltDim The size of the sliding filter in the same dimension.
218/// \param[in] padding Number of zeroes to pad the input with.
219/// \param[in] stride Number of pixels the kernel is sliding in each iteration.
220/// \returns The output dimension.
221///
222/// Note that no checks are performed to assert validity of the input parameters.
223/// We are allowed to assume them valid because those checks have already been
224/// performed prior to the invocation of the kernel.
225////////////////////////////////////////////////////////////////////////////////////
226__device__ int calculateDimension(int imgDim, int fltDim, int padding, int stride)
227{
228 // Parameters passed at this point are guaranteed to be valid - skip checks.
229 return ((imgDim - fltDim + 2 * padding) / stride) + 1;
230}
231
232////////////////////////////////////////////////////////////////////////////////////
233/// \brief A kernel that re-arranges image regions of the input matrix \B, into
234/// column vectors in matrix \A.
235///
236/// \param[out] A The output matrix. Each row corresponds to a receptive field.
237/// \param[in] B The input matrix. Each row corresponds to a row in the image view.
238/// \param[in] depth The depth of the input tensor.
239/// \param[in] imgHeight The height of the input tensor.
240/// \param[in] imgWidth The output of the input tensor
241/// \param[in] fltHeight Height of the filter.
242/// \param[in] fltWidth Width of the filter.
243/// \param[in] strideRows stride size in the horizontal dimension.
244/// \param[in] strideCols stride size in the vertical dimension.
245/// \param[in] zeroPaddingHeight The padding in the horizontal dimension.
246/// \param[in] zeroPaddingWidth The padding in the vertical dimension.
247///
248/// The kernel should be invoked with one thread per output element. Note that
249/// matrices \A and \B have different shapes. Each thread in this kernel is
250/// responsible for filling one cell of the output matrix \A. It does so by computing
251/// the correct element to copy from the input matrix \B. We therefore never need to
252/// block. When reading this kernel it is important to keep in mind that TCudaMatrix
253/// objects are saved in column major order for compatibility with cuBLAS.
254////////////////////////////////////////////////////////////////////////////////////
255template<typename AFloat>
256__global__ void Im2Col(AFloat * A,
257 const AFloat * B,
258 int depth,
259 int imgHeight,
260 int imgWidth,
261 int fltHeight,
262 int fltWidth,
263 int strideRows,
264 int strideCols,
265 int zeroPaddingHeight,
266 int zeroPaddingWidth)
267{
268 // The row of the output matrix.
269 int i = blockDim.y * blockIdx.y + threadIdx.y;
270
271 // The column of the output matrix.
272 int j = blockDim.x * blockIdx.x + threadIdx.x;
273
274 // Number of column in matrix A.
275 int NLocalViewPixels = fltHeight * fltWidth * depth;
276
277 // Number of rows in matrix A.
278 int NLocalViews = calculateDimension(imgWidth, fltWidth, zeroPaddingWidth, strideCols) *
279 calculateDimension(imgHeight, fltHeight, zeroPaddingHeight, strideRows);
280
281 if (i >= NLocalViews || j >= NLocalViewPixels) return;
282
283 int index = j * NLocalViews + i;
284
285 int numSlidesPerRow = calculateDimension(imgWidth, fltWidth, zeroPaddingWidth, strideCols);
286
287 // Which image channel of B?
288 int bz = j / (fltHeight * fltWidth);
289
290 // Which row in matrix B?
291 int by = (i / numSlidesPerRow) * strideRows - zeroPaddingHeight + (j - bz * fltHeight * fltWidth) / fltWidth;
292
293 // Which column in matrix B?
294 int bx = (i % numSlidesPerRow) * strideCols - zeroPaddingWidth + (j - bz * fltHeight * fltWidth) % fltWidth;
295
296 if (bx < 0 || by < 0 || bx >= imgWidth || by >= imgHeight) {
297 // This is a padding element.
298 A[index] = 0;
299 }
300 else {
301 A[index] = B[(bx + by * imgWidth) * depth + bz];
302 }
303}
304
305//____________________________________________________________________________
306template<typename AFloat>
307__global__ void AddRowWise(AFloat * W,
308 const AFloat * theta,
309 int m, int n)
310{
311 int i = blockDim.y * blockIdx.y + threadIdx.y;
312 int j = blockDim.x * blockIdx.x + threadIdx.x;
313 int index = j * m + i;
314
315 if ((i < m) && (j < n))
316 W[index] += theta[j];
317}
318
319//____________________________________________________________________________
320template<typename AFloat>
321__global__ void Hadamard(AFloat * B,
322 const AFloat * A,
323 int m, int n)
324{
325 int i = blockDim.y * blockIdx.y + threadIdx.y;
326 int j = blockDim.x * blockIdx.x + threadIdx.x;
327 int index = j * m + i;
328
329 if ((i < m) && (j < n))
330 B[index] *= A[index];
331}
332
333//____________________________________________________________________________
334template<typename AFloat>
335__global__ void ConstAdd(AFloat * A, AFloat beta,
336 int m, int n)
337{
338 int i = blockDim.y * blockIdx.y + threadIdx.y;
339 int j = blockDim.x * blockIdx.x + threadIdx.x;
340 int index = j * m + i;
341
342 if ((i < m) && (j < n)) {
343 A[index] = A[index] + beta;
344 }
345}
346
347//____________________________________________________________________________
348template<typename AFloat>
349__global__ void ConstMult(AFloat * A, AFloat beta,
350 int m, int n)
351{
352 int i = blockDim.y * blockIdx.y + threadIdx.y;
353 int j = blockDim.x * blockIdx.x + threadIdx.x;
354 int index = j * m + i;
355
356 if ((i < m) && (j < n)) {
357 A[index] = A[index] * beta;
358 }
359}
360
361//____________________________________________________________________________
362template<typename AFloat>
363__global__ void ReciprocalElementWise(AFloat * A,
364 int m, int n)
365{
366 int i = blockDim.y * blockIdx.y + threadIdx.y;
367 int j = blockDim.x * blockIdx.x + threadIdx.x;
368 int index = j * m + i;
369
370 if ((i < m) && (j < n)) {
371 A[index] = 1.0 / A[index];
372 }
373}
374
375//____________________________________________________________________________
376template<typename AFloat>
377__global__ void SquareElementWise(AFloat * A,
378 int m, int n)
379{
380 int i = blockDim.y * blockIdx.y + threadIdx.y;
381 int j = blockDim.x * blockIdx.x + threadIdx.x;
382 int index = j * m + i;
383
384 if ((i < m) && (j < n)) {
385 A[index] = A[index] * A[index];
386 }
387}
388
389//____________________________________________________________________________
390template<typename AFloat>
391__global__ void SqrtElementWise(AFloat * A,
392 int m, int n)
393{
394 int i = blockDim.y * blockIdx.y + threadIdx.y;
395 int j = blockDim.x * blockIdx.x + threadIdx.x;
396 int index = j * m + i;
397
398 if ((i < m) && (j < n)) {
399 A[index] = sqrt(A[index]);
400 }
401}
402
403
404/// optimizer kernel functions
405
406//____________________________________________________________________________
407template<typename AFloat>
408__global__ void AdamUpdate(AFloat * A, const AFloat * M, const AFloat * V,
409 int m, int n, AFloat alpha, AFloat eps)
410{
411 int i = blockDim.y * blockIdx.y + threadIdx.y;
412 int j = blockDim.x * blockIdx.x + threadIdx.x;
413 int index = j * m + i;
414
415 if ((i < m) && (j < n)) {
416 A[index] = A[index] - alpha * M[index]/( sqrt(V[index]) + eps);
417 }
418}
419
420//____________________________________________________________________________
421template<typename AFloat>
422__global__ void AdamUpdateFirstMom(AFloat * A, const AFloat * B,
423 int m, int n, AFloat beta)
424{
425 int i = blockDim.y * blockIdx.y + threadIdx.y;
426 int j = blockDim.x * blockIdx.x + threadIdx.x;
427 int index = j * m + i;
428
429 if ((i < m) && (j < n)) {
430 A[index] = beta * A[index] + (1.-beta) * B[index];
431 }
432}
433
434//____________________________________________________________________________
435template<typename AFloat>
436__global__ void AdamUpdateSecondMom(AFloat * A, const AFloat * B,
437 int m, int n, AFloat beta)
438{
439 int i = blockDim.y * blockIdx.y + threadIdx.y;
440 int j = blockDim.x * blockIdx.x + threadIdx.x;
441 int index = j * m + i;
442
443 if ((i < m) && (j < n)) {
444 A[index] = beta * A[index] + (1.-beta) * B[index] * B[index];
445 }
446}
447
448//____________________________________________________________________________
449template<typename AFloat>
450__global__ void IdentityDerivative(AFloat * A,
451 int m, int n)
452{
453 int i = blockDim.y * blockIdx.y + threadIdx.y;
454 int j = blockDim.x * blockIdx.x + threadIdx.x;
455 int index = j * m + i;
456
457 if ((i < m) && (j < n))
458 A[index] = 1.0;
459}
460
461//____________________________________________________________________________
462template<typename AFloat>
463__global__ void Relu(AFloat * A,
464 int m, int n)
465{
466 int i = blockDim.y * blockIdx.y + threadIdx.y;
467 int j = blockDim.x * blockIdx.x + threadIdx.x;
468 int index = j * m + i;
469
470 if ((i < m) && (j < n)) {
471 AFloat x = A[index];
472 A[index] = (x < 0.0) ? 0.0 : x;
473 }
474}
475
476//____________________________________________________________________________
477template<typename AFloat>
478__global__ void ReluDerivative(AFloat * B,
479 const AFloat * A, int m, int n)
480{
481 int i = blockDim.y * blockIdx.y + threadIdx.y;
482 int j = blockDim.x * blockIdx.x + threadIdx.x;
483 int index = j * m + i;
484
485 if ((i < m) && (j < n)) {
486 AFloat x = A[index];
487 B[index] = (x < 0.0) ? 0.0 : 1.0;
488 }
489}
490
491//____________________________________________________________________________
492template<typename AFloat>
493__global__ void Sigmoid(AFloat * A,
494 int m, int n)
495{
496 int i = blockDim.y * blockIdx.y + threadIdx.y;
497 int j = blockDim.x * blockIdx.x + threadIdx.x;
498 int index = j * m + i;
499
500 if ((i < m) && (j < n)) {
501 AFloat sig = 1.0 / (1.0 + exp(-A[index]));
502 A[index] = sig;
503 }
504}
505
506//____________________________________________________________________________
507template<typename AFloat>
508__global__ void Sigmoid(AFloat * B,
509 const AFloat * A,
510 int m, int n)
511{
512 int i = blockDim.y * blockIdx.y + threadIdx.y;
513 int j = blockDim.x * blockIdx.x + threadIdx.x;
514 int index = j * m + i;
515
516 if ((i < m) && (j < n)) {
517 AFloat sig = 1.0 / (1.0 + exp(-A[index]));
518 B[index] = sig;
519 }
520}
521
522//____________________________________________________________________________
523template<typename AFloat>
524__global__ void SigmoidDerivative(AFloat * B,
525 const AFloat * A,
526 int m, int n)
527{
528 int i = blockDim.y * blockIdx.y + threadIdx.y;
529 int j = blockDim.x * blockIdx.x + threadIdx.x;
530 int index = j * m + i;
531
532 if ((i < m) && (j < n)) {
533 AFloat sig = 1.0 / (1.0 + exp(-A[index]));
534 B[index] = sig * (1.0 - sig);
535 }
536}
537
538//____________________________________________________________________________
539template<typename AFloat>
540__global__ void Softmax(AFloat * B,
541 const AFloat * A,
542 int m, int n)
543{
544 int i = blockDim.x * blockIdx.x + threadIdx.x;
545
546 if (i < m) {
547 AFloat sum = 0.0;
548 for (int j = 0; j < n; j++) {
549 sum += exp(A[i + j * n]);
550 }
551 for (int j = 0; j < n; j++) {
552 B[i + j * n] = exp(A[i * n + j]) / sum;
553 }
554 }
555}
556
557//____________________________________________________________________________
558template<typename AFloat>
559__global__ void Tanh(AFloat * A,
560 int m, int n)
561{
562 int i = blockDim.y * blockIdx.y + threadIdx.y;
563 int j = blockDim.x * blockIdx.x + threadIdx.x;
564 int index = j * m + i;
565
566 if ((i < m) && (j < n)) {
567 AFloat t = ::tanh(A[index]);
568 A[index] = t;
569 }
570}
571
572//____________________________________________________________________________
573template<typename AFloat>
574__global__ void TanhDerivative(AFloat * B,
575 const AFloat * A,
576 int m, int n)
577{
578 int i = blockDim.y * blockIdx.y + threadIdx.y;
579 int j = blockDim.x * blockIdx.x + threadIdx.x;
580 int index = j * m + i;
581
582 if ((i < m) && (j < n)) {
583 AFloat t = ::tanh(A[index]);
584 B[index] = 1 - t*t;
585 }
586}
587
588//____________________________________________________________________________
589template<typename AFloat>
590__global__ void SymmetricRelu(AFloat * A,
591 int m, int n)
592{
593 int i = blockDim.y * blockIdx.y + threadIdx.y;
594 int j = blockDim.x * blockIdx.x + threadIdx.x;
595 int index = j * m + i;
596
597 if ((i < m) && (j < n)) {
598 A[index] = abs(A[index]);
599 }
600}
601
602//____________________________________________________________________________
603template<typename AFloat>
604__global__ void SymmetricReluDerivative(AFloat * B,
605 const AFloat * A,
606 int m, int n)
607{
608 int i = blockDim.y * blockIdx.y + threadIdx.y;
609 int j = blockDim.x * blockIdx.x + threadIdx.x;
610 int index = j * m + i;
611
612 if ((i < m) && (j < n)) {
613 B[index] = (A[index] < 0.0) ? -1.0 : 1.0;
614 }
615}
616
617//____________________________________________________________________________
618template<typename AFloat>
619__global__ void SoftSign(AFloat * A,
620 int m, int n)
621{
622 int i = blockDim.y * blockIdx.y + threadIdx.y;
623 int j = blockDim.x * blockIdx.x + threadIdx.x;
624 int index = j * m + i;
625
626 if ((i < m) && (j < n)) {
627 AFloat x = A[index];
628 A[index] = x / (1.0 + abs(x));
629 }
630}
631
632//____________________________________________________________________________
633template<typename AFloat>
634__global__ void SoftSignDerivative(AFloat * B,
635 const AFloat * A,
636 int m, int n)
637{
638 int i = blockDim.y * blockIdx.y + threadIdx.y;
639 int j = blockDim.x * blockIdx.x + threadIdx.x;
640 int index = j * m + i;
641
642 if ((i < m) && (j < n)) {
643 AFloat x = 1.0 + fabs(A[index]);
644 B[index] = 1 / (x * x);
645 }
646}
647
648//____________________________________________________________________________
649template<typename AFloat>
650__global__ void Gauss(AFloat * A,
651 int m, int n)
652{
653 int i = blockDim.y * blockIdx.y + threadIdx.y;
654 int j = blockDim.x * blockIdx.x + threadIdx.x;
655 int index = j * m + i;
656
657 if ((i < m) && (j < n)) {
658 AFloat x = A[index];
659 A[index] = exp(- x * x);
660 }
661}
662
663//____________________________________________________________________________
664template<typename AFloat>
665__global__ void GaussDerivative(AFloat * B,
666 const AFloat * A,
667 int m, int n)
668{
669 int i = blockDim.y * blockIdx.y + threadIdx.y;
670 int j = blockDim.x * blockIdx.x + threadIdx.x;
671 int index = j * m + i;
672
673 if ((i < m) && (j < n)) {
674 AFloat x = A[index];
675 B[index] = - 2.0 * x * exp(- x * x);
676 }
677}
678
679//____________________________________________________________________________
680template<typename AFloat>
681__global__ void MeanSquaredError(AFloat * result,
682 const AFloat * Y,
683 const AFloat * output,
684 const AFloat * weights,
685 int m, int n)
686{
687 int i = blockDim.y * blockIdx.y + threadIdx.y;
688 int j = blockDim.x * blockIdx.x + threadIdx.x;
689 int tid = blockDim.x * threadIdx.y + threadIdx.x;
690 int index = j * m + i;
691
692 __shared__ AFloat sdata[TDevice::BlockSize];
693
694 if ((i < m) && (j < n)) {
695 AFloat w = weights[i];
696 AFloat norm = 1 / ((AFloat) (m * n));
697 AFloat e = Y[index] - output[index];
698 sdata[tid] = w * norm * e * e;
699 } else {
700 sdata[tid] = 0.0;
701 }
702 ReduceSum(result, sdata);
703}
704
705//____________________________________________________________________________
706template<typename AFloat>
707__global__ void SquaredSum(AFloat * result,
708 const AFloat * A,
709 int m, int n)
710{
711 int i = blockDim.y * blockIdx.y + threadIdx.y;
712 int j = blockDim.x * blockIdx.x + threadIdx.x;
713 int tid = blockDim.x * threadIdx.y + threadIdx.x;
714 int index = j * m + i;
715
716 __shared__ AFloat sdata[TDevice::BlockSize];
717
718 if ((i < m) && (j < n)) {
719 AFloat e = A[index];
720 sdata[tid] = e * e;
721 } else {
722 sdata[tid] = 0.0;
723 }
724 ReduceSum(result, sdata);
725}
726
727//____________________________________________________________________________
728template<typename AFloat>
729__global__ void AbsoluteSum(AFloat * result,
730 const AFloat * A,
731 int m, int n)
732{
733 int i = blockDim.y * blockIdx.y + threadIdx.y;
734 int j = blockDim.x * blockIdx.x + threadIdx.x;
735 int tid = blockDim.x * threadIdx.y + threadIdx.x;
736 int index = j * m + i;
737
738 __shared__ AFloat sdata[TDevice::BlockSize];
739
740 if ((i < m) && (j < n)) {
741 sdata[tid] = abs(A[index]);
742 } else {
743 sdata[tid] = 0.0;
744 }
745 ReduceSum(result, sdata);
746}
747
748//____________________________________________________________________________
749template<typename AFloat>
750__global__ void MeanSquaredErrorGradients(AFloat * dY,
751 const AFloat * Y,
752 const AFloat * output,
753 const AFloat * weights,
754 int m, int n)
755{
756 int i = blockDim.y * blockIdx.y + threadIdx.y;
757 int j = blockDim.x * blockIdx.x + threadIdx.x;
758 int index = j * m + i;
759
760 if ((i < m) && (j < n)) {
761 dY[index] = weights[i] * 2.0 / ((AFloat) (m * n)) * (output[index] - Y[index]);
762 }
763}
764
765//____________________________________________________________________________
766template<typename AFloat>
767__global__ void AddL1RegularizationGradients(AFloat * A,
768 const AFloat * B,
769 AFloat weightDecay,
770 int m, int n)
771{
772 int i = blockDim.y * blockIdx.y + threadIdx.y;
773 int j = blockDim.x * blockIdx.x + threadIdx.x;
774 int index = j * m + i;
775
776 if ((i < m) && (j < n)) {
777 AFloat sign = (B[index] < 0.0) ? -1.0 : 1.0;
778 A[index] += sign * weightDecay;
779 }
780}
781
782//____________________________________________________________________________
783template<typename AFloat>
784__global__ void AddL2RegularizationGradients(AFloat * A,
785 const AFloat * B,
786 AFloat weightDecay,
787 int m, int n)
788{
789 int i = blockDim.y * blockIdx.y + threadIdx.y;
790 int j = blockDim.x * blockIdx.x + threadIdx.x;
791 int index = j * m + i;
792
793 if ((i < m) && (j < n)) {
794 A[index] += 2.0 * weightDecay * B[index];
795 }
796}
797
798//____________________________________________________________________________
799template<typename AFloat>
800__global__ void CrossEntropy(AFloat * result,
801 const AFloat * Y,
802 const AFloat * output,
803 const AFloat * weights,
804 int m, int n)
805{
806 int i = blockDim.y * blockIdx.y + threadIdx.y;
807 int j = blockDim.x * blockIdx.x + threadIdx.x;
808 int tid = blockDim.x * threadIdx.y + threadIdx.x;
809 int index = j * m + i;
810
811 __shared__ AFloat sdata[TDevice::BlockSize];
812
813 if ((i < m) && (j < n)) {
814 AFloat norm = 1 / ((AFloat) (m * n));
815 AFloat sig = 1.0 / (1.0 + exp(-output[index]));
816 if (Y[index] == 0)
817 sdata[tid] = -weights[i] * norm * log(1.0 - sig);
818 else if (Y[index] == 1.0)
819 sdata[tid] = -weights[i] * norm * log(sig);
820 else {
821 AFloat ce = Y[index] * log(sig) + (1.0 - Y[index]) * log(1.0 - sig);
822 sdata[tid] = -weights[i] * norm * ce;
823 }
824 } else {
825 sdata[tid] = 0.0;
826 }
827
828 ReduceSum(result, sdata);
829}
830
831//____________________________________________________________________________
832template<typename AFloat>
833__global__ void CrossEntropyGradients(AFloat * dY,
834 const AFloat * Y,
835 const AFloat * output,
836 const AFloat * weights,
837 int m, int n)
838{
839 int i = blockDim.y * blockIdx.y + threadIdx.y;
840 int j = blockDim.x * blockIdx.x + threadIdx.x;
841 int index = j * m + i;
842
843 if ((i < m) && (j < n)) {
844 AFloat norm = 1 / ((AFloat) (m * n));
845 AFloat y = Y[index];
846 AFloat sig = 1.0 / (1.0 + exp(-output[index]));
847 dY[index] = weights[i] * norm * (sig - y);
848 }
849}
850
851//____________________________________________________________________________
852template<typename AFloat>
853__global__ void SoftmaxCrossEntropy(AFloat * result,
854 const AFloat * Y,
855 const AFloat * output,
856 const AFloat * weights,
857 int m, int n)
858{
859 int i = blockDim.y * blockIdx.y + threadIdx.y;
860 int tid = threadIdx.y;
861
862 __shared__ AFloat sdata[TDevice::BlockSize];
863 AFloat norm = 1.0 / ((AFloat) m);
864
865 sdata[tid] = 0.0;
866 if (i < m) {
867 AFloat sum = 0.0;
868 for (int j = 0; j < n; j++) {
869 sum += exp(output[i + j * m]);
870 }
871 for (int j = 0; j < n; j++) {
872 sdata[tid] += Y[i + j * m] * log(exp(output[i + j * m]) / sum);
873 }
874 sdata[tid] *= -weights[i] * norm;
875 } else {
876 sdata[tid] = 0.0;
877 }
878
879 ReduceSum(result, sdata);
880}
881
882//____________________________________________________________________________
883template<typename AFloat>
884__global__ void SoftmaxCrossEntropyGradients(AFloat * dY,
885 const AFloat * Y,
886 const AFloat * output,
887 const AFloat * weights,
888 int m, int n)
889{
890 int i = blockDim.y * blockIdx.y + threadIdx.y;
891 AFloat norm = 1.0 / ((AFloat) m);
892
893 if (i < m) {
894 AFloat sum = 0.0;
895 AFloat sumY = 0.0;
896 for (int j = 0; j < n; j++) {
897 sum += exp(output[i + j * m]);
898 sumY += Y[i + j * m];
899 }
900 for (int j = 0; j < n; j++) {
901 dY[i + j * m] = sumY * exp(output[i + j * m]) / sum - Y[i + j * m];
902 dY[i + j * m] *= weights[i] * norm;
903 }
904 }
905}
906
907//____________________________________________________________________________
908template<typename AFloat>
909__global__ void ReduceMatrix(AFloat *result,
910 const AFloat *A,
911 int m, int n)
912{
913 int i = blockDim.y * blockIdx.y + threadIdx.y;
914 int j = blockDim.x * blockIdx.x + threadIdx.x;
915 int tid = threadIdx.y * blockDim.x + threadIdx.x;
916 int index = j * m + i;
917
918 __shared__ AFloat smem[TDevice::BlockSize];
919 if ((i < m) && (j < n))
920 smem[tid] = A[index];
921 else
922 smem[tid] = 0.0;
923
924 ReduceSum(result, smem);
925}
926
927//____________________________________________________________________________
928template<typename AFloat>
929__global__ void SumColumns(AFloat *B,
930 const AFloat *A,
931 int m, int n)
932{
933 int i = blockDim.y * blockIdx.y + threadIdx.y;
934 int j = blockDim.x * blockIdx.x + threadIdx.x;
935 int matrixIndex = j * m + i;
936 int blockIndex = blockDim.x * threadIdx.y + threadIdx.x;
937
938
939 __shared__ AFloat smem[TDevice::BlockSize];
940
941 if ((i < m) && (j < n)) {
942 smem[blockIndex] = A[matrixIndex];
943 } else {
944 smem[blockIndex] = 0.0;
945 }
946
947 ReduceSumVertical(B + blockDim.x * blockIdx.x, smem, n);
948}
949
950template<typename AFloat>
951__global__ void AlmostEquals(bool * result, const AFloat * A, const AFloat * B, double epsilon, int m, int n)
952{
953 int i = blockDim.y * blockIdx.y + threadIdx.y;
954 int j = blockDim.x * blockIdx.x + threadIdx.x;
955
956 if (i >= m || j >= n) return;
957 int matrixIndex = j * m + i;
958
959 // This is a race condition but still thread safe: If many threads find inequality I don't care
960 // if they overwrite each other, the result is still going to be false.
961 if(fabs(A[matrixIndex] - B[matrixIndex]) > epsilon) result[0] = false;
962}
963
964//____________________________________________________________________________
965template<typename AFloat>
966__global__ void Dropout(AFloat *A,
967 int m, int n,
968 AFloat dropoutProbability,
969 curandState_t *state)
970{
971 int i = blockDim.y * blockIdx.y + threadIdx.y;
972 int j = blockDim.x * blockIdx.x + threadIdx.x;
973 int tid = i * gridDim.x + j;
974 if ((i < m) && (j < n)) {
975 float r = curand_uniform(state + tid);
976 if (r > dropoutProbability) {
977 A[j * m + i] = 0.0;
978 } else {
979 A[j * m + i] /= dropoutProbability;
980 }
981 }
982}
983
984//____________________________________________________________________________
985//////////////////////////////////////////////////////////////////////////////////////////////
986/// \brief Downsampling kernel used as the forward propagation step of a
987/// Max-Pooling layer.
988///
989/// \param[out] A The output matrix. Each row corresponds to a slice and each element
990/// is the max within a receptive field.
991/// \param[out] B The winning indices matrix. Each element is the index of the max element.
992/// \param[in] C The input matrix. Each row is a slice.
993/// \param[in] imgHeight The heigh of the input.
994/// \param[in] imgWidth The output of the input.
995/// \param[in] fltHeight Height of the kernel.
996/// \param[in] fltWidth Width of the kernel.
997/// \param[in] strideRows stride size in the horizontal dimension.
998/// \param[in] strideCols stride size in the vertical dimension.
999///
1000/// Each output element is the maximum of the receptive field. The caller launches one thread
1001/// per output element in order to eliminate shared write access.
1002///////////////////////////////////////////////////////////////////////////////////////////////
1003template<typename AFloat>
1004__global__ void Downsample(AFloat * output, AFloat * indexMatrix, const AFloat * input, int depth, int imgHeight,
1005 int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
1006{
1007 // The row of the output matrix.
1008 int i = blockDim.y * blockIdx.y + threadIdx.y;
1009
1010 // The column of the output matrix.
1011 int j = blockDim.x * blockIdx.x + threadIdx.x;
1012
1013 // Number of columns in matrix A.
1014 int NLocalViews = calculateDimension(imgWidth, fltWidth, 0, strideCols) *
1015 calculateDimension(imgHeight, fltHeight, 0, strideRows);
1016
1017 if (i >= depth || j >= NLocalViews) return;
1018
1019 int outputIndex = j * depth + i;
1020
1021 int numSlidesPerRow = calculateDimension(imgWidth, fltWidth, 0, strideCols);
1022
1023 int rowMin = (j / numSlidesPerRow) * strideRows; // First row of B that this thread should look at.
1024 int colMin = (j % numSlidesPerRow) * strideCols; // First column of B that this thread should look at.
1025 int bz = i; // Slice of B that this thread should look at.
1026
1027 AFloat value = 0;
1028 AFloat maxIndex = 0;
1029 bool first = true; // The first element should write to `value` no matter what.
1030
1031 for (size_t by = rowMin; by < rowMin + fltHeight; by++) {
1032 for (size_t bx = colMin; bx < colMin + fltWidth; bx++) {
1033 int inputIndex = (bx + by * imgWidth) * depth + bz;
1034 if (input[inputIndex] > value || first) {
1035 first = false;
1036 maxIndex = bx + by * imgWidth;
1037 value = input[inputIndex];
1038 }
1039 }
1040 }
1041 indexMatrix[outputIndex] = maxIndex;
1042 output[outputIndex] = value;
1043
1044}
1045
1046/////////////////////////////////////////////////////////////////////////////////////////////////
1047/// \brief Back-propagate the gradients through a max-pooling layer.
1048///
1049/// \param[out] gradientsBackward The gradients to be written. One gradient for each neuron at the layers's input.
1050/// \param[in] gradients The gradients coming from the next layer. One gradient for each receptive field.
1051/// \param[in] indexMatrix Winning indices. One index for each receptive field.
1052/// \param[in] depth The depth of the input tensor.
1053/// \param[in] imgHeight The height of the input tensor.
1054/// \param[in] imgWidth The output of the input tensor
1055/// \param[in] fltHeight Height of the filter.
1056/// \param[in] fltWidth Width of the filter.
1057/// \param[in] strideRows stride size in the horizontal dimension.
1058/// \param[in] strideCols stride size in the vertical dimension.
1059/////////////////////////////////////////////////////////////////////////////////////////////////
1060template<typename AFloat>
1061__global__ void MaxPoolBackward(AFloat * activationGradientsBackward,
1062 const AFloat * activationGradients,
1063 const AFloat * indexMatrix,
1064 int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth,
1065 int strideRows, int strideCols)
1066{
1067 int slice = blockDim.y * blockIdx.y + threadIdx.y; // row of the gradientsBackward matrix.
1068 int j = blockDim.x * blockIdx.x + threadIdx.x; // column of the gradientsBackward matrix.
1069
1070 if (slice >= depth || j >= imgHeight * imgWidth) return;
1071
1072 int height = calculateDimension(imgHeight, fltHeight, 0, strideRows);
1073 int width = calculateDimension(imgWidth, fltWidth, 0, strideCols);
1074
1075 // Which gradientsBackward element should this thread write to?
1076 int backRow = j % imgHeight;
1077 int backCol = j / imgHeight;
1078
1079 // Which gradient and indexMatrix elements should this thread read?
1080 int nextRowMin = floor((backRow - fltHeight) / (AFloat) strideRows) + 1;
1081 int nextColMin = floor((backCol - fltWidth) / (AFloat) strideCols) + 1;
1082
1083 int outputIndex = 0;
1084 AFloat grad = 0;
1085
1086 // Iterate over all output elements that were the outcome of receptive fields I was part of.
1087 for (int row = nextRowMin; row <= nextRowMin + fltHeight - strideRows; row++) {
1088 for (int col = nextColMin; col <= nextColMin + fltWidth - strideCols; col++) {
1089
1090 if (row >= height || col >= width || col < 0 || row < 0) continue;
1091
1092 outputIndex = (row * width + col) * depth + slice;
1093
1094 // Was I the winning index within this receptive field?
1095 if (indexMatrix[outputIndex] == backCol + backRow * imgWidth) {
1096 grad += activationGradients[outputIndex];
1097 }
1098 }
1099 }
1100 activationGradientsBackward[(backCol + backRow * imgWidth) * depth + slice] = grad;
1101}
1102
1103template<typename AFloat>
1104__global__ void RotateWeights(AFloat * A, const AFloat * B, int filterDepth, int filterHeight, int filterWidth,
1105 int numFilters)
1106{
1107 int i = blockDim.y * blockIdx.y + threadIdx.y;
1108 int j = blockDim.x * blockIdx.x + threadIdx.x;
1109
1110 if (i >= numFilters || j > filterDepth * filterHeight * filterWidth) return;
1111
1112 int jump = filterHeight * filterWidth;
1113 int row = j / jump;
1114 int col = i * jump + jump - j % jump - 1;
1115
1116 A[col * filterDepth + row] = B[j * numFilters + i];
1117}
1118
1119template<typename AFloat>
1120__global__ void AddBiases(AFloat * A, const AFloat * B, int nRows, int nCols)
1121{
1122 int i = blockDim.y * blockIdx.y + threadIdx.y;
1123 int j = blockDim.x * blockIdx.x + threadIdx.x;
1124 if (i >= nRows || j >= nCols) return;
1125
1126 A[i + j * nRows] += B[i];
1127}
1128
1129template<typename AFloat>
1130__global__ void UpdateWeights(AFloat * A, const AFloat ** B, int batchSize, int nRows, int nCols)
1131{
1132 int i = blockDim.y * blockIdx.y + threadIdx.y;
1133 int j = blockDim.x * blockIdx.x + threadIdx.x;
1134
1135 if (i >= nRows || j >= nCols) return;
1136
1137 for (size_t event = 0; event < batchSize; event++) {
1138 size_t index = i * nCols + j;
1139 A[index] += B[event][index];
1140 }
1141}
1142
1143template<typename AFloat>
1144__global__ void Reshape(AFloat * A, const AFloat * B, int nRowsA, int nColsA, int nRowsB, int nColsB)
1145{
1146 int i = blockDim.y * blockIdx.y + threadIdx.y;
1147 int j = blockDim.x * blockIdx.x + threadIdx.x;
1148 if (i >= nRowsA || j >= nColsA) return;
1149
1150 size_t indexA = j * nRowsA + i;
1151
1152 size_t nElem = i * nColsA + j;
1153 size_t indexB = (nElem % nColsB) * nRowsB + nElem / nColsB;
1154
1155 A[indexA] = B[indexB];
1156}
1157
1158////////////////////////////////////////////////////////////////////////////////
1159/// \brief Flatten an array of 2D-arrays into a single 2D-array.
1160///
1161/// \param[out] A Output 2D-array saved in column major order.
1162/// \param[in] B Input array of 2D-arrays. Each element is a matrix to be concatenated.
1163/// \param[in] size Number of 2D-arrays in the input.
1164/// \param[in] nRows Number of rows in each matrix of the input.
1165/// \param[in] nCols Number of columns on each matrix of the input.
1166///
1167/// B is a pointer to `size` raw `TCudaMatrix` pointers. Each of those contains
1168/// elements saved on column major order. However the concatenation is performed
1169/// row wise. Each thread writes a single output element by locating the
1170/// appropriate input index.
1171//////////////////////////////////////////////////////////////////////////////////
1172template<typename AFloat>
1173__global__ void Flatten(AFloat * A, const AFloat *B, int size, int nRows, int nCols)
1174{
1175 int i = blockDim.y * blockIdx.y + threadIdx.y;
1176 int j = blockDim.x * blockIdx.x + threadIdx.x;
1177
1178 int nColsA = nRows * nCols;
1179 if (i >= size || j >= nColsA) return;
1180
1181 // Get a transposed view on matrix B[i].
1182 int row = j / nCols;
1183 int col = j % nCols;
1184 // AFloat element = B[i][col * nRows + row];
1185 AFloat element = B[ i * nColsA + col * nRows + row ];
1186
1187 size_t index = j * size + i;
1188 A[index] = element;
1189}
1190
1191// row major version of flatten (keep roaw before columns in memory): used by Cudnn
1192template<typename AFloat>
1193__global__ void FlattenRM(AFloat * A, const AFloat *B, int size, int nRows, int nCols)
1194{
1195 int i = blockDim.y * blockIdx.y + threadIdx.y;
1196 int j = blockDim.x * blockIdx.x + threadIdx.x;
1197
1198 int nColsA = nRows * nCols;
1199 if (i >= size || j >= nColsA) return;
1200
1201 // Get a transposed view on matrix B[i].
1202 int row = j / nCols;
1203 int col = j % nCols;
1204 // AFloat element = B[i][col * nRows + row];
1205 AFloat element = B[ i * nColsA + row * nCols + col ];
1206
1207 size_t index = j * size + i;
1208 A[index] = element;
1209}
1210
1211
1212////////////////////////////////////////////////////////////////////////////////
1213/// \brief Deflatten a 2D-array into an array of 2D-arrays.
1214///
1215/// \param[out] A Output array of 2D-arrays, each of which is column-major.
1216/// \param[in] B Input 2D-array to be split into `size` parts.
1217/// \param[in] size Number of 2D-arrays in the output.
1218/// \param[in] nRows Number of rows in each matrix of the output.
1219/// \param[in] nCols Number of columns on each matrix of the output.
1220///
1221/// A is a pointer to `size` raw `TCudaMatrix` pointers. Each of those will
1222/// contain elements saved on column major order. However the concatenation
1223/// is performed row wise. Each thread writes a single output element
1224/// by locating the appropriate input index.
1225//////////////////////////////////////////////////////////////////////////////////
1226template<typename AFloat>
1227__global__ void Deflatten(AFloat * A, const AFloat * B, int size, int nRows, int nCols)
1228{
1229 int i = blockDim.y * blockIdx.y + threadIdx.y;
1230 int j = blockDim.x * blockIdx.x + threadIdx.x;
1231
1232 int nColsB = nRows * nCols;
1233 if (i >= size || j >= nColsB) return;
1234
1235 AFloat element = B[j * size + i];
1236
1237 // Get a transposed view on matrix A[i].
1238 int row = j / nCols;
1239 int col = j % nCols;
1240 A[ i * nColsB + col * nRows + row] = element;
1241}
1242
1243// row major of flatten (used by Cudnn)
1244template<typename AFloat>
1245__global__ void DeflattenRM(AFloat * A, const AFloat * B, int size, int nRows, int nCols)
1246{
1247 int i = blockDim.y * blockIdx.y + threadIdx.y;
1248 int j = blockDim.x * blockIdx.x + threadIdx.x;
1249
1250 int nColsB = nRows * nCols;
1251 if (i >= size || j >= nColsB) return;
1252
1253 AFloat element = B[j * size + i];
1254
1255 // Get a transposed view on matrix A[i].
1256 int row = j / nCols;
1257 int col = j % nCols;
1258 A[ i * nColsB + row * nCols + col] = element;
1259}
1260
1261} // namespace Cuda
1262} // namespace DNN
1263} // namespace TMVA
1264
1265#endif
#define e(i)
Definition RSha256.hxx:103
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t r
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
Option_t Option_t width
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t height
Implementation of the CrossEntropy as separation criterion.
static constexpr int BlockSize
Definition Device.h:44
Double_t y[n]
Definition legend1.C:17
Double_t x[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
__global__ void SymmetricRelu(AFloat *A, int m, int n)
Definition Kernels.cuh:590
__global__ void UpdateWeights(AFloat *A, const AFloat **B, int batchSize, int nRows, int nCols)
Definition Kernels.cuh:1130
__device__ int calculateDimension(int imgDim, int fltDim, int padding, int stride)
Calculate the dimension of an output volume, given the sliding parameters and the input shape.
Definition Kernels.cuh:226
__global__ void SigmoidDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:524
__device__ AFloat AtomicAdd(AFloat *address, AFloat val)
__global__ void Dropout(AFloat *A, int m, int n, AFloat dropoutProbability, curandState_t *state)
Definition Kernels.cuh:966
__global__ void SoftmaxCrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:884
__global__ void SumColumns(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:929
__global__ void IdentityDerivative(AFloat *A, int m, int n)
Definition Kernels.cuh:450
__global__ void SqrtElementWise(AFloat *A, int m, int n)
Definition Kernels.cuh:391
__global__ void AdamUpdate(AFloat *A, const AFloat *M, const AFloat *V, int m, int n, AFloat alpha, AFloat eps)
optimizer kernel functions
Definition Kernels.cuh:408
__global__ void SoftmaxCrossEntropy(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:853
__global__ void AddL1RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
Definition Kernels.cuh:767
__device__ void ReduceSumVertical(AFloat *result, AFloat *sdata, int n)
Definition Kernels.cuh:55
__global__ void MeanSquaredErrorGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:750
__global__ void Relu(AFloat *A, int m, int n)
Definition Kernels.cuh:463
__global__ void ReluDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:478
__global__ void AbsoluteSum(AFloat *result, const AFloat *A, int m, int n)
Definition Kernels.cuh:729
__global__ void AddL2RegularizationGradients(AFloat *A, const AFloat *B, AFloat weightDecay, int m, int n)
Definition Kernels.cuh:784
__device__ AFloat max(AFloat x, AFloat y)
Definition Kernels.cuh:207
__global__ void AddRowWise(AFloat *W, const AFloat *theta, int m, int n)
Definition Kernels.cuh:307
__global__ void ConstMult(AFloat *A, AFloat beta, int m, int n)
Definition Kernels.cuh:349
__global__ void GaussDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:665
__global__ void Deflatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Deflatten a 2D-array into an array of 2D-arrays.
Definition Kernels.cuh:1227
__global__ void Flatten(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Flatten an array of 2D-arrays into a single 2D-array.
Definition Kernels.cuh:1173
__global__ void Softmax(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:540
__global__ void RotateWeights(AFloat *A, const AFloat *B, int filterDepth, int filterHeight, int filterWidth, int numFilters)
Definition Kernels.cuh:1104
__global__ void TanhDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:574
__global__ void CrossEntropyGradients(AFloat *dY, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:833
__global__ void ReduceMatrix(AFloat *result, const AFloat *A, int m, int n)
Definition Kernels.cuh:909
__global__ void Im2Col(AFloat *A, const AFloat *B, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols, int zeroPaddingHeight, int zeroPaddingWidth)
A kernel that re-arranges image regions of the input matrix \B, into column vectors in matrix \A.
Definition Kernels.cuh:256
__global__ void DeflattenRM(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Definition Kernels.cuh:1245
__global__ void ConstAdd(AFloat *A, AFloat beta, int m, int n)
Definition Kernels.cuh:335
__global__ void SymmetricReluDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:604
__global__ void MeanSquaredError(AFloat *result, const AFloat *Y, const AFloat *output, const AFloat *weights, int m, int n)
Definition Kernels.cuh:681
__global__ void SquareElementWise(AFloat *A, int m, int n)
Definition Kernels.cuh:377
__global__ void SoftSignDerivative(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:634
__global__ void Reshape(AFloat *A, const AFloat *B, int nRowsA, int nColsA, int nRowsB, int nColsB)
Definition Kernels.cuh:1144
__global__ void Hadamard(AFloat *B, const AFloat *A, int m, int n)
Definition Kernels.cuh:321
__global__ void AlmostEquals(bool *result, const AFloat *A, const AFloat *B, double epsilon, int m, int n)
Definition Kernels.cuh:951
__global__ void FlattenRM(AFloat *A, const AFloat *B, int size, int nRows, int nCols)
Definition Kernels.cuh:1193
__global__ void SquaredSum(AFloat *result, const AFloat *A, int m, int n)
Definition Kernels.cuh:707
__global__ void AdamUpdateFirstMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
Definition Kernels.cuh:422
__global__ void ReciprocalElementWise(AFloat *A, int m, int n)
Definition Kernels.cuh:363
__device__ void ReduceSum(AFloat *result, AFloat *sdata)
Definition Kernels.cuh:134
__global__ void MaxPoolBackward(AFloat *activationGradientsBackward, const AFloat *activationGradients, const AFloat *indexMatrix, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Back-propagate the gradients through a max-pooling layer.
Definition Kernels.cuh:1061
__global__ void Downsample(AFloat *output, AFloat *indexMatrix, const AFloat *input, int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth, int strideRows, int strideCols)
Downsampling kernel used as the forward propagation step of a Max-Pooling layer.
Definition Kernels.cuh:1004
__global__ void AdamUpdateSecondMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
Definition Kernels.cuh:436
__global__ void AddBiases(AFloat *A, const AFloat *B, int nRows, int nCols)
Definition Kernels.cuh:1120
std::shared_ptr< std::function< double(double)> > Tanh
Definition NeuralNet.cxx:29
double weightDecay(double error, ItWeight itWeight, ItWeight itWeightEnd, double factorWeightDecay, EnumRegularization eRegularization)
compute the weight decay for regularization (L1 or L2)
std::shared_ptr< std::function< double(double)> > Gauss
Definition NeuralNet.cxx:12
std::shared_ptr< std::function< double(double)> > Sigmoid
Definition NeuralNet.cxx:26
std::shared_ptr< std::function< double(double)> > SoftSign
Definition NeuralNet.cxx:32
create variable transformations
Definition first.py:1
TMarker m
Definition textangle.C:8
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2345
double epsilon
Definition triangle.c:618
static void output()