36 float alpha = 1.0, beta = 0.0;
43 CUBLAS_OP_N, CUBLAS_OP_N,
48 C.GetDataPointer(),
m);
50 C.SetComputeStream(s);
63 double alpha = 1.0, beta = 0.0;
70 CUBLAS_OP_N, CUBLAS_OP_N,
75 C.GetDataPointer(),
m);
77 C.SetComputeStream(s);
85 float alpha,
float beta)
98 CUBLAS_OP_T, CUBLAS_OP_N,
103 C.GetDataPointer(),
m);
105 C.SetComputeStream(s);
112 double alpha,
double beta)
125 CUBLAS_OP_T, CUBLAS_OP_N,
130 C.GetDataPointer(),
m);
132 C.SetComputeStream(s);
136template<
typename AFloat>
150template<
typename AFloat>
171template<
typename AFloat>
191 float alpha,
float beta)
215 double alpha,
double beta)
242 float alpha = 1.0, beta = 0.0;
265 double alpha = 1.0, beta = 0.0;
290template<
typename AFloat>
294 Fatal(
"AlmostEquals",
"The passed matrices have unequal shapes.");
302 cudaMalloc((
void**) &dResult,
sizeof(
bool));
303 cudaMemset(dResult, 1,
sizeof(
bool));
309 cudaMemcpy(&result, dResult,
sizeof(
bool), cudaMemcpyDeviceToHost);
342template<
typename AFloat>
351 ScaleAdd(B_m, A_m, alpha);
356template<
typename AFloat>
370template<
typename AFloat>
384template<
typename AFloat>
397template<
typename AFloat>
410template<
typename AFloat>
424template<
typename AFloat>
440template<
typename AFloat>
454template<
typename AFloat>
void Fatal(const char *location, const char *msgfmt,...)
Use this function in case of a fatal error. It will abort the program.
static AFloat GetDeviceReturn()
Transfer the value in the device return buffer to the host.
void SetComputeStream(cudaStream_t stream)
cudaStream_t GetComputeStream() const
size_t GetNoElements() const
static AFloat * GetDeviceReturnPointer()
Return device pointer to the device return buffer.
const cublasHandle_t & GetCublasHandle() const
static void ResetDeviceReturn(AFloat value=0.0)
Set the return buffer on the device to the specified value.
const AFloat * GetDataPointer() const
static AFloat * GetOnes()
TCudaTensor< AFloat > At(size_t i) const
cudaStream_t GetComputeStream() const
const AFloat * GetDataPointer() const
size_t GetFirstStride() const
void SetComputeStream(cudaStream_t stream)
size_t GetFirstSize() const
static bool AlmostEquals(const Matrix_t &A, const Matrix_t &B, double epsilon=0.1)
Check two matrices for equality, taking floating point arithmetic errors into account.
static void SqrtElementWise(Matrix_t &A)
Square root each element of the matrix A and write the result into A.
static void SumRows(Matrix_t &B, const Matrix_t &A)
extra functions defined only for CPU architecture !!!
static void Multiply(Matrix_t &C, const Matrix_t &A, const Matrix_t &B)
Standard multiplication of two matrices A and B with the result being written into C.
static void AdamUpdate(Matrix_t &A, const Matrix_t &M, const Matrix_t &V, Scalar_t alpha, Scalar_t eps)
Adam updates.
static void SumColumns(Matrix_t &B, const Matrix_t &A, Scalar_t alpha=1.0, Scalar_t beta=0.)
Sum columns of (m x n) matrix A and write the results into the first m elements in A.
static void AdamUpdateSecondMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta)
static void Hadamard(Tensor_t &A, const Tensor_t &B)
In-place Hadamard (element-wise) product of matrices A and B with the result being written into A.
static void ReciprocalElementWise(Matrix_t &A)
Reciprocal each element of the matrix A and write the result into A.
static void SquareElementWise(Matrix_t &A)
Square each element of the matrix A and write the result into A.
static Scalar_t Sum(const Matrix_t &A)
Compute the sum of all elements in A.
static void ConstMult(Matrix_t &A, Scalar_t beta)
Multiply the constant beta to all the elements of matrix A and write the result into A.
static void AdamUpdateFirstMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta)
static void ConstAdd(Matrix_t &A, Scalar_t beta)
Add the constant beta to all the elements of matrix A and write the result into A.
static void TransposeMultiply(Matrix_t &output, const Matrix_t &input, const Matrix_t &Weights, Scalar_t alpha=1.0, Scalar_t beta=0.)
Matrix multiplication of two matrices A and B^T (transposed) with the result being written into C.
static void ScaleAdd(Matrix_t &A, const Matrix_t &B, Scalar_t beta=1.0)
Adds a the elements in matrix B scaled by c to the elements in the matrix A.
static dim3 BlockDims2D()
static dim3 GridDims2D(int nrows, int ncols)
__global__ void SqrtElementWise(AFloat *A, int m, int n)
__global__ void AdamUpdate(AFloat *A, const AFloat *M, const AFloat *V, int m, int n, AFloat alpha, AFloat eps)
optimizer kernel functions
__global__ void ConstMult(AFloat *A, AFloat beta, int m, int n)
__global__ void ReduceMatrix(AFloat *result, const AFloat *A, int m, int n)
__global__ void ConstAdd(AFloat *A, AFloat beta, int m, int n)
__global__ void SquareElementWise(AFloat *A, int m, int n)
__global__ void Hadamard(AFloat *B, const AFloat *A, int m, int n)
__global__ void AlmostEquals(bool *result, const AFloat *A, const AFloat *B, double epsilon, int m, int n)
__global__ void AdamUpdateFirstMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
__global__ void ReciprocalElementWise(AFloat *A, int m, int n)
__global__ void AdamUpdateSecondMom(AFloat *A, const AFloat *B, int m, int n, AFloat beta)
create variable transformations