25template<
typename AFloat>
49 if (input_gradient.GetNoElements() > 0) {
50 Multiply(input_gradient, df, weights_input);
54 if (state_gradients_backward.GetNoElements() > 0) {
55 Multiply(state_gradients_backward, df, weights_state);
63 if (input_weight_gradients.GetNoElements() > 0) {
64 TransposeMultiply(input_weight_gradients, df, input, 1. , 1.);
67 if (state_weight_gradients.GetNoElements() > 0) {
68 TransposeMultiply(state_weight_gradients, df, state, 1. , 1. );
72 if (bias_gradients.GetNoElements() > 0) {
73 SumColumns(bias_gradients, df, 1., 1.);
76 return input_gradient;
80template <
typename Scalar_t>
121 TCpuMatrix<Scalar_t> tmpState(state_gradients_backward.GetNrows(), state_gradients_backward.GetNcols());
129 Hadamard(cell_gradient, fOutput);
130 Hadamard(cell_gradient, state_gradients_backward);
131 ScaleAdd(cell_gradient, cell_gradients_backward);
132 Copy(cell_gradients_backward, cell_gradient);
133 Hadamard(cell_gradients_backward, fForget);
136 Copy(candidate_gradient, cell_gradient);
137 Hadamard(candidate_gradient, fInput);
138 Hadamard(candidate_gradient, dc);
141 Copy(input_gate_gradient, cell_gradient);
142 Hadamard(input_gate_gradient, fCandidate);
143 Hadamard(input_gate_gradient, di);
146 Copy(forget_gradient, cell_gradient);
147 Hadamard(forget_gradient, precCellActivations);
148 Hadamard(forget_gradient, df);
151 Copy(output_gradient, cell_tanh);
152 Hadamard(output_gradient, state_gradients_backward);
153 Hadamard(output_gradient, dout);
156 Multiply(tmpInp, input_gate_gradient, weights_input);
157 Copy(input_gradient, tmpInp);
158 Multiply(tmpInp, forget_gradient, weights_forget);
159 ScaleAdd(input_gradient, tmpInp);
160 Multiply(tmpInp, candidate_gradient, weights_candidate);
161 ScaleAdd(input_gradient, tmpInp);
162 Multiply(tmpInp, output_gradient, weights_output);
163 ScaleAdd(input_gradient, tmpInp);
166 Multiply(tmpState, input_gate_gradient, weights_input_state);
167 Copy(state_gradients_backward, tmpState);
168 Multiply(tmpState, forget_gradient, weights_forget_state);
169 ScaleAdd(state_gradients_backward, tmpState);
170 Multiply(tmpState, candidate_gradient, weights_candidate_state);
171 ScaleAdd(state_gradients_backward, tmpState);
172 Multiply(tmpState, output_gradient, weights_output_state);
173 ScaleAdd(state_gradients_backward, tmpState);
176 TransposeMultiply(input_weight_gradients, input_gate_gradient, input, 1. , 1.);
177 TransposeMultiply(forget_weight_gradients, forget_gradient, input, 1. , 1.);
178 TransposeMultiply(candidate_weight_gradients, candidate_gradient, input, 1. , 1.);
179 TransposeMultiply(output_weight_gradients, output_gradient, input, 1. , 1.);
182 TransposeMultiply(input_state_weight_gradients, input_gate_gradient, precStateActivations, 1. , 1. );
183 TransposeMultiply(forget_state_weight_gradients, forget_gradient, precStateActivations, 1. , 1. );
184 TransposeMultiply(candidate_state_weight_gradients, candidate_gradient, precStateActivations, 1. , 1. );
185 TransposeMultiply(output_state_weight_gradients, output_gradient, precStateActivations, 1. , 1. );
188 SumColumns(input_bias_gradients, input_gate_gradient, 1., 1.);
189 SumColumns(forget_bias_gradients, forget_gradient, 1., 1.);
190 SumColumns(candidate_bias_gradients, candidate_gradient, 1., 1.);
191 SumColumns(output_bias_gradients, output_gradient, 1., 1.);
193 return input_gradient;
198template <
typename Scalar_t>
228 int r = fUpdate.GetNrows(),
c = fUpdate.GetNcols();
230 Copy(reset_gradient, fUpdate);
231 for (
size_t j = 0; j < (size_t)reset_gradient.
GetNcols(); j++) {
232 for (
size_t i = 0; i < (size_t)reset_gradient.
GetNrows(); i++) {
233 reset_gradient(i, j) = 1 - reset_gradient(i, j);
236 Hadamard(reset_gradient, dc);
237 Hadamard(reset_gradient, state_gradients_backward);
240 if (!resetGateAfter) {
243 Multiply(tmpMul, reset_gradient, weights_candidate_state);
244 Hadamard(tmpMul, precStateActivations);
247 MultiplyTranspose(tmpMul, precStateActivations, weights_candidate_state);
248 Hadamard(tmpMul, reset_gradient);
250 Hadamard(tmpMul, dr);
251 Copy(reset_gradient, tmpMul);
255 Copy(update_gradient, precStateActivations);
256 for (
size_t j = 0; j < (size_t)update_gradient.
GetNcols(); j++) {
257 for (
size_t i = 0; i < (size_t)update_gradient.
GetNrows(); i++) {
258 update_gradient(i, j) = update_gradient(i, j) - fCandidate(i, j);
261 Hadamard(update_gradient, du);
262 Hadamard(update_gradient, state_gradients_backward);
266 Copy(candidate_gradient, fUpdate);
267 for (
size_t j = 0; j < (size_t)candidate_gradient.
GetNcols(); j++) {
268 for (
size_t i = 0; i < (size_t)candidate_gradient.
GetNrows(); i++) {
269 candidate_gradient(i, j) = 1 - candidate_gradient(i, j);
272 Hadamard(candidate_gradient, dc);
273 Hadamard(candidate_gradient, state_gradients_backward);
278 Copy(temp, state_gradients_backward);
281 Hadamard(term, temp);
282 Copy(state_gradients_backward, term);
285 Copy(term, precStateActivations);
287 Hadamard(term, temp);
289 Multiply(var, term, weights_update_state);
291 ScaleAdd(state_gradients_backward, term);
294 Copy(term, fCandidate);
295 for (
size_t j = 0; j < (size_t)term.
GetNcols(); j++) {
296 for (
size_t i = 0; i < (size_t)term.
GetNrows(); i++) {
297 term(i, j) = -term(i, j);
301 Hadamard(term, temp);
302 Multiply(var, term, weights_update_state);
304 ScaleAdd(state_gradients_backward, term);
308 for (
size_t j = 0; j < (size_t)term.
GetNcols(); j++) {
309 for (
size_t i = 0; i < (size_t)term.
GetNrows(); i++) {
310 term(i, j) = 1 - term(i, j);
314 Hadamard(term, temp);
316 if (!resetGateAfter) {
319 Multiply(var, term, weights_candidate_state);
320 Hadamard(var, fReset);
324 Hadamard(term, fReset);
325 Multiply(var, term, weights_candidate_state);
329 ScaleAdd(state_gradients_backward, term);
333 for (
size_t j = 0; j < (size_t)term.
GetNcols(); j++) {
334 for (
size_t i = 0; i < (size_t)term.
GetNrows(); i++) {
335 term(i, j) = 1 - term(i, j);
340 Hadamard(term, temp);
341 if (!resetGateAfter) {
344 Multiply(var, term, weights_candidate_state);
345 Hadamard(var, precStateActivations);
348 MultiplyTranspose(var, precStateActivations, weights_candidate_state);
352 Multiply(term, var, weights_reset_state);
353 ScaleAdd(state_gradients_backward, term);
357 Multiply(tmpInp, reset_gradient, weights_reset);
358 Copy(input_gradient, tmpInp);
359 Multiply(tmpInp, update_gradient, weights_update);
360 ScaleAdd(input_gradient, tmpInp);
361 Multiply(tmpInp, candidate_gradient, weights_candidate);
362 ScaleAdd(input_gradient, tmpInp);
365 TransposeMultiply(reset_weight_gradients, reset_gradient, input, 1., 1.);
366 TransposeMultiply(update_weight_gradients, update_gradient, input, 1., 1.);
367 TransposeMultiply(candidate_weight_gradients, candidate_gradient, input, 1., 1.);
370 TransposeMultiply(reset_state_weight_gradients, reset_gradient, precStateActivations, 1., 1.);
371 TransposeMultiply(update_state_weight_gradients, update_gradient, precStateActivations, 1., 1.);
376 if (!resetGateAfter) {
378 Copy(tempvar, precStateActivations);
379 Hadamard(tempvar, fReset);
380 TransposeMultiply(candidate_state_weight_gradients, candidate_gradient, tempvar, 1., 1.);
384 Copy(tempvar, candidate_gradient);
385 Hadamard(tempvar, fReset);
386 TransposeMultiply(candidate_state_weight_gradients, tempvar, precStateActivations, 1., 1.);
390 SumColumns(reset_bias_gradients, reset_gradient, 1., 1.);
391 SumColumns(update_bias_gradients, update_gradient, 1., 1.);
392 SumColumns(candidate_bias_gradients, candidate_gradient, 1., 1.);
394 return input_gradient;
static Matrix_t & LSTMLayerBackward(TCpuMatrix< Scalar_t > &state_gradients_backward, TCpuMatrix< Scalar_t > &cell_gradients_backward, TCpuMatrix< Scalar_t > &input_weight_gradients, TCpuMatrix< Scalar_t > &forget_weight_gradients, TCpuMatrix< Scalar_t > &candidate_weight_gradients, TCpuMatrix< Scalar_t > &output_weight_gradients, TCpuMatrix< Scalar_t > &input_state_weight_gradients, TCpuMatrix< Scalar_t > &forget_state_weight_gradients, TCpuMatrix< Scalar_t > &candidate_state_weight_gradients, TCpuMatrix< Scalar_t > &output_state_weight_gradients, TCpuMatrix< Scalar_t > &input_bias_gradients, TCpuMatrix< Scalar_t > &forget_bias_gradients, TCpuMatrix< Scalar_t > &candidate_bias_gradients, TCpuMatrix< Scalar_t > &output_bias_gradients, TCpuMatrix< Scalar_t > &di, TCpuMatrix< Scalar_t > &df, TCpuMatrix< Scalar_t > &dc, TCpuMatrix< Scalar_t > &dout, const TCpuMatrix< Scalar_t > &precStateActivations, const TCpuMatrix< Scalar_t > &precCellActivations, const TCpuMatrix< Scalar_t > &fInput, const TCpuMatrix< Scalar_t > &fForget, const TCpuMatrix< Scalar_t > &fCandidate, const TCpuMatrix< Scalar_t > &fOutput, const TCpuMatrix< Scalar_t > &weights_input, const TCpuMatrix< Scalar_t > &weights_forget, const TCpuMatrix< Scalar_t > &weights_candidate, const TCpuMatrix< Scalar_t > &weights_output, const TCpuMatrix< Scalar_t > &weights_input_state, const TCpuMatrix< Scalar_t > &weights_forget_state, const TCpuMatrix< Scalar_t > &weights_candidate_state, const TCpuMatrix< Scalar_t > &weights_output_state, const TCpuMatrix< Scalar_t > &input, TCpuMatrix< Scalar_t > &input_gradient, TCpuMatrix< Scalar_t > &cell_gradient, TCpuMatrix< Scalar_t > &cell_tanh)
Backward pass for LSTM Network.
static Matrix_t & RecurrentLayerBackward(Matrix_t &state_gradients_backward, Matrix_t &input_weight_gradients, Matrix_t &state_weight_gradients, Matrix_t &bias_gradients, Matrix_t &df, const Matrix_t &state, const Matrix_t &weights_input, const Matrix_t &weights_state, const Matrix_t &input, Matrix_t &input_gradient)
Backward pass for Recurrent Networks.
static Matrix_t & GRULayerBackward(TCpuMatrix< Scalar_t > &state_gradients_backward, TCpuMatrix< Scalar_t > &reset_weight_gradients, TCpuMatrix< Scalar_t > &update_weight_gradients, TCpuMatrix< Scalar_t > &candidate_weight_gradients, TCpuMatrix< Scalar_t > &reset_state_weight_gradients, TCpuMatrix< Scalar_t > &update_state_weight_gradients, TCpuMatrix< Scalar_t > &candidate_state_weight_gradients, TCpuMatrix< Scalar_t > &reset_bias_gradients, TCpuMatrix< Scalar_t > &update_bias_gradients, TCpuMatrix< Scalar_t > &candidate_bias_gradients, TCpuMatrix< Scalar_t > &dr, TCpuMatrix< Scalar_t > &du, TCpuMatrix< Scalar_t > &dc, const TCpuMatrix< Scalar_t > &precStateActivations, const TCpuMatrix< Scalar_t > &fReset, const TCpuMatrix< Scalar_t > &fUpdate, const TCpuMatrix< Scalar_t > &fCandidate, const TCpuMatrix< Scalar_t > &weights_reset, const TCpuMatrix< Scalar_t > &weights_update, const TCpuMatrix< Scalar_t > &weights_candidate, const TCpuMatrix< Scalar_t > &weights_reset_state, const TCpuMatrix< Scalar_t > &weights_update_state, const TCpuMatrix< Scalar_t > &weights_candidate_state, const TCpuMatrix< Scalar_t > &input, TCpuMatrix< Scalar_t > &input_gradient, bool resetGateAfter)
Backward pass for GRU Network.
create variable transformations