43 if (input_gradient.GetNoElements() > 0) {
44 Multiply(input_gradient, df, weights_input);
48 if (state_gradients_backward.GetNoElements() > 0) {
49 Multiply(state_gradients_backward, df, weights_state);
57 if (input_weight_gradients.GetNoElements() > 0) {
61 if (state_weight_gradients.GetNoElements() > 0) {
66 if (bias_gradients.GetNoElements() > 0) {
70 return input_gradient;
115 TCpuMatrix<Scalar_t> tmpState(state_gradients_backward.GetNrows(), state_gradients_backward.GetNcols());
124 Hadamard(cell_gradient, state_gradients_backward);
125 ScaleAdd(cell_gradient, cell_gradients_backward);
126 Copy(cell_gradients_backward, cell_gradient);
127 Hadamard(cell_gradients_backward, fForget);
130 Copy(candidate_gradient, cell_gradient);
131 Hadamard(candidate_gradient, fInput);
135 Copy(input_gate_gradient, cell_gradient);
136 Hadamard(input_gate_gradient, fCandidate);
140 Copy(forget_gradient, cell_gradient);
141 Hadamard(forget_gradient, precCellActivations);
145 Copy(output_gradient, cell_tanh);
146 Hadamard(output_gradient, state_gradients_backward);
150 Multiply(tmpInp, input_gate_gradient, weights_input);
151 Copy(input_gradient, tmpInp);
152 Multiply(tmpInp, forget_gradient, weights_forget);
154 Multiply(tmpInp, candidate_gradient, weights_candidate);
156 Multiply(tmpInp, output_gradient, weights_output);
160 Multiply(tmpState, input_gate_gradient, weights_input_state);
161 Copy(state_gradients_backward, tmpState);
162 Multiply(tmpState, forget_gradient, weights_forget_state);
163 ScaleAdd(state_gradients_backward, tmpState);
164 Multiply(tmpState, candidate_gradient, weights_candidate_state);
165 ScaleAdd(state_gradients_backward, tmpState);
166 Multiply(tmpState, output_gradient, weights_output_state);
167 ScaleAdd(state_gradients_backward, tmpState);
172 TransposeMultiply(candidate_weight_gradients, candidate_gradient, input, 1. , 1.);
176 TransposeMultiply(input_state_weight_gradients, input_gate_gradient, precStateActivations, 1. , 1. );
177 TransposeMultiply(forget_state_weight_gradients, forget_gradient, precStateActivations, 1. , 1. );
178 TransposeMultiply(candidate_state_weight_gradients, candidate_gradient, precStateActivations, 1. , 1. );
179 TransposeMultiply(output_state_weight_gradients, output_gradient, precStateActivations, 1. , 1. );
182 SumColumns(input_bias_gradients, input_gate_gradient, 1., 1.);
183 SumColumns(forget_bias_gradients, forget_gradient, 1., 1.);
184 SumColumns(candidate_bias_gradients, candidate_gradient, 1., 1.);
185 SumColumns(output_bias_gradients, output_gradient, 1., 1.);
187 return input_gradient;
222 int r = fUpdate.GetNrows(),
c = fUpdate.GetNcols();
224 Copy(reset_gradient, fUpdate);
225 for (
size_t j = 0; j < (size_t)reset_gradient.
GetNcols(); j++) {
226 for (
size_t i = 0; i < (size_t)reset_gradient.
GetNrows(); i++) {
227 reset_gradient(i, j) = 1 - reset_gradient(i, j);
231 Hadamard(reset_gradient, state_gradients_backward);
234 if (!resetGateAfter) {
237 Multiply(tmpMul, reset_gradient, weights_candidate_state);
238 Hadamard(tmpMul, precStateActivations);
245 Copy(reset_gradient, tmpMul);
249 Copy(update_gradient, precStateActivations);
250 for (
size_t j = 0; j < (size_t)update_gradient.
GetNcols(); j++) {
251 for (
size_t i = 0; i < (size_t)update_gradient.
GetNrows(); i++) {
252 update_gradient(i, j) = update_gradient(i, j) - fCandidate(i, j);
256 Hadamard(update_gradient, state_gradients_backward);
260 Copy(candidate_gradient, fUpdate);
261 for (
size_t j = 0; j < (size_t)candidate_gradient.
GetNcols(); j++) {
262 for (
size_t i = 0; i < (size_t)candidate_gradient.
GetNrows(); i++) {
263 candidate_gradient(i, j) = 1 - candidate_gradient(i, j);
267 Hadamard(candidate_gradient, state_gradients_backward);
272 Copy(temp, state_gradients_backward);
276 Copy(state_gradients_backward, term);
279 Copy(term, precStateActivations);
283 Multiply(var, term, weights_update_state);
285 ScaleAdd(state_gradients_backward, term);
288 Copy(term, fCandidate);
289 for (
size_t j = 0; j < (size_t)term.
GetNcols(); j++) {
290 for (
size_t i = 0; i < (size_t)term.
GetNrows(); i++) {
291 term(i, j) = -term(i, j);
296 Multiply(var, term, weights_update_state);
298 ScaleAdd(state_gradients_backward, term);
302 for (
size_t j = 0; j < (size_t)term.
GetNcols(); j++) {
303 for (
size_t i = 0; i < (size_t)term.
GetNrows(); i++) {
304 term(i, j) = 1 - term(i, j);
310 if (!resetGateAfter) {
313 Multiply(var, term, weights_candidate_state);
319 Multiply(var, term, weights_candidate_state);
323 ScaleAdd(state_gradients_backward, term);
327 for (
size_t j = 0; j < (size_t)term.
GetNcols(); j++) {
328 for (
size_t i = 0; i < (size_t)term.
GetNrows(); i++) {
329 term(i, j) = 1 - term(i, j);
335 if (!resetGateAfter) {
338 Multiply(var, term, weights_candidate_state);
339 Hadamard(var, precStateActivations);
346 Multiply(term, var, weights_reset_state);
347 ScaleAdd(state_gradients_backward, term);
351 Multiply(tmpInp, reset_gradient, weights_reset);
352 Copy(input_gradient, tmpInp);
353 Multiply(tmpInp, update_gradient, weights_update);
355 Multiply(tmpInp, candidate_gradient, weights_candidate);
361 TransposeMultiply(candidate_weight_gradients, candidate_gradient, input, 1., 1.);
364 TransposeMultiply(reset_state_weight_gradients, reset_gradient, precStateActivations, 1., 1.);
365 TransposeMultiply(update_state_weight_gradients, update_gradient, precStateActivations, 1., 1.);
370 if (!resetGateAfter) {
372 Copy(tempvar, precStateActivations);
374 TransposeMultiply(candidate_state_weight_gradients, candidate_gradient, tempvar, 1., 1.);
378 Copy(tempvar, candidate_gradient);
380 TransposeMultiply(candidate_state_weight_gradients, tempvar, precStateActivations, 1., 1.);
384 SumColumns(reset_bias_gradients, reset_gradient, 1., 1.);
385 SumColumns(update_bias_gradients, update_gradient, 1., 1.);
386 SumColumns(candidate_bias_gradients, candidate_gradient, 1., 1.);
388 return input_gradient;
static Matrix_t & LSTMLayerBackward(TCpuMatrix< Scalar_t > &state_gradients_backward, TCpuMatrix< Scalar_t > &cell_gradients_backward, TCpuMatrix< Scalar_t > &input_weight_gradients, TCpuMatrix< Scalar_t > &forget_weight_gradients, TCpuMatrix< Scalar_t > &candidate_weight_gradients, TCpuMatrix< Scalar_t > &output_weight_gradients, TCpuMatrix< Scalar_t > &input_state_weight_gradients, TCpuMatrix< Scalar_t > &forget_state_weight_gradients, TCpuMatrix< Scalar_t > &candidate_state_weight_gradients, TCpuMatrix< Scalar_t > &output_state_weight_gradients, TCpuMatrix< Scalar_t > &input_bias_gradients, TCpuMatrix< Scalar_t > &forget_bias_gradients, TCpuMatrix< Scalar_t > &candidate_bias_gradients, TCpuMatrix< Scalar_t > &output_bias_gradients, TCpuMatrix< Scalar_t > &di, TCpuMatrix< Scalar_t > &df, TCpuMatrix< Scalar_t > &dc, TCpuMatrix< Scalar_t > &dout, const TCpuMatrix< Scalar_t > &precStateActivations, const TCpuMatrix< Scalar_t > &precCellActivations, const TCpuMatrix< Scalar_t > &fInput, const TCpuMatrix< Scalar_t > &fForget, const TCpuMatrix< Scalar_t > &fCandidate, const TCpuMatrix< Scalar_t > &fOutput, const TCpuMatrix< Scalar_t > &weights_input, const TCpuMatrix< Scalar_t > &weights_forget, const TCpuMatrix< Scalar_t > &weights_candidate, const TCpuMatrix< Scalar_t > &weights_output, const TCpuMatrix< Scalar_t > &weights_input_state, const TCpuMatrix< Scalar_t > &weights_forget_state, const TCpuMatrix< Scalar_t > &weights_candidate_state, const TCpuMatrix< Scalar_t > &weights_output_state, const TCpuMatrix< Scalar_t > &input, TCpuMatrix< Scalar_t > &input_gradient, TCpuMatrix< Scalar_t > &cell_gradient, TCpuMatrix< Scalar_t > &cell_tanh)
Backward pass for LSTM Network.
static Matrix_t & GRULayerBackward(TCpuMatrix< Scalar_t > &state_gradients_backward, TCpuMatrix< Scalar_t > &reset_weight_gradients, TCpuMatrix< Scalar_t > &update_weight_gradients, TCpuMatrix< Scalar_t > &candidate_weight_gradients, TCpuMatrix< Scalar_t > &reset_state_weight_gradients, TCpuMatrix< Scalar_t > &update_state_weight_gradients, TCpuMatrix< Scalar_t > &candidate_state_weight_gradients, TCpuMatrix< Scalar_t > &reset_bias_gradients, TCpuMatrix< Scalar_t > &update_bias_gradients, TCpuMatrix< Scalar_t > &candidate_bias_gradients, TCpuMatrix< Scalar_t > &dr, TCpuMatrix< Scalar_t > &du, TCpuMatrix< Scalar_t > &dc, const TCpuMatrix< Scalar_t > &precStateActivations, const TCpuMatrix< Scalar_t > &fReset, const TCpuMatrix< Scalar_t > &fUpdate, const TCpuMatrix< Scalar_t > &fCandidate, const TCpuMatrix< Scalar_t > &weights_reset, const TCpuMatrix< Scalar_t > &weights_update, const TCpuMatrix< Scalar_t > &weights_candidate, const TCpuMatrix< Scalar_t > &weights_reset_state, const TCpuMatrix< Scalar_t > &weights_update_state, const TCpuMatrix< Scalar_t > &weights_candidate_state, const TCpuMatrix< Scalar_t > &input, TCpuMatrix< Scalar_t > &input_gradient, bool resetGateAfter)
Backward pass for GRU Network.