Commit 45b2c06a authored by Davis King's avatar Davis King

Made the layer's backward() function take a copy of the output tensor

vector from that layer so that it can be reused in any computations.
Most layers won't use it but for some backward computations having it
allows them to be implemented more efficiently.
parent 2a94e4d9
...@@ -365,7 +365,7 @@ namespace dlib ...@@ -365,7 +365,7 @@ namespace dlib
dimpl::subnet_wrapper<subnet_type> wsub(subnetwork); dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
params_grad.copy_size(details.get_layer_params()); params_grad.copy_size(details.get_layer_params());
params_grad = 0; params_grad = 0;
details.backward(get_gradient_input(), wsub, static_cast<tensor&>(params_grad)); details.backward(get_output(), get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
// Don't try to adjust the parameters if this layer doesn't have any. // Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0) if (params_grad.size() != 0)
solvers.top()(details, static_cast<const tensor&>(params_grad)); solvers.top()(details, static_cast<const tensor&>(params_grad));
...@@ -601,7 +601,7 @@ namespace dlib ...@@ -601,7 +601,7 @@ namespace dlib
subnet_wrapper wsub(x, grad_final_ignored); subnet_wrapper wsub(x, grad_final_ignored);
params_grad.copy_size(details.get_layer_params()); params_grad.copy_size(details.get_layer_params());
params_grad = 0; params_grad = 0;
details.backward(get_gradient_input(), wsub, static_cast<tensor&>(params_grad)); details.backward(get_output(), get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
// Don't try to adjust the parameters if this layer doesn't have any. // Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0) if (params_grad.size() != 0)
solvers.top()(details, static_cast<const tensor&>(params_grad)); solvers.top()(details, static_cast<const tensor&>(params_grad));
...@@ -1602,7 +1602,7 @@ namespace dlib ...@@ -1602,7 +1602,7 @@ namespace dlib
random_noise.copy_size(l.get_layer_params()); random_noise.copy_size(l.get_layer_params());
randomize_parameters(random_noise, 5, rnd); randomize_parameters(random_noise, 5, rnd);
params_grad = random_noise; params_grad = random_noise;
l.backward(input_grad, subnetwork, params_grad); l.backward(output, input_grad, subnetwork, params_grad);
running_stats<double> rs_param, rs_data; running_stats<double> rs_param, rs_data;
......
...@@ -36,7 +36,7 @@ namespace dlib ...@@ -36,7 +36,7 @@ namespace dlib
} }
template <typename SUBNET> template <typename SUBNET>
void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad) void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
{ {
// TODO // TODO
} }
...@@ -89,7 +89,7 @@ namespace dlib ...@@ -89,7 +89,7 @@ namespace dlib
} }
template <typename SUBNET> template <typename SUBNET>
void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad) void backward(const tensor& , const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
{ {
// compute the gradient of the parameters. // compute the gradient of the parameters.
params_grad += trans(mat(sub.get_output()))*mat(gradient_input); params_grad += trans(mat(sub.get_output()))*mat(gradient_input);
...@@ -153,7 +153,7 @@ namespace dlib ...@@ -153,7 +153,7 @@ namespace dlib
} }
template <typename SUBNET> template <typename SUBNET>
void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad) void backward(const tensor&, const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
{ {
const float* grad = gradient_input.host(); const float* grad = gradient_input.host();
const float* in = sub.get_output().host(); const float* in = sub.get_output().host();
......
...@@ -168,6 +168,7 @@ namespace dlib ...@@ -168,6 +168,7 @@ namespace dlib
template <typename SUBNET> template <typename SUBNET>
void backward( void backward(
const tensor& computed_output,
const tensor& gradient_input, const tensor& gradient_input,
SUBNET& sub, SUBNET& sub,
tensor& params_grad tensor& params_grad
...@@ -176,7 +177,8 @@ namespace dlib ...@@ -176,7 +177,8 @@ namespace dlib
requires requires
- SUBNET implements the SUBNET interface defined at the top of this file. - SUBNET implements the SUBNET interface defined at the top of this file.
- setup() has been called. - setup() has been called.
- gradient_input has the same dimensions as the output of forward(sub,output). - computed_output is the tensor resulting from calling forward(sub,computed_output).
- have_same_dimensions(gradient_input, computed_output)
- have_same_dimensions(sub.get_gradient_input(), sub.get_output()) == true - have_same_dimensions(sub.get_gradient_input(), sub.get_output()) == true
- have_same_dimensions(params_grad, get_layer_params()) == true - have_same_dimensions(params_grad, get_layer_params()) == true
ensures ensures
...@@ -185,8 +187,9 @@ namespace dlib ...@@ -185,8 +187,9 @@ namespace dlib
These gradients are stored into #sub and #params_grad, respectively. To be These gradients are stored into #sub and #params_grad, respectively. To be
precise, the gradients are taken of a function f(sub,get_layer_params()) precise, the gradients are taken of a function f(sub,get_layer_params())
which is defined thusly: which is defined thusly:
- let OUT be the output of forward(sub,OUT). - Recalling that computed_output is a function of sub and get_layer_params()
- let f(sub,get_layer_params()) == dot(OUT, gradient_input) since it is the result of calling forward(sub,computed_output):
let f(sub,get_layer_params()) == dot(computed_output, gradient_input)
Then we define the following gradient vectors: Then we define the following gradient vectors:
- PARAMETER_GRADIENT == gradient of f(sub,get_layer_params()) with - PARAMETER_GRADIENT == gradient of f(sub,get_layer_params()) with
respect to get_layer_params(). respect to get_layer_params().
...@@ -272,7 +275,7 @@ namespace dlib ...@@ -272,7 +275,7 @@ namespace dlib
template <typename SUBNET> void setup (const SUBNET& sub); template <typename SUBNET> void setup (const SUBNET& sub);
template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
const tensor& get_layer_params() const; const tensor& get_layer_params() const;
tensor& get_layer_params(); tensor& get_layer_params();
/*! /*!
...@@ -309,7 +312,7 @@ namespace dlib ...@@ -309,7 +312,7 @@ namespace dlib
template <typename SUBNET> void setup (const SUBNET& sub); template <typename SUBNET> void setup (const SUBNET& sub);
template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output); template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
const tensor& get_layer_params() const; const tensor& get_layer_params() const;
tensor& get_layer_params(); tensor& get_layer_params();
/*! /*!
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment