Made the layer's backward() function take a copy of the output tensor

vector from that layer so that it can be reused in any computations. Most layers won't use it but for some backward computations having it allows them to be implemented more efficiently.

Made the layer's backward() function take a copy of the output tensor
vector from that layer so that it can be reused in any computations. Most layers won't use it but for some backward computations having it allows them to be implemented more efficiently.
45b2c06a · Davis King · 2a94e4d9 · 45b2c06a · 45b2c06a · 45b2c06a
Commit 45b2c06a authored Oct 25, 2015 by Davis King
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 11 deletions

core.h dlib/dnn/core.h +3 -3

layers.h dlib/dnn/layers.h +3 -3

layers_abstract.h dlib/dnn/layers_abstract.h +8 -5

No files found.
--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -365,7 +365,7 @@ namespace dlib
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
            params_grad.copy_size(details.get_layer_params());
            params_grad = 0;
-            details.backward(get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
+            details.backward(get_output(), get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
                solvers.top()(details, static_cast<const tensor&>(params_grad));
@@ -601,7 +601,7 @@ namespace dlib
            subnet_wrapper wsub(x, grad_final_ignored);
            params_grad.copy_size(details.get_layer_params());
            params_grad = 0;
-            details.backward(get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
+            details.backward(get_output(), get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
                solvers.top()(details, static_cast<const tensor&>(params_grad));
@@ -1602,7 +1602,7 @@ namespace dlib
        random_noise.copy_size(l.get_layer_params());
        randomize_parameters(random_noise, 5, rnd);
        params_grad = random_noise;
-        l.backward(input_grad, subnetwork, params_grad);
+        l.backward(output, input_grad, subnetwork, params_grad);
        running_stats<double> rs_param, rs_data;

--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -36,7 +36,7 @@ namespace dlib
        } 
        template <typename SUBNET>
-        void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+        void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
        {
            // TODO
        }
@@ -89,7 +89,7 @@ namespace dlib
        } 
        template <typename SUBNET>
-        void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+        void backward(const tensor& , const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
        {
            // compute the gradient of the parameters.  
            params_grad += trans(mat(sub.get_output()))*mat(gradient_input);
@@ -153,7 +153,7 @@ namespace dlib
        } 
        template <typename SUBNET>
-        void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+        void backward(const tensor&, const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
        {
            const float* grad = gradient_input.host();
            const float* in = sub.get_output().host();

--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
@@ -168,6 +168,7 @@ namespace dlib
        template <typename SUBNET>
        void backward(
+            const tensor& computed_output,
            const tensor& gradient_input, 
            SUBNET& sub, 
            tensor& params_grad
@@ -176,7 +177,8 @@ namespace dlib
            requires
                - SUBNET implements the SUBNET interface defined at the top of this file.
                - setup() has been called.
-                - gradient_input has the same dimensions as the output of forward(sub,output).
+                - computed_output is the tensor resulting from calling forward(sub,computed_output).
+                - have_same_dimensions(gradient_input, computed_output)
                - have_same_dimensions(sub.get_gradient_input(), sub.get_output()) == true
                - have_same_dimensions(params_grad, get_layer_params()) == true
            ensures
@@ -185,8 +187,9 @@ namespace dlib
                  These gradients are stored into #sub and #params_grad, respectively. To be
                  precise, the gradients are taken of a function f(sub,get_layer_params())
                  which is defined thusly:   
-                    - let OUT be the output of forward(sub,OUT).
+                    - Recalling that computed_output is a function of sub and get_layer_params() 
-                    - let f(sub,get_layer_params()) == dot(OUT, gradient_input)
+                      since it is the result of calling forward(sub,computed_output):
+                      let f(sub,get_layer_params()) == dot(computed_output, gradient_input)
                  Then we define the following gradient vectors: 
                    - PARAMETER_GRADIENT == gradient of f(sub,get_layer_params()) with
                      respect to get_layer_params(). 
@@ -272,7 +275,7 @@ namespace dlib
        template <typename SUBNET> void setup (const SUBNET& sub);
        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
        const tensor& get_layer_params() const; 
        tensor& get_layer_params(); 
        /*!
@@ -309,7 +312,7 @@ namespace dlib
        template <typename SUBNET> void setup (const SUBNET& sub);
        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
        const tensor& get_layer_params() const; 
        tensor& get_layer_params(); 
        /*!