Upgrade the layer interface so that you can implement layers that operate

in-place.

Upgrade the layer interface so that you can implement layers that operate
in-place.
c1433b3d · Davis King · 69490292 · c1433b3d · c1433b3d · c1433b3d
Commit c1433b3d authored Nov 13, 2015 by Davis King
Showing with 506 additions and 64 deletions

core.h dlib/dnn/core.h +411 -41

core_abstract.h dlib/dnn/core_abstract.h +1 -0

layers.h dlib/dnn/layers.h +13 -11

layers_abstract.h dlib/dnn/layers_abstract.h +81 -12

No files found.
--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -61,7 +61,171 @@ namespace dlib
            return std::make_tuple(std::get<indices>(item)...);
        }
-    }
+        template <typename T> struct alwaysbool { typedef bool type; };
+        resizable_tensor& rt();
+        // The significance of a layer's backward method requiring forward's outputs is
+        // that such as layer can't have an in-place layer stacked on top of it because
+        // in-place layers overwrite the output of the layer they sit on top of.
+        template <typename layer_type, typename SUBNET>
+        constexpr auto backward_requires_forward_output(
+            layer_type& layer,
+            SUBNET& sub
+        ) -> typename alwaysbool<decltype(layer.backward(rt(),rt(),sub,rt()))>::type
+        {
+            return true;
+        }
+        template <typename layer_type, typename SUBNET>
+        constexpr auto backward_requires_forward_output(
+            layer_type& layer,
+            SUBNET& sub
+        ) -> typename alwaysbool<decltype(layer.backward(rt(),sub,rt()))>::type
+        {
+            return false;
+        }
+        template <typename layer_type, typename SUBNET>
+        constexpr auto backward_requires_forward_output(
+            layer_type& layer,
+            SUBNET& sub
+        ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type
+        {
+            return true;
+        }
+        template <typename layer_type, typename SUBNET>
+        constexpr auto has_inplace_backward(
+            layer_type& layer,
+            SUBNET& sub
+        ) -> typename alwaysbool<decltype(layer.backward(rt(),rt(),sub,rt()))>::type
+        {
+            return false;
+        }
+        template <typename layer_type, typename SUBNET>
+        constexpr auto has_inplace_backward(
+            layer_type& layer,
+            SUBNET& sub
+        ) -> typename alwaysbool<decltype(layer.backward(rt(),sub,rt()))>::type
+        {
+            return false;
+        }
+        template <typename layer_type, typename SUBNET>
+        constexpr auto has_inplace_backward(
+            layer_type& layer,
+            SUBNET& sub
+        ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type
+        {
+            return true;
+        }
+        template <typename layer_type, typename SUBNET>
+        constexpr auto is_inplace_layer(
+            layer_type& layer,
+            const SUBNET& sub 
+        ) -> typename alwaysbool<decltype(layer.forward(sub,rt()))>::type
+        {
+            return false;
+        }
+        template <typename layer_type, typename SUBNET>
+        constexpr auto is_inplace_layer(
+            layer_type& layer,
+            const SUBNET& sub
+        ) -> typename alwaysbool<decltype(layer.forward_inplace(sub.get_output(),rt()))>::type
+        {
+            return true;
+        }
+        template <typename layer_type, typename SUBNET>
+        auto call_layer_backward(
+            layer_type& layer,
+            const tensor& computed_output, 
+            const tensor& gradient_input, 
+            SUBNET& sub, 
+            tensor& params_grad
+        ) -> decltype(layer.backward(computed_output,gradient_input,sub,params_grad))
+        {
+            layer.backward(computed_output,gradient_input,sub,params_grad);
+        }
+        template <typename layer_type, typename SUBNET>
+        auto call_layer_backward(
+            layer_type& layer,
+            const tensor& , 
+            const tensor& gradient_input, 
+            SUBNET& sub, 
+            tensor& params_grad
+        ) -> decltype(layer.backward(gradient_input,sub,params_grad))
+        {
+            layer.backward(gradient_input,sub,params_grad);
+        }
+        template <typename layer_type, typename SUBNET>
+        auto call_layer_backward(
+            layer_type& layer,
+            const tensor& computed_output, 
+            const tensor& gradient_input, 
+            SUBNET& sub, 
+            tensor& params_grad
+        ) -> decltype(layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad))
+        {
+            layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad);
+        }
+        template <typename layer_type, typename SUBNET>
+        auto call_layer_forward(
+            layer_type& layer,
+            const SUBNET& sub, 
+            tensor& data_output
+        ) -> decltype(layer.forward(sub,rt()))
+        {
+            // This overload of call_layer_forward() is here because this template
+            // naturally gets instantiated but only on code paths that never get executed.
+            // So rather than writing a bunch of hard to read template magic around call
+            // sites we just have this overload that doesn't do anything (and an assert to
+            // make sure that's the case).
+            DLIB_CASSERT(false, "This should never happen");
+        }
+        template <typename layer_type, typename SUBNET>
+        auto call_layer_forward(
+            layer_type& layer,
+            const SUBNET& sub, 
+            resizable_tensor& data_output
+        ) -> decltype(layer.forward(sub,data_output))
+        {
+            layer.forward(sub,data_output);
+        }
+        template <typename layer_type, typename SUBNET>
+        auto call_layer_forward(
+            layer_type& layer,
+            const SUBNET& sub, 
+            tensor& data_output
+        ) -> decltype(layer.forward_inplace(sub.get_output(),data_output))
+        {
+            layer.forward_inplace(sub.get_output(),data_output);
+        }
+        template <typename layer_type, typename SUBNET>
+        auto call_layer_forward(
+            layer_type& layer,
+            const SUBNET& sub, 
+            resizable_tensor& data_output
+        ) -> decltype(layer.forward_inplace(sub.get_output(),data_output))
+        {
+            if (!have_same_dimensions(data_output, sub.get_output()))
+                data_output.copy_size(sub.get_output());
+            layer.forward_inplace(sub.get_output(),data_output);
+        }
+    } // end namespace impl
    template <typename Head, typename... Tail>
    std::tuple<Tail...> tuple_tail(
@@ -162,7 +326,7 @@ namespace dlib
    namespace dimpl
    {
-        template <typename T, typename enabled=void>
+        template <typename T, bool is_first = true, typename enabled=void>
        class subnet_wrapper
        {
            /*!
@@ -173,6 +337,13 @@ namespace dlib
                    objects to the layer callbacks those callbacks won't be able to 
                    interact with the subnetworks in a way other than specified 
                    by the SUBNET interface spec.
+                    We also allow the top layer of a subnet_wrapper stack to call the
+                    private_get_output() and private_get_gradient_input() functions.  This
+                    way, layers that have had their output/gradient overwritten by in-place
+                    layers can only be accessed from the in-place layers that sit directly
+                    on top of them since those in-place layers are the only layers that
+                    know how to interact with them properly.
            !*/
        public:
@@ -185,7 +356,31 @@ namespace dlib
        };
        template <typename T>
-        class subnet_wrapper<T,typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
+        class subnet_wrapper<T,true, typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
+        {
+        public:
+            subnet_wrapper(const subnet_wrapper&) = delete;
+            subnet_wrapper& operator=(const subnet_wrapper&) = delete;
+            typedef T wrapped_type;
+            const static size_t num_layers = T::num_layers;
+            subnet_wrapper(T& l_) : l(l_),subnetwork(l.subnet()) {}
+            const tensor& get_output() const { return l.private_get_output(); }
+            tensor& get_gradient_input() { return l.private_get_gradient_input(); }
+            const subnet_wrapper<typename T::subnet_type>& subnet() const { subnetwork; }
+            subnet_wrapper<typename T::subnet_type>& subnet() { subnetwork; }
+        private:
+            T& l;
+            subnet_wrapper<typename T::subnet_type,false> subnetwork;
+        };
+        template <typename T>
+        class subnet_wrapper<T,false, typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
        {
        public:
@@ -231,8 +426,11 @@ namespace dlib
        add_layer(
        ):
            this_layer_setup_called(false),
-            gradient_input_is_stale(true)
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false)
        {
+            if (this_layer_operates_inplace())
+                subnetwork.disable_output_and_gradient_getters();
        }
        add_layer(const add_layer&) = default;
@@ -242,6 +440,8 @@ namespace dlib
        template <typename T, typename U, typename E>
        friend class add_layer;
+        template <typename T, bool is_first, typename E>
+        friend class dimpl::subnet_wrapper;
        // Allow copying networks from one to another as long as their corresponding 
        // layers can be constructed from each other.
@@ -253,9 +453,12 @@ namespace dlib
            details(item.layer_details()), 
            this_layer_setup_called(item.this_layer_setup_called),
            gradient_input_is_stale(item.gradient_input_is_stale),
+            get_output_and_gradient_input_disabled(item.get_output_and_gradient_input_disabled),
            x_grad(item.x_grad),
            cached_output(item.cached_output)
        {
+            if (this_layer_operates_inplace())
+                subnetwork.disable_output_and_gradient_getters();
        }
        template <typename ...T>
@@ -266,8 +469,11 @@ namespace dlib
            details(layer_det), 
            subnetwork(std::forward<T>(args)...),
            this_layer_setup_called(false),
-            gradient_input_is_stale(true)
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false)
        {
+            if (this_layer_operates_inplace())
+                subnetwork.disable_output_and_gradient_getters();
        }
        template <typename ...T>
@@ -278,8 +484,11 @@ namespace dlib
            details(std::move(layer_det)), 
            subnetwork(std::forward<T>(args)...),
            this_layer_setup_called(false),
-            gradient_input_is_stale(true)
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false)
        {
+            if (this_layer_operates_inplace())
+                subnetwork.disable_output_and_gradient_getters();
        }
        template <typename ...T, typename ...U>
@@ -290,8 +499,11 @@ namespace dlib
            details(std::get<0>(layer_det)), 
            subnetwork(tuple_tail(layer_det),std::forward<T>(args)...),
            this_layer_setup_called(false),
-            gradient_input_is_stale(true)
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false)
        {
+            if (this_layer_operates_inplace())
+                subnetwork.disable_output_and_gradient_getters();
        }
        template <typename ...T, typename ...U>
@@ -343,21 +555,54 @@ namespace dlib
                details.setup(wsub);
                this_layer_setup_called = true;
            }
-            details.forward(wsub, cached_output);
+            if (this_layer_operates_inplace())
+                impl::call_layer_forward(details, wsub, private_get_output());
+            else
+                impl::call_layer_forward(details, wsub, cached_output);
            gradient_input_is_stale = true;
-            return get_output();
+            return private_get_output();
        }
-        const tensor& get_output() const { return cached_output; }
+    private:
-        tensor& get_gradient_input() 
+        tensor& private_get_output() const
        { 
-            if (gradient_input_is_stale)
+            if (const_cast<add_layer&>(*this).this_layer_operates_inplace())
+                return subnetwork.private_get_output();
+            else
+                return const_cast<resizable_tensor&>(cached_output); 
+        }
+        tensor& private_get_gradient_input() 
+        { 
+            if (this_layer_operates_inplace())
            {
-                gradient_input_is_stale = false;
+                return subnetwork.private_get_gradient_input();
-                x_grad.copy_size(get_output());
-                x_grad = 0;
            }
-            return x_grad; 
+            else
+            {
+                if (gradient_input_is_stale)
+                {
+                    gradient_input_is_stale = false;
+                    x_grad.copy_size(private_get_output());
+                    x_grad = 0;
+                }
+                return x_grad; 
+            }
+        }
+        void disable_output_and_gradient_getters (
+        ) { get_output_and_gradient_input_disabled = true; }
+    public:
+        const tensor& get_output() const 
+        { 
+            if (get_output_and_gradient_input_disabled)
+                throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it.");
+            return private_get_output(); 
+        }
+        tensor& get_gradient_input() 
+        { 
+            if (get_output_and_gradient_input_disabled)
+                throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it.");
+            return private_get_gradient_input();
        }
        template <typename solver_type>
@@ -365,11 +610,13 @@ namespace dlib
        {
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
            params_grad.copy_size(details.get_layer_params());
-            details.backward(get_output(), get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
+            impl::call_layer_backward(details, private_get_output(),
+                private_get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
                solvers.top()(details, static_cast<const tensor&>(params_grad));
            subnetwork.update(x, solvers.pop());
+            gradient_input_is_stale = true;
        }
        const subnet_type& subnet() const { return subnetwork; }
@@ -396,6 +643,7 @@ namespace dlib
            serialize(item.details, out);
            serialize(item.this_layer_setup_called, out);
            serialize(item.gradient_input_is_stale, out);
+            serialize(item.get_output_and_gradient_input_disabled, out);
            serialize(item.x_grad, out);
            serialize(item.cached_output, out);
        }
@@ -410,18 +658,34 @@ namespace dlib
            deserialize(item.details, in);
            deserialize(item.this_layer_setup_called, in);
            deserialize(item.gradient_input_is_stale, in);
+            deserialize(item.get_output_and_gradient_input_disabled, in);
            deserialize(item.x_grad, in);
            deserialize(item.cached_output, in);
        }
    private:
+        bool this_layer_operates_inplace(
+        ) 
+        {
+            // This layer can run in-place if it's an in-place capable layer and also if
+            // the layer it's on top of doesn't need it's own output tensor (since in-place
+            // layers overwrite that tensor)
+            return impl::is_inplace_layer(details, subnetwork) && !subnetwork.this_layer_requires_forward_output();
+        }
+        bool this_layer_requires_forward_output(
+        ) 
+        {
+            return impl::backward_requires_forward_output(details, subnetwork);
+        }
        void swap(add_layer& item)
        {
            std::swap(subnetwork,item.subnetwork);
            std::swap(details, item.details);
            std::swap(this_layer_setup_called, item.this_layer_setup_called);
            std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
+            std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
            std::swap(x_grad, item.x_grad);
            std::swap(cached_output, item.cached_output);
        }
@@ -431,6 +695,10 @@ namespace dlib
        LAYER_DETAILS details;
        bool this_layer_setup_called;
        bool gradient_input_is_stale;
+        bool get_output_and_gradient_input_disabled;
+        // Note that if this_layer_operates_inplace()==true then x_grad and cached_output
+        // are not used at all.  Instead, this layer uses these variables from the lower
+        // layer.
        resizable_tensor x_grad;
        resizable_tensor cached_output; 
@@ -461,7 +729,8 @@ namespace dlib
        add_layer(
        ): 
            this_layer_setup_called(false),
-            gradient_input_is_stale(true) 
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false)
        {}
        add_layer(const add_layer&) = default;
@@ -471,6 +740,8 @@ namespace dlib
        template <typename T, typename U, typename E>
        friend class add_layer;
+        template <typename T, bool is_first, typename E>
+        friend class dimpl::subnet_wrapper;
        // Allow copying networks from one to another as long as their corresponding 
        // layers can be constructed from each other.
@@ -482,6 +753,7 @@ namespace dlib
            details(item.layer_details()),
            this_layer_setup_called(item.this_layer_setup_called),
            gradient_input_is_stale(item.gradient_input_is_stale),
+            get_output_and_gradient_input_disabled(false),
            x_grad(item.x_grad),
            cached_output(item.cached_output)
        {
@@ -492,7 +764,8 @@ namespace dlib
        ) : 
            details(layer_det), 
            this_layer_setup_called(false),
-            gradient_input_is_stale(true) 
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false)
        {}
        add_layer(
@@ -500,7 +773,8 @@ namespace dlib
        ) : 
            details(std::move(layer_det)), 
            this_layer_setup_called(false),
-            gradient_input_is_stale(true) 
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false)
        {}
        add_layer(
@@ -510,7 +784,8 @@ namespace dlib
            details(std::move(layer_det)),
            input_layer(std::move(il)),
            this_layer_setup_called(false),
-            gradient_input_is_stale(true)
+            gradient_input_is_stale(true),
+            get_output_and_gradient_input_disabled(false)
        {}
        add_layer(
@@ -577,33 +852,50 @@ namespace dlib
                details.setup(wsub);
                this_layer_setup_called = true;
            }
-            details.forward(wsub, cached_output);
+            impl::call_layer_forward(details, wsub, cached_output);
            gradient_input_is_stale = true;
-            return get_output();
+            return private_get_output();
        }
-        const tensor& get_output() const { return cached_output; }
+    private:
-        tensor& get_gradient_input() 
+        tensor& private_get_output() const { return const_cast<resizable_tensor&>(cached_output); }
+        tensor& private_get_gradient_input() 
        { 
            if (gradient_input_is_stale)
            {
                gradient_input_is_stale = false;
-                x_grad.copy_size(get_output());
+                x_grad.copy_size(private_get_output());
                x_grad = 0;
            }
            return x_grad; 
        }
+        void disable_output_and_gradient_getters (
+        ) { get_output_and_gradient_input_disabled = true; }
+    public:
+        const tensor& get_output() const 
+        { 
+            if (get_output_and_gradient_input_disabled)
+                throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it.");
+            return private_get_output(); 
+        }
+        tensor& get_gradient_input() 
+        { 
+            if (get_output_and_gradient_input_disabled)
+                throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it.");
+            return private_get_gradient_input();
+        }
        template <typename solver_type>
        void update(const tensor& x, sstack<solver_type,num_layers>& solvers)
        {
            subnet_wrapper wsub(x, grad_final_ignored);
            params_grad.copy_size(details.get_layer_params());
-            details.backward(get_output(), get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
+            impl::call_layer_backward(details, private_get_output(),
+                private_get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
                solvers.top()(details, static_cast<const tensor&>(params_grad));
+            gradient_input_is_stale = true;
        }
        const subnet_type& subnet() const { return input_layer; } 
@@ -630,6 +922,7 @@ namespace dlib
            serialize(item.details, out);
            serialize(item.this_layer_setup_called, out);
            serialize(item.gradient_input_is_stale, out);
+            serialize(item.get_output_and_gradient_input_disabled, out);
            serialize(item.x_grad, out);
            serialize(item.cached_output, out);
        }
@@ -644,12 +937,20 @@ namespace dlib
            deserialize(item.details, in);
            deserialize(item.this_layer_setup_called, in);
            deserialize(item.gradient_input_is_stale, in);
+            deserialize(item.get_output_and_gradient_input_disabled, in);
            deserialize(item.x_grad, in);
            deserialize(item.cached_output, in);
        }
    private:
+        bool this_layer_requires_forward_output(
+        ) 
+        {
+            subnet_wrapper wsub(grad_final_ignored, grad_final_ignored);
+            return impl::backward_requires_forward_output(details, wsub);
+        }
        class subnet_wrapper
        {
        public:
@@ -685,6 +986,7 @@ namespace dlib
            std::swap(details, item.details);
            std::swap(this_layer_setup_called, item.this_layer_setup_called);
            std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
+            std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
            std::swap(x_grad, item.x_grad); 
            std::swap(cached_output, item.cached_output); 
        }
@@ -693,6 +995,7 @@ namespace dlib
        LAYER_DETAILS details;
        bool this_layer_setup_called;
        bool gradient_input_is_stale;
+        bool get_output_and_gradient_input_disabled;
        resizable_tensor x_grad; 
        resizable_tensor cached_output; 
@@ -1493,10 +1796,13 @@ namespace dlib
            }
+            tensor& get_mutable_output() { return output; }
            const tensor& get_output() const { return output; }
+            const tensor& private_get_output() const { return get_output(); }
            const test_layer_subnet& subnet() const { init_sub(); return *subnetwork; }
            tensor& get_gradient_input() { return gradient_input; }
+            tensor& private_get_gradient_input() { return get_gradient_input(); }
            test_layer_subnet& subnet() { init_sub(); return *subnetwork; }
@@ -1578,7 +1884,7 @@ namespace dlib
        // (since we do a lazy layer creation thing based on calls to subnet() inside
        // test_layer_subnet).
        l.setup(subnetwork);
-        l.forward(subnetwork, output);
+        impl::call_layer_forward(l, subnetwork, output);
        resizable_tensor input_grad;
        input_grad.copy_size(output);
@@ -1605,11 +1911,71 @@ namespace dlib
        // comparing them to a central differences approximation.
        resizable_tensor params_grad;
        params_grad.copy_size(l.get_layer_params());
-        // Set the params grad to something crazy so that it's very obvious if it doesn't
+        // But first, set the params grad to something crazy so that it's very obvious if
-        // get fully assigned.
+        // it doesn't get fully assigned.
        params_grad = std::numeric_limits<float>::infinity();
-        l.backward(output, input_grad, subnetwork, params_grad);
+        impl::call_layer_backward(l, output, input_grad, subnetwork, params_grad);
+        static_assert(impl::is_inplace_layer(l, subnetwork) == impl::has_inplace_backward(l, subnetwork),
+            "Layer not defined correctly.  forward and backward methods must either both be in-place or both out-of-place. ");
+        // Make sure the outputs of forward() and backward() are the same when they are run
+        // in in-place mode.
+        if (impl::is_inplace_layer(l, subnetwork))
+        {
+            test_layer_subnet subnetwork2(rnd);
+            layer_details_type ll(l);
+            ll.setup(subnetwork2);
+            resizable_tensor ip_out;
+            impl::call_layer_forward(ll, subnetwork2, ip_out);
+            impl::call_layer_forward(ll, subnetwork2, subnetwork2.get_mutable_output());
+            const auto forward_error = max(abs(mat(ip_out) - mat(subnetwork2.get_output())));
+            if (forward_error > 0.00001)
+            {
+                using namespace std;
+                sout << "This layer is supposed to support in-place computations but the output of forward_inplace()\n";
+                sout << "changes when invoked in-place vs. out-of-place. The error was: " << forward_error << endl;
+                return layer_test_results(sout.str()); 
+            }
+            resizable_tensor params_grad;
+            params_grad.copy_size(ll.get_layer_params());
+            params_grad = std::numeric_limits<float>::infinity();
+            resizable_tensor input_grad;
+            input_grad.copy_size(ip_out);
+            fill_with_gassuan_random_numbers(input_grad, rnd);
+            resizable_tensor params_grad1, params_grad2, data_grad1, data_grad2;
+            params_grad1 = params_grad;
+            params_grad2 = params_grad;
+            // Now call backward() and make sure it works as well.
+            subnetwork2.get_gradient_input() = 9999;
+            impl::call_layer_backward(ll, ip_out, input_grad, subnetwork2, params_grad1);
+            data_grad1 = subnetwork2.get_gradient_input();
+            subnetwork2.get_gradient_input() = mat(input_grad);
+            impl::call_layer_backward(ll, ip_out, subnetwork2.get_gradient_input(), subnetwork2, params_grad2);
+            data_grad2 = subnetwork2.get_gradient_input();
+            if (params_grad.size() != 0)
+            {
+                const auto backward_param_error = max(abs(mat(params_grad1) - mat(params_grad2)));
+                if (backward_param_error > 0.00001)
+                {
+                    using namespace std;
+                    sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n";
+                    sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_param_error << endl;
+                    return layer_test_results(sout.str()); 
+                }
+            }
+            const auto backward_data_error = max(abs(mat(data_grad1) - mat(data_grad2)));
+            if (backward_data_error > 0.00001)
+            {
+                using namespace std;
+                sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n";
+                sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_data_error << endl;
+                return layer_test_results(sout.str()); 
+            }
+        }
        // ==================================================================
        // first validate the way the parameter gradients are computed
@@ -1622,9 +1988,10 @@ namespace dlib
                eps = base_eps;
            const float oldval = l1.get_layer_params().host()[i];
            l1.get_layer_params().host()[i] = oldval+eps;
-            l1.forward(subnetwork, out2);
+            impl::call_layer_forward(l1, subnetwork, out2);
            l1.get_layer_params().host()[i] = oldval-eps;
-            l1.forward(subnetwork, out3);
+            impl::call_layer_forward(l1, subnetwork, out3);
+            l1.get_layer_params().host()[i] = oldval;
            // Compute a reference derivative via a central differences approximation and
            // compare it to the one output by the layer and make sure they match.
@@ -1635,8 +2002,8 @@ namespace dlib
            {
                using namespace std;
                sout << "Gradient error in parameter #" << i <<".  Relative error: "<< relative_error << endl;
-                sout << "expected derivative:   " << reference_derivative << endl;
+                sout << "expected derivative: " << reference_derivative << endl;
-                sout << "output derivative: " << output_derivative << endl;
+                sout << "output derivative:   " << output_derivative << endl;
                return layer_test_results(sout.str()); 
            }
@@ -1651,21 +2018,24 @@ namespace dlib
            if (eps == 0)
                eps = base_eps;
            subnetwork.get_output_element(i) = oldval+eps;
-            l.forward(subnetwork, out2);
+            impl::call_layer_forward(l, subnetwork, out2);
            subnetwork.get_output_element(i) = oldval-eps;
-            l.forward(subnetwork, out3);
+            impl::call_layer_forward(l, subnetwork, out3);
+            subnetwork.get_output_element(i) = oldval;
            // Compute a reference derivative via a central differences approximation and
            // compare it to the one output by the layer and make sure they match.
            double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
-            double output_derivative = subnetwork.get_gradient_input_element(i)-initial_gradient_input[i];
+            double output_derivative = subnetwork.get_gradient_input_element(i);
+            if (!impl::is_inplace_layer(l,subnetwork))
+                output_derivative -= initial_gradient_input[i];
            double relative_error = (reference_derivative - output_derivative)/(reference_derivative + 1e-100);
            if (std::abs(relative_error) > 0.01)
            {
                using namespace std;
                sout << "Gradient error in data variable #" << i <<".  Relative error: "<< relative_error << endl;
-                sout << "expected derivative:   " << reference_derivative << endl;
+                sout << "expected derivative: " << reference_derivative << endl;
-                sout << "output derivative: " << output_derivative << endl;
+                sout << "output derivative:   " << output_derivative << endl;
                return layer_test_results(sout.str()); 
            }
        }

--- a/dlib/dnn/core_abstract.h
+++ b/dlib/dnn/core_abstract.h
@@ -389,6 +389,7 @@ namespace dlib
            ensures
                - Back propagates the error gradient, get_gradient_input(), through this
                  network and uses the provided solvers to update the network parameters.
+                - All elements of #get_gradient_input() are set to 0. 
        !*/
        void clean(

--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -36,7 +36,7 @@ namespace dlib
        } 
        template <typename SUBNET>
-        void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
        {
            // TODO
        }
@@ -89,7 +89,7 @@ namespace dlib
        } 
        template <typename SUBNET>
-        void backward(const tensor& , const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+        void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
        {
            // compute the gradient of the parameters.  
            params_grad = trans(mat(sub.get_output()))*mat(gradient_input);
@@ -145,20 +145,22 @@ namespace dlib
        {
        }
-        template <typename SUBNET>
+        void forward_inplace(const tensor& input, tensor& output)
-        void forward(const SUBNET& sub, resizable_tensor& output)
        {
-            output.copy_size(sub.get_output());
+            output = lowerbound(mat(input), 0);
-            output = lowerbound(mat(sub.get_output()), 0);
        } 
-        template <typename SUBNET>
+        void backward_inplace(
-        void backward(const tensor&, const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+            const tensor& computed_output,
+            const tensor& gradient_input, 
+            tensor& data_grad, 
+            tensor& params_grad
+        )
        {
            const float* grad = gradient_input.host();
-            const float* in = sub.get_output().host();
+            const float* in = computed_output.host();
-            float* out = sub.get_gradient_input().host();
+            float* out = data_grad.host();
-            for (unsigned long i = 0; i < sub.get_output().size(); ++i)
+            for (unsigned long i = 0; i < computed_output.size(); ++i)
            {
                if (in[i] > 0)
                    out[i] = grad[i];

--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
@@ -91,12 +91,28 @@ namespace dlib
                produces an output tensor.  You create an entire deep network by composing
                these functions.  Importantly, you are able to use a wide range of
                different functions to accommodate the task you are trying to accomplish.
-                Dlib includes a number of common layer types but if you want to define your
+                Therefore, dlib includes a number of common layer types but if you want to
-                own then you simply implement a class with the same interface as
+                define your own then you simply implement a class with the same interface
-                EXAMPLE_LAYER_.
+                as EXAMPLE_LAYER_.
                Note that there is no dlib::EXAMPLE_LAYER_ type.  It is shown here purely
                to document the interface that a layer object must implement.
+                The central work of defining a layer is implementing the forward and backward
+                methods.  When you do this you have three options:
+                    - Implement the forward() and backward() methods according to the
+                      specification shown below.  Do not implement forward_inplace() and
+                      backward_inplace().
+                    - Implement the forward() and backward() methods according to the
+                      specification shown below, except exclude the computed_output
+                      parameter from backward().  Doing this will allow dlib to make some
+                      layers execute in-place and therefore run a little faster and use
+                      less memory. Do not implement forward_inplace() and
+                      backward_inplace().
+                    - Implement the forward_inplace() and backward_inplace() methods
+                      according to the specification shown below.  Do not implement
+                      forward() and backward().  These in-place methods allow some types of
+                      layers to be implemented more efficiently.
        !*/
    public:
@@ -152,7 +168,7 @@ namespace dlib
        template <typename SUBNET>
        void forward(
            const SUBNET& sub, 
-            resizable_tensor& output
+            resizable_tensor& data_output
        );
        /*!
            requires
@@ -160,14 +176,14 @@ namespace dlib
                - setup() has been called.
            ensures
                - Runs the output of the subnetwork through this layer and stores the
-                  output into #output.  In particular, forward() can use any of the outputs
+                  results into #data_output.  In particular, forward() can use any of the
-                  in sub (e.g. sub.get_output(), sub.subnet().get_output(), etc.) to
+                  outputs in sub (e.g. sub.get_output(), sub.subnet().get_output(), etc.)
-                  compute whatever it wants.
+                  to compute whatever it wants.
        !*/
        template <typename SUBNET>
        void backward(
-            const tensor& computed_output,
+            const tensor& computed_output, // this parameter is optional
            const tensor& gradient_input, 
            SUBNET& sub, 
            tensor& params_grad
@@ -189,7 +205,7 @@ namespace dlib
                  These gradients are stored into #sub and #params_grad, respectively. To be
                  precise, the gradients are taken of a function f(sub,get_layer_params())
                  which is defined thusly:   
-                    - Recalling that computed_output is a function of sub and get_layer_params() 
+                    - Recalling that computed_output is a function of both sub and get_layer_params(), 
                      since it is the result of calling forward(sub,computed_output):
                      let f(sub,get_layer_params()) == dot(computed_output, gradient_input)
                  Then we define the following gradient vectors: 
@@ -207,6 +223,59 @@ namespace dlib
                        - layer<I>(sub).get_gradient_input() += DATA_GRADIENT_I
        !*/
+        void forward_inplace(
+            const tensor& data_input, 
+            tensor& data_output
+        );
+        /*!
+            requires
+                - have_same_dimensions(data_input,data_output) == true
+                - setup() has been called.
+            ensures
+                - Runs the data_input tensor though this layer and stores the output into
+                  #data_output.
+                - This function supports in-place operation, i.e. having
+                  is_same_object(data_input, data_output)==true
+        !*/
+        void backward_inplace(
+            const tensor& computed_output,
+            const tensor& gradient_input,
+            tensor& data_grad,
+            tensor& params_grad
+        );
+        /*!
+            requires
+                - setup() has been called.
+                - computed_output is the tensor resulting from the most recent call to
+                  forward_inplace().  This means that backward_inplace() is allowed to
+                  cache intermediate results computed during forward_inplace() and use them
+                  for the backward computation.
+                - have_same_dimensions(gradient_input, data_grad) == true
+                - have_same_dimensions(gradient_input, computed_output) == true
+                - have_same_dimensions(params_grad, get_layer_params()) == true
+            ensures
+                - This function supports in-place operation, i.e. having
+                  is_same_object(gradient_input, data_grad)==true
+                - This function outputs the gradients of this layer with respect to the
+                  input data from a sublayer and also with respect to this layer's parameters.
+                  These gradients are stored into #data_grad and #params_grad, respectively. To be
+                  precise, the gradients are taken of a function f(data_input,get_layer_params())
+                  which is defined thusly:   
+                    - Recalling that computed_output is a function of both the input to
+                      forward_inplace() and get_layer_params(), since it is the result of
+                      calling forward_inplace(data_input,computed_output):
+                      let f(data_input,get_layer_params()) == dot(computed_output, gradient_input)
+                  Then we define the following gradient vectors: 
+                    - PARAMETER_GRADIENT == gradient of f(data_input,get_layer_params()) with
+                      respect to get_layer_params(). 
+                    - DATA_GRADIENT == gradient of f(data_input,get_layer_params()) with respect
+                      to data_input. 
+                  Finally, backward_inplace() outputs these gradients by performing:
+                    - params_grad = PARAMETER_GRADIENT 
+                    - data_grad = DATA_GRADIENT
+        !*/
        const tensor& get_layer_params(
        ) const; 
        /*!
@@ -277,7 +346,7 @@ namespace dlib
        template <typename SUBNET> void setup (const SUBNET& sub);
        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
-        template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
        const tensor& get_layer_params() const; 
        tensor& get_layer_params(); 
        /*!
@@ -313,8 +382,8 @@ namespace dlib
        );
        template <typename SUBNET> void setup (const SUBNET& sub);
-        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        void forward_inplace(const tensor& input, tensor& output);
-        template <typename SUBNET> void backward(const tensor& computed_output, const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        void backward_inplace(const tensor& computed_output, const tensor& gradient_input, tensor& data_grad, tensor& params_grad);
        const tensor& get_layer_params() const; 
        tensor& get_layer_params(); 
        /*!