Split the update() methods into two parts. One that computes gradients

with respect to parameters and one that updates the parameters with those gradients.

Split the update() methods into two parts. One that computes gradients
with respect to parameters and one that updates the parameters with those gradients.
d85de930 · Davis King · 8c64a656 · d85de930 · d85de930 · d85de930
Commit d85de930 authored Apr 17, 2016 by Davis King
Hide whitespace changes
Inline Side-by-side

Showing with 266 additions and 199 deletions

core.h dlib/dnn/core.h +135 -86

core_abstract.h dlib/dnn/core_abstract.h +124 -109

trainer.h dlib/dnn/trainer.h +7 -4

No files found.
--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -804,31 +804,42 @@ namespace dlib
        const tensor& get_final_data_gradient(
        ) const { return subnetwork->get_final_data_gradient(); }

-        template <typename solver_type>
-        void update(const tensor& x, sstack<solver_type> solvers, double step_size)
+        void back_propagate_error(const tensor& x)
        {
-            update(x,private_get_gradient_input(),solvers,step_size);
+            back_propagate_error(x, private_get_gradient_input());
        }
-
-        template <typename solver_type>
-        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
+        void back_propagate_error(const tensor& x, const tensor& gradient_input)
        {
-            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
            dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork);
            params_grad.copy_size(details.get_layer_params());
            impl::call_layer_backward(details, private_get_output(),
                gradient_input, wsub, static_cast<tensor&>(params_grad));

+            subnetwork->back_propagate_error(x); 
+
+            // zero out get_gradient_input()
+            gradient_input_is_stale = true;
+        }
+
+        template <typename solver_type>
+        void update_parameters(sstack<solver_type> solvers, double step_size)
+        {
+            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
            {
                const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
                tt::add(1,details.get_layer_params(), step_size, step);
            }
-            subnetwork->update(x, solvers.pop(), step_size);
-            gradient_input_is_stale = true;
+            subnetwork->update_parameters(solvers.pop(), step_size);
        }

+        const tensor& get_parameter_gradient(
+        ) const { return params_grad; }
+
+        tensor& get_parameter_gradient (
+        ) { return params_grad; }
+
        const subnet_type& subnet() const { return *subnetwork; }
        subnet_type& subnet() { return *subnetwork; }

@@ -847,7 +858,7 @@ namespace dlib

        friend void serialize(const add_layer& item, std::ostream& out)
        {
-            int version = 1;
+            int version = 2;
            serialize(version, out);
            serialize(*item.subnetwork, out);
            serialize(item.details, out);
@@ -856,13 +867,14 @@ namespace dlib
            serialize(item.get_output_and_gradient_input_disabled, out);
            serialize(item.x_grad, out);
            serialize(item.cached_output, out);
+            serialize(item.params_grad, out);
        }

        friend void deserialize(add_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
-            if (version != 1)
+            if (!(1 <= version && version <= 2))
                throw serialization_error("Unexpected version found while deserializing dlib::add_layer.");
            deserialize(*item.subnetwork, in);
            deserialize(item.details, in);
@@ -871,6 +883,8 @@ namespace dlib
            deserialize(item.get_output_and_gradient_input_disabled, in);
            deserialize(item.x_grad, in);
            deserialize(item.cached_output, in);
+            if (version == 2)
+                deserialize(item.params_grad, in);
        }

        friend std::ostream& operator<< (std::ostream& out, const add_layer& item)
@@ -910,6 +924,7 @@ namespace dlib
            std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
            std::swap(x_grad, item.x_grad);
            std::swap(cached_output, item.cached_output);
+            std::swap(params_grad, item.params_grad);
        }


@@ -924,10 +939,10 @@ namespace dlib
        resizable_tensor x_grad;
        resizable_tensor cached_output; 

-        // The following 2 objects don't logically contribute to the state of this class.
-        // They are only here to prevent them from being reallocated over and over in
-        // member functions.
        resizable_tensor params_grad; 
+
+        // temp_tensor doesn't logically contribute to the state of this object.  
+        // It is here only to prevent it from being reallocated over and over.
        resizable_tensor temp_tensor;

    };
@@ -1118,16 +1133,12 @@ namespace dlib
        const tensor& get_final_data_gradient(
        ) const { return grad_final; }

-        template <typename solver_type>
-        void update(const tensor& x, sstack<solver_type> solvers, double step_size)
+        void back_propagate_error(const tensor& x)
        {
-            return update(x,private_get_gradient_input(),solvers, step_size);
+            back_propagate_error(x, private_get_gradient_input());
        }
-
-        template <typename solver_type>
-        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
+        void back_propagate_error(const tensor& x, const tensor& gradient_input)
        {
-            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
            // make sure grad_final is initialized to 0
            if (!have_same_dimensions(x, grad_final))
                grad_final.copy_size(x);
@@ -1138,15 +1149,27 @@ namespace dlib
            impl::call_layer_backward(details, private_get_output(),
                gradient_input, wsub, static_cast<tensor&>(params_grad));

+            // zero out get_gradient_input()
+            gradient_input_is_stale = true;
+        }
+
+        template <typename solver_type>
+        void update_parameters(sstack<solver_type> solvers, double step_size)
+        {
+            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
            // Don't try to adjust the parameters if this layer doesn't have any.
-            if (params_grad.size() != 0)
-            {
+            if (params_grad.size() != 0) {
                const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
                tt::add(1,details.get_layer_params(), step_size, step);
            }
-            gradient_input_is_stale = true;
        }

+        const tensor& get_parameter_gradient(
+        ) const { return params_grad; }
+
+        tensor& get_parameter_gradient (
+        )  { return params_grad; }
+
        const subnet_type& subnet() const { return input_layer; } 
        subnet_type& subnet() { return input_layer; } 

@@ -1347,18 +1370,27 @@ namespace dlib
        const tensor& get_final_data_gradient(
        ) const { return subnetwork.get_final_data_gradient(); }

-        template <typename solver_type>
-        void update(const tensor& x, sstack<solver_type> solvers, double step_size)
+        void back_propagate_error(const tensor& x)
+        {
+            subnetwork.back_propagate_error(x);
+        }
+        void back_propagate_error(const tensor& x, const tensor& gradient_input)
        {
-            subnetwork.update(x,solvers, step_size);
+            subnetwork.back_propagate_error(x,gradient_input);
        }

        template <typename solver_type>
-        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
+        void update_parameters(sstack<solver_type> solvers, double step_size)
        {
-            subnetwork.update(x,gradient_input,solvers, step_size);
+            subnetwork.update_parameters(solvers, step_size);
        }

+        const tensor& get_parameter_gradient(
+        ) const { return params_grad; }
+
+        tensor& get_parameter_gradient (
+        ) { return params_grad; }
+
        const subnet_type& subnet() const { return subnetwork; }
        subnet_type& subnet() { return subnetwork; }

@@ -1430,6 +1462,11 @@ namespace dlib
        { return subnetwork.private_get_gradient_input(); }

        subnet_type subnetwork;
+
+        // This member doesn't logically contribute to the state of the object since it is
+        // always empty. It's just here so we can have the get_parameter_gradient() methods
+        // which have to return something.  So they return this empty tensor.
+        resizable_tensor params_grad;
    };

 // ----------------------------------------------------------------------------------------
@@ -1598,32 +1635,42 @@ namespace dlib
            return details[0].get_gradient_input();
        }

-        template <typename solver_type>
-        void update(const tensor& x, sstack<solver_type> solvers, double step_size)
+        const tensor& get_parameter_gradient(
+        ) const { return details[0].get_parameter_gradient(); }
+
+        tensor& get_parameter_gradient (
+        ) { return details[0].get_parameter_gradient(); }
+
+        void back_propagate_error(const tensor& x)
        {
-            update(x,private_get_gradient_input(),solvers,step_size);
+            back_propagate_error(x, private_get_gradient_input());
        }
-
-        template <typename solver_type>
-        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
+        void back_propagate_error(const tensor& x, const tensor& gradient_input)
        {
-            const auto cnt = (REPEATED_LAYER<SUBNET>::num_computational_layers-SUBNET::num_computational_layers);
            if (details.size() > 1)
            {
-                details[0].update(details[1].get_output(), gradient_input, solvers,step_size);
+                details[0].back_propagate_error(details[1].get_output(), gradient_input);
                for (size_t i = 1; i < details.size(); ++i)
                {
                    if (i+1 < details.size())
-                        details[i].update(details[i+1].get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i),step_size);
+                        details[i].back_propagate_error(details[i+1].get_output(), details[i-1].get_final_data_gradient());
                    else
-                        details[i].update(subnetwork.get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i),step_size);
+                        details[i].back_propagate_error(subnetwork.get_output(), details[i-1].get_final_data_gradient());
                }
            }
            else
            {
-                details[0].update(subnetwork.get_output(), gradient_input, solvers,step_size);
+                details[0].back_propagate_error(subnetwork.get_output(), gradient_input);
            }
-            subnetwork.update(x, details.back().get_final_data_gradient(), solvers.pop(cnt*details.size()),step_size);
+            subnetwork.back_propagate_error(x, details.back().get_final_data_gradient());
+        }
+
+        template <typename solver_type>
+        void update_parameters(sstack<solver_type> solvers, double step_size)
+        {
+            for (size_t i = 0; i < details.size(); ++i)
+                details[i].update_parameters(solvers.pop(comp_layers_in_each_group*i),step_size);
+            subnetwork.update_parameters(solvers.pop(comp_layers_in_each_group*details.size()),step_size);
        }

        const subnet_type& subnet() const { return subnetwork; }
@@ -1827,25 +1874,19 @@ namespace dlib
            return grad_final; 
        }

-        template <typename solver_type>
-        void update(
-            const tensor& /*x*/, 
-            sstack<solver_type> /*solvers*/,
-            double /*step_size*/
-        )
+        void back_propagate_error(const tensor& /*x*/)
        {
-            // nothing to update
+            // nothing to do
+        }
+        void back_propagate_error(const tensor& /*x*/, const tensor& /*gradient_input*/)
+        {
+            // nothing to do
        }

        template <typename solver_type>
-        void update(
-            const tensor& /*x*/,
-            const tensor& /*gradient_input*/,
-            sstack<solver_type> /*solvers*/,
-            double /*step_size*/
-        )
+        void update_parameters(sstack<solver_type> /*solvers*/, double /*step_size*/)
        {
-            // nothing to update
+            // nothing to do
        }

        const subnet_type& subnet() const { return input_layer; }
@@ -2141,58 +2182,55 @@ namespace dlib
            return compute_loss(temp_tensor);
        }

-        template <typename label_iterator, typename solver_type>
-        double update (
+        template <typename label_iterator>
+        double compute_parameter_gradients (
            const tensor& x,
-            label_iterator lbegin,
-            sstack<solver_type> solvers,
-            double step_size
+            label_iterator lbegin
        )
        {
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
            double l = loss.compute_loss(x, lbegin, wsub);
-            subnetwork.update(x, solvers, step_size);
+            subnetwork.back_propagate_error(x);
            return l;
        }
-
-        template <typename input_iterator, typename label_iterator, typename solver_type>
-        double update (
+        template <typename input_iterator, typename label_iterator>
+        double compute_parameter_gradients (
            input_iterator ibegin,
            input_iterator iend,
-            label_iterator lbegin,
-            sstack<solver_type> solvers,
-            double step_size
+            label_iterator lbegin
        )
        {
            to_tensor(ibegin,iend,temp_tensor);
-            return update(temp_tensor, lbegin, solvers, step_size);
+            return compute_parameter_gradients(temp_tensor, lbegin);
        }
-
-        template <typename solver_type>
-        double update (
-            const tensor& x,
-            sstack<solver_type> solvers,
-            double step_size
+        double compute_parameter_gradients (
+            const tensor& x
        )
        {
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
            double l = loss.compute_loss(x, wsub);
-            subnetwork.update(x, solvers, step_size);
+            subnetwork.back_propagate_error(x);
            return l;
        }
-
-        template <typename input_iterator, typename solver_type>
-        double update (
+        template <typename input_iterator>
+        double compute_parameter_gradients (
            input_iterator ibegin,
-            input_iterator iend,
+            input_iterator iend
+        )
+        {
+            to_tensor(ibegin,iend,temp_tensor);
+            return compute_parameter_gradients(temp_tensor);
+        }
+
+        template <typename solver_type>
+        void update_parameters (
            sstack<solver_type> solvers,
            double step_size
        )
        {
-            to_tensor(ibegin,iend,temp_tensor);
-            return update(temp_tensor, solvers, step_size);
+            subnetwork.update_parameters(solvers, step_size);
        }

        const subnet_type& subnet() const { return subnetwork; }
@@ -2477,18 +2515,24 @@ namespace dlib
            return subnetwork.get_final_data_gradient(); 
        }

-        template <typename solver_type>
-        void update(const tensor& x, sstack<solver_type> solvers)
+        void back_propagate_error(const tensor& x)
        {
-            subnetwork.update(x,solvers);
+            subnetwork.back_propagate_error(x);
        }

        template <typename solver_type>
-        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers)
+        void update_parameters(sstack<solver_type> solvers, double step_size)
        {
-            subnetwork.update(x,gradient_input,solvers);
+            subnetwork.update_parameters(solvers, step_size);
        }

+        const tensor& get_parameter_gradient(
+        ) const { return params_grad; }
+
+        tensor& get_parameter_gradient (
+        ) { return params_grad; }
+
+
        const subnet_type& subnet() const 
        { 
            return subnetwork; 
@@ -2558,6 +2602,11 @@ namespace dlib
        { return layer<TAG_TYPE>(subnetwork).private_get_gradient_input(); }

        subnet_type subnetwork;
+
+        // This member doesn't logically contribute to the state of the object since it is
+        // always empty. It's just here so we can have the get_parameter_gradient() methods
+        // which have to return something.  So they return this empty tensor.
+        resizable_tensor params_grad;
    };
    template <template<typename> class T, typename U>
    struct is_nonloss_layer_type<add_skip_layer<T,U>> : std::true_type {};

--- a/dlib/dnn/core_abstract.h
+++ b/dlib/dnn/core_abstract.h
@@ -410,33 +410,53 @@ namespace dlib
        /*!
            ensures
                - returns the error gradient for this network.  That is, this is the error
-                  gradient that this network will use to update itself when update() is
-                  called.  Therefore, when performing back propagation, layers that sit on
-                  top of this network layer write their back propagated error gradients
-                  into get_gradient_input().  Or to put it another way, during back
-                  propagation, layers take the contents of their get_gradient_input() and
-                  back propagate it through themselves and store the results into their
-                  subnetwork's get_gradient_input().
+                  gradient that this network will use to compute parameter gradients when
+                  back_propagate_error() is called.  Therefore, when performing back
+                  propagation, layers that sit on top of this network layer write their
+                  back-propagated error gradients into get_gradient_input().  Or to put it
+                  another way, during back-propagation, layers take the contents of their
+                  get_gradient_input() and back-propagate it through themselves and store
+                  the result into their subnetwork's get_gradient_input().

                  This means you should consider get_gradient_input() as an input to the
-                  update() method.  
+                  back_propagate_error() method.  
        !*/

        const tensor& get_final_data_gradient(
        ) const;
        /*!
            ensures
-                - if update() has been called to back-propagate a gradient through this
-                  network then you can call get_final_data_gradient() to obtain the last
-                  gradient computed.  That is, this function returns the gradient of the
-                  network with respect to its inputs.
+                - if back_propagate_error() has been called to back-propagate a gradient
+                  through this network then you can call get_final_data_gradient() to
+                  obtain the last data gradient computed.  That is, this function returns
+                  the gradient of the network with respect to its inputs.
+                - Note that there is only one "final data gradient" for an entire network,
+                  not one per layer, since there is only one input to the entire network.
        !*/

-        template <typename solver_type>
-        void update(
-            const tensor& x, 
-            sstack<solver_type> solvers,
-            double step_size
+        const tensor& get_parameter_gradient(
+        ) const; 
+        /*!
+            ensures
+                - if back_propagate_error() has been called then you can call
+                  get_parameter_gradient() to find the gradient of this layer's parameters.
+                  When we update the parameters by calling update_parameters(), it will use
+                  the gradient in get_parameter_gradient() to perform the update.
+                  Therefore, you should consider get_parameter_gradient() as an input to
+                  update_parameters().
+        !*/
+
+        tensor& get_parameter_gradient (
+        ); 
+        /*!
+            ensures
+                - returns a non-const reference to the tensor returned by the above
+                  get_parameter_gradient() method.  You could use this method to modify the
+                  parameter gradient in some way before invoking update_parameters().
+        !*/
+
+        void back_propagate_error(
+            const tensor& x
        );
        /*!
            requires
@@ -445,28 +465,21 @@ namespace dlib
                  subsequently modified in any way.
                - get_gradient_input() has been set equal to the gradient of this network's
                  output with respect to some loss function.
-                - The given solvers have only ever been used with this network.  That
-                  is, if you want to call update() on some other neural network object then
-                  you must NOT reuse the same solvers object.
-                - solvers.size() >= num_computational_layers
-                - 0 < step_size <= 1
            ensures
                - Back propagates the error gradient, get_gradient_input(), through this
-                  network and uses the provided solvers to update the network parameters.
-                - The parameter delta vector output by the solvers is multiplied by
-                  step_size before being added to the parameters.
+                  network and computes parameter and data gradients, via backpropagation.
+                  Specifically, this function populates get_final_data_gradient() and also,
+                  for each layer, the tensor returned by get_parameter_gradient().
                - All elements of #get_gradient_input() are set to 0. 
-                - have_same_dimensions(#get_final_data_gradient(), x) == true
+                - have_same_dimensions(#get_final_data_gradient(), x) == true.
+                - have_same_dimensions(#get_parameter_gradient(), layer_details().get_layer_params()) == true.
                - #get_final_data_gradient() contains the gradient of the network with
                  respect to x.
        !*/

-        template <typename solver_type>
-        void update(
+        void back_propagate_error(
            const tensor& x, 
-            const tensor& gradient_input,
-            sstack<solver_type> solvers,
-            double step_size
+            const tensor& gradient_input
        );
        /*!
            requires
@@ -474,27 +487,45 @@ namespace dlib
                  Moreover, this was the most recent call to forward() and x has not been
                  subsequently modified in any way.
                - have_same_dimensions(gradient_input, get_output()) == true
-                - The given solvers have only ever been used with this network.  That
-                  is, if you want to call update() on some other neural network object then
-                  you must NOT reuse the same solvers object.
-                - solvers.size() >= num_computational_layers
-                - 0 < step_size <= 1
            ensures
-                - This function is identical to the version of update() defined immediately
-                  above except that it back-propagates gradient_input through the network
-                  instead of get_gradient_input().  Therefore, this version of update is
-                  equivalent to performing:
+                - This function is identical to the version of back_propagate_error()
+                  defined immediately above except that it back-propagates gradient_input
+                  through the network instead of get_gradient_input().  Therefore, this
+                  version of back_propagate_error() is equivalent to performing:
                    get_gradient_input() = gradient_input;
-                    update(x,solvers);
-                  Except that calling update(x,gradient_input,solvers) avoids the copy
-                  and is therefore slightly more efficient.
-                - The parameter delta vector output by the solvers is multiplied by
-                  step_size before being added to the parameters.
+                    back_propagate_error(x);
+                  Except that calling back_propagate_error(x,gradient_input) avoids the
+                  copy and is therefore slightly more efficient.
                - All elements of #get_gradient_input() are set to 0. 
+                - have_same_dimensions(#get_final_data_gradient(), x) == true.
+                - have_same_dimensions(#get_parameter_gradient(), layer_details().get_layer_params()) == true.
                - #get_final_data_gradient() contains the gradient of the network with
                  respect to x.
        !*/

+        template <typename solver_type>
+        void update_parameters(
+            sstack<solver_type> solvers, 
+            double step_size
+        );
+        /*!
+            requires
+                - solver_type is an implementation of the EXAMPLE_SOLVER interface defined
+                  in solvers_abstract.h
+                - back_propagate_error() has been called.
+                - The given solvers have only ever been used with this network.  That is,
+                  if you want to call update_parameters() on some other neural network
+                  object then you must NOT reuse the same solvers object.
+                - solvers.size() >= num_computational_layers
+                - 0 < step_size <= 1
+            ensures
+                - Updates all the parameters in the network.  In particular, we pass each
+                  layer's parameter gradient (i.e. the tensor returned by the layer's
+                  get_parameter_gradient() member) through that layer's corresponding
+                  solver object.  This produces a parameter delta vector and we add
+                  step_size times that vector to the layer's parameters.
+        !*/
+
        void clean(
        );
        /*!
@@ -831,12 +862,10 @@ namespace dlib

    // -------------

-        template <typename label_iterator, typename solver_type>
-        double update (
+        template <typename label_iterator>
+        double compute_parameter_gradients (
            const tensor& x,
-            label_iterator lbegin,
-            sstack<solver_type> solvers,
-            double step_size
+            label_iterator lbegin
        );
        /*!
            requires
@@ -844,31 +873,22 @@ namespace dlib
                - x.num_samples() > 0
                - lbegin == iterator pointing to the start of a range of
                  x.num_samples()/sample_expansion_factor label_type elements.
-                - The given solvers have only ever been used with this network.  That
-                  is, if you want to call update() on some other neural network object then
-                  you must NOT reuse the same solvers object.
-                - solvers.size() >= num_computational_layers
-                - 0 < step_size <= 1
            ensures
                - runs x through the network, compares the output to the expected output
-                  pointed to by lbegin, and updates the network parameters via
-                  backpropagation.
+                  pointed to by lbegin, and computes parameter and data gradients with
+                  respect to the loss, via backpropagation.  Specifically, this function
+                  updates get_final_data_gradient() and also, for each layer, the tensor
+                  returned by get_parameter_gradient().
                - for all valid k:
                    - the expected label of the kth sample in x is *(lbegin+k/sample_expansion_factor).
-                - The provided solvers are used to update the parameters in each layer of
-                  the network.
-                - The parameter delta vector output by the solvers is multiplied by
-                  step_size before being added to the parameters.
                - returns compute_loss(x,lbegin)
        !*/

-        template <typename input_iterator, typename label_iterator, typename solver_type>
-        double update (
+        template <typename input_iterator, typename label_iterator>
+        double compute_parameter_gradients (
            input_iterator ibegin,
            input_iterator iend,
-            label_iterator lbegin,
-            sstack<solver_type> solvers,
-            double step_size
+            label_iterator lbegin
        );
        /*!
            requires
@@ -876,77 +896,72 @@ namespace dlib
                - std::distance(ibegin,iend) > 0
                - lbegin == iterator pointing to the start of a range of
                  std::distance(ibegin,iend) label_type elements.
-                - The given solvers have only ever been used with this network.  That
-                  is, if you want to call update() on some other neural network object then
-                  you must NOT reuse the same solvers object.
-                - solvers.size() >= num_computational_layers
-                - 0 < step_size <= 1
            ensures
                - runs [ibegin,iend) through the network, compares the output to the
-                  expected output pointed to by lbegin, and updates the network parameters
-                  via backpropagation.
+                  expected output pointed to by lbegin, and computes parameter and data
+                  gradients with respect to the loss, via backpropagation.  Specifically,
+                  this function updates get_final_data_gradient() and also, for each layer,
+                  the tensor returned by get_parameter_gradient().
                - for all valid k:
                    - the expected label of *(ibegin+k) is *(lbegin+k).
-                - The provided solvers are used to update the parameters in each layer of
-                  the network.
-                - The parameter delta vector output by the solvers is multiplied by
-                  step_size before being added to the parameters.
                - returns compute_loss(ibegin,iend,lbegin)
        !*/

-    // -------------
-
-        template <typename solver_type>
-        double update (
-            const tensor& x,
-            sstack<solver_type> solvers,
-            double step_size
+        double compute_parameter_gradients (
+            const tensor& x
        );
        /*!
            requires
                - LOSS_DETAILS is an unsupervised loss.  i.e. label_type==no_label_type.
                - x.num_samples()%sample_expansion_factor == 0
                - x.num_samples() > 0
-                - The given solvers have only ever been used with this network.  That
-                  is, if you want to call update() on some other neural network object then
-                  you must NOT reuse the same solvers object.
-                - solvers.size() >= num_computational_layers
-                - 0 < step_size <= 1
            ensures
-                - runs x through the network and updates the network parameters by
-                  back-propagating the loss gradient through the network.
-                - The provided solvers are used to update the parameters in each layer of
-                  the network.
-                - The parameter delta vector output by the solvers is multiplied by
-                  step_size before being added to the parameters.
+                - runs x through the network and computes parameter and data gradients with
+                  respect to the loss, via backpropagation.  Specifically, this function
+                  updates get_final_data_gradient() and also, for each layer, the tensor
+                  returned by get_parameter_gradient().
                - returns compute_loss(x)
        !*/

-        template <typename input_iterator, typename solver_type>
-        double update (
+        template <typename input_iterator>
+        double compute_parameter_gradients (
            input_iterator ibegin,
-            input_iterator iend,
-            sstack<solver_type> solvers,
-            double step_size
+            input_iterator iend
        );
        /*!
            requires
                - LOSS_DETAILS is an unsupervised loss.  i.e. label_type==no_label_type.
                - [ibegin, iend) is an iterator range over input_type objects.
                - std::distance(ibegin,iend) > 0
+            ensures
+                - runs [ibegin,iend) through the network and computes parameter and data
+                  gradients with respect to the loss, via backpropagation.  Specifically,
+                  this function updates get_final_data_gradient() and also, for each layer,
+                  the tensor returned by get_parameter_gradient().
+                - returns compute_loss(ibegin,iend)
+        !*/
+
+        template <typename solver_type>
+        void update_parameters (
+            sstack<solver_type> solvers,
+            double step_size
+        );
+        /*!
+            requires
+                - solver_type is an implementation of the EXAMPLE_SOLVER interface defined
+                  in solvers_abstract.h
+                - compute_parameter_gradients() has been called.
                - The given solvers have only ever been used with this network.  That
-                  is, if you want to call update() on some other neural network object then
-                  you must NOT reuse the same solvers object.
+                  is, if you want to call update_parameters() on some other neural network
+                  object then you must NOT reuse the same solvers object.
                - solvers.size() >= num_computational_layers
                - 0 < step_size <= 1
            ensures
-                - runs [ibegin,iend) through the network and updates the network parameters
-                  by back-propagating the loss gradient through the network.
-                - The provided solvers are used to update the parameters in each layer of
-                  the network.
-                - The parameter delta vector output by the solvers is multiplied by
-                  step_size before being added to the parameters.
-                - returns compute_loss(ibegin,iend)
+                - Updates all the parameters in the network.  In particular, we pass each
+                  layer's parameter gradient (i.e. the tensor returned by the layer's
+                  get_parameter_gradient() member) through that layer's corresponding
+                  solver object.  This produces a parameter delta vector and we add
+                  step_size times that vector to the layer's parameters.
        !*/

    // -------------

--- a/dlib/dnn/trainer.h
+++ b/dlib/dnn/trainer.h
@@ -418,14 +418,16 @@ namespace dlib
        template <typename T>
        void run_update(job_t& next_job, const T&)
        {
-            double loss = net.update(next_job.t, next_job.labels.begin(), make_sstack(solvers),step_size);
+            double loss = net.compute_parameter_gradients(next_job.t, next_job.labels.begin());
+            net.update_parameters(make_sstack(solvers),step_size);
            record_loss(loss);
        }

        void run_update(job_t& next_job, const no_label_type&)
        {
            no_label_type pick_which_run_update;
-            double loss = net.update(next_job.t, make_sstack(solvers), step_size);
+            double loss = net.compute_parameter_gradients(next_job.t);
+            net.update_parameters(make_sstack(solvers), step_size);
            record_loss(loss);
        }

@@ -438,8 +440,9 @@ namespace dlib
            job_t next_job;
            while(job_pipe.dequeue(next_job))
            {
-                // call net.update() but pick the right version for unsupervised or
-                // supervised training based on the type of label_type.
+                // call net.compute_parameter_gradients() and net.update_parameters() but
+                // pick the right version for unsupervised or supervised training based on
+                // the type of label_type.
                run_update(next_job, pick_which_run_update);

                // If we have been running for a while then check if the loss is still