Split the update() methods into two parts. One that computes gradients

with respect to parameters and one that updates the parameters with those gradients.

Split the update() methods into two parts. One that computes gradients
with respect to parameters and one that updates the parameters with those gradients.
d85de930 · Davis King · 8c64a656 · d85de930 · d85de930 · d85de930
Commit d85de930 authored Apr 17, 2016 by Davis King
Show whitespace changes
Inline Side-by-side

Showing with 266 additions and 199 deletions

core.h dlib/dnn/core.h +135 -86

core_abstract.h dlib/dnn/core_abstract.h +124 -109

trainer.h dlib/dnn/trainer.h +7 -4

No files found.
--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -804,31 +804,42 @@ namespace dlib
        const tensor& get_final_data_gradient(
        ) const { return subnetwork->get_final_data_gradient(); }
-        template <typename solver_type>
+        void back_propagate_error(const tensor& x)
-        void update(const tensor& x, sstack<solver_type> solvers, double step_size)
        {
-            update(x,private_get_gradient_input(),solvers,step_size);
+            back_propagate_error(x, private_get_gradient_input());
        }
+        void back_propagate_error(const tensor& x, const tensor& gradient_input)
-        template <typename solver_type>
-        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
        {
-            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
            dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork);
            params_grad.copy_size(details.get_layer_params());
            impl::call_layer_backward(details, private_get_output(),
                gradient_input, wsub, static_cast<tensor&>(params_grad));
+            subnetwork->back_propagate_error(x); 
+            // zero out get_gradient_input()
+            gradient_input_is_stale = true;
+        }
+        template <typename solver_type>
+        void update_parameters(sstack<solver_type> solvers, double step_size)
+        {
+            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
            {
                const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
                tt::add(1,details.get_layer_params(), step_size, step);
            }
-            subnetwork->update(x, solvers.pop(), step_size);
+            subnetwork->update_parameters(solvers.pop(), step_size);
-            gradient_input_is_stale = true;
        }
+        const tensor& get_parameter_gradient(
+        ) const { return params_grad; }
+        tensor& get_parameter_gradient (
+        ) { return params_grad; }
        const subnet_type& subnet() const { return *subnetwork; }
        subnet_type& subnet() { return *subnetwork; }
@@ -847,7 +858,7 @@ namespace dlib
        friend void serialize(const add_layer& item, std::ostream& out)
        {
-            int version = 1;
+            int version = 2;
            serialize(version, out);
            serialize(*item.subnetwork, out);
            serialize(item.details, out);
@@ -856,13 +867,14 @@ namespace dlib
            serialize(item.get_output_and_gradient_input_disabled, out);
            serialize(item.x_grad, out);
            serialize(item.cached_output, out);
+            serialize(item.params_grad, out);
        }
        friend void deserialize(add_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
-            if (version != 1)
+            if (!(1 <= version && version <= 2))
                throw serialization_error("Unexpected version found while deserializing dlib::add_layer.");
            deserialize(*item.subnetwork, in);
            deserialize(item.details, in);
@@ -871,6 +883,8 @@ namespace dlib
            deserialize(item.get_output_and_gradient_input_disabled, in);
            deserialize(item.x_grad, in);
            deserialize(item.cached_output, in);
+            if (version == 2)
+                deserialize(item.params_grad, in);
        }
        friend std::ostream& operator<< (std::ostream& out, const add_layer& item)
@@ -910,6 +924,7 @@ namespace dlib
            std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
            std::swap(x_grad, item.x_grad);
            std::swap(cached_output, item.cached_output);
+            std::swap(params_grad, item.params_grad);
        }
@@ -924,10 +939,10 @@ namespace dlib
        resizable_tensor x_grad;
        resizable_tensor cached_output; 
-        // The following 2 objects don't logically contribute to the state of this class.
-        // They are only here to prevent them from being reallocated over and over in
-        // member functions.
        resizable_tensor params_grad; 
+        // temp_tensor doesn't logically contribute to the state of this object.  
+        // It is here only to prevent it from being reallocated over and over.
        resizable_tensor temp_tensor;
    };
@@ -1118,16 +1133,12 @@ namespace dlib
        const tensor& get_final_data_gradient(
        ) const { return grad_final; }
-        template <typename solver_type>
+        void back_propagate_error(const tensor& x)
-        void update(const tensor& x, sstack<solver_type> solvers, double step_size)
        {
-            return update(x,private_get_gradient_input(),solvers, step_size);
+            back_propagate_error(x, private_get_gradient_input());
        }
+        void back_propagate_error(const tensor& x, const tensor& gradient_input)
-        template <typename solver_type>
-        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
        {
-            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
            // make sure grad_final is initialized to 0
            if (!have_same_dimensions(x, grad_final))
                grad_final.copy_size(x);
@@ -1138,15 +1149,27 @@ namespace dlib
            impl::call_layer_backward(details, private_get_output(),
                gradient_input, wsub, static_cast<tensor&>(params_grad));
-            // Don't try to adjust the parameters if this layer doesn't have any.
+            // zero out get_gradient_input()
-            if (params_grad.size() != 0)
+            gradient_input_is_stale = true;
+        }
+        template <typename solver_type>
+        void update_parameters(sstack<solver_type> solvers, double step_size)
        {
+            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
+            // Don't try to adjust the parameters if this layer doesn't have any.
+            if (params_grad.size() != 0) {
                const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
                tt::add(1,details.get_layer_params(), step_size, step);
            }
-            gradient_input_is_stale = true;
        }
+        const tensor& get_parameter_gradient(
+        ) const { return params_grad; }
+        tensor& get_parameter_gradient (
+        )  { return params_grad; }
        const subnet_type& subnet() const { return input_layer; } 
        subnet_type& subnet() { return input_layer; } 
@@ -1347,18 +1370,27 @@ namespace dlib
        const tensor& get_final_data_gradient(
        ) const { return subnetwork.get_final_data_gradient(); }
-        template <typename solver_type>
+        void back_propagate_error(const tensor& x)
-        void update(const tensor& x, sstack<solver_type> solvers, double step_size)
        {
-            subnetwork.update(x,solvers, step_size);
+            subnetwork.back_propagate_error(x);
+        }
+        void back_propagate_error(const tensor& x, const tensor& gradient_input)
+        {
+            subnetwork.back_propagate_error(x,gradient_input);
        }
        template <typename solver_type>
-        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
+        void update_parameters(sstack<solver_type> solvers, double step_size)
        {
-            subnetwork.update(x,gradient_input,solvers, step_size);
+            subnetwork.update_parameters(solvers, step_size);
        }
+        const tensor& get_parameter_gradient(
+        ) const { return params_grad; }
+        tensor& get_parameter_gradient (
+        ) { return params_grad; }
        const subnet_type& subnet() const { return subnetwork; }
        subnet_type& subnet() { return subnetwork; }
@@ -1430,6 +1462,11 @@ namespace dlib
        { return subnetwork.private_get_gradient_input(); }
        subnet_type subnetwork;
+        // This member doesn't logically contribute to the state of the object since it is
+        // always empty. It's just here so we can have the get_parameter_gradient() methods
+        // which have to return something.  So they return this empty tensor.
+        resizable_tensor params_grad;
    };
 // ----------------------------------------------------------------------------------------
@@ -1598,32 +1635,42 @@ namespace dlib
            return details[0].get_gradient_input();
        }
-        template <typename solver_type>
+        const tensor& get_parameter_gradient(
-        void update(const tensor& x, sstack<solver_type> solvers, double step_size)
+        ) const { return details[0].get_parameter_gradient(); }
+        tensor& get_parameter_gradient (
+        ) { return details[0].get_parameter_gradient(); }
+        void back_propagate_error(const tensor& x)
        {
-            update(x,private_get_gradient_input(),solvers,step_size);
+            back_propagate_error(x, private_get_gradient_input());
        }
+        void back_propagate_error(const tensor& x, const tensor& gradient_input)
-        template <typename solver_type>
-        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
        {
-            const auto cnt = (REPEATED_LAYER<SUBNET>::num_computational_layers-SUBNET::num_computational_layers);
            if (details.size() > 1)
            {
-                details[0].update(details[1].get_output(), gradient_input, solvers,step_size);
+                details[0].back_propagate_error(details[1].get_output(), gradient_input);
                for (size_t i = 1; i < details.size(); ++i)
                {
                    if (i+1 < details.size())
-                        details[i].update(details[i+1].get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i),step_size);
+                        details[i].back_propagate_error(details[i+1].get_output(), details[i-1].get_final_data_gradient());
                    else
-                        details[i].update(subnetwork.get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i),step_size);
+                        details[i].back_propagate_error(subnetwork.get_output(), details[i-1].get_final_data_gradient());
                }
            }
            else
            {
-                details[0].update(subnetwork.get_output(), gradient_input, solvers,step_size);
+                details[0].back_propagate_error(subnetwork.get_output(), gradient_input);
            }
-            subnetwork.update(x, details.back().get_final_data_gradient(), solvers.pop(cnt*details.size()),step_size);
+            subnetwork.back_propagate_error(x, details.back().get_final_data_gradient());
+        }
+        template <typename solver_type>
+        void update_parameters(sstack<solver_type> solvers, double step_size)
+        {
+            for (size_t i = 0; i < details.size(); ++i)
+                details[i].update_parameters(solvers.pop(comp_layers_in_each_group*i),step_size);
+            subnetwork.update_parameters(solvers.pop(comp_layers_in_each_group*details.size()),step_size);
        }
        const subnet_type& subnet() const { return subnetwork; }
@@ -1827,25 +1874,19 @@ namespace dlib
            return grad_final; 
        }
-        template <typename solver_type>
+        void back_propagate_error(const tensor& /*x*/)
-        void update(
+        {
-            const tensor& /*x*/, 
+            // nothing to do
-            sstack<solver_type> /*solvers*/,
+        }
-            double /*step_size*/
+        void back_propagate_error(const tensor& /*x*/, const tensor& /*gradient_input*/)
-        )
        {
-            // nothing to update
+            // nothing to do
        }
        template <typename solver_type>
-        void update(
+        void update_parameters(sstack<solver_type> /*solvers*/, double /*step_size*/)
-            const tensor& /*x*/,
-            const tensor& /*gradient_input*/,
-            sstack<solver_type> /*solvers*/,
-            double /*step_size*/
-        )
        {
-            // nothing to update
+            // nothing to do
        }
        const subnet_type& subnet() const { return input_layer; }
@@ -2141,58 +2182,55 @@ namespace dlib
            return compute_loss(temp_tensor);
        }
-        template <typename label_iterator, typename solver_type>
+        template <typename label_iterator>
-        double update (
+        double compute_parameter_gradients (
            const tensor& x,
-            label_iterator lbegin,
+            label_iterator lbegin
-            sstack<solver_type> solvers,
-            double step_size
        )
        {
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
            double l = loss.compute_loss(x, lbegin, wsub);
-            subnetwork.update(x, solvers, step_size);
+            subnetwork.back_propagate_error(x);
            return l;
        }
+        template <typename input_iterator, typename label_iterator>
-        template <typename input_iterator, typename label_iterator, typename solver_type>
+        double compute_parameter_gradients (
-        double update (
            input_iterator ibegin,
            input_iterator iend,
-            label_iterator lbegin,
+            label_iterator lbegin
-            sstack<solver_type> solvers,
-            double step_size
        )
        {
            to_tensor(ibegin,iend,temp_tensor);
-            return update(temp_tensor, lbegin, solvers, step_size);
+            return compute_parameter_gradients(temp_tensor, lbegin);
        }
+        double compute_parameter_gradients (
-        template <typename solver_type>
+            const tensor& x
-        double update (
-            const tensor& x,
-            sstack<solver_type> solvers,
-            double step_size
        )
        {
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
            double l = loss.compute_loss(x, wsub);
-            subnetwork.update(x, solvers, step_size);
+            subnetwork.back_propagate_error(x);
            return l;
        }
+        template <typename input_iterator>
-        template <typename input_iterator, typename solver_type>
+        double compute_parameter_gradients (
-        double update (
            input_iterator ibegin,
-            input_iterator iend,
+            input_iterator iend
+        )
+        {
+            to_tensor(ibegin,iend,temp_tensor);
+            return compute_parameter_gradients(temp_tensor);
+        }
+        template <typename solver_type>
+        void update_parameters (
            sstack<solver_type> solvers,
            double step_size
        )
        {
-            to_tensor(ibegin,iend,temp_tensor);
+            subnetwork.update_parameters(solvers, step_size);
-            return update(temp_tensor, solvers, step_size);
        }
        const subnet_type& subnet() const { return subnetwork; }
@@ -2477,18 +2515,24 @@ namespace dlib
            return subnetwork.get_final_data_gradient(); 
        }
-        template <typename solver_type>
+        void back_propagate_error(const tensor& x)
-        void update(const tensor& x, sstack<solver_type> solvers)
        {
-            subnetwork.update(x,solvers);
+            subnetwork.back_propagate_error(x);
        }
        template <typename solver_type>
-        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers)
+        void update_parameters(sstack<solver_type> solvers, double step_size)
        {
-            subnetwork.update(x,gradient_input,solvers);
+            subnetwork.update_parameters(solvers, step_size);
        }
+        const tensor& get_parameter_gradient(
+        ) const { return params_grad; }
+        tensor& get_parameter_gradient (
+        ) { return params_grad; }
        const subnet_type& subnet() const 
        { 
            return subnetwork; 
@@ -2558,6 +2602,11 @@ namespace dlib
        { return layer<TAG_TYPE>(subnetwork).private_get_gradient_input(); }
        subnet_type subnetwork;
+        // This member doesn't logically contribute to the state of the object since it is
+        // always empty. It's just here so we can have the get_parameter_gradient() methods
+        // which have to return something.  So they return this empty tensor.
+        resizable_tensor params_grad;
    };
    template <template<typename> class T, typename U>
    struct is_nonloss_layer_type<add_skip_layer<T,U>> : std::true_type {};

--- a/dlib/dnn/core_abstract.h
+++ b/dlib/dnn/core_abstract.h
@@ -410,33 +410,53 @@ namespace dlib
        /*!
            ensures
                - returns the error gradient for this network.  That is, this is the error
-                  gradient that this network will use to update itself when update() is
+                  gradient that this network will use to compute parameter gradients when
-                  called.  Therefore, when performing back propagation, layers that sit on
+                  back_propagate_error() is called.  Therefore, when performing back
-                  top of this network layer write their back propagated error gradients
+                  propagation, layers that sit on top of this network layer write their
-                  into get_gradient_input().  Or to put it another way, during back
+                  back-propagated error gradients into get_gradient_input().  Or to put it
-                  propagation, layers take the contents of their get_gradient_input() and
+                  another way, during back-propagation, layers take the contents of their
-                  back propagate it through themselves and store the results into their
+                  get_gradient_input() and back-propagate it through themselves and store
-                  subnetwork's get_gradient_input().
+                  the result into their subnetwork's get_gradient_input().
                  This means you should consider get_gradient_input() as an input to the
-                  update() method.  
+                  back_propagate_error() method.  
        !*/
        const tensor& get_final_data_gradient(
        ) const;
        /*!
            ensures
-                - if update() has been called to back-propagate a gradient through this
+                - if back_propagate_error() has been called to back-propagate a gradient
-                  network then you can call get_final_data_gradient() to obtain the last
+                  through this network then you can call get_final_data_gradient() to
-                  gradient computed.  That is, this function returns the gradient of the
+                  obtain the last data gradient computed.  That is, this function returns
-                  network with respect to its inputs.
+                  the gradient of the network with respect to its inputs.
+                - Note that there is only one "final data gradient" for an entire network,
+                  not one per layer, since there is only one input to the entire network.
        !*/
-        template <typename solver_type>
+        const tensor& get_parameter_gradient(
-        void update(
+        ) const; 
-            const tensor& x, 
+        /*!
-            sstack<solver_type> solvers,
+            ensures
-            double step_size
+                - if back_propagate_error() has been called then you can call
+                  get_parameter_gradient() to find the gradient of this layer's parameters.
+                  When we update the parameters by calling update_parameters(), it will use
+                  the gradient in get_parameter_gradient() to perform the update.
+                  Therefore, you should consider get_parameter_gradient() as an input to
+                  update_parameters().
+        !*/
+        tensor& get_parameter_gradient (
+        ); 
+        /*!
+            ensures
+                - returns a non-const reference to the tensor returned by the above
+                  get_parameter_gradient() method.  You could use this method to modify the
+                  parameter gradient in some way before invoking update_parameters().
+        !*/
+        void back_propagate_error(
+            const tensor& x
        );
        /*!
            requires
@@ -445,28 +465,21 @@ namespace dlib
                  subsequently modified in any way.
                - get_gradient_input() has been set equal to the gradient of this network's
                  output with respect to some loss function.
-                - The given solvers have only ever been used with this network.  That
-                  is, if you want to call update() on some other neural network object then
-                  you must NOT reuse the same solvers object.
-                - solvers.size() >= num_computational_layers
-                - 0 < step_size <= 1
            ensures
                - Back propagates the error gradient, get_gradient_input(), through this
-                  network and uses the provided solvers to update the network parameters.
+                  network and computes parameter and data gradients, via backpropagation.
-                - The parameter delta vector output by the solvers is multiplied by
+                  Specifically, this function populates get_final_data_gradient() and also,
-                  step_size before being added to the parameters.
+                  for each layer, the tensor returned by get_parameter_gradient().
                - All elements of #get_gradient_input() are set to 0. 
-                - have_same_dimensions(#get_final_data_gradient(), x) == true
+                - have_same_dimensions(#get_final_data_gradient(), x) == true.
+                - have_same_dimensions(#get_parameter_gradient(), layer_details().get_layer_params()) == true.
                - #get_final_data_gradient() contains the gradient of the network with
                  respect to x.
        !*/
-        template <typename solver_type>
+        void back_propagate_error(
-        void update(
            const tensor& x, 
-            const tensor& gradient_input,
+            const tensor& gradient_input
-            sstack<solver_type> solvers,
-            double step_size
        );
        /*!
            requires
@@ -474,27 +487,45 @@ namespace dlib
                  Moreover, this was the most recent call to forward() and x has not been
                  subsequently modified in any way.
                - have_same_dimensions(gradient_input, get_output()) == true
-                - The given solvers have only ever been used with this network.  That
-                  is, if you want to call update() on some other neural network object then
-                  you must NOT reuse the same solvers object.
-                - solvers.size() >= num_computational_layers
-                - 0 < step_size <= 1
            ensures
-                - This function is identical to the version of update() defined immediately
+                - This function is identical to the version of back_propagate_error()
-                  above except that it back-propagates gradient_input through the network
+                  defined immediately above except that it back-propagates gradient_input
-                  instead of get_gradient_input().  Therefore, this version of update is
+                  through the network instead of get_gradient_input().  Therefore, this
-                  equivalent to performing:
+                  version of back_propagate_error() is equivalent to performing:
                    get_gradient_input() = gradient_input;
-                    update(x,solvers);
+                    back_propagate_error(x);
-                  Except that calling update(x,gradient_input,solvers) avoids the copy
+                  Except that calling back_propagate_error(x,gradient_input) avoids the
-                  and is therefore slightly more efficient.
+                  copy and is therefore slightly more efficient.
-                - The parameter delta vector output by the solvers is multiplied by
-                  step_size before being added to the parameters.
                - All elements of #get_gradient_input() are set to 0. 
+                - have_same_dimensions(#get_final_data_gradient(), x) == true.
+                - have_same_dimensions(#get_parameter_gradient(), layer_details().get_layer_params()) == true.
                - #get_final_data_gradient() contains the gradient of the network with
                  respect to x.
        !*/
+        template <typename solver_type>
+        void update_parameters(
+            sstack<solver_type> solvers, 
+            double step_size
+        );
+        /*!
+            requires
+                - solver_type is an implementation of the EXAMPLE_SOLVER interface defined
+                  in solvers_abstract.h
+                - back_propagate_error() has been called.
+                - The given solvers have only ever been used with this network.  That is,
+                  if you want to call update_parameters() on some other neural network
+                  object then you must NOT reuse the same solvers object.
+                - solvers.size() >= num_computational_layers
+                - 0 < step_size <= 1
+            ensures
+                - Updates all the parameters in the network.  In particular, we pass each
+                  layer's parameter gradient (i.e. the tensor returned by the layer's
+                  get_parameter_gradient() member) through that layer's corresponding
+                  solver object.  This produces a parameter delta vector and we add
+                  step_size times that vector to the layer's parameters.
+        !*/
        void clean(
        );
        /*!
@@ -831,12 +862,10 @@ namespace dlib
    // -------------
-        template <typename label_iterator, typename solver_type>
+        template <typename label_iterator>
-        double update (
+        double compute_parameter_gradients (
            const tensor& x,
-            label_iterator lbegin,
+            label_iterator lbegin
-            sstack<solver_type> solvers,
-            double step_size
        );
        /*!
            requires
@@ -844,31 +873,22 @@ namespace dlib
                - x.num_samples() > 0
                - lbegin == iterator pointing to the start of a range of
                  x.num_samples()/sample_expansion_factor label_type elements.
-                - The given solvers have only ever been used with this network.  That
-                  is, if you want to call update() on some other neural network object then
-                  you must NOT reuse the same solvers object.
-                - solvers.size() >= num_computational_layers
-                - 0 < step_size <= 1
            ensures
                - runs x through the network, compares the output to the expected output
-                  pointed to by lbegin, and updates the network parameters via
+                  pointed to by lbegin, and computes parameter and data gradients with
-                  backpropagation.
+                  respect to the loss, via backpropagation.  Specifically, this function
+                  updates get_final_data_gradient() and also, for each layer, the tensor
+                  returned by get_parameter_gradient().
                - for all valid k:
                    - the expected label of the kth sample in x is *(lbegin+k/sample_expansion_factor).
-                - The provided solvers are used to update the parameters in each layer of
-                  the network.
-                - The parameter delta vector output by the solvers is multiplied by
-                  step_size before being added to the parameters.
                - returns compute_loss(x,lbegin)
        !*/
-        template <typename input_iterator, typename label_iterator, typename solver_type>
+        template <typename input_iterator, typename label_iterator>
-        double update (
+        double compute_parameter_gradients (
            input_iterator ibegin,
            input_iterator iend,
-            label_iterator lbegin,
+            label_iterator lbegin
-            sstack<solver_type> solvers,
-            double step_size
        );
        /*!
            requires
@@ -876,77 +896,72 @@ namespace dlib
                - std::distance(ibegin,iend) > 0
                - lbegin == iterator pointing to the start of a range of
                  std::distance(ibegin,iend) label_type elements.
-                - The given solvers have only ever been used with this network.  That
-                  is, if you want to call update() on some other neural network object then
-                  you must NOT reuse the same solvers object.
-                - solvers.size() >= num_computational_layers
-                - 0 < step_size <= 1
            ensures
                - runs [ibegin,iend) through the network, compares the output to the
-                  expected output pointed to by lbegin, and updates the network parameters
+                  expected output pointed to by lbegin, and computes parameter and data
-                  via backpropagation.
+                  gradients with respect to the loss, via backpropagation.  Specifically,
+                  this function updates get_final_data_gradient() and also, for each layer,
+                  the tensor returned by get_parameter_gradient().
                - for all valid k:
                    - the expected label of *(ibegin+k) is *(lbegin+k).
-                - The provided solvers are used to update the parameters in each layer of
-                  the network.
-                - The parameter delta vector output by the solvers is multiplied by
-                  step_size before being added to the parameters.
                - returns compute_loss(ibegin,iend,lbegin)
        !*/
-    // -------------
+        double compute_parameter_gradients (
+            const tensor& x
-        template <typename solver_type>
-        double update (
-            const tensor& x,
-            sstack<solver_type> solvers,
-            double step_size
        );
        /*!
            requires
                - LOSS_DETAILS is an unsupervised loss.  i.e. label_type==no_label_type.
                - x.num_samples()%sample_expansion_factor == 0
                - x.num_samples() > 0
-                - The given solvers have only ever been used with this network.  That
-                  is, if you want to call update() on some other neural network object then
-                  you must NOT reuse the same solvers object.
-                - solvers.size() >= num_computational_layers
-                - 0 < step_size <= 1
            ensures
-                - runs x through the network and updates the network parameters by
+                - runs x through the network and computes parameter and data gradients with
-                  back-propagating the loss gradient through the network.
+                  respect to the loss, via backpropagation.  Specifically, this function
-                - The provided solvers are used to update the parameters in each layer of
+                  updates get_final_data_gradient() and also, for each layer, the tensor
-                  the network.
+                  returned by get_parameter_gradient().
-                - The parameter delta vector output by the solvers is multiplied by
-                  step_size before being added to the parameters.
                - returns compute_loss(x)
        !*/
-        template <typename input_iterator, typename solver_type>
+        template <typename input_iterator>
-        double update (
+        double compute_parameter_gradients (
            input_iterator ibegin,
-            input_iterator iend,
+            input_iterator iend
-            sstack<solver_type> solvers,
-            double step_size
        );
        /*!
            requires
                - LOSS_DETAILS is an unsupervised loss.  i.e. label_type==no_label_type.
                - [ibegin, iend) is an iterator range over input_type objects.
                - std::distance(ibegin,iend) > 0
+            ensures
+                - runs [ibegin,iend) through the network and computes parameter and data
+                  gradients with respect to the loss, via backpropagation.  Specifically,
+                  this function updates get_final_data_gradient() and also, for each layer,
+                  the tensor returned by get_parameter_gradient().
+                - returns compute_loss(ibegin,iend)
+        !*/
+        template <typename solver_type>
+        void update_parameters (
+            sstack<solver_type> solvers,
+            double step_size
+        );
+        /*!
+            requires
+                - solver_type is an implementation of the EXAMPLE_SOLVER interface defined
+                  in solvers_abstract.h
+                - compute_parameter_gradients() has been called.
                - The given solvers have only ever been used with this network.  That
-                  is, if you want to call update() on some other neural network object then
+                  is, if you want to call update_parameters() on some other neural network
-                  you must NOT reuse the same solvers object.
+                  object then you must NOT reuse the same solvers object.
                - solvers.size() >= num_computational_layers
                - 0 < step_size <= 1
            ensures
-                - runs [ibegin,iend) through the network and updates the network parameters
+                - Updates all the parameters in the network.  In particular, we pass each
-                  by back-propagating the loss gradient through the network.
+                  layer's parameter gradient (i.e. the tensor returned by the layer's
-                - The provided solvers are used to update the parameters in each layer of
+                  get_parameter_gradient() member) through that layer's corresponding
-                  the network.
+                  solver object.  This produces a parameter delta vector and we add
-                - The parameter delta vector output by the solvers is multiplied by
+                  step_size times that vector to the layer's parameters.
-                  step_size before being added to the parameters.
-                - returns compute_loss(ibegin,iend)
        !*/
    // -------------

--- a/dlib/dnn/trainer.h
+++ b/dlib/dnn/trainer.h
@@ -418,14 +418,16 @@ namespace dlib
        template <typename T>
        void run_update(job_t& next_job, const T&)
        {
-            double loss = net.update(next_job.t, next_job.labels.begin(), make_sstack(solvers),step_size);
+            double loss = net.compute_parameter_gradients(next_job.t, next_job.labels.begin());
+            net.update_parameters(make_sstack(solvers),step_size);
            record_loss(loss);
        }
        void run_update(job_t& next_job, const no_label_type&)
        {
            no_label_type pick_which_run_update;
-            double loss = net.update(next_job.t, make_sstack(solvers), step_size);
+            double loss = net.compute_parameter_gradients(next_job.t);
+            net.update_parameters(make_sstack(solvers), step_size);
            record_loss(loss);
        }
@@ -438,8 +440,9 @@ namespace dlib
            job_t next_job;
            while(job_pipe.dequeue(next_job))
            {
-                // call net.update() but pick the right version for unsupervised or
+                // call net.compute_parameter_gradients() and net.update_parameters() but
-                // supervised training based on the type of label_type.
+                // pick the right version for unsupervised or supervised training based on
+                // the type of label_type.
                run_update(next_job, pick_which_run_update);
                // If we have been running for a while then check if the loss is still