Now training will automatically reduce the learning rate when it is clear that

the loss isn't being reduced. Also, there is a stopping condition now based on how large the current learning rate is. That is, training stops when the learning rate gets small enough and it is clear that no progress is being made.

Now training will automatically reduce the learning rate when it is clear that
the loss isn't being reduced. Also, there is a stopping condition now based on how large the current learning rate is. That is, training stops when the learning rate gets small enough and it is clear that no progress is being made.
9f92b082 · Davis King · 6f63bc62 · 9f92b082 · 9f92b082 · 9f92b082
Commit 9f92b082 authored Jan 09, 2016 by Davis King
6 changed files
--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -15,6 +15,8 @@
 #include <utility>
 #include <tuple>
 #include <cmath>
+#include "tensor_tools.h"
 namespace dlib
@@ -719,23 +721,27 @@ namespace dlib
        ) const { return subnetwork->get_final_data_gradient(); }
        template <typename solver_type>
-        void update(const tensor& x, sstack<solver_type> solvers)
+        void update(const tensor& x, sstack<solver_type> solvers, double step_size)
        {
-            update(x,private_get_gradient_input(),solvers);
+            update(x,private_get_gradient_input(),solvers,step_size);
        }
        template <typename solver_type>
-        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers)
+        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
        {
            DLIB_CASSERT(solvers.size()>=num_layers,"");
            dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork);
            params_grad.copy_size(details.get_layer_params());
            impl::call_layer_backward(details, private_get_output(),
                gradient_input, wsub, static_cast<tensor&>(params_grad));
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
-                solvers.top()(details, static_cast<const tensor&>(params_grad));
+            {
-            subnetwork->update(x, solvers.pop());
+                const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
+                tt::add(1,details.get_layer_params(), step_size, step);
+            }
+            subnetwork->update(x, solvers.pop(), step_size);
            gradient_input_is_stale = true;
        }
@@ -1016,13 +1022,13 @@ namespace dlib
        ) const { return grad_final; }
        template <typename solver_type>
-        void update(const tensor& x, sstack<solver_type> solvers)
+        void update(const tensor& x, sstack<solver_type> solvers, double step_size)
        {
-            update(x,private_get_gradient_input(),solvers);
+            return update(x,private_get_gradient_input(),solvers, step_size);
        }
        template <typename solver_type>
-        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers)
+        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
        {
            DLIB_CASSERT(solvers.size()>=num_layers,"");
            // make sure grad_final is initialized to 0
@@ -1034,9 +1040,13 @@ namespace dlib
            params_grad.copy_size(details.get_layer_params());
            impl::call_layer_backward(details, private_get_output(),
                gradient_input, wsub, static_cast<tensor&>(params_grad));
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
-                solvers.top()(details, static_cast<const tensor&>(params_grad));
+            {
+                const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
+                tt::add(1,details.get_layer_params(), step_size, step);
+            }
            gradient_input_is_stale = true;
        }
@@ -1225,15 +1235,15 @@ namespace dlib
        ) const { return subnetwork.get_final_data_gradient(); }
        template <typename solver_type>
-        void update(const tensor& x, sstack<solver_type> solvers)
+        void update(const tensor& x, sstack<solver_type> solvers, double step_size)
        {
-            subnetwork.update(x,solvers);
+            subnetwork.update(x,solvers, step_size);
        }
        template <typename solver_type>
-        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers)
+        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
        {
-            subnetwork.update(x,gradient_input,solvers);
+            subnetwork.update(x,gradient_input,solvers, step_size);
        }
        const subnet_type& subnet() const { return subnetwork; }
@@ -1462,31 +1472,31 @@ namespace dlib
        }
        template <typename solver_type>
-        void update(const tensor& x, sstack<solver_type> solvers)
+        void update(const tensor& x, sstack<solver_type> solvers, double step_size)
        {
-            update(x,private_get_gradient_input(),solvers);
+            update(x,private_get_gradient_input(),solvers,step_size);
        }
        template <typename solver_type>
-        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers)
+        void update(const tensor& x, const tensor& gradient_input, sstack<solver_type> solvers, double step_size)
        {
            const auto cnt = (LAYER<SUBNET>::num_layers-SUBNET::num_layers);
            if (details.size() > 1)
            {
-                details[0].update(details[1].get_output(), gradient_input, solvers);
+                details[0].update(details[1].get_output(), gradient_input, solvers,step_size);
                for (size_t i = 1; i < details.size(); ++i)
                {
                    if (i+1 < details.size())
-                        details[i].update(details[i+1].get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i));
+                        details[i].update(details[i+1].get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i),step_size);
                    else
-                        details[i].update(subnetwork.get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i));
+                        details[i].update(subnetwork.get_output(), details[i-1].get_final_data_gradient(), solvers.pop(cnt*i),step_size);
                }
            }
            else
            {
-                details[0].update(subnetwork.get_output(), gradient_input, solvers);
+                details[0].update(subnetwork.get_output(), gradient_input, solvers,step_size);
            }
-            subnetwork.update(x, details.back().get_final_data_gradient(), solvers.pop(cnt*details.size()));
+            subnetwork.update(x, details.back().get_final_data_gradient(), solvers.pop(cnt*details.size()),step_size);
        }
        const subnet_type& subnet() const { return subnetwork; }
@@ -1672,13 +1682,22 @@ namespace dlib
        }
        template <typename solver_type>
-        void update(const tensor& /*x*/, sstack<solver_type> /*solvers*/)
+        void update(
+            const tensor& /*x*/, 
+            sstack<solver_type> /*solvers*/,
+            double /*step_size*/
+        )
        {
            // nothing to update
        }
        template <typename solver_type>
-        void update(const tensor& /*x*/, const tensor& gradient_input, sstack<solver_type> /*solvers*/)
+        void update(
+            const tensor& /*x*/,
+            const tensor& /*gradient_input*/,
+            sstack<solver_type> /*solvers*/,
+            double /*step_size*/
+        )
        {
            // nothing to update
        }
@@ -1948,13 +1967,14 @@ namespace dlib
        double update (
            const tensor& x,
            label_iterator lbegin,
-            sstack<solver_type> solvers
+            sstack<solver_type> solvers,
+            double step_size
        )
        {
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
            double l = loss.compute_loss(x, lbegin, wsub);
-            subnetwork.update(x, solvers);
+            subnetwork.update(x, solvers, step_size);
            return l;
        }
@@ -1963,23 +1983,25 @@ namespace dlib
            input_iterator ibegin,
            input_iterator iend,
            label_iterator lbegin,
-            sstack<solver_type> solvers
+            sstack<solver_type> solvers,
+            double step_size
        )
        {
            to_tensor(ibegin,iend,temp_tensor);
-            return update(temp_tensor, lbegin, solvers);
+            return update(temp_tensor, lbegin, solvers, step_size);
        }
        template <typename solver_type>
        double update (
            const tensor& x,
-            sstack<solver_type> solvers
+            sstack<solver_type> solvers,
+            double step_size
        )
        {
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
            double l = loss.compute_loss(x, wsub);
-            subnetwork.update(x, solvers);
+            subnetwork.update(x, solvers, step_size);
            return l;
        }
@@ -1987,11 +2009,12 @@ namespace dlib
        double update (
            input_iterator ibegin,
            input_iterator iend,
-            std::vector<solver_type>& solvers
+            sstack<solver_type> solvers,
+            double step_size
        )
        {
            to_tensor(ibegin,iend,temp_tensor);
-            return update(temp_tensor, solvers);
+            return update(temp_tensor, solvers, step_size);
        }
        const subnet_type& subnet() const { return subnetwork; }

--- a/dlib/dnn/core_abstract.h
+++ b/dlib/dnn/core_abstract.h
@@ -392,7 +392,8 @@ namespace dlib
        template <typename solver_type>
        void update(
            const tensor& x, 
-            sstack<solver_type> solvers
+            sstack<solver_type> solvers,
+            double step_size
        );
        /*!
            requires
@@ -405,9 +406,12 @@ namespace dlib
                  is, if you want to call update() on some other neural network object then
                  you must NOT reuse the same solvers object.
                - solvers.size() >= num_layers
+                - 0 < step_size <= 1
            ensures
                - Back propagates the error gradient, get_gradient_input(), through this
                  network and uses the provided solvers to update the network parameters.
+                - The parameter delta vector output by the solvers is multiplied by
+                  step_size before being added to the parameters.
                - All elements of #get_gradient_input() are set to 0. 
                - have_same_dimensions(#get_final_data_gradient(), x) == true
                - #get_final_data_gradient() contains the gradient of the network with
@@ -418,7 +422,8 @@ namespace dlib
        void update(
            const tensor& x, 
            const tensor& gradient_input,
-            sstack<solver_type> solvers
+            sstack<solver_type> solvers,
+            double step_size
        );
        /*!
            requires
@@ -430,6 +435,7 @@ namespace dlib
                  is, if you want to call update() on some other neural network object then
                  you must NOT reuse the same solvers object.
                - solvers.size() >= num_layers
+                - 0 < step_size <= 1
            ensures
                - This function is identical to the version of update() defined immediately
                  above except that it back-propagates gradient_input through the network
@@ -439,6 +445,8 @@ namespace dlib
                    update(x,solvers);
                  Except that calling update(x,gradient_input,solvers) avoids the copy
                  and is therefore slightly more efficient.
+                - The parameter delta vector output by the solvers is multiplied by
+                  step_size before being added to the parameters.
                - All elements of #get_gradient_input() are set to 0. 
                - #get_final_data_gradient() contains the gradient of the network with
                  respect to x.
@@ -755,7 +763,8 @@ namespace dlib
        double update (
            const tensor& x,
            label_iterator lbegin,
-            sstack<solver_type> solvers
+            sstack<solver_type> solvers,
+            double step_size
        );
        /*!
            requires
@@ -767,6 +776,7 @@ namespace dlib
                  is, if you want to call update() on some other neural network object then
                  you must NOT reuse the same solvers object.
                - solvers.size() >= num_layers
+                - 0 < step_size <= 1
            ensures
                - runs x through the network, compares the output to the expected output
                  pointed to by lbegin, and updates the network parameters via
@@ -775,6 +785,8 @@ namespace dlib
                    - the expected label of the kth sample in x is *(lbegin+k/sample_expansion_factor).
                - The provided solvers are used to update the parameters in each layer of
                  the network.
+                - The parameter delta vector output by the solvers is multiplied by
+                  step_size before being added to the parameters.
                - returns compute_loss(x,lbegin)
        !*/
@@ -783,7 +795,8 @@ namespace dlib
            input_iterator ibegin,
            input_iterator iend,
            label_iterator lbegin,
-            sstack<solver_type> solvers
+            sstack<solver_type> solvers,
+            double step_size
        );
        /*!
            requires
@@ -795,6 +808,7 @@ namespace dlib
                  is, if you want to call update() on some other neural network object then
                  you must NOT reuse the same solvers object.
                - solvers.size() >= num_layers
+                - 0 < step_size <= 1
            ensures
                - runs [ibegin,iend) through the network, compares the output to the
                  expected output pointed to by lbegin, and updates the network parameters
@@ -803,6 +817,8 @@ namespace dlib
                    - the expected label of *(ibegin+k) is *(lbegin+k).
                - The provided solvers are used to update the parameters in each layer of
                  the network.
+                - The parameter delta vector output by the solvers is multiplied by
+                  step_size before being added to the parameters.
                - returns compute_loss(ibegin,iend,lbegin)
        !*/
@@ -811,7 +827,8 @@ namespace dlib
        template <typename solver_type>
        double update (
            const tensor& x,
-            sstack<solver_type> solvers
+            sstack<solver_type> solvers,
+            double step_size
        );
        /*!
            requires
@@ -822,11 +839,14 @@ namespace dlib
                  is, if you want to call update() on some other neural network object then
                  you must NOT reuse the same solvers object.
                - solvers.size() >= num_layers
+                - 0 < step_size <= 1
            ensures
                - runs x through the network and updates the network parameters by
                  back-propagating the loss gradient through the network.
                - The provided solvers are used to update the parameters in each layer of
                  the network.
+                - The parameter delta vector output by the solvers is multiplied by
+                  step_size before being added to the parameters.
                - returns compute_loss(x)
        !*/
@@ -834,7 +854,8 @@ namespace dlib
        double update (
            input_iterator ibegin,
            input_iterator iend,
-            sstack<solver_type> solvers
+            sstack<solver_type> solvers,
+            double step_size
        );
        /*!
            requires
@@ -845,11 +866,14 @@ namespace dlib
                  is, if you want to call update() on some other neural network object then
                  you must NOT reuse the same solvers object.
                - solvers.size() >= num_layers
+                - 0 < step_size <= 1
            ensures
                - runs [ibegin,iend) through the network and updates the network parameters
                  by back-propagating the loss gradient through the network.
                - The provided solvers are used to update the parameters in each layer of
                  the network.
+                - The parameter delta vector output by the solvers is multiplied by
+                  step_size before being added to the parameters.
                - returns compute_loss(ibegin,iend)
        !*/

--- a/dlib/dnn/solvers.h
+++ b/dlib/dnn/solvers.h
@@ -33,24 +33,23 @@ namespace dlib
        float get_learning_rate (
        ) const { return learning_rate; }
-        template <typename LAYER_DETAILS>
+        const tensor& operator() (
-        void operator() (
+            const tensor& params,
-            LAYER_DETAILS& l, 
            const tensor& params_grad
        )
        {
-            DLIB_CASSERT(l.get_layer_params().size() != 0,"");
+            DLIB_CASSERT(params.size() != 0,"");
            if (v.size() == 0)
            {
                v.copy_size(params_grad);
                v = 0;
            }
-            tt::affine_transform(v, v, l.get_layer_params(), params_grad, 
-                               momentum, -weight_decay*learning_rate, -learning_rate, 0);
-            // perform l.get_layer_params() += v;
+            //perform: v = momentum*mat(v) - weight_decay*learning_rate*mat(params) - learning_rate*mat(params_grad);
-            tt::affine_transform(l.get_layer_params(), l.get_layer_params(), v, 1, 1, 0);
+            tt::affine_transform(v, v, params, params_grad, 
+                               momentum, -weight_decay*learning_rate, -learning_rate, 0);
+            return v;
        }
        friend void serialize(const sgd& item, std::ostream& out)

--- a/dlib/dnn/solvers_abstract.h
+++ b/dlib/dnn/solvers_abstract.h
@@ -18,11 +18,11 @@ namespace dlib
        /*!
            WHAT THIS OBJECT REPRESENTS
                A solver defines the parameter update rule for a single layer in a deep
-                neural network.  It takes a parameter gradient vector and a layer and
+                neural network.  It takes a parameter gradient vector and the layer's
-                updates the layer's parameters.  Importantly, each solver instance is used
+                parameters and tells you how the parameters should be updated.
-                with only one layer in a network.  This allows us to define solvers that
+                Importantly, each solver instance is used with only one layer in a network.
-                have per layer state, for example, a solver may keep a momentum term and
+                This allows us to define solvers that have per layer state, for example, a
-                apply it to its update rule.
+                solver may keep a momentum term and apply it to its update rule.
                Note that there is no dlib::EXAMPLE_SOLVER type.  It is shown here purely
                to document the interface a solver object must implement.
@@ -33,22 +33,22 @@ namespace dlib
        EXAMPLE_SOLVER(
        );
-        template <typename LAYER_DETAILS>
+        const tensor& operator() (
-        void operator() (
+            const tensor& params,
-            LAYER_DETAILS& l, 
            const tensor& params_grad
-        );
+        )
        /*!
            requires
-                - LAYER_DETAILS implements the EXAMPLE_LAYER_ interface defined in
+                - params.size() != 0
-                  layers_abstract.h.
+                - have_same_dimensions(params, params_grad) == true.
-                - l.get_layer_params().size() != 0
-                - have_same_dimensions(l.get_layer_params(), params_grad) == true.
                - When this function is invoked on a particular solver instance, it is
-                  always supplied with the same LAYER_DETAILS object.
+                  always supplied with parameters from the same layer instance.  That is,
+                  the solver is allowed to remember things from one invocation to another
+                  and to assume that it is being serially applied to optimize the same
+                  parameters. 
            ensures
-                - Updates the parameters in l.  That is, l.get_layer_params() is modified
+                - Returns a step vector V that is intended to be used to update the
-                  based on the parameter gradient vector stored in params_grad.
+                  parameters by adding V to params.
        !*/
    };

--- a/dlib/dnn/trainer.h
+++ b/dlib/dnn/trainer.h
@@ -7,13 +7,14 @@
 #include "core.h"
 #include "solvers.h"
 #include "../statistics.h"
-#include "../console_progress_indicator.h"
 #include <chrono>
 #include "../serialize.h"
 #include "../pipe.h"
 #include "../threads.h"
 #include "cuda_dlib.h"
+#include "../statistics/running_gradient.h"
+#include <atomic>
 namespace dlib
 {
@@ -95,15 +96,15 @@ namespace dlib
            mini_batch_size = batch_size;
        }
-        unsigned long get_num_epochs (
+        unsigned long get_max_num_epochs (
-        ) const { return num_epochs; }
+        ) const { return max_num_epochs; }
-        void set_num_epochs (
+        void set_max_num_epochs (
            unsigned long num
        )  
        {
            DLIB_CASSERT(num > 0,"");
-            num_epochs = num;
+            max_num_epochs = num;
        }
        void be_verbose (
@@ -159,14 +160,14 @@ namespace dlib
        {
            DLIB_CASSERT(data.size() == labels.size() && data.size() > 0, "");
-            console_progress_indicator pbar(num_epochs);
+            for (unsigned long epoch_iteration = 0; 
-            pbar.print_status(0);
+                epoch_iteration < max_num_epochs && step_size >= min_step_size; 
-            for (unsigned long epoch_iteration = 0; epoch_iteration < num_epochs; ++epoch_iteration)
+                ++epoch_iteration)
            {
                using namespace std::chrono;
                auto last_time = system_clock::now();
                clear_average_loss();
-                for (size_t i = 0; i < data.size(); i += mini_batch_size)
+                for (size_t i = 0; i < data.size() && step_size >= min_step_size; i += mini_batch_size)
                {
                    net.to_tensor(data.begin()+i, 
                                  data.begin()+std::min(i+mini_batch_size,data.size()), 
@@ -183,10 +184,10 @@ namespace dlib
                        {
                            last_time = now_time;
                            auto iter = epoch_iteration + i/(double)data.size();
-                            std::cout << "epoch: " << rpad(cast_to_string(iter),string_pad) << " " 
+                            std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << "  " 
-                                << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  ";
+                                << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << "  "
-                            pbar.print_status(iter, true);
+                                << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) 
-                            std::cout << std::endl;
+                                << std::endl;
                        }
                    }
                }
@@ -195,10 +196,10 @@ namespace dlib
                {
                    // Capitalize the E in Epoch so it's easy to grep out the lines that
                    // are for full epoch status statements.
-                    std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),string_pad) << " " 
+                    std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << "  " 
-                              << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  ";
+                              << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << "  "
-                    pbar.print_status(epoch_iteration+1, true);
+                              << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) 
-                    std::cout << std::endl;
+                              << std::endl;
                }
            }
            return get_net();
@@ -214,14 +215,14 @@ namespace dlib
            static_assert(has_unsupervised_loss, 
                "You can only call this version of train() when using an unsupervised loss.");
-            console_progress_indicator pbar(num_epochs);
+            for (unsigned long epoch_iteration = 0; 
-            pbar.print_status(0);
+                epoch_iteration < max_num_epochs && step_size >= min_step_size; 
-            for (unsigned long epoch_iteration = 0; epoch_iteration < num_epochs; ++epoch_iteration)
+                ++epoch_iteration)
            {
                using namespace std::chrono;
                auto last_time = system_clock::now();
                clear_average_loss();
-                for (size_t i = 0; i < data.size(); i += mini_batch_size)
+                for (size_t i = 0; i < data.size() && step_size >= min_step_size; i += mini_batch_size)
                {
                    net.to_tensor(data.begin()+i, 
                                  data.begin()+std::min(i+mini_batch_size,data.size()), 
@@ -236,10 +237,10 @@ namespace dlib
                        {
                            last_time = now_time;
                            auto iter = epoch_iteration + i/(double)data.size();
-                            std::cout << "epoch: " << rpad(cast_to_string(iter),string_pad) << " " 
+                            std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << "  " 
-                                << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  ";
+                                << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << "  "
-                            pbar.print_status(iter, true);
+                                << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) 
-                            std::cout << std::endl;
+                                << std::endl;
                        }
                    }
                }
@@ -248,10 +249,10 @@ namespace dlib
                {
                    // Capitalize the E in Epoch so it's easy to grep out the lines that
                    // are for full epoch status statements.
-                    std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),string_pad) << " " 
+                    std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << "  " 
-                              << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << "  ";
+                              << "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << "  "
-                    pbar.print_status(epoch_iteration+1, true);
+                              << "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) 
-                    std::cout << std::endl;
+                              << std::endl;
                }
            }
            return get_net();
@@ -260,14 +261,20 @@ namespace dlib
        friend void serialize(const dnn_trainer& item, std::ostream& out)
        {
            item.wait_for_thread_to_pause();
-            int version = 2;
+            int version = 3;
            serialize(version, out);
            serialize(item.rs, out);
-            serialize(item.num_epochs, out);
+            serialize(item.rg, out);
+            serialize(item.max_num_epochs, out);
            serialize(item.mini_batch_size, out);
            serialize(item.verbose, out);
            serialize(item.net, out);
            serialize(item.solvers, out);
+            serialize(item.step_size.load(), out);
+            serialize(item.min_step_size, out);
+            serialize(item.iter_between_step_size_adjust.load(), out);
+            serialize(item.step_size_shrink.load(), out);
        }
        friend void deserialize(dnn_trainer& item, std::istream& in)
@@ -275,14 +282,21 @@ namespace dlib
            item.wait_for_thread_to_pause();
            int version = 0;
            deserialize(version, in);
-            if (version != 2)
+            if (version != 3)
                throw serialization_error("Unexpected version found while deserializing dlib::dnn_trainer.");
+            double temp;
            deserialize(item.rs, in);
-            deserialize(item.num_epochs, in);
+            deserialize(item.rg, in);
+            deserialize(item.max_num_epochs, in);
            deserialize(item.mini_batch_size, in);
            deserialize(item.verbose, in);
            deserialize(item.net, in);
            deserialize(item.solvers, in);
+            deserialize(temp, in); item.step_size = temp;
+            deserialize(item.min_step_size, in);
+            deserialize(temp, in); item.iter_between_step_size_adjust = temp;
+            deserialize(temp, in); item.step_size_shrink = temp;
        }
        double get_average_loss (
@@ -299,6 +313,62 @@ namespace dlib
            rs.clear();
        }
+        void set_setep_size (
+            double ss
+        )
+        {
+            DLIB_CASSERT(ss > 0,"");
+            wait_for_thread_to_pause();
+            step_size = ss;
+        }
+        double get_step_size(
+        ) const 
+        {
+            return step_size;
+        }
+        void set_min_step_size (
+            double ss
+        )
+        {
+            DLIB_CASSERT(ss > 0,"");
+            min_step_size = ss;
+        }
+        double get_min_step_size (
+        ) const
+        {
+            return min_step_size;
+        }
+        void set_iterations_between_step_size_adjust (
+            unsigned long min_iter
+        )
+        {
+            iter_between_step_size_adjust = min_iter;
+        }
+        unsigned long get_iterations_between_step_size_adjust (
+        ) const
+        {
+            return iter_between_step_size_adjust;
+        }
+        void set_step_size_shrink_amount (
+            double shrink
+        )
+        {
+            DLIB_CASSERT(0 < shrink && shrink <= 1,"");
+            step_size_shrink = shrink;
+        }
+        double get_step_size_shrink (
+        ) const
+        {
+            return step_size_shrink;
+        }
    private:
        struct job_t
        {
@@ -309,16 +379,20 @@ namespace dlib
        template <typename T>
        void run_update(job_t& next_job, const T&)
        {
-            rs.add(net.update(next_job.t, next_job.labels.begin(), make_sstack(solvers)));
+            double loss = net.update(next_job.t, next_job.labels.begin(), make_sstack(solvers),step_size);
+            rs.add(loss);
+            rg.add(loss);
        }
        void run_update(job_t& next_job, const no_label_type&)
        {
            no_label_type pick_wich_run_update;
-            rs.add(net.update(next_job.t, make_sstack(solvers)));
+            double loss = net.update(next_job.t, make_sstack(solvers), step_size);
+            rs.add(loss);
+            rg.add(loss);
        }
-        void thread()
+        void thread() try
        {
            // Make sure this thread uses the same cuda device as the thread that created
            // the dnn_trainer object.
@@ -330,7 +404,23 @@ namespace dlib
                // call net.update() but pick the right version for unsupervised or
                // supervised training based on the type of label_type.
                run_update(next_job, pick_wich_run_update);
+                // If we have been running for a while then check if the loss is still
+                // dropping.  If it isn't then we will reduce the step size.
+                if (rg.current_n() > iter_between_step_size_adjust)
+                {
+                    if (rg.probability_gradient_greater_than(0) > 0.45)
+                    {
+                        step_size = step_size_shrink*step_size;
                    }
+                    rg.clear();
+                }
+            }
+        }
+        catch(std::exception& e)
+        {
+            std::cerr << e.what() << std::endl;
+            throw;
        }
        void wait_for_thread_to_pause() const
@@ -339,29 +429,40 @@ namespace dlib
        }
        const static long string_pad = 10;
+        const static long epoch_string_pad = 4;
+        const static long ss_string_pad = 4;
        void init()
        {
-            num_epochs = 300;
+            max_num_epochs = 10000;
-            mini_batch_size = 32;
+            mini_batch_size = 128;
            verbose = false;
            cuda_device_id = dlib::cuda::get_device();
+            step_size = 1;
+            min_step_size = 1e-4;
+            iter_between_step_size_adjust = 2000;
+            step_size_shrink = 0.1;
            start();
        }
-        // The job object is not logically part of the state of this object. It is here
-        // only to avoid reallocating it over and over.
-        job_t job;
        dlib::pipe<job_t> job_pipe;
        running_stats<double> rs;
-        unsigned long num_epochs;
+        running_gradient rg;
+        unsigned long max_num_epochs;
        size_t mini_batch_size;
        bool verbose;
        int cuda_device_id;
        net_type net;
        std::vector<solver_type> solvers;
+        std::atomic<double> step_size;
+        double min_step_size;
+        std::atomic<long> iter_between_step_size_adjust;
+        std::atomic<double> step_size_shrink;
+        // The job object is not logically part of the state of this object. It is here
+        // only to avoid reallocating it over and over.
+        job_t job;
    };
 // ----------------------------------------------------------------------------------------

--- a/dlib/dnn/trainer_abstract.h
+++ b/dlib/dnn/trainer_abstract.h
@@ -46,6 +46,12 @@ namespace dlib
            ensures
                - #get_net() == a default initialized net_type object.
                - #get_solvers() == a set of default initialized solvers.
+                - #get_max_num_epochs() == 10000
+                - #get_mini_batch_size() == 128
+                - #get_step_size() == 1
+                - #get_min_step_size() == 1e-4
+                - #get_iterations_between_step_size_adjust() == 2000
+                - #get_step_size_shrink() == 0.1
        !*/
        explicit dnn_trainer(
@@ -55,6 +61,12 @@ namespace dlib
            ensures
                - #get_net() == net 
                - #get_solvers() == a set of default initialized solvers.
+                - #get_max_num_epochs() == 10000
+                - #get_mini_batch_size() == 128
+                - #get_step_size() == 1
+                - #get_min_step_size() == 1e-4
+                - #get_iterations_between_step_size_adjust() == 2000
+                - #get_step_size_shrink() == 0.1
        !*/
        dnn_trainer(
@@ -66,6 +78,12 @@ namespace dlib
                - #get_net() == net 
                - #get_solvers() == a set of solvers that are all initialized with the
                  provided solver instance.
+                - #get_max_num_epochs() == 10000
+                - #get_mini_batch_size() == 128
+                - #get_step_size() == 1
+                - #get_min_step_size() == 1e-4
+                - #get_iterations_between_step_size_adjust() == 2000
+                - #get_step_size_shrink() == 0.1
        !*/
        const net_type& get_net (
@@ -139,22 +157,107 @@ namespace dlib
                - #get_mini_batch_size() == batch_size
        !*/
-        unsigned long get_num_epochs (
+        unsigned long get_max_num_epochs (
        ) const; 
        /*!
            ensures
-                - Returns the number of passes over the training data we will execute when
+                - train() will execute at most get_max_num_epochs() iterations over the
-                  train() is called. 
+                  training data before returning.
        !*/
-        void set_num_epochs (
+        void set_max_num_epochs (
            unsigned long num
        );
        /*!
            requires
                - num > 0
            ensures
-                - @get_num_epochs() == num
+                - #get_max_num_epochs() == num
+        !*/
+        void set_setep_size (
+            double ss
+        );
+        /*!
+            requires
+                - ss > 0
+            ensures
+                - #get_step_size() == ss
+        !*/
+        double get_step_size(
+        ) const;
+        /*!
+            ensures
+                - During each training step, a solver tells us how to modify the parameters
+                  of each layer in the network.  It does this by outputting a step vector,
+                  that when added to the parameters, will hopefully result in improved
+                  network performance.  In our case, at during each step, we multiply the
+                  step vector from the solver by get_step_size() before adding it to the
+                  parameters.  Therefore, get_step_size() controls the "learning rate" used
+                  during training. 
+        !*/
+        void set_min_step_size (
+            double ss
+        );
+        /*!
+            requires
+                - ss > 0
+            ensures
+                - #get_min_step_size() == ss
+        !*/
+        double get_min_step_size (
+        ) const;
+        /*!
+            ensures
+                - During training, this object will test if progress is still being made
+                  and if it isn't then it will reduce get_step_size() by setting it to
+                  get_step_size()*get_step_size_shrink().  However, it will not reduce it
+                  below get_min_step_size().  Once this minimum step size is crossed the
+                  training will terminate.
+        !*/
+        void set_iterations_between_step_size_adjust (
+            unsigned long min_iter
+        );
+        /*!
+            ensures
+                - #get_iterations_between_step_size_adjust() == min_iter
+        !*/
+        unsigned long get_iterations_between_step_size_adjust (
+        ) const;
+        /*!
+            ensures
+                - This object monitors the progress of training and estimates if the
+                  training error is being reduced.  It does this by looking at
+                  get_iterations_between_step_size_adjust() mini-batch results and applying
+                  the statistical test defined by the running_gradient object to see if the
+                  training error is getting smaller.  
+                  Therefore, get_iterations_between_step_size_adjust() should always be set
+                  to something sensibly large so that this test can be done with reasonably
+                  high confidence.
+        !*/
+        void set_step_size_shrink_amount (
+            double shrink
+        );
+        /*!
+            requires
+                - 0 < shrink && shrink <= 1
+            ensures
+                - #get_step_size_shrink() == shrink
+        !*/
+        double get_step_size_shrink (
+        ) const;
+        /*!
+            ensures
+                - Whenever the training routine thinks it isn't making progress anymore it
+                  will reduce get_step_size() by multiplying it by get_step_size_shrink().
        !*/
        void be_verbose (
@@ -185,8 +288,10 @@ namespace dlib
                - Trains a supervised neural network based on the given training data.
                  The goal of training is to find the network parameters that minimize
                  get_net().compute_loss(data.begin(), data.end(), labels.begin()). 
-                - The optimizer will run for get_num_epochs() epochs and each layer in the
+                - The optimizer will run until get_step_size() < get_min_step_size() or
-                  network will be optimized by its corresponding solver in get_solvers().  
+                  get_max_num_epochs() training epochs have been executes. 
+                - Each layer in the network will be optimized by its corresponding solver
+                  in get_solvers().  
                - returns #get_net()
                  (i.e. the trained network can also be accessed by calling get_net() after
                  train() finishes executing)
@@ -213,8 +318,10 @@ namespace dlib
                - Trains an unsupervised neural network based on the given training data.
                  The goal of training is to find the network parameters that minimize
                  get_net().compute_loss(data.begin(), data.end()). 
-                - The optimizer will run for get_num_epochs() epochs and each layer in the
+                - The optimizer will run until get_step_size() < get_min_step_size() or
-                  network will be optimized by its corresponding solver in get_solvers().  
+                  get_max_num_epochs() training epochs have been executes. 
+                - Each layer in the network will be optimized by its corresponding solver
+                  in get_solvers().  
                - returns #get_net()
                  (i.e. the trained network can also be accessed by calling get_net() after
                  train() finishes executing)