Changed the solver interface to take the learning rate and the layer details

object as an input. This allows the solvers to exhibit a more complex behavior that depends on the specific layer. It also removes the learning rate from the solver's parameter set and pushes it entirely into the core training code. This also removes the need for the separate "step size" which previously was multiplied with the output of the solvers. Most of the code is still the same, and in the core and trainer the step_size variables have just been renamed to learning_rate. The dnn_trainer's relevant member functions have also been renamed. The examples have been updated to reflect these API changes. I also cleaned up the resnet definition and added better downsampling.

Changed the solver interface to take the learning rate and the layer details
object as an input. This allows the solvers to exhibit a more complex behavior that depends on the specific layer. It also removes the learning rate from the solver's parameter set and pushes it entirely into the core training code. This also removes the need for the separate "step size" which previously was multiplied with the output of the solvers. Most of the code is still the same, and in the core and trainer the step_size variables have just been renamed to learning_rate. The dnn_trainer's relevant member functions have also been renamed. The examples have been updated to reflect these API changes. I also cleaned up the resnet definition and added better downsampling.
66166c67 · Davis King · 8421f213 · 66166c67 · 66166c67 · 66166c67
Commit 66166c67 authored May 15, 2016 by Davis King
8 changed files
--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -825,16 +825,16 @@ namespace dlib
        }

        template <typename solver_type>
-        void update_parameters(sstack<solver_type> solvers, double step_size)
+        void update_parameters(sstack<solver_type> solvers, double learning_rate)
        {
            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
            {
-                const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
-                tt::add(1,details.get_layer_params(), step_size, step);
+                const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
+                tt::add(details.get_layer_params(), details.get_layer_params(), step);
            }
-            subnetwork->update_parameters(solvers.pop(), step_size);
+            subnetwork->update_parameters(solvers.pop(), learning_rate);
        }

        const tensor& get_parameter_gradient(
@@ -1175,13 +1175,14 @@ namespace dlib
        }

        template <typename solver_type>
-        void update_parameters(sstack<solver_type> solvers, double step_size)
+        void update_parameters(sstack<solver_type> solvers, double learning_rate)
        {
            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
            // Don't try to adjust the parameters if this layer doesn't have any.
-            if (params_grad.size() != 0) {
-                const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
-                tt::add(1,details.get_layer_params(), step_size, step);
+            if (params_grad.size() != 0) 
+            {
+                const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
+                tt::add(details.get_layer_params(), details.get_layer_params(), step);
            }
        }

@@ -1401,9 +1402,9 @@ namespace dlib
        }

        template <typename solver_type>
-        void update_parameters(sstack<solver_type> solvers, double step_size)
+        void update_parameters(sstack<solver_type> solvers, double learning_rate)
        {
-            subnetwork.update_parameters(solvers, step_size);
+            subnetwork.update_parameters(solvers, learning_rate);
        }

        const tensor& get_parameter_gradient(
@@ -1687,11 +1688,11 @@ namespace dlib
        }

        template <typename solver_type>
-        void update_parameters(sstack<solver_type> solvers, double step_size)
+        void update_parameters(sstack<solver_type> solvers, double learning_rate)
        {
            for (size_t i = 0; i < details.size(); ++i)
-                details[i].update_parameters(solvers.pop(comp_layers_in_each_group*i),step_size);
-            subnetwork.update_parameters(solvers.pop(comp_layers_in_each_group*details.size()),step_size);
+                details[i].update_parameters(solvers.pop(comp_layers_in_each_group*i),learning_rate);
+            subnetwork.update_parameters(solvers.pop(comp_layers_in_each_group*details.size()),learning_rate);
        }

        const subnet_type& subnet() const { return subnetwork; }
@@ -1905,7 +1906,7 @@ namespace dlib
        }

        template <typename solver_type>
-        void update_parameters(sstack<solver_type> /*solvers*/, double /*step_size*/)
+        void update_parameters(sstack<solver_type> /*solvers*/, double /*learning_rate*/)
        {
            // nothing to do
        }
@@ -2248,10 +2249,10 @@ namespace dlib
        template <typename solver_type>
        void update_parameters (
            sstack<solver_type> solvers,
-            double step_size
+            double learning_rate
        )
        {
-            subnetwork.update_parameters(solvers, step_size);
+            subnetwork.update_parameters(solvers, learning_rate);
        }

        const subnet_type& subnet() const { return subnetwork; }
@@ -2542,9 +2543,9 @@ namespace dlib
        }

        template <typename solver_type>
-        void update_parameters(sstack<solver_type> solvers, double step_size)
+        void update_parameters(sstack<solver_type> solvers, double learning_rate)
        {
-            subnetwork.update_parameters(solvers, step_size);
+            subnetwork.update_parameters(solvers, learning_rate);
        }

        const tensor& get_parameter_gradient(

--- a/dlib/dnn/core_abstract.h
+++ b/dlib/dnn/core_abstract.h
@@ -506,7 +506,7 @@ namespace dlib
        template <typename solver_type>
        void update_parameters(
            sstack<solver_type> solvers, 
-            double step_size
+            double learning_rate
        );
        /*!
            requires
@@ -517,13 +517,14 @@ namespace dlib
                  if you want to call update_parameters() on some other neural network
                  object then you must NOT reuse the same solvers object.
                - solvers.size() >= num_computational_layers
-                - 0 < step_size <= 1
+                - 0 < learning_rate <= 1
            ensures
                - Updates all the parameters in the network.  In particular, we pass each
                  layer's parameter gradient (i.e. the tensor returned by the layer's
                  get_parameter_gradient() member) through that layer's corresponding
-                  solver object.  This produces a parameter delta vector and we add
-                  step_size times that vector to the layer's parameters.
+                  solver object.  This produces a parameter delta vector which we add to
+                  the layer's parameters.
+                - The solvers use the given learning rate.
        !*/

        void clean(
@@ -944,7 +945,7 @@ namespace dlib
        template <typename solver_type>
        void update_parameters (
            sstack<solver_type> solvers,
-            double step_size
+            double learning_rate
        );
        /*!
            requires
@@ -955,13 +956,14 @@ namespace dlib
                  is, if you want to call update_parameters() on some other neural network
                  object then you must NOT reuse the same solvers object.
                - solvers.size() >= num_computational_layers
-                - 0 < step_size <= 1
+                - 0 < learning_rate <= 1
            ensures
                - Updates all the parameters in the network.  In particular, we pass each
                  layer's parameter gradient (i.e. the tensor returned by the layer's
                  get_parameter_gradient() member) through that layer's corresponding
-                  solver object.  This produces a parameter delta vector and we add
-                  step_size times that vector to the layer's parameters.
+                  solver object.  This produces a parameter delta vector which we add to
+                  the layer's parameters.
+                - The solvers use the given learning rate.
        !*/

    // -------------

--- a/dlib/dnn/solvers.h
+++ b/dlib/dnn/solvers.h
@@ -14,30 +14,34 @@ namespace dlib
    public:

        sgd(
-            float learning_rate_ = 0.01,
-            float weight_decay_ = 0.0005,
-            float momentum_ = 0.9 
+            float weight_decay_,
+            float momentum_ 
        ) 
        { 
            weight_decay = weight_decay_;
-            learning_rate = learning_rate_;
            momentum = momentum_;
        }

+        sgd(
+        ) : sgd(0.0005, 0.9) 
+        { 
+        }
+
        float get_momentum (
        ) const { return momentum; }

        float get_weight_decay (
        ) const { return weight_decay; }

-        float get_learning_rate (
-        ) const { return learning_rate; }
-
+        template <typename layer_type> 
        const tensor& operator() (
-            const tensor& params,
+            const float learning_rate,
+            const layer_type& l,
            const tensor& params_grad
        )
        {
+            const tensor& params = l.get_layer_params();
+
            DLIB_CASSERT(params.size() != 0,"");
            if (v.size() == 0)
            {
@@ -54,10 +58,9 @@ namespace dlib

        friend void serialize(const sgd& item, std::ostream& out)
        {
-            serialize("sgd", out);
+            serialize("sgd2", out);
            serialize(item.v, out);
            serialize(item.weight_decay, out);
-            serialize(item.learning_rate, out);
            serialize(item.momentum, out);
        }

@@ -65,18 +68,16 @@ namespace dlib
        {
            std::string version;
            deserialize(version, in);
-            if (version != "sgd")
+            if (version != "sgd2")
                throw serialization_error("Unexpected version found while deserializing dlib::sgd.");
            deserialize(item.v, in);
            deserialize(item.weight_decay, in);
-            deserialize(item.learning_rate, in);
            deserialize(item.momentum, in);
        }

    private:
        resizable_tensor v;
        float weight_decay;
-        float learning_rate;
        float momentum;
    };

@@ -87,19 +88,21 @@ namespace dlib
    public:

        adam(
-            float learning_rate_ = 0.001,
-            float weight_decay_ = 0.0005,
-            float momentum1_ = 0.9, 
-            float momentum2_ = 0.999 
+            float weight_decay_,
+            float momentum1_, 
+            float momentum2_
        ) 
        { 
            weight_decay = weight_decay_;
-            learning_rate = learning_rate_;
            momentum1 = momentum1_;
            momentum2 = momentum2_;
            t = 0;
        }

+        adam(
+        ) : adam(0.0005, 0.9, 0.999) 
+        {}
+
        float get_momentum1 (
        ) const { return momentum1; }

@@ -109,14 +112,14 @@ namespace dlib
        float get_weight_decay (
        ) const { return weight_decay; }

-        float get_learning_rate (
-        ) const { return learning_rate; }
-
+        template <typename layer_type>
        const tensor& operator() (
-            const tensor& params,
+            const float learning_rate,
+            const layer_type& l,
            const tensor& params_grad
        )
        {
+            const tensor& params = l.get_layer_params();
            DLIB_CASSERT(params.size() != 0,"");
            if (v.size() == 0)
            {
@@ -136,12 +139,11 @@ namespace dlib

        friend void serialize(const adam& item, std::ostream& out)
        {
-            serialize("adam", out);
+            serialize("adam2", out);
            serialize(item.m, out);
            serialize(item.v, out);
            serialize(item.s, out);
            serialize(item.weight_decay, out);
-            serialize(item.learning_rate, out);
            serialize(item.momentum1, out);
            serialize(item.momentum2, out);
            serialize(item.t, out);
@@ -151,13 +153,12 @@ namespace dlib
        {
            std::string version;
            deserialize(version, in);
-            if (version != "adam")
+            if (version != "adam2")
                throw serialization_error("Unexpected version found while deserializing dlib::adam.");
            deserialize(item.m, in);
            deserialize(item.v, in);
            deserialize(item.s, in);
            deserialize(item.weight_decay, in);
-            deserialize(item.learning_rate, in);
            deserialize(item.momentum1, in);
            deserialize(item.momentum2, in);
            deserialize(item.t, in);
@@ -168,7 +169,6 @@ namespace dlib
        resizable_tensor v;
        resizable_tensor s;
        float weight_decay;
-        float learning_rate;
        float momentum1;
        float momentum2;
        float t;

--- a/dlib/dnn/solvers_abstract.h
+++ b/dlib/dnn/solvers_abstract.h
@@ -33,22 +33,28 @@ namespace dlib
        EXAMPLE_SOLVER(
        );

+        template <typename layer_type>
        const tensor& operator() (
-            const tensor& params,
+            const float learning_rate,
+            const layer_type& l,
            const tensor& params_grad
        )
        /*!
            requires
-                - params.size() != 0
-                - have_same_dimensions(params, params_grad) == true.
+                - l.get_layer_params().size() != 0
+                - have_same_dimensions(l.get_layer_params(), params_grad) == true.
                - When this function is invoked on a particular solver instance, it is
-                  always supplied with parameters from the same layer instance.  That is,
-                  the solver is allowed to remember things from one invocation to another
-                  and to assume that it is being serially applied to optimize the same
+                  always supplied with the same layer instance, l.  That is, the solver is
+                  allowed to remember things from one invocation to another and to assume
+                  that it is being serially applied to optimize the same layer's
                  parameters. 
            ensures
                - Returns a step vector V that is intended to be used to update the
-                  parameters by adding V to params.
+                  parameters by adding V to l.get_layer_params().
+                - This function will use the given "learning rate" to compute V.  How the
+                  learning rate is used is solver dependent.  But in general the learning
+                  rate should be used to select the step size, i.e. to somehow determine
+                  the magnitude of V.
        !*/
    };

@@ -68,32 +74,34 @@ namespace dlib
            WHAT THIS OBJECT REPRESENTS
                This object implements the EXAMPLE_SOLVER interface defined above.  It is a
                basic stochastic gradient descent solver which uses momentum and weight
-                decay.  In particular, it performs the following update each time the
-                solver is invoked:
-                    v = momentum*v - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
-                    l.get_layer_params() += v;
-                Here v is a momentum term that is remembered by the solver from one
+                decay.  In particular, it computes the update vector V according to:
+                    V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
+                Here V is a momentum term that is remembered by the solver from one
                invocation of operator() to the next.  
        !*/
    public:

        sgd(
-            float learning_rate = 0.01,
-            float weight_decay = 0.0005,
-            float momentum = 0.9 
+        ); 
+        /*!
+            ensures
+                - #get_weight_decay()  == 0.0005 
+                - #get_momentum()      == 0.9 
+        !*/
+
+        sgd(
+            float weight_decay,
+            float momentum 
        ); 
        /*!
            requires
-                - learning_rate > 0
                - weight_decay >= 0
                - momentum >= 0
            ensures
-                - #get_learning_rate() == learning_rate
                - #get_weight_decay()  == weight_decay 
                - #get_momentum()      == momentum 
        !*/

-        float get_learning_rate () const; 
        float get_weight_decay () const;
        float get_momentum () const; 
    };
@@ -120,25 +128,30 @@ namespace dlib
    public:

        adam(
-            float learning_rate = 0.001,
-            float weight_decay = 0.0005,
-            float momentum1 = 0.9, 
-            float momentum2 = 0.999 
+        ); 
+        /*!
+            ensures
+                - #get_weight_decay()  == 0.0005 
+                - #get_momentum1()     == 0.9 
+                - #get_momentum2()     == 0.999 
+        !*/
+
+        adam(
+            float weight_decay,
+            float momentum1, 
+            float momentum2 
        ); 
        /*!
            requires
-                - learning_rate > 0
                - weight_decay >= 0
                - 0 <= momentum1 < 1
                - 0 <= momentum2 < 1
            ensures
-                - #get_learning_rate() == learning_rate
                - #get_weight_decay()  == weight_decay 
                - #get_momentum1()     == momentum1
                - #get_momentum2()     == momentum2
        !*/

-        float get_learning_rate () const; 
        float get_weight_decay () const;
        float get_momentum1 () const; 
        float get_momentum2 () const; 

--- a/dlib/dnn/trainer.h
+++ b/dlib/dnn/trainer.h
--- a/dlib/dnn/trainer_abstract.h
+++ b/dlib/dnn/trainer_abstract.h
@@ -68,10 +68,10 @@ namespace dlib
                  provided solver instance.
                - #get_max_num_epochs() == 10000
                - #get_mini_batch_size() == 128
-                - #get_step_size() == 1
-                - #get_min_step_size() == 1e-3
+                - #get_learning_rate() == 1e-2 
+                - #get_min_learning_rate() == 1e-5
                - #get_iterations_without_progress_threshold() == 2000
-                - #get_step_size_shrink() == 0.1
+                - #get_learning_rate_shrink() == 0.1
                - if (cuda_extra_devices.size() > 0) then
                    - This object will use multiple graphics cards to run the learning
                      algorithms.  In particular, it will always use whatever device is
@@ -102,6 +102,8 @@ namespace dlib
                  get_net().  In particular, the first layer's solver is
                  get_solvers()[0], the second layer's solver is
                  get_solvers()[1], and so on.
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
        !*/

        unsigned long get_mini_batch_size (
@@ -142,54 +144,51 @@ namespace dlib
                - #get_max_num_epochs() == num
        !*/

-        void set_step_size (
-            double ss
+        void set_learning_rate (
+            double lr
        );
        /*!
            requires
-                - ss > 0
+                - lr > 0
            ensures
-                - #get_step_size() == ss
+                - #get_learning_rate() == lr
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
        !*/

-        double get_step_size(
+        double get_learning_rate(
        ) const;
        /*!
            ensures
                - During each training step, a solver tells us how to modify the parameters
-                  of each layer in the network.  It does this by outputting a step vector,
-                  that when added to the parameters, will hopefully result in improved
-                  network performance.  In our case, at each step, we multiply the step
-                  vector from the solver by get_step_size() before adding it to the
-                  parameters.  Therefore, get_step_size() controls the "learning rate" used
-                  during training.  
-
-                  It should be emphasized that this learning rate applied by dnn_trainer is
-                  independent from any learning rate scheduling a solver might itself apply
-                  to the step vector it outputs.  That is, the dnn_trainer doesn't know
-                  what the solver is doing.  It just takes the output from a solver and
-                  multiplies it by get_step_size() before applying the step vector.
+                  of each layer in the network.  It does this by outputting a step vector
+                  that, when added to the parameters, will hopefully result in improved
+                  network performance.  The learning rate is one of the inputs to the
+                  solver and influences the size of this step vector.
        !*/

-        void set_min_step_size (
-            double ss
+        void set_min_learning_rate (
+            double lr
        );
        /*!
            requires
-                - ss > 0
+                - lr > 0
            ensures
-                - #get_min_step_size() == ss
+                - #get_min_learning_rate() == lr
        !*/

-        double get_min_step_size (
+        double get_min_learning_rate (
        ) const;
        /*!
            ensures
-                - During training, this object will test if progress is still being made
-                  and if it isn't then it will reduce get_step_size() by setting it to
-                  get_step_size()*get_step_size_shrink().  However, it will not reduce it
-                  below get_min_step_size().  Once this minimum step size is crossed the
-                  training will terminate.
+                - During training via this->train(), this object will test if progress is
+                  still being made and if it isn't then it will reduce get_learning_rate()
+                  by setting it to get_learning_rate()*get_learning_rate_shrink().
+                  However, it will not reduce it below get_min_learning_rate().  Once this
+                  minimum learning rate is crossed the training will terminate.
+                - get_min_learning_rate() doesn't apply if you are using train_one_step().  
+                  You can keep calling train_one_step() as many times as you want and the
+                  learning rate will drop infinitely close to 0 if you run long enough.
        !*/

        void set_iterations_without_progress_threshold (
@@ -209,33 +208,33 @@ namespace dlib
                  get_iterations_without_progress_threshold() mini-batch results and
                  applying the statistical test defined by the running_gradient object to
                  see if the training error is getting smaller.  If it isn't being reduced
-                  then get_step_size() is made smaller by a factor of get_step_size_shrink().
+                  then get_learning_rate() is made smaller by a factor of get_learning_rate_shrink().

                  Therefore, get_iterations_without_progress_threshold() should always be
                  set to something sensibly large so that this test can be done with
                  reasonably high confidence.  Think of this test as saying "if the loss
                  hasn't decreased for the previous get_iterations_without_progress_threshold() 
-                  then shrink the step size".
+                  then shrink the learning rate".
        !*/

-        void set_step_size_shrink_amount (
+        void set_learning_rate_shrink_amount (
            double shrink
        );
        /*!
            requires
                - 0 < shrink && shrink <= 1
            ensures
-                - #get_step_size_shrink() == shrink
+                - #get_learning_rate_shrink() == shrink
        !*/

-        double get_step_size_shrink (
+        double get_learning_rate_shrink (
        ) const;
        /*!
            ensures
                - Whenever the training routine thinks it isn't making progress anymore it
-                  will reduce get_step_size() by multiplying it by get_step_size_shrink().
-                - You can disable the automatic step size reduction by setting
-                  get_step_size_shrink() to 1.
+                  will reduce get_learning_rate() by multiplying it by get_learning_rate_shrink().
+                - You can disable the automatic learning rate reduction by setting
+                  get_learning_rate_shrink() to 1.
        !*/

        void be_verbose (
@@ -283,8 +282,8 @@ namespace dlib
                - Trains a supervised neural network based on the given training data.
                  The goal of training is to find the network parameters that minimize
                  get_net().compute_loss(data.begin(), data.end(), labels.begin()). 
-                - The optimizer will run until get_step_size() < get_min_step_size() or
-                  get_max_num_epochs() training epochs have been executed. 
+                - The optimizer will run until get_learning_rate() < get_min_learning_rate() 
+                  or get_max_num_epochs() training epochs have been executed. 
                - Each layer in the network will be optimized by its corresponding solver
                  in get_solvers().  
                - Each call to train DOES NOT reinitialize the state of get_net() or
@@ -309,8 +308,8 @@ namespace dlib
                - Trains an unsupervised neural network based on the given training data.
                  The goal of training is to find the network parameters that minimize
                  get_net().compute_loss(data.begin(), data.end()). 
-                - The optimizer will run until get_step_size() < get_min_step_size() or
-                  get_max_num_epochs() training epochs have been executed. 
+                - The optimizer will run until get_learning_rate() < get_min_learning_rate() 
+                  or get_max_num_epochs() training epochs have been executed. 
                - Each layer in the network will be optimized by its corresponding solver
                  in get_solvers().  
                - Each call to train DOES NOT reinitialize the state of get_net() or
@@ -381,6 +380,8 @@ namespace dlib
                - Note that, if be_verbose() has been called, then this object will
                  automatically call clear_average_loss() periodically when it logs the
                  loss to the console.
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
        !*/

        void clear_average_loss (
@@ -393,6 +394,8 @@ namespace dlib
                  applied during training.  Calling clear_average_loss() resets the
                  running_stats object so it forgets about all previous loss values
                  observed.
+                - This function blocks until all threads inside the dnn_trainer have
+                  stopped touching the net. 
        !*/

    };

--- a/examples/dnn_mnist_advanced_ex.cpp
+++ b/examples/dnn_mnist_advanced_ex.cpp
@@ -33,23 +33,27 @@ using namespace dlib;
 // It exists solely so other layers can refer to it.  In this case, the
 // add_prev1 layer looks for the tag1 layer and will take the tag1 output and
 // add it to the input of the add_prev1 layer.  This combination allows us to
-// implement skip and residual style networks.  
-template <int stride, typename SUBNET> 
-using base_res  = relu<add_prev1<bn_con<con<8,3,3,1,1,relu<bn_con<con<8,3,3,stride,stride,tag1<SUBNET>>>>>>>>;
-
-// Let's also define the same block but with all the batch normalization layers
-// replaced with affine transform layers.  We will use this type of construction
-// when testing our networks.
-template <int stride, typename SUBNET> 
-using base_ares = relu<add_prev1<affine<con<8,3,3,1,1,relu<affine<con<8,3,3,stride,stride,tag1<SUBNET>>>>>>>>;
-
-// And of course we can define more alias templates based on previously defined
-// alias templates.  The _down versions downsample the inputs by a factor of 2
-// while the res and ares layer types don't.
-template <typename SUBNET> using res       = base_res<1,SUBNET>;
-template <typename SUBNET> using res_down  = base_res<2,SUBNET>;
-template <typename SUBNET> using ares      = base_ares<1,SUBNET>;
-template <typename SUBNET> using ares_down = base_ares<2,SUBNET>;
+// implement skip and residual style networks.  We have also made base_res
+// parameterized by BN, which will let us insert different batch normalization
+// layers.
+template <template <typename> class BN, typename SUBNET> 
+using base_res  = relu<add_prev1<BN<con<8,3,3,1,1,relu<BN<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;
+
+// We also want a residual block that begins by doing downsampling.  We can
+// reuse base_res to define it like this:
+template <template <typename> class BN, typename SUBNET> 
+using base_res_down  = base_res<BN,avg_pool<1,1,2,2,SUBNET>>;
+
+// Now we can define 4 different residual blocks we will use in this example.
+// The first two are non-downsampling residual blocks while the last two
+// downsample.  Also, res and res_down use batch normalization while ares and
+// ares_down have had the batch normalization replaced with simple affine
+// layers.  We will use the affine version of the layers when testing our
+// networks.
+template <typename SUBNET> using res       = base_res<bn_con,SUBNET>;
+template <typename SUBNET> using ares      = base_res<affine,SUBNET>;
+template <typename SUBNET> using res_down  = base_res_down<bn_con,SUBNET>;
+template <typename SUBNET> using ares_down = base_res_down<affine,SUBNET>;



@@ -141,37 +145,39 @@ int main(int argc, char** argv) try
    // These print statements will output this (I've truncated it since it's
    // long, but you get the idea):
    /*
-        The pnet has 125 layers in it.
+        The pnet has 127 layers in it.
        layer<0>    loss_multiclass_log
        layer<1>    fc       (num_outputs=10)
        layer<2>    avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
        layer<3>    prelu    (initial_param_value=0.2)
        layer<4>    add_prev
        layer<5>    bn_con
-        layer<6>      con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
+        layer<6>    con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
        layer<7>    prelu    (initial_param_value=0.25)
        layer<8>    bn_con
-        layer<9>      con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
+        layer<9>    con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
        layer<10>   tag1
        ...
-        layer<33>     con      (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2)
+        layer<33>   con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
        layer<34>   tag1
-        layer<35>     tag4
-        layer<36>     prelu    (initial_param_value=0.3)
-        layer<37>     add_prev
-        layer<38>     bn_con
+        layer<35>   avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
+        layer<36>   tag4
+        layer<37>   prelu    (initial_param_value=0.3)
+        layer<38>   add_prev
+        layer<39>   bn_con
        ...
-        layer<114>    con      (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2)
-        layer<115>    tag1
-        layer<116>    relu
-        layer<117>    add_prev
-        layer<118>    bn_con
-        layer<119>    con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
-        layer<120>    relu
-        layer<121>    bn_con
-        layer<122>    con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
-        layer<123>    tag1
-        layer<124>    input<matrix>
+        layer<115>  con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
+        layer<116>  tag1
+        layer<117>  avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
+        layer<118>  relu
+        layer<119>  add_prev
+        layer<120>  bn_con
+        layer<121>  con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
+        layer<122>  relu
+        layer<123>  bn_con
+        layer<124>  con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
+        layer<125>  tag1
+        layer<126>  input<matrix>
    */

    // Now that we know the index numbers for each layer, we can access them
@@ -189,7 +195,7 @@ int main(int argc, char** argv) try
    // parts of your network and access them by layer<tag>().  You can also
    // index relative to a tag.  So for example, to access the layer immediately
    // after tag4 you can say:
-    layer<tag4,1>(pnet); // Equivalent to layer<35+1>(pnet).
+    layer<tag4,1>(pnet); // Equivalent to layer<36+1>(pnet).

    // Or to access the layer 2 layers after tag4:
    layer<tag4,2>(pnet);
@@ -203,23 +209,26 @@ int main(int argc, char** argv) try
    // talk about training networks!

    // The dnn_trainer will use SGD by default, but you can tell it to use
-    // different solvers like adam.  
-    dnn_trainer<net_type,adam> trainer(net,adam(0.001));
+    // different solvers like adam with a weight decay of 0.0005 and the given
+    // momentum parameters. 
+    dnn_trainer<net_type,adam> trainer(net,adam(0.0005, 0.9, 0.999));
    // Also, if you have multiple graphics cards you can tell the trainer to use
    // them together to make the training faster.  For example, replacing the
    // above constructor call with this one would cause it to use GPU cards 0
    // and 1.
-    //dnn_trainer<net_type,adam> trainer(net,adam(0.001), {0,1});
+    //dnn_trainer<net_type,adam> trainer(net,adam(0.0005, 0.9, 0.999), {0,1});

    trainer.be_verbose();
    trainer.set_synchronization_file("mnist_resnet_sync", std::chrono::seconds(100));
    // While the trainer is running it keeps an eye on the training error.  If
    // it looks like the error hasn't decreased for the last 2000 iterations it
-    // will automatically reduce the step size by 0.1.  You can change these
+    // will automatically reduce the learning rate by 0.1.  You can change these
    // default parameters to some other values by calling these functions.  Or
-    // disable them entirely by setting the shrink amount to 1.
+    // disable the automatic shrinking entirely by setting the shrink amount to 1.
    trainer.set_iterations_without_progress_threshold(2000);
-    trainer.set_step_size_shrink_amount(0.1);
+    trainer.set_learning_rate_shrink_amount(0.1);
+    // The learning rate will start at 1e-3.
+    trainer.set_learning_rate(1e-3);


    // Now, what if your training dataset is so big it doesn't fit in RAM?  You
@@ -230,10 +239,10 @@ int main(int argc, char** argv) try
    std::vector<matrix<unsigned char>> mini_batch_samples;
    std::vector<unsigned long> mini_batch_labels; 
    dlib::rand rnd(time(0));
-    // Loop until the trainer's automatic shrinking has shrunk the step size by
-    // 1e-3.  For the default shrinks amount of 0.1 this means stop after it
-    // shrinks it 3 times.
-    while(trainer.get_step_size() >= 1e-3)
+    // Loop until the trainer's automatic shrinking has shrunk the learning rate to 1e-6.
+    // Given our settings, this means it will stop training after it has shrunk the
+    // learning rate 3 times.
+    while(trainer.get_learning_rate() >= 1e-6)
    {
        mini_batch_samples.clear();
        mini_batch_labels.clear();

--- a/examples/dnn_mnist_ex.cpp
+++ b/examples/dnn_mnist_ex.cpp
@@ -89,7 +89,9 @@ int main(int argc, char** argv) try
    net_type net;
    // And then train it using the MNIST data.  The code below uses mini-batch stochastic
    // gradient descent with an initial learning rate of 0.01 to accomplish this.
-    dnn_trainer<net_type> trainer(net,sgd(0.01));
+    dnn_trainer<net_type> trainer(net);
+    trainer.set_learning_rate(0.01);
+    trainer.set_min_learning_rate(0.00001);
    trainer.set_mini_batch_size(128);
    trainer.be_verbose();
    // Since DNN training can take a long time, we can ask the trainer to save its state to
@@ -97,11 +99,11 @@ int main(int argc, char** argv) try
    // start it again it will begin where it left off rather than restarting the training
    // from scratch.  
    trainer.set_synchronization_file("mnist_sync", std::chrono::seconds(20));
-    // Finally, this line begins training.  By default, it runs SGD with our specified step
-    // size until the loss stops decreasing.  Then it reduces the step size by a factor of
-    // 10 and continues running until the loss stops decreasing again.  It will reduce the
-    // step size 3 times and then terminate.  For a longer discussion, see the documentation
-    // of the dnn_trainer object.
+    // Finally, this line begins training.  By default, it runs SGD with our specified
+    // learning rate until the loss stops decreasing.  Then it reduces the learning rate by
+    // a factor of 10 and continues running until the loss stops decreasing again.  It will
+    // keep doing this until the learning rate has dropped below the min learning rate
+    // defined above or the maximum number of epochs as been executed (defaulted to 10000). 
    trainer.train(training_images, training_labels);

    // At this point our net object should have learned how to classify MNIST images.  But