Added learning rate and weight decay multipliers to the con_, fc_, and bn_

layers. Updated the solvers to support this.

Added learning rate and weight decay multipliers to the con_, fc_, and bn_
layers. Updated the solvers to support this.
b92b226c · Davis King · 40f04beb · b92b226c · b92b226c · b92b226c
Commit b92b226c authored May 22, 2016 by Davis King
10 changed files
--- a/dlib/dnn/cpu_dlib.cpp
+++ b/dlib/dnn/cpu_dlib.cpp
@@ -488,6 +488,8 @@ namespace dlib
    // -----------------------------------------------------------------------------------
        void compute_adam_update (
+            size_t begin,
+            size_t end,
            tensor& s,
            tensor& m,
            tensor& v,
@@ -504,6 +506,7 @@ namespace dlib
                         s.size() == v.size() &&
                         s.size() == params.size() &&
                         s.size() == params_grad.size(),"");
+            DLIB_CASSERT(begin <= end && end <= params.size(),"");
            const float eps = 1e-8;
            const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));
@@ -516,7 +519,7 @@ namespace dlib
            auto ps = s.host_write_only();
            auto pparams = params.host();
            auto ppgrad = params_grad.host();
-            for (size_t i = 0; i < params.size(); ++i)
+            for (size_t i = begin; i < end; ++i)
            {
                float g = weight_decay*pparams[i] + ppgrad[i];
                pm[i] = momentum1*pm[i] + (1-momentum1)*g;

--- a/dlib/dnn/cpu_dlib.h
+++ b/dlib/dnn/cpu_dlib.h
@@ -114,6 +114,8 @@ namespace dlib
    // -----------------------------------------------------------------------------------
        void compute_adam_update (
+            size_t begin,
+            size_t end,
            tensor& s,
            tensor& m,
            tensor& v,

--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -583,7 +583,8 @@ namespace dlib
    // ----------------------------------------------------------------------------------------
        __global__ void _cuda_compute_adam_update(
-            size_t n,
+            size_t begin,
+            size_t end,
            float* s,
            float* m,
            float* v,
@@ -600,7 +601,7 @@ namespace dlib
            //   m = momentum1*m + (1-momentum1)    *   (weight_decay*params + params_grad);
            //   v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad);
            //   s = -alpha*m/(sqrt(v) + eps);
-            for (auto i : grid_stride_range(0, n))
+            for (auto i : grid_stride_range(begin, end))
            {
                float g = (weight_decay*params[i] + params_grad[i]);
                m[i] = momentum1*m[i] + (1-momentum1)*g;
@@ -610,6 +611,8 @@ namespace dlib
        }
        void compute_adam_update (
+            size_t begin,
+            size_t end,
            tensor& s,
            tensor& m,
            tensor& v,
@@ -626,10 +629,11 @@ namespace dlib
                         s.size() == v.size() &&
                         s.size() == params.size() &&
                         s.size() == params_grad.size(),"");
+            DLIB_CASSERT(begin <= end && end <= params.size(),"");
            const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));
-            launch_kernel(_cuda_compute_adam_update,max_jobs(s.size()),
+            launch_kernel(_cuda_compute_adam_update,max_jobs(end-begin),
-                    s.size(), s.device(), m.device(), v.device(), alpha, weight_decay,
+                    begin, end, s.device(), m.device(), v.device(), alpha, weight_decay,
                    momentum1, momentum2, params.device(), params_grad.device());
        }

--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -205,6 +205,8 @@ namespace dlib
    // ----------------------------------------------------------------------------------------
        void compute_adam_update (
+            size_t begin,
+            size_t end,
            tensor& s,
            tensor& m,
            tensor& v,

--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
@@ -123,6 +123,16 @@ namespace dlib
                      allow dlib to make some layers execute in-place and therefore run a
                      little faster and use less memory.  Do not implement forward() and
                      backward().
+                It should also be noted that layers may define additional layer specific
+                fields and the solvers can use these fields as they see fit.  For example,
+                some layers define get_learning_rate_multiplier() and
+                get_weight_decay_multiplier() methods.  The solvers that come with dlib
+                look at these methods, if they exist, and adjust the learning rate or
+                weight decay for that layer according to the multiplier.  Therefore, you
+                can add these methods to your layer types if you want, or even define new
+                fields and new solvers that use those fields in some way.  
        !*/
    public:
@@ -367,6 +377,10 @@ namespace dlib
            ensures
                - #get_num_outputs() == num_outputs
                - #get_bias_mode() == bias_mode 
+                - #get_learning_rate_multiplier()      == 1
+                - #get_weight_decay_multiplier()       == 1
+                - #get_bias_learning_rate_multiplier() == 1
+                - #get_bias_weight_decay_multiplier()  == 0
        !*/
        unsigned long get_num_outputs (
@@ -389,6 +403,82 @@ namespace dlib
                  is added to each of the outputs of this layer. 
        !*/
+        double get_learning_rate_multiplier(
+        ) const;  
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its parameters be
+                  multiplied by get_learning_rate_multiplier().
+        !*/
+        double get_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its parameters be
+                  multiplied by get_weight_decay_multiplier().
+        !*/
+        void set_learning_rate_multiplier(
+            double val
+        );
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_learning_rate_multiplier() == val
+        !*/
+        void set_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_weight_decay_multiplier() == val
+        !*/
+        double get_bias_learning_rate_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its bias parameters be
+                  multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
+        !*/
+        double get_bias_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its bias parameters be
+                  multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
+        !*/
+        void set_bias_learning_rate_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_learning_rate_multiplier() == val
+        !*/
+        void set_bias_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_weight_decay_multiplier() == val
+        !*/
        template <typename SUBNET> void setup (const SUBNET& sub);
        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
@@ -458,6 +548,10 @@ namespace dlib
                - #stride_x() == _stride_x
                - #padding_y() == _padding_y
                - #padding_x() == _padding_x
+                - #get_learning_rate_multiplier()      == 1
+                - #get_weight_decay_multiplier()       == 1
+                - #get_bias_learning_rate_multiplier() == 1
+                - #get_bias_weight_decay_multiplier()  == 0
        !*/
        long num_filters(
@@ -517,6 +611,82 @@ namespace dlib
                  sides of the image.
        !*/
+        double get_learning_rate_multiplier(
+        ) const;  
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its parameters be
+                  multiplied by get_learning_rate_multiplier().
+        !*/
+        double get_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its parameters be
+                  multiplied by get_weight_decay_multiplier().
+        !*/
+        void set_learning_rate_multiplier(
+            double val
+        );
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_learning_rate_multiplier() == val
+        !*/
+        void set_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_weight_decay_multiplier() == val
+        !*/
+        double get_bias_learning_rate_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its bias parameters be
+                  multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
+        !*/
+        double get_bias_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its bias parameters be
+                  multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
+        !*/
+        void set_bias_learning_rate_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_learning_rate_multiplier() == val
+        !*/
+        void set_bias_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_weight_decay_multiplier() == val
+        !*/
        template <typename SUBNET> void setup (const SUBNET& sub);
        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
@@ -684,7 +854,9 @@ namespace dlib
        /*!
            ensures
                - #get_mode() == mode
-                - get_running_stats_window_size() == 1000
+                - #get_running_stats_window_size() == 1000
+                - #get_learning_rate_multiplier()  == 1
+                - #get_weight_decay_multiplier()   == 0
        !*/
        explicit bn_(
@@ -693,7 +865,9 @@ namespace dlib
        /*!
            ensures
                - #get_mode() == mode 
-                - get_running_stats_window_size() == window_size
+                - #get_running_stats_window_size() == window_size
+                - #get_learning_rate_multiplier()  == 1
+                - #get_weight_decay_multiplier()   == 0
        !*/
        layer_mode get_mode(
@@ -725,6 +899,44 @@ namespace dlib
                  the running average.
        !*/
+        double get_learning_rate_multiplier(
+        ) const;  
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its parameters be
+                  multiplied by get_learning_rate_multiplier().
+        !*/
+        double get_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its parameters be
+                  multiplied by get_weight_decay_multiplier().
+        !*/
+        void set_learning_rate_multiplier(
+            double val
+        );
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_learning_rate_multiplier() == val
+        !*/
+        void set_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_weight_decay_multiplier() == val
+        !*/
        template <typename SUBNET> void setup (const SUBNET& sub);
        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);

--- a/dlib/dnn/solvers.h
+++ b/dlib/dnn/solvers.h
@@ -6,6 +6,7 @@
 #include "solvers_abstract.h"
 #include "tensor.h"
 #include <iostream>
+#include "layers.h"
 namespace dlib
 {
@@ -48,11 +49,43 @@ namespace dlib
                v.copy_size(params_grad);
                v = 0;
            }
+            const double lr = learning_rate*get_learning_rate_multiplier(l);
+            const double wd = weight_decay*get_weight_decay_multiplier(l);
-            //perform: v = momentum*mat(v) - weight_decay*learning_rate*mat(params) - learning_rate*mat(params_grad);
+            //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
-            tt::affine_transform(v, v, params, params_grad, 
+            tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
-                               momentum, -weight_decay*learning_rate, -learning_rate, 0);
+            return v;
+        }
+        template <unsigned long N>
+        const tensor& operator() (
+            const float learning_rate,
+            const fc_<N,FC_HAS_BIAS>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, l.get_num_outputs());
+            return v;
+        }
+        template <
+            long _num_filters,
+            long _nr,
+            long _nc,
+            int _stride_y,
+            int _stride_x,
+            int _padding_y,
+            int _padding_x
+            >
+        const tensor& operator() (
+            const float learning_rate,
+            const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, l.num_filters());
            return v;
        }
@@ -76,9 +109,49 @@ namespace dlib
        }
    private:
+        template <typename layer_type> 
+        void update_considering_bias(
+            const float learning_rate,
+            const layer_type& l,
+            const tensor& params_grad,
+            unsigned long bias_offset
+        )
+        {
+            const tensor& params = l.get_layer_params();
+            DLIB_CASSERT(params.size() != 0,"");
+            if (v.size() == 0)
+            {
+                v.copy_size(params_grad);
+                v = 0;
+            }
+            double lr = learning_rate*get_learning_rate_multiplier(l);
+            double wd = weight_decay*get_weight_decay_multiplier(l);
+            //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
+            if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
+            {
+                tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
+            }
+            else
+            {
+                tt::affine_transform_range(0, bias_offset, v, v, params, params_grad, momentum, -wd*lr, -lr);
+                // now update the biases but apply their multipliers
+                lr *= l.get_bias_learning_rate_multiplier();
+                wd *= l.get_bias_weight_decay_multiplier();
+                tt::affine_transform_range(bias_offset, v.size(), v, v, params, params_grad, momentum, -wd*lr, -lr);
+            }
+        }
        resizable_tensor v;
        float weight_decay;
        float momentum;
    };
 // ----------------------------------------------------------------------------------------
@@ -131,12 +204,47 @@ namespace dlib
            }
            ++t;
-            tt::compute_adam_update(s, m, v, t, learning_rate, weight_decay, momentum1, momentum2, params, params_grad);
+            tt::compute_adam_update(0, params.size(), s, m, v, t,
+                learning_rate*get_learning_rate_multiplier(l),
+                weight_decay*get_weight_decay_multiplier(l), 
+                momentum1, momentum2, params, params_grad);
            return s;
        }
+        template <unsigned long N>
+        const tensor& operator() (
+            const float learning_rate,
+            const fc_<N,FC_HAS_BIAS>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, l.get_num_outputs());
+            return s;
+        }
+        template <
+            long _num_filters,
+            long _nr,
+            long _nc,
+            int _stride_y,
+            int _stride_x,
+            int _padding_y,
+            int _padding_x
+            >
+        const tensor& operator() (
+            const float learning_rate,
+            const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, l.num_filters());
+            return s;
+        }
        friend void serialize(const adam& item, std::ostream& out)
        {
            serialize("adam2", out);
@@ -165,6 +273,49 @@ namespace dlib
        }
    private:
+        template <typename layer_type> 
+        void update_considering_bias(
+            const float learning_rate,
+            const layer_type& l,
+            const tensor& params_grad,
+            unsigned long bias_offset
+        )
+        {
+            const tensor& params = l.get_layer_params();
+            DLIB_CASSERT(params.size() != 0,"");
+            if (v.size() == 0)
+            {
+                m.copy_size(params_grad);
+                m = 0;
+                v.copy_size(params_grad);
+                v = 0;
+                s.copy_size(params_grad);
+            }
+            ++t;
+            if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
+            {
+                tt::compute_adam_update(0, params.size(), s, m, v, t,
+                    learning_rate*get_learning_rate_multiplier(l),
+                    weight_decay*get_weight_decay_multiplier(l), 
+                    momentum1, momentum2, params, params_grad);
+            }
+            else
+            {
+                tt::compute_adam_update(0, bias_offset, s, m, v, t,
+                    learning_rate*get_learning_rate_multiplier(l),
+                    weight_decay*get_weight_decay_multiplier(l), 
+                    momentum1, momentum2, params, params_grad);
+                tt::compute_adam_update(bias_offset, params.size(), s, m, v, t,
+                    learning_rate*get_learning_rate_multiplier(l)*l.get_bias_learning_rate_multiplier(),
+                    weight_decay*get_weight_decay_multiplier(l)*l.get_bias_weight_decay_multiplier(), 
+                    momentum1, momentum2, params, params_grad);
+            }
+        }
        resizable_tensor m;
        resizable_tensor v;
        resizable_tensor s;

--- a/dlib/dnn/solvers_abstract.h
+++ b/dlib/dnn/solvers_abstract.h
@@ -78,6 +78,15 @@ namespace dlib
                    V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
                Here V is a momentum term that is remembered by the solver from one
                invocation of operator() to the next.  
+                Note that the actual learning rate and weight decay used by the solver are
+                multiplied by the per layer multipliers.  That is, the solver will call
+                get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
+                multiply these values with the nominal learning rate and weight decay,
+                respectively, to determine the values it will use during each step.  It is
+                also overloaded to allow additional learning rate multipliers to be applied
+                to fc_ and con_ bias parameters.
        !*/
    public:
@@ -123,6 +132,15 @@ namespace dlib
                paper:
                    Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
                    optimization." International Conference on Learning Representation. 2015.
+                Note that the actual learning rate and weight decay used by the solver are
+                multiplied by the per layer multipliers.  That is, the solver will call
+                get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
+                multiply these values with the nominal learning rate and weight decay,
+                respectively, to determine the values it will use during each step.  It is
+                also overloaded to allow additional learning rate multipliers to be applied
+                to fc_ and con_ bias parameters.
        !*/
    public:

--- a/dlib/dnn/tensor_tools.cpp
+++ b/dlib/dnn/tensor_tools.cpp
@@ -311,6 +311,8 @@ namespace dlib { namespace tt
 // ----------------------------------------------------------------------------------------
    void compute_adam_update (
+        size_t begin,
+        size_t end,
        tensor& s,
        tensor& m,
        tensor& v,
@@ -324,10 +326,10 @@ namespace dlib { namespace tt
    )
    {
 #ifdef DLIB_USE_CUDA
-        cuda::compute_adam_update(s, m, v, t, learning_rate, weight_decay, momentum1,
+        cuda::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
            momentum2, params, params_grad);
 #else
-        cpu::compute_adam_update(s, m, v, t, learning_rate, weight_decay, momentum1,
+        cpu::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
            momentum2, params, params_grad);
 #endif
    }

--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@@ -335,6 +335,8 @@ namespace dlib { namespace tt
 // ----------------------------------------------------------------------------------------
    void compute_adam_update (
+        size_t begin,
+        size_t end,
        tensor& s,
        tensor& m,
        tensor& v,
@@ -354,12 +356,16 @@ namespace dlib { namespace tt
            - weight_decay >= 0
            - 0 <= momentum1 < 1
            - 0 <= momentum2 < 1
+            - begin <= end <= params.size()
        ensures
            - This function implements the ADAM parameter update method described in the paper:
                Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
                optimization." International Conference on Learning Representation. 2015.
              Specifically, it implements the method shown as Algorithm 1.
            - #s is the update vector that should be added to the parameters.
+            - The function only operates in the half open range [begin,end) of the memory
+              blocks of each tensor.  E.g. to make this function run on the entire tensor
+              set begin to 0 and end to params.size().
    !*/
 // ----------------------------------------------------------------------------------------