Merge branch 'master' of https://github.com/davisking/dlib into dnn_group_layer

93e786db · Fm · 59892409 · 91163863 · 93e786db · 93e786db
Commit 93e786db authored May 26, 2016 by Fm
23 changed files
--- a/dlib/algs.h
+++ b/dlib/algs.h
@@ -488,6 +488,13 @@ namespace dlib

 // ----------------------------------------------------------------------------------------

+    struct general_ {};
+    struct special_ : general_ {};
+    template<typename> struct int_ { typedef int type; };
+
+// ----------------------------------------------------------------------------------------
+
+
    /*!A is_same_object 

        This is a templated function which checks if both of its arguments are actually

--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -24,6 +24,38 @@
 namespace dlib
 {

+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <typename T, typename int_<decltype(&T::get_learning_rate_multiplier)>::type = 0>
+        double get_learning_rate_multiplier (
+            const T& obj,
+            special_
+        ) { return obj.get_learning_rate_multiplier(); }
+
+        template <typename T>
+        double get_learning_rate_multiplier ( const T& obj, general_) { return 1; }
+    }
+    template <typename T>
+    double get_learning_rate_multiplier(const T& obj) { return impl::get_learning_rate_multiplier(obj, special_()); }
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <typename T, typename int_<decltype(&T::get_weight_decay_multiplier)>::type = 0>
+        double get_weight_decay_multiplier (
+            const T& obj,
+            special_
+        ) { return obj.get_weight_decay_multiplier(); }
+
+        template <typename T>
+        double get_weight_decay_multiplier ( const T& obj, general_) { return 1; }
+    }
+    template <typename T>
+    double get_weight_decay_multiplier(const T& obj) { return impl::get_weight_decay_multiplier(obj, special_()); }
+
 // ----------------------------------------------------------------------------------------

    namespace impl
@@ -458,7 +490,7 @@ namespace dlib

        sstack pop(size_t num=1) 
        { 
-            DLIB_CASSERT(num < size(), "You can't pop more things from the stack than it has in it.");
+            DLIB_CASSERT(num <= size(), "You can't pop more things from the stack than it has in it.");
            return sstack(data+num, mysize-num);
        }

@@ -849,8 +881,9 @@ namespace dlib
        void update_parameters(sstack<solver_type> solvers, double learning_rate)
        {
            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
-            // Don't try to adjust the parameters if this layer doesn't have any.
-            if (params_grad.size() != 0)
+            // Don't try to adjust the parameters if this layer doesn't have any or the
+            // learning rate is disabled for this layer.
+            if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0)
            {
                const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
                tt::add(details.get_layer_params(), details.get_layer_params(), step);
@@ -1200,8 +1233,9 @@ namespace dlib
        void update_parameters(sstack<solver_type> solvers, double learning_rate)
        {
            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
-            // Don't try to adjust the parameters if this layer doesn't have any.
-            if (params_grad.size() != 0) 
+            // Don't try to adjust the parameters if this layer doesn't have any or the
+            // learning rate is disabled for this layer.
+            if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0) 
            {
                const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
                tt::add(details.get_layer_params(), details.get_layer_params(), step);
@@ -1817,9 +1851,7 @@ namespace dlib
    public:
        typedef INPUT_LAYER subnet_type;
        typedef typename subnet_type::input_type input_type;
-        // This layer counts as a computational layer because it copies and stores the
-        // inputs.
-        const static size_t num_computational_layers = 1;
+        const static size_t num_computational_layers = 0;
        const static size_t num_layers = 2;
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
        static_assert(sample_expansion_factor >= 1,

--- a/dlib/dnn/core_abstract.h
+++ b/dlib/dnn/core_abstract.h
@@ -67,6 +67,32 @@ namespace dlib
              (except computes it using a numerically accurate method)
    !*/

+// ----------------------------------------------------------------------------------------
+
+    template <typename T>
+    double get_learning_rate_multiplier(
+        const T& obj
+    ); 
+    /*!
+        ensures
+            - if (obj has a get_learning_rate_multiplier() member function) then
+                - returns obj.get_learning_rate_multiplier()
+            - else
+                - returns 1
+    !*/
+
+    template <typename T>
+    double get_weight_decay_multiplier(
+        const T& obj
+    ); 
+    /*!
+        ensures
+            - if (obj has a get_weight_decay_multiplier() member function) then
+                - returns obj.get_weight_decay_multiplier()
+            - else
+                - returns 1
+    !*/
+
 // ----------------------------------------------------------------------------------------

    bool dnn_prefer_fastest_algorithms(
@@ -152,7 +178,7 @@ namespace dlib
        ); 
        /*!
            requires
-                - num < size()
+                - num <= size()
            ensures
                - returns a reference to the sub-stack S such that:
                    - S.size() == size()-num.

--- a/dlib/dnn/cpu_dlib.cpp
+++ b/dlib/dnn/cpu_dlib.cpp
@@ -385,6 +385,30 @@ namespace dlib
                d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D;
        }

+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        )
+        {
+            DLIB_CASSERT(dest.size()==src1.size(),"");
+            DLIB_CASSERT(dest.size()==src2.size(),"");
+            DLIB_CASSERT(dest.size()==src3.size(),"");
+            DLIB_CASSERT(begin <= end && end <= dest.size(),"");
+            const auto d = dest.host();
+            const auto s1 = src1.host();
+            const auto s2 = src2.host();
+            const auto s3 = src3.host();
+            for (size_t i = begin; i < end; ++i)
+                d[i] = A*s1[i] + B*s2[i] + C*s3[i];
+        }
+
    // -----------------------------------------------------------------------------------

        void affine_transform(
@@ -464,6 +488,8 @@ namespace dlib
    // -----------------------------------------------------------------------------------

        void compute_adam_update (
+            size_t begin,
+            size_t end,
            tensor& s,
            tensor& m,
            tensor& v,
@@ -480,6 +506,7 @@ namespace dlib
                         s.size() == v.size() &&
                         s.size() == params.size() &&
                         s.size() == params_grad.size(),"");
+            DLIB_CASSERT(begin <= end && end <= params.size(),"");
            const float eps = 1e-8;
            const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));

@@ -492,7 +519,7 @@ namespace dlib
            auto ps = s.host_write_only();
            auto pparams = params.host();
            auto ppgrad = params_grad.host();
-            for (size_t i = 0; i < params.size(); ++i)
+            for (size_t i = begin; i < end; ++i)
            {
                float g = weight_decay*pparams[i] + ppgrad[i];
                pm[i] = momentum1*pm[i] + (1-momentum1)*g;
@@ -504,6 +531,7 @@ namespace dlib
    // -----------------------------------------------------------------------------------

        void batch_normalize_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -519,7 +547,8 @@ namespace dlib
                gamma.k()  == src.k() &&
                have_same_dimensions(gamma, beta) &&
                have_same_dimensions(gamma, running_means) &&
-                have_same_dimensions(gamma, running_variances), 
+                have_same_dimensions(gamma, running_variances) && 
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -538,7 +567,8 @@ namespace dlib
                "\nrunning_variances.nc():  " << running_variances.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
            );
            dest.copy_size(src);

@@ -554,7 +584,7 @@ namespace dlib
            {
                for (long k = 0; k < num; ++k)
                {
-                    *d = g[k]*(*s - m[k])/std::sqrt(v[k]+dlib::tt::BATCH_NORM_EPS) + b[k];
+                    *d = g[k]*(*s - m[k])/std::sqrt(v[k]+eps) + b[k];
                    ++d;
                    ++s;
                }
@@ -562,6 +592,7 @@ namespace dlib
        }

        void batch_normalize (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -582,7 +613,8 @@ namespace dlib
                beta.num_samples() == 1 && 
                gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
                gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
-                gamma.k()  == beta.k()  && beta.k() == src.k(), 
+                gamma.k()  == beta.k()  && beta.k() == src.k() &&
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -593,7 +625,8 @@ namespace dlib
                "\nbeta.nc():  " << beta.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
            );

            dest.copy_size(src);
@@ -635,7 +668,7 @@ namespace dlib
                else
                    rvar[i] = (1-averaging_factor)*rvar[i] + scale*averaging_factor*actual_var;

-                p_invstds[i] = 1.0f/std::sqrt(actual_var + dlib::tt::BATCH_NORM_EPS);
+                p_invstds[i] = 1.0f/std::sqrt(actual_var + eps);
            }

            p_src = src.host();
@@ -662,6 +695,7 @@ namespace dlib
        }

        void batch_normalize_gradient (
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,
@@ -682,6 +716,7 @@ namespace dlib
            DLIB_CASSERT(num == beta_grad.size(),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
+            DLIB_CASSERT(eps > 0,"");

            beta_grad = 0;
            gamma_grad = 0;
@@ -757,6 +792,7 @@ namespace dlib
    // ----------------------------------------------------------------------------------------

        void batch_normalize_conv_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -772,7 +808,8 @@ namespace dlib
                gamma.k()  == src.k() &&
                have_same_dimensions(gamma, beta) &&
                have_same_dimensions(gamma, running_means) &&
-                have_same_dimensions(gamma, running_variances), 
+                have_same_dimensions(gamma, running_variances) &&
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -791,7 +828,8 @@ namespace dlib
                "\nrunning_variances.nc():  " << running_variances.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
            );
            dest.copy_size(src);

@@ -807,7 +845,7 @@ namespace dlib
            {
                for (long k = 0; k < src.k(); ++k)
                {
-                    const float invstd = 1.0f/std::sqrt(v[k] + dlib::tt::BATCH_NORM_EPS);
+                    const float invstd = 1.0f/std::sqrt(v[k] + eps);
                    for (long j = 0; j < num; ++j)
                    {
                        *d = g[k]*(*s - m[k])*invstd + b[k];
@@ -819,6 +857,7 @@ namespace dlib
        }

        void batch_normalize_conv (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -841,7 +880,8 @@ namespace dlib
                beta.nr() == 1 && 
                gamma.nc() == 1 && 
                beta.nc() == 1 && 
-                gamma.k()  == beta.k()  && beta.k() == src.k(), 
+                gamma.k()  == beta.k()  && beta.k() == src.k() &&
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -852,7 +892,8 @@ namespace dlib
                "\nbeta.nc():  " << beta.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc()  <<
+                "\neps:  " << eps 
            );

            dest.copy_size(src);
@@ -900,7 +941,7 @@ namespace dlib
                else
                    rvar[k] = (1-averaging_factor)*rvar[k] + scale*averaging_factor*actual_var;

-                p_invstds[k] = 1.0f/std::sqrt(actual_var + dlib::tt::BATCH_NORM_EPS);
+                p_invstds[k] = 1.0f/std::sqrt(actual_var + eps);
            }

            p_src = src.host();
@@ -928,6 +969,7 @@ namespace dlib
        }

        void batch_normalize_conv_gradient(
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,
@@ -948,6 +990,7 @@ namespace dlib
            DLIB_CASSERT(src.k() == beta_grad.size(),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
+            DLIB_CASSERT(eps > 0,"");

            beta_grad = 0;
            gamma_grad = 0;

--- a/dlib/dnn/cpu_dlib.h
+++ b/dlib/dnn/cpu_dlib.h
@@ -81,6 +81,18 @@ namespace dlib
            const float D
        );

+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        );
+
    // -----------------------------------------------------------------------------------

        void affine_transform(
@@ -102,6 +114,8 @@ namespace dlib
    // -----------------------------------------------------------------------------------

        void compute_adam_update (
+            size_t begin,
+            size_t end,
            tensor& s,
            tensor& m,
            tensor& v,
@@ -117,6 +131,7 @@ namespace dlib
    // -----------------------------------------------------------------------------------

        void batch_normalize_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -126,6 +141,7 @@ namespace dlib
        );

        void batch_normalize (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -138,6 +154,7 @@ namespace dlib
        );

        void batch_normalize_gradient (
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,
@@ -149,6 +166,7 @@ namespace dlib
        );

        void batch_normalize_conv_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -158,6 +176,7 @@ namespace dlib
        );

        void batch_normalize_conv (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -170,6 +189,7 @@ namespace dlib
        );

        void batch_normalize_conv_gradient (
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,

--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -504,6 +504,40 @@ namespace dlib
                src2.device(), src3.device(), dest.size(), A, B, C, D);
        }

+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_affine_transform_range(
+            float* d, const float* s1, const float* s2, const float* s3, size_t begin, size_t end, float A, float B, float C
+        )
+        {
+            for (auto i : grid_stride_range(begin, end))
+            {
+                d[i] = A*s1[i] + B*s2[i] + C*s3[i];
+            }
+        }
+
+
+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        )
+        {
+            DLIB_CASSERT(dest.size()==src1.size(),"");
+            DLIB_CASSERT(dest.size()==src2.size(),"");
+            DLIB_CASSERT(dest.size()==src3.size(),"");
+            DLIB_CASSERT(begin <= end && end <= dest.size(),"");
+            launch_kernel(_cuda_affine_transform_range,max_jobs(end-begin),
+                dest.device(), src1.device(),
+                src2.device(), src3.device(), begin, end, A, B, C);
+        }
+
    // -----------------------------------------------------------------------------------

        __global__ void _cuda_affine_transform2(float* d, const float* s, size_t n, const float* A, const float* B)
@@ -549,7 +583,8 @@ namespace dlib
    // ----------------------------------------------------------------------------------------

        __global__ void _cuda_compute_adam_update(
-            size_t n,
+            size_t begin,
+            size_t end,
            float* s,
            float* m,
            float* v,
@@ -566,7 +601,7 @@ namespace dlib
            //   m = momentum1*m + (1-momentum1)    *   (weight_decay*params + params_grad);
            //   v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad);
            //   s = -alpha*m/(sqrt(v) + eps);
-            for (auto i : grid_stride_range(0, n))
+            for (auto i : grid_stride_range(begin, end))
            {
                float g = (weight_decay*params[i] + params_grad[i]);
                m[i] = momentum1*m[i] + (1-momentum1)*g;
@@ -576,6 +611,8 @@ namespace dlib
        }

        void compute_adam_update (
+            size_t begin,
+            size_t end,
            tensor& s,
            tensor& m,
            tensor& v,
@@ -592,10 +629,11 @@ namespace dlib
                         s.size() == v.size() &&
                         s.size() == params.size() &&
                         s.size() == params_grad.size(),"");
+            DLIB_CASSERT(begin <= end && end <= params.size(),"");
            const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));

-            launch_kernel(_cuda_compute_adam_update,max_jobs(s.size()),
-                    s.size(), s.device(), m.device(), v.device(), alpha, weight_decay,
+            launch_kernel(_cuda_compute_adam_update,max_jobs(end-begin),
+                    begin, end, s.device(), m.device(), v.device(), alpha, weight_decay,
                    momentum1, momentum2, params.device(), params_grad.device());
        }


--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -164,6 +164,18 @@ namespace dlib
            const float D
        );

+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        );
+
        // Note that this function isn't in the tt:: namespace because add_scaled() is
        // called by cuda::add() so we don't need a tt:: version of add_scaled().  
        void add_scaled(
@@ -193,6 +205,8 @@ namespace dlib
    // ----------------------------------------------------------------------------------------

        void compute_adam_update (
+            size_t begin,
+            size_t end,
            tensor& s,
            tensor& m,
            tensor& v,

--- a/dlib/dnn/cudnn_dlibapi.cpp
+++ b/dlib/dnn/cudnn_dlibapi.cpp
@@ -338,6 +338,7 @@ namespace dlib
    // ------------------------------------------------------------------------------------

        void batch_normalize_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -353,7 +354,8 @@ namespace dlib
                gamma.k()  == src.k() &&
                have_same_dimensions(gamma, beta) &&
                have_same_dimensions(gamma, running_means) &&
-                have_same_dimensions(gamma, running_variances), 
+                have_same_dimensions(gamma, running_variances) && 
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -372,7 +374,8 @@ namespace dlib
                "\nrunning_variances.nc():  " << running_variances.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
            );
            const float in_scale = 1;
            const float out_scale = 0;
@@ -393,10 +396,11 @@ namespace dlib
                                beta.device(),
                                running_means.device(),
                                running_variances.device(),
-                                dlib::tt::BATCH_NORM_EPS));
+                                eps));
        }

        void batch_normalize (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -417,7 +421,8 @@ namespace dlib
                beta.num_samples() == 1 && 
                gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
                gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
-                gamma.k()  == beta.k()  && beta.k() == src.k(), 
+                gamma.k()  == beta.k()  && beta.k() == src.k() &&
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -428,7 +433,8 @@ namespace dlib
                "\nbeta.nc():  " << beta.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
            );

            const float in_scale = 1;
@@ -455,12 +461,13 @@ namespace dlib
                                averaging_factor,
                                running_means.device(),
                                running_variances.device(),
-                                dlib::tt::BATCH_NORM_EPS,
+                                eps,
                                means.device(),
                                invstds.device()));
        }

        void batch_normalize_gradient(
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,
@@ -480,6 +487,7 @@ namespace dlib
            DLIB_CASSERT(num == beta_grad.size(),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
+            DLIB_CASSERT(eps > 0,"");

            const float in_scale = 1;
            const float out_scale = 1;
@@ -503,7 +511,7 @@ namespace dlib
                                gamma.device(),
                                gamma_grad.device(),
                                beta_grad.device(),
-                                dlib::tt::BATCH_NORM_EPS,
+                                eps,
                                means.device(),
                                invstds.device()));
        }
@@ -511,6 +519,7 @@ namespace dlib
    // ------------------------------------------------------------------------------------

        void batch_normalize_conv_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -526,7 +535,8 @@ namespace dlib
                gamma.k()  == src.k() &&
                have_same_dimensions(gamma, beta) &&
                have_same_dimensions(gamma, running_means) &&
-                have_same_dimensions(gamma, running_variances), 
+                have_same_dimensions(gamma, running_variances) &&
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -545,7 +555,8 @@ namespace dlib
                "\nrunning_variances.nc():  " << running_variances.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
            );
            const float in_scale = 1;
            const float out_scale = 0;
@@ -566,10 +577,11 @@ namespace dlib
                                beta.device(),
                                running_means.device(),
                                running_variances.device(),
-                                dlib::tt::BATCH_NORM_EPS));
+                                eps));
        }

        void batch_normalize_conv (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -592,7 +604,8 @@ namespace dlib
                beta.nr() == 1 && 
                gamma.nc() == 1 && 
                beta.nc() == 1 && 
-                gamma.k()  == beta.k()  && beta.k() == src.k(), 
+                gamma.k()  == beta.k()  && beta.k() == src.k() &&
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -603,7 +616,8 @@ namespace dlib
                "\nbeta.nc():  " << beta.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
            );
            const float in_scale = 1;
            const float out_scale = 0;
@@ -629,12 +643,13 @@ namespace dlib
                                averaging_factor,
                                running_means.device(),
                                running_variances.device(),
-                                dlib::tt::BATCH_NORM_EPS,
+                                eps,
                                means.device(),
                                invstds.device()));
        }

        void batch_normalize_conv_gradient(
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,
@@ -653,6 +668,7 @@ namespace dlib
            DLIB_CASSERT(src.k() == beta_grad.size(),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
+            DLIB_CASSERT(eps > 0,"");

            const float in_scale = 1;
            const float out_scale = 1;
@@ -676,7 +692,7 @@ namespace dlib
                                gamma.device(),
                                gamma_grad.device(),
                                beta_grad.device(),
-                                dlib::tt::BATCH_NORM_EPS,
+                                eps,
                                means.device(),
                                invstds.device()));
        }

--- a/dlib/dnn/cudnn_dlibapi.h
+++ b/dlib/dnn/cudnn_dlibapi.h
@@ -135,6 +135,7 @@ namespace dlib
    // ------------------------------------------------------------------------------------

        void batch_normalize_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -144,6 +145,7 @@ namespace dlib
        );

        void batch_normalize (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -156,6 +158,7 @@ namespace dlib
        );

        void batch_normalize_gradient(
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,
@@ -169,6 +172,7 @@ namespace dlib
    // ------------------------------------------------------------------------------------

        void batch_normalize_conv_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -178,6 +182,7 @@ namespace dlib
        );

        void batch_normalize_conv (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -190,6 +195,7 @@ namespace dlib
        );

        void batch_normalize_conv_gradient(
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,

--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
--- a/dlib/dnn/solvers.h
+++ b/dlib/dnn/solvers.h
@@ -6,6 +6,7 @@
 #include "solvers_abstract.h"
 #include "tensor.h"
 #include <iostream>
+#include "layers.h"

 namespace dlib
 {
@@ -49,10 +50,53 @@ namespace dlib
                v = 0;
            }

-            //perform: v = momentum*mat(v) - weight_decay*learning_rate*mat(params) - learning_rate*mat(params_grad);
-            tt::affine_transform(v, v, params, params_grad, 
-                               momentum, -weight_decay*learning_rate, -learning_rate, 0);
+            const double lr = learning_rate*get_learning_rate_multiplier(l);
+            const double wd = weight_decay*get_weight_decay_multiplier(l);
            
+            //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
+            tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
+
+            return v;
+        }
+
+        template <unsigned long N>
+        const tensor& operator() (
+            const float learning_rate,
+            const fc_<N,FC_HAS_BIAS>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, l.get_num_outputs());
+            return v;
+        }
+
+        template <
+            long _num_filters,
+            long _nr,
+            long _nc,
+            int _stride_y,
+            int _stride_x,
+            int _padding_y,
+            int _padding_x
+            >
+        const tensor& operator() (
+            const float learning_rate,
+            const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, l.num_filters());
+            return v;
+        }
+
+        template < layer_mode mode >
+        const tensor& operator() (
+            const float learning_rate,
+            const bn_<mode>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
            return v;
        }

@@ -76,9 +120,49 @@ namespace dlib
        }

    private:
+
+        template <typename layer_type> 
+        void update_considering_bias(
+            const float learning_rate,
+            const layer_type& l,
+            const tensor& params_grad,
+            unsigned long bias_offset
+        )
+        {
+            const tensor& params = l.get_layer_params();
+
+            DLIB_CASSERT(params.size() != 0,"");
+            if (v.size() == 0)
+            {
+                v.copy_size(params_grad);
+                v = 0;
+            }
+
+            double lr = learning_rate*get_learning_rate_multiplier(l);
+            double wd = weight_decay*get_weight_decay_multiplier(l);
+            
+            //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
+
+            if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
+            {
+                tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
+            }
+            else
+            {
+
+                tt::affine_transform_range(0, bias_offset, v, v, params, params_grad, momentum, -wd*lr, -lr);
+
+                // now update the biases but apply their multipliers
+                lr *= l.get_bias_learning_rate_multiplier();
+                wd *= l.get_bias_weight_decay_multiplier();
+                tt::affine_transform_range(bias_offset, v.size(), v, v, params, params_grad, momentum, -wd*lr, -lr);
+            }
+        }
+
        resizable_tensor v;
        float weight_decay;
        float momentum;
+
    };

 // ----------------------------------------------------------------------------------------
@@ -132,11 +216,57 @@ namespace dlib

            ++t;

-            tt::compute_adam_update(s, m, v, t, learning_rate, weight_decay, momentum1, momentum2, params, params_grad);
+            
+            tt::compute_adam_update(0, params.size(), s, m, v, t,
+                learning_rate*get_learning_rate_multiplier(l),
+                weight_decay*get_weight_decay_multiplier(l), 
+                momentum1, momentum2, params, params_grad);

            return s;
        }

+        template <unsigned long N>
+        const tensor& operator() (
+            const float learning_rate,
+            const fc_<N,FC_HAS_BIAS>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, l.get_num_outputs());
+            return s;
+        }
+
+        template <
+            long _num_filters,
+            long _nr,
+            long _nc,
+            int _stride_y,
+            int _stride_x,
+            int _padding_y,
+            int _padding_x
+            >
+        const tensor& operator() (
+            const float learning_rate,
+            const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, l.num_filters());
+            return s;
+        }
+
+        template < layer_mode mode >
+        const tensor& operator() (
+            const float learning_rate,
+            const bn_<mode>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
+            return s;
+        }
+
+
        friend void serialize(const adam& item, std::ostream& out)
        {
            serialize("adam2", out);
@@ -165,6 +295,49 @@ namespace dlib
        }

    private:
+
+        template <typename layer_type> 
+        void update_considering_bias(
+            const float learning_rate,
+            const layer_type& l,
+            const tensor& params_grad,
+            unsigned long bias_offset
+        )
+        {
+            const tensor& params = l.get_layer_params();
+            DLIB_CASSERT(params.size() != 0,"");
+            if (v.size() == 0)
+            {
+                m.copy_size(params_grad);
+                m = 0;
+                v.copy_size(params_grad);
+                v = 0;
+                s.copy_size(params_grad);
+            }
+
+
+            ++t;
+
+            if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
+            {
+                tt::compute_adam_update(0, params.size(), s, m, v, t,
+                    learning_rate*get_learning_rate_multiplier(l),
+                    weight_decay*get_weight_decay_multiplier(l), 
+                    momentum1, momentum2, params, params_grad);
+            }
+            else
+            {
+                tt::compute_adam_update(0, bias_offset, s, m, v, t,
+                    learning_rate*get_learning_rate_multiplier(l),
+                    weight_decay*get_weight_decay_multiplier(l), 
+                    momentum1, momentum2, params, params_grad);
+
+                tt::compute_adam_update(bias_offset, params.size(), s, m, v, t,
+                    learning_rate*get_learning_rate_multiplier(l)*l.get_bias_learning_rate_multiplier(),
+                    weight_decay*get_weight_decay_multiplier(l)*l.get_bias_weight_decay_multiplier(), 
+                    momentum1, momentum2, params, params_grad);
+            }
+        }
        resizable_tensor m;
        resizable_tensor v;
        resizable_tensor s;

--- a/dlib/dnn/solvers_abstract.h
+++ b/dlib/dnn/solvers_abstract.h
@@ -78,6 +78,15 @@ namespace dlib
                    V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
                Here V is a momentum term that is remembered by the solver from one
                invocation of operator() to the next.  
+
+
+                Note that the actual learning rate and weight decay used by the solver are
+                multiplied by the per layer multipliers.  That is, the solver will call
+                get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
+                multiply these values with the nominal learning rate and weight decay,
+                respectively, to determine the values it will use during each step.  It is
+                also overloaded to allow additional learning rate multipliers to be applied
+                to fc_ and con_ bias parameters.
        !*/
    public:

@@ -123,6 +132,15 @@ namespace dlib
                paper:
                    Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
                    optimization." International Conference on Learning Representation. 2015.
+
+
+                Note that the actual learning rate and weight decay used by the solver are
+                multiplied by the per layer multipliers.  That is, the solver will call
+                get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
+                multiply these values with the nominal learning rate and weight decay,
+                respectively, to determine the values it will use during each step.  It is
+                also overloaded to allow additional learning rate multipliers to be applied
+                to fc_ and con_ bias parameters.
        !*/

    public:

--- a/dlib/dnn/tensor_tools.cpp
+++ b/dlib/dnn/tensor_tools.cpp
@@ -240,6 +240,42 @@ namespace dlib { namespace tt
 #endif
    }

+    void affine_transform_range(
+        size_t begin,
+        size_t end,
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
+#else
+        cpu::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
+#endif
+    }
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
+#else
+        cpu::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
+#endif
+    }
+
 // ----------------------------------------------------------------------------------------

    void affine_transform(
@@ -275,6 +311,8 @@ namespace dlib { namespace tt
 // ----------------------------------------------------------------------------------------

    void compute_adam_update (
+        size_t begin,
+        size_t end,
        tensor& s,
        tensor& m,
        tensor& v,
@@ -288,10 +326,10 @@ namespace dlib { namespace tt
    )
    {
 #ifdef DLIB_USE_CUDA
-        cuda::compute_adam_update(s, m, v, t, learning_rate, weight_decay, momentum1,
+        cuda::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
            momentum2, params, params_grad);
 #else
-        cpu::compute_adam_update(s, m, v, t, learning_rate, weight_decay, momentum1,
+        cpu::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
            momentum2, params, params_grad);
 #endif
    }
@@ -299,6 +337,7 @@ namespace dlib { namespace tt
 // ----------------------------------------------------------------------------------------

    void batch_normalize_inference (
+        const double eps,
        resizable_tensor& dest,
        const tensor& src,
        const tensor& gamma, 
@@ -308,13 +347,14 @@ namespace dlib { namespace tt
    )
    {
 #ifdef DLIB_USE_CUDA
-        cuda::batch_normalize_inference(dest,src,gamma,beta,running_means,running_variances);
+        cuda::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
 #else
-        cpu::batch_normalize_inference(dest,src,gamma,beta,running_means,running_variances);
+        cpu::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
 #endif
    }

    void batch_normalize (
+        const double eps,
        resizable_tensor& dest,
        resizable_tensor& means,
        resizable_tensor& vars,
@@ -327,13 +367,14 @@ namespace dlib { namespace tt
    )
    {
 #ifdef DLIB_USE_CUDA
-        cuda::batch_normalize(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+        cuda::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
 #else
-        cpu::batch_normalize(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+        cpu::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
 #endif
    }

    void batch_normalize_gradient (
+        const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,
@@ -346,15 +387,16 @@ namespace dlib { namespace tt
    {
             
 #ifdef DLIB_USE_CUDA
-        cuda::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+        cuda::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
 #else
-        cpu::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+        cpu::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
 #endif
    }

 // ----------------------------------------------------------------------------------------

    void batch_normalize_conv_inference (
+        const double eps,
        resizable_tensor& dest,
        const tensor& src,
        const tensor& gamma, 
@@ -364,13 +406,14 @@ namespace dlib { namespace tt
    )
    {
 #ifdef DLIB_USE_CUDA
-        cuda::batch_normalize_conv_inference(dest,src,gamma,beta,running_means,running_variances);
+        cuda::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
 #else
-        cpu::batch_normalize_conv_inference(dest,src,gamma,beta,running_means,running_variances);
+        cpu::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
 #endif
    }

    void batch_normalize_conv (
+        const double eps,
        resizable_tensor& dest,
        resizable_tensor& means,
        resizable_tensor& vars,
@@ -383,13 +426,14 @@ namespace dlib { namespace tt
    )
    {
 #ifdef DLIB_USE_CUDA
-        cuda::batch_normalize_conv(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+        cuda::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
 #else
-        cpu::batch_normalize_conv(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+        cpu::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
 #endif
    }

    void batch_normalize_conv_gradient (
+        const double eps,
        const tensor& gradient_input,
        const tensor& means,
        const tensor& invstds,
@@ -402,9 +446,9 @@ namespace dlib { namespace tt
    {
             
 #ifdef DLIB_USE_CUDA
-        cuda::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+        cuda::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
 #else
-        cpu::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+        cpu::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
 #endif
    }


--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@@ -229,13 +229,58 @@ namespace dlib { namespace tt
        const float D
    );
    /*!
-        requires - dest.size()==src1.size()
+        requires 
+            - dest.size()==src1.size()
            - dest.size()==src2.size()
            - dest.size()==src3.size()
        ensures
            - #dest == A*src1 + B*src2 + C*src3 + D
    !*/

+    void affine_transform(
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    );
+    /*!
+        requires 
+            - dest.size()==src1.size()
+            - dest.size()==src2.size()
+            - dest.size()==src3.size()
+        ensures
+            - #dest == A*src1 + B*src2 + C*src3
+    !*/
+
+    void affine_transform_range(
+        size_t begin,
+        size_t end,
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    );
+    /*!
+        requires 
+            - dest.size()==src1.size()
+            - dest.size()==src2.size()
+            - dest.size()==src3.size()
+            - begin <= end <= dest.size()
+        ensures
+            - This function operates much like
+              affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only
+              the half open range [begin,end) rather than processing the entire tensor.
+              Specifically, it does this:
+                - for i in the range [begin, end):
+                    - #dest.host()[i] == A*src1.host()[i] + B*src2.host()[i] + C*src3.host()[i]
+    !*/
+
 // ----------------------------------------------------------------------------------------

    void affine_transform(
@@ -290,6 +335,8 @@ namespace dlib { namespace tt
 // ----------------------------------------------------------------------------------------

    void compute_adam_update (
+        size_t begin,
+        size_t end,
        tensor& s,
        tensor& m,
        tensor& v,
@@ -309,19 +356,22 @@ namespace dlib { namespace tt
            - weight_decay >= 0
            - 0 <= momentum1 < 1
            - 0 <= momentum2 < 1
+            - begin <= end <= params.size()
        ensures
            - This function implements the ADAM parameter update method described in the paper:
                Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
                optimization." International Conference on Learning Representation. 2015.
              Specifically, it implements the method shown as Algorithm 1.
            - #s is the update vector that should be added to the parameters.
+            - The function only operates in the half open range [begin,end) of the memory
+              blocks of each tensor.  E.g. to make this function run on the entire tensor
+              set begin to 0 and end to params.size().
    !*/

 // ----------------------------------------------------------------------------------------

-    const double BATCH_NORM_EPS = 0.00001;
-
    void batch_normalize_inference (
+        const double eps,
        resizable_tensor& dest,
        const tensor& src,
        const tensor& gamma, 
@@ -331,6 +381,7 @@ namespace dlib { namespace tt
    );
    /*!
        requires
+            - eps > 0
            - gamma.num_samples() == 1 
            - gamma.nr() == src.nr() 
            - gamma.nc() == src.nc() 
@@ -342,11 +393,12 @@ namespace dlib { namespace tt
            - Linearly transforms src as a call to batch_normalize() would if src had means
              and variances as given by running_means and running_variances.  That is, this
              function performs: 
-                dest = gamma*(src-running_means)/sqrt(running_variances+BATCH_NORM_EPS) + beta
+                dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
              Note that it does it in a pointwise fashion over the samples in src.
    !*/

    void batch_normalize (
+        const double eps,
        resizable_tensor& dest,
        resizable_tensor& means,
        resizable_tensor& invstds,
@@ -359,6 +411,7 @@ namespace dlib { namespace tt
    );
    /*!
        requires
+            - eps > 0
            - src.num_samples() > 1
            - gamma.num_samples() == 1
            - beta.num_samples() == 1
@@ -384,6 +437,7 @@ namespace dlib { namespace tt
    !*/

    void batch_normalize_gradient (
+        const double eps,
        const tensor& gradient_input,
        const tensor& means,
        const tensor& invstds,
@@ -395,8 +449,9 @@ namespace dlib { namespace tt
    );
    /*!
        requires
+            - eps > 0
            - invstds and means should be the output of a call to
-              batch_normalize(dest,means,invstds,src,gamma,beta)
+              batch_normalize(eps,dest,means,invstds,src,gamma,beta)
            - have_same_dimensions(gradient_input, src) == true
            - have_same_dimensions(src, src_grad) == true
            - src.num_samples() > 1
@@ -410,7 +465,7 @@ namespace dlib { namespace tt
            - have_same_dimensions(invstds, gamma) == true
        ensures
            - Let f(src,gamma,beta) == dot(gradient_input, dest output of
-              batch_normalize(dest,means,invstds,src,gamma,beta))
+              batch_normalize(eps,dest,means,invstds,src,gamma,beta))
            - Adds the gradient of f() with respect to src to #src_grad.
            - Assigns the gradient of f() with respect to gamma to #gamma_grad.
            - Assigns the gradient of f() with respect to beta to #beta_grad.
@@ -419,6 +474,7 @@ namespace dlib { namespace tt
 // ----------------------------------------------------------------------------------------

    void batch_normalize_conv_inference (
+        const double eps,
        resizable_tensor& dest,
        const tensor& src,
        const tensor& gamma, 
@@ -428,6 +484,7 @@ namespace dlib { namespace tt
    );
    /*!
        requires
+            - eps > 0
            - gamma.num_samples() == 1 
            - gamma.nr() == 1 
            - gamma.nc() == 1 
@@ -439,12 +496,13 @@ namespace dlib { namespace tt
            - Linearly transforms src as a call to batch_normalize_conv() would if src had
              means and variances as given by running_means and running_variances.  That
              is, this function performs: 
-                dest = gamma*(src-running_means)/sqrt(running_variances+BATCH_NORM_EPS) + beta
+                dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
              Note that it does this in a pointwise fashion over the samples, rows, and
              columns in src.
    !*/

    void batch_normalize_conv (
+        const double eps,
        resizable_tensor& dest,
        resizable_tensor& means,
        resizable_tensor& invstds,
@@ -457,6 +515,7 @@ namespace dlib { namespace tt
    );
    /*!
        requires
+            - eps > 0
            - src.num_samples() > 1
            - gamma.num_samples()==gamma.nr()==gamma.nc() == 1
            - beta.num_samples() ==beta.nr() ==gamma.nc() == 1
@@ -478,6 +537,7 @@ namespace dlib { namespace tt
    !*/

    void batch_normalize_conv_gradient (
+        const double eps,
        const tensor& gradient_input,
        const tensor& means,
        const tensor& invstds,
@@ -489,8 +549,9 @@ namespace dlib { namespace tt
    );
    /*!
        requires
+            - eps > 0
            - invstds and means should be the output of a call to
-              batch_normalize_conv(dest,means,invstds,src,gamma,beta)
+              batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta)
            - have_same_dimensions(gradient_input, src) == true
            - have_same_dimensions(src, src_grad) == true
            - src.num_samples() > 1
@@ -502,7 +563,7 @@ namespace dlib { namespace tt
            - have_same_dimensions(invstds, gamma) == true
        ensures
            - Let f(src,gamma,beta) == dot(gradient_input, dest output of
-              batch_normalize_conv(dest,means,invstds,src,gamma,beta))
+              batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta))
            - Adds the gradient of f() with respect to src to #src_grad.
            - Assigns the gradient of f() with respect to gamma to #gamma_grad.
            - Assigns the gradient of f() with respect to beta to #beta_grad.

--- a/dlib/dnn/trainer.h
+++ b/dlib/dnn/trainer.h
@@ -526,8 +526,7 @@ namespace dlib
            label_type pick_which_run_update;
            job_t next_job;

-            std::vector<std::future<double>> losses(devices.size());
-            std::vector<std::future<void>> update_futs(devices.size());
+            std::vector<dlib::future<double>> losses(devices.size());

            std::vector<tt::multi_device_tensor_averager> averagers;
            // An array of all the parameter tensors in the first network.  We will
@@ -536,6 +535,16 @@ namespace dlib
            std::vector<tensor*> reference_params;
            visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); });

+            // We make separate thread pools with just one thread in them because we want
+            // to make sure each device is always executed on the same thread.  We care
+            // about this because there are thread_local context variables for some cuda
+            // components and they get regenerated when the current cuda device changes.
+            // Recreating them over and over is somewhat expensive so we want to avoid
+            // that.
+            std::vector<std::shared_ptr<thread_pool>> tp;
+            for (size_t i = 0; i < devices.size(); ++i)
+                tp.push_back(std::make_shared<thread_pool>(1));
+

            size_t iteration = 0;
            while(job_pipe.dequeue(next_job))
@@ -545,7 +554,7 @@ namespace dlib
                // right version for unsupervised or supervised training based on the type
                // of label_type.
                for (size_t i = 0; i < devices.size(); ++i)
-                    losses[i] = std::async(std::launch::async,[&,i](){ return compute_parameter_gradients(i, next_job, pick_which_run_update); });
+                    tp[i]->add_task_by_value([&,i](double& loss){ loss = compute_parameter_gradients(i, next_job, pick_which_run_update); }, losses[i]);
                // aggregate loss values from all the network computations.
                double theloss = 0;
                for (auto&& loss : losses)
@@ -596,10 +605,10 @@ namespace dlib

                // Now apply all the updates to each device.
                for (size_t i = 0; i < devices.size(); ++i)
-                    update_futs[i] = std::async(std::launch::async, [&,i](){ if (next_job.have_data[i]) update_parameters(i); });
+                    tp[i]->add_task_by_value([&,i](){ if (next_job.have_data[i]) update_parameters(i); });
                // and wait for the updates to all happen.
-                for (auto&& f : update_futs)
-                    f.wait();
+                for (size_t i = 0; i < devices.size(); ++i)
+                    tp[i]->wait_for_all_tasks();


                // Evey now and then force all the parameters to be the same just to make

--- a/dlib/optimization/optimization.h
+++ b/dlib/optimization/optimization.h
@@ -482,7 +482,7 @@ namespace dlib
            << "\n\t x_upper.size():         " << x_upper.size()
        );
        DLIB_ASSERT (
-            min(x_upper-x_lower) > 0,
+            min(x_upper-x_lower) >= 0,
            "\tdouble find_min_box_constrained()"
            << "\n\t You have to supply proper box constraints to this function."
            << "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower)
@@ -610,7 +610,7 @@ namespace dlib
            << "\n\t x_upper.size():         " << x_upper.size()
        );
        DLIB_ASSERT (
-            min(x_upper-x_lower) > 0,
+            min(x_upper-x_lower) >= 0,
            "\tdouble find_max_box_constrained()"
            << "\n\t You have to supply proper box constraints to this function."
            << "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower)

--- a/dlib/optimization/optimization_abstract.h
+++ b/dlib/optimization/optimization_abstract.h
@@ -297,7 +297,7 @@ namespace dlib
            - is_col_vector(x_upper) == true
            - x.size() == x_lower.size() == x_upper.size()
              (i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
-            - min(x_upper-x_lower) > 0
+            - min(x_upper-x_lower) >= 0
              (i.e. x_upper must contain upper bounds relative to x_lower)
        ensures
            - Performs a box constrained minimization of the function f() using the given
@@ -391,7 +391,7 @@ namespace dlib
            - is_col_vector(x_upper) == true
            - x.size() == x_lower.size() == x_upper.size()
              (i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
-            - min(x_upper-x_lower) > 0
+            - min(x_upper-x_lower) >= 0
              (i.e. x_upper must contain upper bounds relative to x_lower)
        ensures
            - Performs a box constrained maximization of the function f() using the given

--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
--- a/examples/dnn_mnist_advanced_ex.cpp
+++ b/examples/dnn_mnist_advanced_ex.cpp
--- a/examples/webcam_face_pose_ex.cpp
+++ b/examples/webcam_face_pose_ex.cpp
@@ -42,6 +42,12 @@ int main()
    try
    {
        cv::VideoCapture cap(0);
+        if (!cap.isOpened())
+        {
+            cerr << "Unable to connect to camera" << endl;
+            return 1;
+        }
+
        image_window win;

        // Load face detection and pose estimation models.

--- a/tools/visual_studio_natvis/README.txt
+++ b/tools/visual_studio_natvis/README.txt
+Hi Davis,
+thanks for your work on dlib!
+ 
+I have created a natvis file to have nicer debugger visualization of dlib matrices in Visual Studio (2012 - …) and I just wanted to share it with you.
+ 
+To test it, copy the file into you folder  %USERPROFILE%\My Documents\Visual Studio 2015\Visualizers or %VSINSTALLDIR%\Common7\Packages\Debugger\Visualizers as described here https://msdn.microsoft.com/en-us/library/jj620914.aspx
+ 
+It’s certainly extendable, especially to include it into image watch, but currently it may help users to debug much faster.
+ 
+Feel free to share it.
+Best,
+    Johannes Huber
--- a/tools/visual_studio_natvis/dlib.natvis
+++ b/tools/visual_studio_natvis/dlib.natvis
+<?xml version="1.0" encoding="utf-8"?>
+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
+  <!-- dlib matrix debugger visualization in Visual Studio-->
+  <!-- Johannes Huber, SAFEmine Part of Hexagon -->
+  <!-- no warranty -->
+
+  <!-- general dlib::matrix fixed size-->
+   <Type Name="dlib::matrix&lt;*,*,*,*&gt;">
+   <DisplayString>{{ size= &lt;{$T2}&gt; x &lt;{$T3}&gt; }}</DisplayString>
+   <Expand>
+      <ArrayItems>
+         <Size>$T2 * $T3</Size>
+         <ValuePointer>($T1*)data.data</ValuePointer>
+      </ArrayItems>
+   </Expand>
+  </Type>
+
+  <!-- general dlib::matrix fixed rows-->
+  <Type Name="dlib::matrix&lt;*,0,*,*&gt;">
+   <DisplayString>{{ size={data.nr_} x &lt;{$T2}&gt; }}</DisplayString>
+   <Expand>
+      <ArrayItems Condition="data.data != 0">
+         <Size>data.nr_ * $T2</Size>
+         <ValuePointer>($T1*)data.data</ValuePointer>
+      </ArrayItems>
+   </Expand>
+  </Type>
+
+  <!-- general dlib::matrix fixed cols-->
+  <Type Name="dlib::matrix&lt;*,*,0,*&gt;">
+   <DisplayString>{{ size= &lt;{$T2}&gt; x {data.nc_} }}</DisplayString>
+   <Expand>
+      <ArrayItems Condition="data.data != 0">
+         <Size>$T2 * data.nc_</Size>
+         <ValuePointer>($T1*)data.data</ValuePointer>
+      </ArrayItems>
+   </Expand>
+  </Type>
+
+  <!-- general dlib::matrix dynamic size-->
+  <Type Name="dlib::matrix&lt;*,0,0,*&gt;">
+   <DisplayString>{{ size= {data.nc_} x {data.nc_} }}</DisplayString>
+   <Expand>
+      <ArrayItems Condition="data.data != 0">
+         <Size>data.nr_*data.nc_</Size>
+         <ValuePointer>($T1*)data.data</ValuePointer>
+      </ArrayItems>
+   </Expand>
+  </Type>
+
+</AutoVisualizer>
\ No newline at end of file