merged

50012d2c · Davis King · 28da9a42 · 3e559e42 · 50012d2c · 50012d2c
Commit 50012d2c authored Sep 24, 2013 by Davis King
3 changed files
--- a/dlib/optimization/optimization.h
+++ b/dlib/optimization/optimization.h
@@ -456,6 +456,11 @@ namespace dlib
        const matrix_exp<EXP2>& x_upper
    )
    {
+        /*
+            The implementation of this function is more or less based on the discussion in
+            the paper Projected Newton-type Methods in Machine Learning by Mark Schmidt, et al.
+        */
        // make sure the requires clause is not violated
        COMPILE_TIME_ASSERT(is_matrix<T>::value);
        DLIB_ASSERT (
@@ -490,6 +495,7 @@ namespace dlib
        // active constraint.
        const double gap_eps = 1e-8;
+        double last_alpha = 1;
        while(stop_strategy.should_continue_search(x, f_value, g))
        {
            s = search_strategy.get_next_direction(x, f_value, zero_bounded_variables(gap_eps, g, x, g, x_lower, x_upper));
@@ -499,10 +505,19 @@ namespace dlib
                        make_line_search_function(clamp_function(f,x_lower,x_upper), x, s, f_value),
                        f_value,
                        dot(g,s), // compute gradient for the line search
-                        1, 
+                        last_alpha, 
                        search_strategy.get_wolfe_rho(), 
                        search_strategy.get_max_line_search_iterations());
+            // Do a trust region style thing for alpha.  The idea is that if we take a
+            // small step then we are likely to take another small step.  So we reuse the
+            // alpha from the last iteration unless the line search didn't shrink alpha at
+            // all, in that case, we start with a bigger alpha next time.
+            if (alpha == last_alpha)
+                last_alpha = std::min(last_alpha*10,1.0);
+            else
+                last_alpha = alpha;
            // Take the search step indicated by the above line search
            x = clamp(x + alpha*s, x_lower, x_upper);
            g = der(x);
@@ -601,6 +616,7 @@ namespace dlib
        // active constraint.
        const double gap_eps = 1e-8;
+        double last_alpha = 1;
        while(stop_strategy.should_continue_search(x, f_value, g))
        {
            s = search_strategy.get_next_direction(x, f_value, zero_bounded_variables(gap_eps, g, x, g, x_lower, x_upper));
@@ -610,10 +626,19 @@ namespace dlib
                        negate_function(make_line_search_function(clamp_function(f,x_lower,x_upper), x, s, f_value)),
                        f_value,
                        dot(g,s), // compute gradient for the line search
-                        1, 
+                        last_alpha, 
                        search_strategy.get_wolfe_rho(), 
                        search_strategy.get_max_line_search_iterations());
+            // Do a trust region style thing for alpha.  The idea is that if we take a
+            // small step then we are likely to take another small step.  So we reuse the
+            // alpha from the last iteration unless the line search didn't shrink alpha at
+            // all, in that case, we start with a bigger alpha next time.
+            if (alpha == last_alpha)
+                last_alpha = std::min(last_alpha*10,1.0);
+            else
+                last_alpha = alpha;
            // Take the search step indicated by the above line search
            x = clamp(x + alpha*s, x_lower, x_upper);
            g = -der(x);

--- a/dlib/optimization/optimization_line_search.h
+++ b/dlib/optimization/optimization_line_search.h
@@ -183,6 +183,57 @@ namespace dlib
        return put_in_range(0,1,alpha);
    }
+// ----------------------------------------------------------------------------------------
+    inline double poly_min_extrap (
+        double f0,
+        double d0,
+        double x1,
+        double f_x1,
+        double x2,
+        double f_x2
+    )
+    {
+        DLIB_ASSERT(0 < x1 && x1 < x2,"Invalid inputs were given to this function");
+        // The contents of this function follow the equations described on page 58 of the
+        // book Numerical Optimization by Nocedal and Wright, second edition.
+        matrix<double,2,2> m;
+        matrix<double,2,1> v;
+        const double aa2 = x2*x2;
+        const double aa1 = x1*x1;
+        m =  aa2,       -aa1,
+            -aa2*x2, aa1*x1;   
+        v = f_x1 - f0 - d0*x1,
+            f_x2 - f0 - d0*x2;
+        double temp = aa2*aa1*(x1-x2);
+        // just take a guess if this happens
+        if (temp == 0)
+        {
+            return x1/2.0;
+        }
+        matrix<double,2,1> temp2;
+        temp2 = m*v/temp;
+        const double a = temp2(0);
+        const double b = temp2(1);
+        temp = b*b - 3*a*d0;
+        if (temp < 0 || a == 0)
+        {
+            // This is probably a line so just pick the lowest point
+            if (f0 < f_x2)
+                return 0;
+            else
+                return x2;
+        }
+        temp = (-b + std::sqrt(temp))/(3*a);
+        return put_in_range(0, x2, temp);
+    }
 // ----------------------------------------------------------------------------------------
    inline double lagrange_poly_min_extrap (
@@ -447,11 +498,17 @@ namespace dlib
            << "\n\t max_iter: " << max_iter 
        );
-        // If the gradient is telling us we need to search backwards then that is what we
+        // make sure alpha is going in the right direction.  That is, it should be opposite
-        // will do.
+        // the direction of the gradient.
-        if (d0 > 0 && alpha > 0)
+        if ((d0 > 0 && alpha > 0) ||
+            (d0 < 0 && alpha < 0))
+        {
            alpha *= -1;
+        }
+        bool have_prev_alpha = false;
+        double prev_alpha = 0;
+        double prev_val = 0;
        unsigned long iter = 0;
        while (true)
        {
@@ -466,12 +523,26 @@ namespace dlib
                // Interpolate a new alpha.  We also make sure the step by which we
                // reduce alpha is not super small.
                double step;
+                if (!have_prev_alpha)
+                {
                    if (d0 < 0)
-                    step = put_in_range(0.1,0.9, poly_min_extrap(f0, d0, val));
+                        step = alpha*put_in_range(0.1,0.9, poly_min_extrap(f0, d0, val));
                    else
-                    step = put_in_range(0.1,0.9, poly_min_extrap(f0, -d0, val));
+                        step = alpha*put_in_range(0.1,0.9, poly_min_extrap(f0, -d0, val));
+                    have_prev_alpha = true;
+                }
+                else
+                {
+                    if (d0 < 0)
+                        step = put_in_range(0.1*alpha,0.9*alpha, poly_min_extrap(f0, d0, alpha, val, prev_alpha, prev_val));
+                    else
+                        step = put_in_range(0.1*alpha,0.9*alpha, -poly_min_extrap(f0, -d0, -alpha, val, -prev_alpha, prev_val));
+                }
+                prev_alpha = alpha;
+                prev_val = val;
-                alpha *= step;
+                alpha = step;
            }
        }
    }

--- a/dlib/optimization/optimization_line_search_abstract.h
+++ b/dlib/optimization/optimization_line_search_abstract.h
@@ -119,6 +119,28 @@ namespace dlib
            - returns the point in the range [0,1] that minimizes the polynomial c(x) 
    !*/
+// ----------------------------------------------------------------------------------------
+    inline double poly_min_extrap (
+        double f0,
+        double d0,
+        double x1,
+        double f_x1,
+        double x2,
+        double f_x2
+    )
+    /*!
+        requires
+            - 0 < x1 < x2
+        ensures
+            - let f(x) be a 3rd degree polynomial such that:
+                - f(0) == f0
+                - derivative of f(x) at x==0 is d0
+                - f(x1) == f_x1
+                - f(x2) == f_x2
+            - returns the point in the range [0,x2] that minimizes the polynomial f(x) 
+    !*/
 // ----------------------------------------------------------------------------------------
    inline double lagrange_poly_min_extrap (