Switched the QP solver from using KKT violation as a stopping

condition to using the duality gap. --HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403501

Switched the QP solver from using KKT violation as a stopping
condition to using the duality gap. --HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403501
3d118c63 · Davis King · d3bb4087 · 3d118c63 · 3d118c63
Commit 3d118c63 authored Feb 27, 2010 by Davis King
Showing with 46 additions and 14 deletions

optimization_solve_qp_using_smo.h dlib/optimization/optimization_solve_qp_using_smo.h +43 -11

optimization_solve_qp_using_smo_abstract.h dlib/optimization/optimization_solve_qp_using_smo_abstract.h +3 -3

No files found.
--- a/dlib/optimization/optimization_solve_qp_using_smo.h
+++ b/dlib/optimization/optimization_solve_qp_using_smo.h
@@ -63,8 +63,8 @@ namespace dlib
        The important thing to take away is the final rule.  It tells us that at the
        optimal solution all elements of the gradient of f have the same value if 
        their corresponding alpha is non-zero.  It also tells us that all the other
-        gradient values are bigger than y.  So using this rule we can easily check
+        gradient values are bigger than y.  We can use this information to help us
-        to see if the algorithm has found the optimal point. 
+        pick which alpha variables to optimize at each iteration. 
    */
 // ----------------------------------------------------------------------------------------
@@ -106,11 +106,17 @@ namespace dlib
                     << "\n\t max_iter:             " << max_iter 
        );
+        const T C = sum(alpha);
        // Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha.  
        matrix<T,NR,NC,MM,L> df = Q*alpha - b;
+        // This variable should always contain the current value of trans(alpha)*df 
+        T alpha_df = trans(alpha)*df;
        const T tau = 1000*std::numeric_limits<T>::epsilon();
+        T big, little;
        unsigned long iter = 0;
        for (; iter < max_iter; ++iter)
        {
@@ -119,9 +125,9 @@ namespace dlib
            //    - big_idx   == the index of the largest element in df such that alpha(big_idx) > 0
            // These two indices will tell us which two alpha values are most in violation of the KKT 
            // optimality conditions.  
-            T big = -std::numeric_limits<T>::max();
+            big = -std::numeric_limits<T>::max();
            long big_idx = 0;
-            T little = std::numeric_limits<T>::max();
+            little = std::numeric_limits<T>::max();
            long little_idx = 0;
            for (long i = 0; i < df.nr(); ++i)
            {
@@ -137,10 +143,17 @@ namespace dlib
                }
            }
-            // Check if the KKT conditions are still violated.  Note that this rule isn't
+            // Check if the KKT conditions are still violated and stop if so.  
-            // one of the KKT conditions itself but is derived from them.  See the top of
+            //if (alpha(little_idx) > 0 && (big - little) < eps)
-            // this file for a derivation of this stopping rule.
+            //    break;
-            if (alpha(little_idx) > 0 && (big - little) < eps)
+            // Check how big the duality gap is and stop when it goes below eps.  
+            // The duality gap is the gap between the objective value of the function
+            // we are optimizing and the value of it's primal form.  This value is always 
+            // greater than or equal to the distance to the optimum solution so it is a 
+            // good way to decide if we should stop.   See the book referenced above for 
+            // more information.  In particular, see the part about the Wolfe Dual.
+            if (alpha_df - C*little < eps)
                break;
@@ -167,13 +180,15 @@ namespace dlib
                alpha(little_idx) = old_alpha_big + old_alpha_little;
            }
-            // Every 100 iterations
+            // Every 300 iterations
-            if ((iter%100) == 99)
+            if ((iter%300) == 299)
            {
                // Perform this form of the update every so often because doing so can help
                // avoid the buildup of numerical errors you get with the alternate update
                // below.
                df = Q*alpha - b;
+                alpha_df = trans(alpha)*df;
            }
            else
            {
@@ -181,11 +196,28 @@ namespace dlib
                const T delta_alpha_big   = alpha(big_idx) - old_alpha_big;
                const T delta_alpha_little = alpha(little_idx) - old_alpha_little;
+                alpha_df += delta_alpha_big*df(big_idx) + delta_alpha_little*df(little_idx);
                for(long k = 0; k < df.nr(); ++k)
-                    df(k) += Q(big_idx,k)*delta_alpha_big + Q(little_idx,k)*delta_alpha_little;
+                {
+                    T temp = Q(big_idx,k)*delta_alpha_big + Q(little_idx,k)*delta_alpha_little;
+                    df(k) += temp;
+                    alpha_df += alpha(k)*temp;
+                }
            }
        }
+        /*
+        using namespace std;
+        cout << "SMO: " << endl;
+        cout << "   duality gap: "<< trans(alpha)*df - C*min(df) << endl;
+        cout << "   KKT gap:     "<< big-little << endl;
+        cout << "   iter:        "<< iter+1 << endl;
+        cout << "   eps:         "<< eps << endl;
+        */
        return iter+1;
    }

--- a/dlib/optimization/optimization_solve_qp_using_smo_abstract.h
+++ b/dlib/optimization/optimization_solve_qp_using_smo_abstract.h
@@ -43,9 +43,9 @@ namespace dlib
            - The solution to the above QP will be stored in #alpha.
            - This function uses a simple implementation of the sequential minimal
              optimization algorithm.  It starts the algorithm with the given alpha
-              and it works on the problem until the KKT violation is less than eps.  
+              and it works on the problem until the duality gap (i.e. how far away
-              So eps controls how accurate the solution is and smaller values result 
+              we are from the optimum solution) is less than eps.  So eps controls 
-              in better solutions.
+              how accurate the solution is and smaller values result in better solutions.
            - At most max_iter iterations of optimization will be performed.  
            - returns the number of iterations performed.  If this method fails to
              converge to eps accuracy then the number returned will be max_iter+1.