Added a quadratic solver.

--HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403454

Added a quadratic solver.
--HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403454
6eb05731 · Davis King · ec217d6f · 6eb05731 · 6eb05731 · 6eb05731
Commit 6eb05731 authored Feb 13, 2010 by Davis King
6 changed files
--- a/dlib/optimization.h
+++ b/dlib/optimization.h
@@ -5,6 +5,7 @@
 #include "optimization/optimization.h"
 #include "optimization/optimization_bobyqa.h"
+#include "optimization/optimization_solve_qp_using_smo.h"
 #endif // DLIB_OPTIMIZATIOn_HEADER

--- a/dlib/optimization/optimization_solve_qp_using_smo.h
+++ b/dlib/optimization/optimization_solve_qp_using_smo.h
+// Copyright (C) 2010  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_H__
+#define DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_H__
+#include "optimization_solve_qp_using_smo_abstract.h"
+#include "../matrix.h"
+namespace dlib
+{
+// ----------------------------------------------------------------------------------------
+    /*
+        The algorithm defined in the solve_qp_using_smo() function below can be
+        derived by using an important theorem from the theory of constrained optimization.
+        This theorem tells us that any optimal point of a constrained function must
+        satisfy what are called the KKT conditions (also sometimes called just the KT 
+        conditions, especially in older literature).  A very good book to consult 
+        regarding this topic is Practical Methods of Optimization (second edition) by 
+        R. Fletcher.  Below I will try to explain the general idea of how this is 
+        applied.
+        Let e == ones_matrix(alpha.size(),1)
+        First, note that the function below solves the following quadratic program.  
+            Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b
+            subject to the following constraints:
+                - trans(e)*alpha == C (i.e. the sum of alpha values doesn't change)
+                - min(alpha) >= 0 (i.e. all alpha values are nonnegative)
+        To get from this problem formulation to the algorithm below we have to 
+        consider the KKT conditions.  They tell us that any solution to the above
+        problem must satisfy the following 5 conditions:
+            1. trans(e)*alpha == C
+            2. min(alpha) >= 0
+            3. Let L(alpha, x, y) == f(alpha) - trans(x)*alpha - y*(trans(e)*alpha - C)
+               Where x is a vector of length alpha.size() and y is a single scalar.
+               Then the derivative of L with respect to alpha must == 0
+               So we get the following as our 3rd condition:
+               f'(alpha) - x - y*e == 0
+            4. min(x) >= 0 (i.e. all x values are nonnegative)
+            5. pointwise_multiply(x, alpha) == 0
+               (i.e. only one member of each x(i) and alpha(i) pair can be non-zero)
+        From 3 we can easily obtain this rule:
+            for all i: f'(alpha)(i) - x(i) == y
+        If we then consider 4 and 5 we see that we can infer that the following
+        must also be the case:
+            - if (alpha(i) > 0) then
+                - x(i) == 0
+                - f'(alpha)(i) == y
+            - else
+                - x(i) == some nonnegative number
+                - f'(alpha)(i) >= y
+        The important thing to take away is the final rule.  It tells us that at the
+        optimal solution all elements of the gradient of f have the same value if 
+        their corresponding alpha is non-zero.  It also tells us that all the other
+        gradient values are bigger than y.  So using this rule we can easily check
+        to see if the algorithm has found the optimal point. 
+    */
+// ----------------------------------------------------------------------------------------
+    template <
+        typename EXP1,
+        typename EXP2,
+        typename T, long NR, long NC, typename MM, typename L
+        >
+    void solve_qp_using_smo ( 
+        const matrix_exp<EXP1>& Q,
+        const matrix_exp<EXP2>& b,
+        matrix<T,NR,NC,MM,L>& alpha,
+        T eps
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(Q.nr() == Q.nc() &&
+                     is_col_vector(b) &&
+                     is_col_vector(alpha) &&
+                     b.size() == alpha.size() &&
+                     b.size() == Q.nr() &&
+                     min(alpha) >= 0 &&
+                     eps > 0,
+                     "\t void solve_qp_using_smo()"
+                     << "\n\t Invalid arguments were given to this function"
+                     << "\n\t Q.nr():               " << Q.nr()
+                     << "\n\t Q.nc():               " << Q.nc()
+                     << "\n\t is_col_vector(b):     " << is_col_vector(b)
+                     << "\n\t is_col_vector(alpha): " << is_col_vector(alpha)
+                     << "\n\t b.size():             " << b.size() 
+                     << "\n\t alpha.size():         " << alpha.size() 
+                     << "\n\t Q.nr():               " << Q.nr() 
+                     << "\n\t min(alpha):           " << min(alpha) 
+                     << "\n\t eps:                  " << eps 
+        );
+        // Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha.  
+        matrix<T,NR,NC,MM,L> df = Q*alpha - b;
+        const T tau = 1000*std::numeric_limits<T>::epsilon();
+        while (true)
+        {
+            // Find the two elements of df that satisfy the following:
+            //    - small_idx == index_of_min(df)
+            //    - big_idx   == the index of the largest element in df such that alpha(big_idx) > 0
+            // These two indices will tell us which two alpha values are most in violation of the KKT 
+            // optimality conditions.  
+            T big = -std::numeric_limits<T>::max();
+            long big_idx = 0;
+            T small = std::numeric_limits<T>::max();
+            long small_idx = 0;
+            for (long i = 0; i < df.nr(); ++i)
+            {
+                if (df(i) > big && alpha(i) > 0)
+                {
+                    big = df(i);
+                    big_idx = i;
+                }
+                if (df(i) < small)
+                {
+                    small = df(i);
+                    small_idx = i;
+                }
+            }
+            // Check if the KKT conditions are still violated.  Note that this rule isn't
+            // one of the KKT conditions itself but is derived from them.  See the top of
+            // this file for a derivation of this stopping rule.
+            if (alpha(small_idx) > 0 && (big - small) < eps)
+                break;
+            // Save these values, we will need them later.
+            const T old_alpha_big = alpha(big_idx);
+            const T old_alpha_small = alpha(small_idx);
+            // Now optimize the two variables we just picked. 
+            T quad_coef = Q(big_idx,big_idx) + Q(small_idx,small_idx) - 2*Q(big_idx, small_idx);
+            if (quad_coef <= tau)
+                quad_coef = tau;
+            const T delta = (big - small)/quad_coef;
+            alpha(big_idx) -= delta;
+            alpha(small_idx) += delta;
+            // Make sure alpha stays feasible.  That is, make sure the updated alpha doesn't
+            // violate the non-negativity constraint.  
+            if (alpha(big_idx) < 0)
+            {
+                // Since an alpha can't be negative we will just set it to 0 and shift all the
+                // weight to the other alpha.
+                alpha(big_idx) = 0;
+                alpha(small_idx) = old_alpha_big + old_alpha_small;
+            }
+            // Now update the gradient. We will perform the equivalent of: df = Q*alpha - b;
+            const T delta_alpha_big   = alpha(big_idx) - old_alpha_big;
+            const T delta_alpha_small = alpha(small_idx) - old_alpha_small;
+            for(long k = 0; k < df.nr(); ++k)
+                df(k) += Q(big_idx,k)*delta_alpha_big + Q(small_idx,k)*delta_alpha_small;
+        }
+    }
+// ----------------------------------------------------------------------------------------
+}
+#endif // DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_H__
--- a/dlib/optimization/optimization_solve_qp_using_smo_abstract.h
+++ b/dlib/optimization/optimization_solve_qp_using_smo_abstract.h
+// Copyright (C) 2010  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_H__
+#ifdef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_H__
+#include "../matrix.h"
+namespace dlib
+{
+// ----------------------------------------------------------------------------------------
+    template <
+        typename EXP1,
+        typename EXP2,
+        typename T, long NR, long NC, typename MM, typename L
+        >
+    void solve_qp_using_smo ( 
+        const matrix_exp<EXP1>& Q,
+        const matrix_exp<EXP2>& b,
+        matrix<T,NR,NC,MM,L>& alpha,
+        T eps
+    );
+    /*!
+        requires
+            - Q.nr() == Q.nc()
+            - is_col_vector(b) == true
+            - is_col_vector(alpha) == true
+            - b.size() == alpha.size() == Q.nr()
+            - min(alpha) >= 0
+            - eps > 0
+        ensures
+            - Let C == sum(alpha) (i.e. C is the sum of the alpha values you 
+              supply to this function)
+            - This function solves the following quadratic program:
+                Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b
+                subject to the following constraints:
+                    - sum(alpha) == C (i.e. the sum of alpha values doesn't change)
+                    - min(alpha) >= 0 (i.e. all alpha values are nonnegative)
+            - The solution to the above QP will be stored in #alpha.
+            - This function uses a simple implementation of the sequential minimal
+              optimization algorithm.  It starts the algorithm with the given alpha
+              and it works on the problem until the KKT violation is less than eps.  
+              So eps controls how accurate the solution is and smaller values result 
+              in better solutions.
+    !*/
+// ----------------------------------------------------------------------------------------
+}
+#endif // DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_H__
--- a/dlib/test/CMakeLists.txt
+++ b/dlib/test/CMakeLists.txt
@@ -57,6 +57,7 @@ set (tests
   metaprogramming.cpp
   multithreaded_object.cpp
   optimization.cpp
+   opt_qp_solver.cpp
   pipe.cpp
   pixel.cpp
   queue.cpp

--- a/dlib/test/makefile
+++ b/dlib/test/makefile
@@ -67,6 +67,7 @@ SRC += member_function_pointer.cpp
 SRC += metaprogramming.cpp
 SRC += multithreaded_object.cpp
 SRC += optimization.cpp
+SRC += opt_qp_solver.cpp
 SRC += pipe.cpp
 SRC += pixel.cpp
 SRC += queue.cpp

--- a/dlib/test/opt_qp_solver.cpp
+++ b/dlib/test/opt_qp_solver.cpp
+// Copyright (C) 2010  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#include <dlib/optimization.h>
+#include <sstream>
+#include <string>
+#include <cstdlib>
+#include <ctime>
+#include <vector>
+#include <dlib/rand.h>
+#include <dlib/string.h>
+#include <dlib/statistics.h>
+#include "tester.h"
+namespace  
+{
+    using namespace test;
+    using namespace dlib;
+    using namespace std;
+    logger dlog("test.opt_qp_solver");
+// ----------------------------------------------------------------------------------------
+    class test_smo
+    {
+    public:
+        double penalty;
+        double C;
+        double operator() (
+            const matrix<double,0,1>& alpha
+        ) const
+        {
+            double obj =  0.5* trans(alpha)*Q*alpha - trans(alpha)*b;
+            double c1 = pow(sum(alpha)-C,2);
+            double c2 = sum(pow(pointwise_multiply(alpha, alpha<0), 2));
+            obj += penalty*(c1 + c2);
+            return obj;
+        }
+        matrix<double> Q, b;
+    };
+// ----------------------------------------------------------------------------------------
+    class opt_qp_solver_tester : public tester
+    {
+        /*
+            The idea here is just to solve the same problem with two different
+            methods and check that they basically agree.  The SMO solver should be
+            very accurate but for this problem the BOBYQA solver is relatively
+            inaccurate.  So this test is really just a sanity check on the SMO
+            solver.
+        */
+    public:
+        opt_qp_solver_tester (
+        ) :
+            tester ("test_opt_qp_solver",
+                    "Runs tests on the solve_qp_using_smo component.")
+        {
+            thetime = time(0);
+        }
+        time_t thetime;
+        dlib::rand::float_1a rnd;
+        void perform_test(
+        )
+        {
+            ++thetime;
+            typedef matrix<double,0,1> sample_type;
+            dlog << LINFO << "time seed: " << thetime;
+            rnd.set_seed(cast_to_string(thetime));
+            running_stats<double> rs;
+            for (int i = 0; i < 40; ++i)
+            {
+                for (long dims = 2; dims < 6; ++dims)
+                {
+                    rs.add(do_the_test(dims, 1.0));
+                }
+            }
+            for (int i = 0; i < 40; ++i)
+            {
+                for (long dims = 2; dims < 6; ++dims)
+                {
+                    rs.add(do_the_test(dims, 5.0));
+                }
+            }
+            dlog << LINFO << "disagreement mean: " << rs.mean();
+            dlog << LINFO << "disagreement stddev: " << rs.stddev();
+            DLIB_TEST_MSG(rs.mean() < 0.001, rs.mean());
+            DLIB_TEST_MSG(rs.stddev() < 0.01, rs.stddev());
+        }
+        double do_the_test (
+            const long dims,
+            double C
+        )
+        {
+            print_spinner();
+            dlog << LINFO << "dims: " << dims;
+            dlog << LINFO << "testing with C == " << C;
+            test_smo test;
+            test.Q = randm(dims, dims, rnd);
+            test.Q = trans(test.Q)*test.Q;
+            test.b = randm(dims,1, rnd);
+            test.C = C;
+            matrix<double,0,1> x(dims), lower(dims), upper(dims), alpha(dims);
+            lower = 0;
+            upper = C;
+            test.penalty = 1000;
+            alpha = C/alpha.size();
+            x = alpha;
+            solve_qp_using_smo(test.Q, test.b, alpha, 0.00001);
+            DLIB_TEST_MSG(abs(sum(alpha) - C) < 1e-13, abs(sum(alpha) - C) );
+            dlog << LTRACE << "alpha: " << alpha;
+            dlog << LINFO << "SMO: true objective: "<< 0.5*trans(alpha)*test.Q*alpha - trans(alpha)*test.b;
+            double obj = find_min_bobyqa( test, x, 2*x.size()+1, 
+                                          lower,
+                                          upper,
+                                          C/3.0,
+                                          1e-6,
+                                          10000);
+            dlog << LINFO << "BOBYQA: objective: " << obj;
+            dlog << LINFO << "BOBYQA: true objective: "<< 0.5*trans(x)*test.Q*x - trans(x)*test.b;
+            dlog << LINFO << "sum(x): " << sum(x);
+            dlog << LINFO << x;
+            double disagreement = max(abs(x-alpha));
+            dlog << LINFO << "Disagreement: " << disagreement;
+            return disagreement;
+        }
+    } a;
+}