Commit 6eb05731 authored by Davis King's avatar Davis King

Added a quadratic solver.

--HG--
extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403454
parent ec217d6f
......@@ -5,6 +5,7 @@
#include "optimization/optimization.h"
#include "optimization/optimization_bobyqa.h"
#include "optimization/optimization_solve_qp_using_smo.h"
#endif // DLIB_OPTIMIZATIOn_HEADER
......
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_H__
#define DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_H__
#include "optimization_solve_qp_using_smo_abstract.h"
#include "../matrix.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
/*
The algorithm defined in the solve_qp_using_smo() function below can be
derived by using an important theorem from the theory of constrained optimization.
This theorem tells us that any optimal point of a constrained function must
satisfy what are called the KKT conditions (also sometimes called just the KT
conditions, especially in older literature). A very good book to consult
regarding this topic is Practical Methods of Optimization (second edition) by
R. Fletcher. Below I will try to explain the general idea of how this is
applied.
Let e == ones_matrix(alpha.size(),1)
First, note that the function below solves the following quadratic program.
Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b
subject to the following constraints:
- trans(e)*alpha == C (i.e. the sum of alpha values doesn't change)
- min(alpha) >= 0 (i.e. all alpha values are nonnegative)
To get from this problem formulation to the algorithm below we have to
consider the KKT conditions. They tell us that any solution to the above
problem must satisfy the following 5 conditions:
1. trans(e)*alpha == C
2. min(alpha) >= 0
3. Let L(alpha, x, y) == f(alpha) - trans(x)*alpha - y*(trans(e)*alpha - C)
Where x is a vector of length alpha.size() and y is a single scalar.
Then the derivative of L with respect to alpha must == 0
So we get the following as our 3rd condition:
f'(alpha) - x - y*e == 0
4. min(x) >= 0 (i.e. all x values are nonnegative)
5. pointwise_multiply(x, alpha) == 0
(i.e. only one member of each x(i) and alpha(i) pair can be non-zero)
From 3 we can easily obtain this rule:
for all i: f'(alpha)(i) - x(i) == y
If we then consider 4 and 5 we see that we can infer that the following
must also be the case:
- if (alpha(i) > 0) then
- x(i) == 0
- f'(alpha)(i) == y
- else
- x(i) == some nonnegative number
- f'(alpha)(i) >= y
The important thing to take away is the final rule. It tells us that at the
optimal solution all elements of the gradient of f have the same value if
their corresponding alpha is non-zero. It also tells us that all the other
gradient values are bigger than y. So using this rule we can easily check
to see if the algorithm has found the optimal point.
*/
// ----------------------------------------------------------------------------------------
template <
typename EXP1,
typename EXP2,
typename T, long NR, long NC, typename MM, typename L
>
void solve_qp_using_smo (
const matrix_exp<EXP1>& Q,
const matrix_exp<EXP2>& b,
matrix<T,NR,NC,MM,L>& alpha,
T eps
)
{
// make sure requires clause is not broken
DLIB_ASSERT(Q.nr() == Q.nc() &&
is_col_vector(b) &&
is_col_vector(alpha) &&
b.size() == alpha.size() &&
b.size() == Q.nr() &&
min(alpha) >= 0 &&
eps > 0,
"\t void solve_qp_using_smo()"
<< "\n\t Invalid arguments were given to this function"
<< "\n\t Q.nr(): " << Q.nr()
<< "\n\t Q.nc(): " << Q.nc()
<< "\n\t is_col_vector(b): " << is_col_vector(b)
<< "\n\t is_col_vector(alpha): " << is_col_vector(alpha)
<< "\n\t b.size(): " << b.size()
<< "\n\t alpha.size(): " << alpha.size()
<< "\n\t Q.nr(): " << Q.nr()
<< "\n\t min(alpha): " << min(alpha)
<< "\n\t eps: " << eps
);
// Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha.
matrix<T,NR,NC,MM,L> df = Q*alpha - b;
const T tau = 1000*std::numeric_limits<T>::epsilon();
while (true)
{
// Find the two elements of df that satisfy the following:
// - small_idx == index_of_min(df)
// - big_idx == the index of the largest element in df such that alpha(big_idx) > 0
// These two indices will tell us which two alpha values are most in violation of the KKT
// optimality conditions.
T big = -std::numeric_limits<T>::max();
long big_idx = 0;
T small = std::numeric_limits<T>::max();
long small_idx = 0;
for (long i = 0; i < df.nr(); ++i)
{
if (df(i) > big && alpha(i) > 0)
{
big = df(i);
big_idx = i;
}
if (df(i) < small)
{
small = df(i);
small_idx = i;
}
}
// Check if the KKT conditions are still violated. Note that this rule isn't
// one of the KKT conditions itself but is derived from them. See the top of
// this file for a derivation of this stopping rule.
if (alpha(small_idx) > 0 && (big - small) < eps)
break;
// Save these values, we will need them later.
const T old_alpha_big = alpha(big_idx);
const T old_alpha_small = alpha(small_idx);
// Now optimize the two variables we just picked.
T quad_coef = Q(big_idx,big_idx) + Q(small_idx,small_idx) - 2*Q(big_idx, small_idx);
if (quad_coef <= tau)
quad_coef = tau;
const T delta = (big - small)/quad_coef;
alpha(big_idx) -= delta;
alpha(small_idx) += delta;
// Make sure alpha stays feasible. That is, make sure the updated alpha doesn't
// violate the non-negativity constraint.
if (alpha(big_idx) < 0)
{
// Since an alpha can't be negative we will just set it to 0 and shift all the
// weight to the other alpha.
alpha(big_idx) = 0;
alpha(small_idx) = old_alpha_big + old_alpha_small;
}
// Now update the gradient. We will perform the equivalent of: df = Q*alpha - b;
const T delta_alpha_big = alpha(big_idx) - old_alpha_big;
const T delta_alpha_small = alpha(small_idx) - old_alpha_small;
for(long k = 0; k < df.nr(); ++k)
df(k) += Q(big_idx,k)*delta_alpha_big + Q(small_idx,k)*delta_alpha_small;
}
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_H__
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_H__
#ifdef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_H__
#include "../matrix.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
template <
typename EXP1,
typename EXP2,
typename T, long NR, long NC, typename MM, typename L
>
void solve_qp_using_smo (
const matrix_exp<EXP1>& Q,
const matrix_exp<EXP2>& b,
matrix<T,NR,NC,MM,L>& alpha,
T eps
);
/*!
requires
- Q.nr() == Q.nc()
- is_col_vector(b) == true
- is_col_vector(alpha) == true
- b.size() == alpha.size() == Q.nr()
- min(alpha) >= 0
- eps > 0
ensures
- Let C == sum(alpha) (i.e. C is the sum of the alpha values you
supply to this function)
- This function solves the following quadratic program:
Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b
subject to the following constraints:
- sum(alpha) == C (i.e. the sum of alpha values doesn't change)
- min(alpha) >= 0 (i.e. all alpha values are nonnegative)
- The solution to the above QP will be stored in #alpha.
- This function uses a simple implementation of the sequential minimal
optimization algorithm. It starts the algorithm with the given alpha
and it works on the problem until the KKT violation is less than eps.
So eps controls how accurate the solution is and smaller values result
in better solutions.
!*/
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_H__
......@@ -57,6 +57,7 @@ set (tests
metaprogramming.cpp
multithreaded_object.cpp
optimization.cpp
opt_qp_solver.cpp
pipe.cpp
pixel.cpp
queue.cpp
......
......@@ -67,6 +67,7 @@ SRC += member_function_pointer.cpp
SRC += metaprogramming.cpp
SRC += multithreaded_object.cpp
SRC += optimization.cpp
SRC += opt_qp_solver.cpp
SRC += pipe.cpp
SRC += pixel.cpp
SRC += queue.cpp
......
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#include <dlib/optimization.h>
#include <sstream>
#include <string>
#include <cstdlib>
#include <ctime>
#include <vector>
#include <dlib/rand.h>
#include <dlib/string.h>
#include <dlib/statistics.h>
#include "tester.h"
namespace
{
using namespace test;
using namespace dlib;
using namespace std;
logger dlog("test.opt_qp_solver");
// ----------------------------------------------------------------------------------------
class test_smo
{
public:
double penalty;
double C;
double operator() (
const matrix<double,0,1>& alpha
) const
{
double obj = 0.5* trans(alpha)*Q*alpha - trans(alpha)*b;
double c1 = pow(sum(alpha)-C,2);
double c2 = sum(pow(pointwise_multiply(alpha, alpha<0), 2));
obj += penalty*(c1 + c2);
return obj;
}
matrix<double> Q, b;
};
// ----------------------------------------------------------------------------------------
class opt_qp_solver_tester : public tester
{
/*
The idea here is just to solve the same problem with two different
methods and check that they basically agree. The SMO solver should be
very accurate but for this problem the BOBYQA solver is relatively
inaccurate. So this test is really just a sanity check on the SMO
solver.
*/
public:
opt_qp_solver_tester (
) :
tester ("test_opt_qp_solver",
"Runs tests on the solve_qp_using_smo component.")
{
thetime = time(0);
}
time_t thetime;
dlib::rand::float_1a rnd;
void perform_test(
)
{
++thetime;
typedef matrix<double,0,1> sample_type;
dlog << LINFO << "time seed: " << thetime;
rnd.set_seed(cast_to_string(thetime));
running_stats<double> rs;
for (int i = 0; i < 40; ++i)
{
for (long dims = 2; dims < 6; ++dims)
{
rs.add(do_the_test(dims, 1.0));
}
}
for (int i = 0; i < 40; ++i)
{
for (long dims = 2; dims < 6; ++dims)
{
rs.add(do_the_test(dims, 5.0));
}
}
dlog << LINFO << "disagreement mean: " << rs.mean();
dlog << LINFO << "disagreement stddev: " << rs.stddev();
DLIB_TEST_MSG(rs.mean() < 0.001, rs.mean());
DLIB_TEST_MSG(rs.stddev() < 0.01, rs.stddev());
}
double do_the_test (
const long dims,
double C
)
{
print_spinner();
dlog << LINFO << "dims: " << dims;
dlog << LINFO << "testing with C == " << C;
test_smo test;
test.Q = randm(dims, dims, rnd);
test.Q = trans(test.Q)*test.Q;
test.b = randm(dims,1, rnd);
test.C = C;
matrix<double,0,1> x(dims), lower(dims), upper(dims), alpha(dims);
lower = 0;
upper = C;
test.penalty = 1000;
alpha = C/alpha.size();
x = alpha;
solve_qp_using_smo(test.Q, test.b, alpha, 0.00001);
DLIB_TEST_MSG(abs(sum(alpha) - C) < 1e-13, abs(sum(alpha) - C) );
dlog << LTRACE << "alpha: " << alpha;
dlog << LINFO << "SMO: true objective: "<< 0.5*trans(alpha)*test.Q*alpha - trans(alpha)*test.b;
double obj = find_min_bobyqa( test, x, 2*x.size()+1,
lower,
upper,
C/3.0,
1e-6,
10000);
dlog << LINFO << "BOBYQA: objective: " << obj;
dlog << LINFO << "BOBYQA: true objective: "<< 0.5*trans(x)*test.Q*x - trans(x)*test.b;
dlog << LINFO << "sum(x): " << sum(x);
dlog << LINFO << x;
double disagreement = max(abs(x-alpha));
dlog << LINFO << "Disagreement: " << disagreement;
return disagreement;
}
} a;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment