Commit 682f6b3d authored by Davis King's avatar Davis King

Added a trust region optimizer.

--HG--
extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403919
parent 571aff20
......@@ -7,6 +7,7 @@
#include "optimization/optimization_bobyqa.h"
#include "optimization/optimization_solve_qp_using_smo.h"
#include "optimization/optimization_oca.h"
#include "optimization/optimization_trust_region.h"
#endif // DLIB_OPTIMIZATIOn_HEADER
......
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_OPTIMIZATION_TRUST_REGIoN_H__
#define DLIB_OPTIMIZATION_TRUST_REGIoN_H__
#include "../matrix.h"
#include "optimization_trust_region_abstract.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
template <
typename EXP1,
typename EXP2,
typename T, long NR, long NC, typename MM, typename L
>
unsigned long solve_trust_region_subproblem (
const matrix_exp<EXP1>& B,
const matrix_exp<EXP2>& g,
const typename EXP1::type radius,
matrix<T,NR,NC,MM,L>& p,
double eps,
unsigned long max_iter
)
{
/*
This is an implementation of algorithm 4.3(Trust Region Subproblem)
from the book Numerical Optimization by Nocedal and Wright. Some of
the details are also from Practical Methods of Optimization by Fletcher.
*/
// make sure requires clause is not broken
DLIB_ASSERT(B.nr() == B.nc() && is_col_vector(g) && g.size() == B.nr() && equal(B,trans(B)),
"\t unsigned long solve_trust_region_subproblem()"
<< "\n\t invalid arguments were given to this function"
<< "\n\t B.nr(): " << B.nr()
<< "\n\t B.nc(): " << B.nc()
<< "\n\t is_col_vector(g): " << is_col_vector(g)
<< "\n\t g.size(): " << g.size()
);
DLIB_ASSERT(radius > 0 && eps > 0 && max_iter > 0,
"\t unsigned long solve_trust_region_subproblem()"
<< "\n\t invalid arguments were given to this function"
<< "\n\t radius: " << radius
<< "\n\t eps: " << eps
<< "\n\t max_iter: " << max_iter
);
const_temp_matrix<EXP1> BB(B);
const_temp_matrix<EXP2> gg(g);
p.set_size(g.nr(),g.nc());
p = 0;
const T numeric_eps = max(abs(BB))*std::numeric_limits<T>::epsilon();
matrix<T,EXP1::NR,EXP2::NR,MM,L> R;
T lambda = 0;
// We need to put a bracket around lambda. It can't go below 0. We
// can get an upper bound using Gershgorin disks.
// This number is a lower bound on the eigenvalues in BB
const T BB_min_eigenvalue = min(diag(BB) - (sum_cols(abs(BB)) - abs(diag(BB))));
const T g_norm = length(gg);
T lambda_min = 0;
T lambda_max = g_norm/radius - BB_min_eigenvalue;
// If we can tell that the minimum is at 0 then don't do anything. Just return the answer.
if (g_norm < numeric_eps && BB_min_eigenvalue > numeric_eps)
{
return 0;
}
// how much lambda has changed recently
T lambda_delta = 0;
for (unsigned long i = 0; i < max_iter; ++i)
{
R = chol(BB + lambda*identity_matrix<T>(BB.nr()));
// if the cholesky decomposition doesn't exist.
if (R(R.nr()-1, R.nc()-1) <= numeric_eps)
{
// If B is indefinite and g is equal to 0 then we should
// quit this loop and go right to the eigenvalue decomposition method.
if (g_norm < numeric_eps)
break;
// narrow the bracket on lambda. Obviously the current lambda is
// too small.
lambda_min = lambda;
// jump towards the max value. Eventually there will
// be a lambda that results in a cholesky decomposition.
const T alpha = 0.10;
lambda = (1-alpha)*lambda + alpha*lambda_max;
continue;
}
p = -gg;
// Solve RR'*p = -g for p.
using namespace blas_bindings;
// Solve R*q = -g for q where q = R'*p.
triangular_solver(CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, R, p);
const T q_norm = length(p);
// Solve R'*p = q for p.
triangular_solver(CblasLeft, CblasLower, CblasTrans, CblasNonUnit, R, p);
const T p_norm = length(p);
// check if we are done.
if (lambda == 0)
{
if (p_norm < radius)
{
// i will always be 0 in this case. So we return 1.
return i+1;
}
}
else
{
// if we are close enough to the solution then terminate
if (std::abs(p_norm - radius)/radius < eps)
return i+1;
}
// shrink our bracket on lambda
if (p_norm < radius)
lambda_max = lambda;
else
lambda_min = lambda;
// This is only going to happen when g is basically 0 and B is indefinite.
if (p_norm < numeric_eps)
{
break;
}
const T old_lambda = lambda;
// figure out which lambda to try next
lambda = lambda + std::pow(q_norm/p_norm,2)*(p_norm - radius)/radius;
// make sure the chosen lambda is within our bracket (but not exactly at either end).
const T gap = (lambda_max-lambda_min)*0.01;
lambda = put_in_range(lambda_min+gap, lambda_max-gap, lambda);
// Keep track of how much lambda is thrashing around inside the search bracket. If it
// keeps moving around a whole lot then cut the search bracket in half.
lambda_delta += std::abs(lambda - old_lambda);
if (lambda_delta > 3*(lambda_max-lambda_min))
{
lambda = (lambda_min+lambda_max)/2;
lambda_delta = 0;
}
} // end for loop
// We are probably in the "hard case". Use an eigenvalue decomposition to sort things out.
// Either that or the eps was just set too tight and really we are already done.
eigenvalue_decomposition<EXP1> ed(BB);
matrix<T,NR,NC,MM,L> ev = ed.get_real_eigenvalues();
const long min_eig_idx = index_of_min(ev);
ev -= min(ev);
// zero out any values which are basically zero
ev = pointwise_multiply(ev, ev > max(abs(ev))*std::numeric_limits<T>::epsilon());
ev = reciprocal(ev);
// figure out part of what p should be assuming we are in the hard case.
matrix<T,NR,NC,MM,L> p_hard;
p_hard = trans(ed.get_pseudo_v())*gg;
p_hard = diagm(ev)*p_hard;
p_hard = ed.get_pseudo_v()*p_hard;
// If we really are in the hard case then this if will be true. Otherwise, the p
// we found in the "easy case" loop up top is the best answer.
if (length(p_hard) < radius && length(p_hard) >= length(p))
{
// adjust the length of p_hard by adding a component along the eigenvector associated with
// the smallest eigenvalue. We want to make it the case that length(p) == radius.
const T tau = std::sqrt(radius*radius - length_squared(p_hard));
p = p_hard + tau*colm(ed.get_pseudo_v(),min_eig_idx);
// if we have to do an eigenvalue decomposition then say we did all the iterations
return max_iter;
}
// if we get this far it means we didn't converge to eps accuracy.
return max_iter+1;
}
// ----------------------------------------------------------------------------------------
template <
typename stop_strategy_type,
typename funct_model
>
double find_min_trust_region (
stop_strategy_type stop_strategy,
const funct_model& model,
typename funct_model::column_vector& x,
double radius = 1
)
{
/*
This is an implementation of algorithm 4.1(Trust Region)
from the book Numerical Optimization by Nocedal and Wright.
*/
// make sure requires clause is not broken
DLIB_ASSERT(is_col_vector(x) && radius > 0,
"\t double find_min_trust_region()"
<< "\n\t invalid arguments were given to this function"
<< "\n\t is_col_vector(x): " << is_col_vector(x)
<< "\n\t radius: " << radius
);
typedef typename funct_model::column_vector T;
typedef typename T::type type;
typedef typename T::mem_manager_type mem_manager_type;
typedef typename T::layout_type layout_type;
typename funct_model::general_matrix h;
typename funct_model::column_vector g, p, d;
type f_value = model(x);
model.get_derivative_and_hessian(x,g,h);
// Sometimes the loop below won't modify x because the trust region step failed.
// This bool tells us when we are in that case.
bool stale_x = false;
while(stale_x || stop_strategy.should_continue_search(x, f_value, g))
{
// Try to scale the trust region to some reasonable elliptic shape
// appropriate for the current problem.
//d = 1,1;
//d = clamp(abs(g),1e-1, 1e1);
d = clamp(diag(abs(h)),1e-1, 1e1);
const unsigned long iter = solve_trust_region_subproblem(inv(diagm(d))*h*inv(diagm(d)),
inv(diagm(d))*g,
radius,
p,
0.1,
20);
// convert back from the spherical trust region to our elliptical one.
p = inv(diagm(d))*p;
const type new_f_value = model(x+p);
const type predicted_improvement = -0.5*trans(p)*h*p - trans(g)*p;
const type measured_improvement = (f_value - new_f_value);
// If the sub-problem can't find a way to improve then stop. This only happens when p is essentially 0.
if (std::abs(predicted_improvement) <= std::abs(measured_improvement)*std::numeric_limits<type>::epsilon())
break;
const type rho = measured_improvement/predicted_improvement;
if (rho < 0.25)
{
radius *= 0.25;
// something has gone horribly wrong if the radius has shrunk to zero. So just
// give up if that happens.
if (radius == 0)
break;
}
else
{
// if rho > 0.75 and we are being checked by the radius
if (rho > 0.75 && iter > 1)
{
radius = std::min<type>(1000, 2*radius);
}
}
if (rho > 0)
{
x = x + p;
f_value = new_f_value;
model.get_derivative_and_hessian(x,g,h);
stale_x = false;
}
else
{
stale_x = true;
}
}
return f_value;
}
// ----------------------------------------------------------------------------------------
template <typename funct_model>
struct negate_tr_model
{
negate_tr_model( const funct_model& m) : model(m) {}
const funct_model& model;
typedef typename funct_model::column_vector column_vector;
typedef typename funct_model::general_matrix general_matrix;
template <typename T>
typename T::type operator() (const T& x) const
{
return -model(x);
}
template <typename T, typename U>
void get_derivative_and_hessian (
const T& x,
T& d,
U& h
) const
{
model.get_derivative_and_hessian(x,d,h);
d = -d;
h = -h;
}
};
// ----------------------------------------------------------------------------------------
template <
typename stop_strategy_type,
typename funct_model
>
double find_max_trust_region (
stop_strategy_type stop_strategy,
const funct_model& model,
typename funct_model::column_vector& x,
double radius = 1
)
{
// make sure requires clause is not broken
DLIB_ASSERT(is_col_vector(x) && radius > 0,
"\t double find_max_trust_region()"
<< "\n\t invalid arguments were given to this function"
<< "\n\t is_col_vector(x): " << is_col_vector(x)
<< "\n\t radius: " << radius
);
return -find_min_trust_region(stop_strategy,
negate_tr_model<funct_model>(model),
x,
radius);
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_OPTIMIZATION_TRUST_REGIoN_H__
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_OPTIMIZATION_TRUST_REGIoN_H__
#define DLIB_OPTIMIZATION_TRUST_REGIoN_H__
#include "../matrix.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
template <
typename EXP1,
typename EXP2,
typename T, long NR, long NC, typename MM, typename L
>
unsigned long solve_trust_region_subproblem (
const matrix_exp<EXP1>& B,
const matrix_exp<EXP2>& g,
const typename EXP1::type radius,
matrix<T,NR,NC,MM,L>& p,
double eps,
unsigned long max_iter
);
/*!
requires
- B == trans(B)
(i.e. B should be a symmetric matrix)
- B.nr() == B.nc()
- is_col_vector(g) == true
- g.size() == B.nr()
- p is capable of containing a column vector the size of g
(i.e. p = g; should be a legal expression)
- radius > 0
- eps > 0
- max_iter > 0
ensures
- This function solves the following optimization problem:
Minimize: f(p) == 0.5*trans(p)*B*p + trans(g)*p
subject to the following constraint:
- length(p) <= radius
- returns the number of iterations performed. If this method fails to
converge to eps accuracy then the number returned will be max_iter+1.
- if this function returns 0 or 1 then we are not hitting the radius bound.
Otherwise, the radius constraint is active and std::abs(length(#p)-radius) <= eps.
!*/
// ----------------------------------------------------------------------------------------
class function_model
{
/*!
WHAT THIS OBJECT REPRESENTS
This object defines the interface for a function model
used by the trust-region optimizers defined below.
In particular, this object represents a function f() and
its associated derivative and hessian.
!*/
// Define the type used to represent column vectors
typedef matrix<double,0,1> column_vector;
// Define the type used to represent the hessian matrix
typedef matrix<double> general_matrix;
double operator() (
const column_vector& x
) const;
/*!
ensures
- returns f(x)
(i.e. evaluates this model at the given point and returns the value)
!*/
void get_derivative_and_hessian (
const column_vector& x,
column_vector& d,
general_matrix& h
) const;
/*!
ensures
- #d == the derivative of f() at x
- #h == the hessian matrix of f() at x
- is_col_vector(#d) == true
- #d.size() == x.size()
- #h.nr() == #h.nc() == x.size()
- #h == trans(#h)
!*/
};
// ----------------------------------------------------------------------------------------
template <
typename stop_strategy_type,
typename funct_model
>
double find_min_trust_region (
stop_strategy_type stop_strategy,
const funct_model& model,
typename funct_model::column_vector& x,
double radius = 1
);
/*!
requires
- stop_strategy == an object that defines a stop strategy such as one of
the objects from dlib/optimization/optimization_stop_strategies_abstract.h
- is_col_vector(x) == true
- radius > 0
- model must be an object with an interface as defined by the function_model
example object shown above.
ensures
- Performs an unconstrained minimization of the function defined by model
starting from the initial point x. This function uses a trust region
algorithm to perform the minimization. The radius parameter defines
the initial size of the trust region.
- The function is optimized until stop_strategy decides that an acceptable
point has been found or the trust region subproblem fails to make progress.
- #x == the value of x that was found to minimize model()
- returns model(#x).
- When this function makes calls to model.get_derivative_and_hessian() it always
does so by first calling model() and then calling model.get_derivative_and_hessian().
That is, any call to model.get_derivative_and_hessian(val) will always be
preceded by a call to model(val) with the same value. This way you can reuse
any redundant computations performed by model() and model.get_derivative_and_hessian()
as appropriate.
!*/
// ----------------------------------------------------------------------------------------
template <
typename stop_strategy_type,
typename funct_model
>
double find_max_trust_region (
stop_strategy_type stop_strategy,
const funct_model& model,
typename funct_model::column_vector& x,
double radius = 1
);
/*!
requires
- stop_strategy == an object that defines a stop strategy such as one of
the objects from dlib/optimization/optimization_stop_strategies_abstract.h
- is_col_vector(x) == true
- radius > 0
- model must be an object with an interface as defined by the function_model
example object shown above.
ensures
- Performs an unconstrained maximization of the function defined by model
starting from the initial point x. This function uses a trust region
algorithm to perform the maximization. The radius parameter defines
the initial size of the trust region.
- The function is optimized until stop_strategy decides that an acceptable
point has been found or the trust region subproblem fails to make progress.
- #x == the value of x that was found to maximize model()
- returns model(#x).
- When this function makes calls to model.get_derivative_and_hessian() it always
does so by first calling model() and then calling model.get_derivative_and_hessian().
That is, any call to model.get_derivative_and_hessian(val) will always be
preceded by a call to model(val) with the same value. This way you can reuse
any redundant computations performed by model() and model.get_derivative_and_hessian()
as appropriate.
- Note that this function solves the maximization problem by converting it
into a minimization problem. Therefore, the values of model() and its derivative
reported to the stopping strategy will be negated. That is, stop_strategy
will see -model() and -derivative. All this really means is that the status
messages from a stopping strategy in verbose mode will display a negated objective
value.
!*/
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_OPTIMIZATION_TRUST_REGIoN_H__
......@@ -83,12 +83,13 @@ set (tests
statistics.cpp
std_vector_c.cpp
string.cpp
svm.cpp
svm_c_linear.cpp
svm.cpp
thread_pool.cpp
threads.cpp
timer.cpp
tokenizer.cpp
trust_region.cpp
tuple.cpp
type_safe_union.cpp
)
......
......@@ -99,6 +99,7 @@ SRC += thread_pool.cpp
SRC += threads.cpp
SRC += timer.cpp
SRC += tokenizer.cpp
SRC += trust_region.cpp
SRC += tuple.cpp
SRC += type_safe_union.cpp
......
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#include <dlib/optimization.h>
#include <sstream>
#include <string>
#include <cstdlib>
#include <ctime>
#include <vector>
#include "../rand.h"
#include "tester.h"
namespace
{
using namespace test;
using namespace dlib;
using namespace std;
logger dlog("test.trust_region");
// ----------------------------------------------------------------------------------------
template <typename T>
T rosen ( const matrix<T,2,1>& m)
{
const T x = m(0);
const T y = m(1);
// compute Rosenbrock's function and return the result
return 100.0*pow(y - x*x,2) + pow(1 - x,2);
}
template <typename T>
const matrix<T,2,1> rosen_derivative ( const matrix<T,2,1>& m)
{
const T x = m(0);
const T y = m(1);
// make us a column vector of length 2
matrix<T,2,1> res(2);
// now compute the gradient vector
res(0) = -400*x*(y-x*x) - 2*(1-x); // derivative of rosen() with respect to x
res(1) = 200*(y-x*x); // derivative of rosen() with respect to y
return res;
}
template <typename T>
const matrix<T,2,2> rosen_hessian ( const matrix<T,2,1>& m)
{
const T x = m(0);
const T y = m(1);
// make us a column vector of length 2
matrix<T,2,2> res;
// now compute the gradient vector
res(0,0) = -400*y + 3*400*x*x + 2;
res(1,1) = 200;
res(0,1) = -400*x;
res(1,0) = -400*x;
return res;
}
// ----------------------------------------------------------------------------------------
template <typename T>
struct rosen_model
{
typedef matrix<T,2,1> column_vector;
typedef matrix<T,2,2> general_matrix;
T operator() ( column_vector x) const
{
return static_cast<T>(rosen(x));
}
void get_derivative_and_hessian (
const column_vector& x,
column_vector& d,
general_matrix& h
) const
{
d = rosen_derivative(x);
h = rosen_hessian(x);
}
};
// ----------------------------------------------------------------------------------------
template <typename T>
struct neg_rosen_model
{
typedef matrix<T,0,1> column_vector;
typedef matrix<T,0,0> general_matrix;
T operator() ( column_vector x) const
{
return -static_cast<T>(rosen<T>(x));
}
void get_derivative_and_hessian (
const column_vector& x,
column_vector& d,
general_matrix& h
) const
{
d = -matrix_cast<T>(rosen_derivative<T>(x));
h = -matrix_cast<T>(rosen_hessian<T>(x));
}
};
// ----------------------------------------------------------------------------------------
dlib::rand::float_1a rnd;
template <typename T>
void test_with_rosen()
{
print_spinner();
matrix<T,2,1> ans;
ans = 1,1;
matrix<T,2,1> p = 100*matrix_cast<T>(randm(2,1,rnd)) - 50;
T obj = find_min_trust_region(objective_delta_stop_strategy(0, 100), rosen_model<T>(), p);
DLIB_TEST_MSG(obj == 0, "obj: " << obj);
DLIB_TEST_MSG(length(p-ans) == 0, "length(p): " << length(p-ans));
matrix<T,0,1> p2 = 100*matrix_cast<T>(randm(2,1,rnd)) - 50;
obj = find_max_trust_region(objective_delta_stop_strategy(0, 100), neg_rosen_model<T>(), p2);
DLIB_TEST_MSG(obj == 0, "obj: " << obj);
DLIB_TEST_MSG(length(p2-ans) == 0, "length(p2): " << length(p2-ans));
}
// ----------------------------------------------------------------------------------------
void test_trust_region_sub_problem()
{
dlog << LINFO << "subproblem test 1";
{
matrix<double,2,2> B;
B = 1, 0,
0, 1;
matrix<double,2,1> g, p, ans;
g = 0;
ans = 0;
solve_trust_region_subproblem(B,g,1,p, 0.001, 10);
DLIB_TEST(length(p-ans) < 1e-10);
solve_trust_region_subproblem(B,g,1,p, 0.001, 1);
DLIB_TEST(length(p-ans) < 1e-10);
}
dlog << LINFO << "subproblem test 2";
{
matrix<double,2,2> B;
B = 1, 0,
0, 1;
B *= 0.1;
matrix<double,2,1> g, p, ans;
g = 1;
ans = -g / length(g);
solve_trust_region_subproblem(B,g,1,p, 1e-6, 20);
DLIB_TEST(length(p-ans) < 1e-4);
}
dlog << LINFO << "subproblem test 3";
{
matrix<double,2,2> B;
B = 0, 0,
0, 0;
matrix<double,2,1> g, p, ans;
g = 1;
ans = -g / length(g);
solve_trust_region_subproblem(B,g,1,p, 1e-6, 20);
dlog << LINFO << "ans: " << trans(ans);
dlog << LINFO << "p: " << trans(p);
DLIB_TEST(length(p-ans) < 1e-4);
}
return;
dlog << LINFO << "subproblem test 4";
{
matrix<double,2,2> B;
B = 2, 0,
0, -1;
matrix<double,2,1> g, p, ans;
g = 0;
ans = 0, -1;
solve_trust_region_subproblem(B,g,1,p, 1e-6, 20);
DLIB_TEST(length(p-ans) < 1e-4);
}
dlog << LINFO << "subproblem test 5";
{
matrix<double,2,2> B;
B = 2, 0,
0, -1;
matrix<double,2,1> g, p, ans;
g = 0, 1;
ans = 0, -1;
solve_trust_region_subproblem(B,g,1,p, 1e-6, 20);
DLIB_TEST(length(p-ans) < 1e-4);
}
dlog << LINFO << "subproblem test 6";
for (int i = 0; i < 10; ++i)
{
matrix<double,10,10> B;
B = randm(10,10, rnd);
B = 0.01*B*trans(B);
matrix<double,10,1> g, p, ans;
g = 1;
solve_trust_region_subproblem(B,g,1,p, 1e-6, 20);
DLIB_TEST(std::abs(length(p) - 1) < 1e-4);
}
}
// ----------------------------------------------------------------------------------------
class optimization_tester : public tester
{
public:
optimization_tester (
) :
tester ("test_trust_region",
"Runs tests on the trust region optimization component.")
{}
void perform_test (
)
{
dlog << LINFO << "test with rosen<float>";
for (int i = 0; i < 50; ++i)
test_with_rosen<float>();
dlog << LINFO << "test with rosen<double>";
for (int i = 0; i < 50; ++i)
test_with_rosen<double>();
test_trust_region_sub_problem();
}
} a;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment