Commit 9fb35897 authored by Davis King's avatar Davis King

Split the code up into multiple files and setup the abstracts for each of them.

--HG--
rename : dlib/optimization/optimization.h => dlib/optimization/optimization_line_search.h
rename : dlib/optimization/optimization_abstract.h => dlib/optimization/optimization_line_search_abstract.h
rename : dlib/optimization/optimization.h => dlib/optimization/optimization_search_strategies.h
rename : dlib/optimization/optimization_abstract.h => dlib/optimization/optimization_search_strategies_abstract.h
rename : dlib/optimization/optimization.h => dlib/optimization/optimization_stop_strategies.h
rename : dlib/optimization/optimization_abstract.h => dlib/optimization/optimization_stop_strategies_abstract.h
extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403190
parent 75b3839a
...@@ -5,329 +5,14 @@ ...@@ -5,329 +5,14 @@
#include <cmath> #include <cmath>
#include <limits> #include <limits>
#include "../matrix.h"
#include "../algs.h"
#include "optimization_abstract.h" #include "optimization_abstract.h"
#include "../sequence.h" #include "optimization_search_strategies.h"
#include "optimization_stop_strategies.h"
#include "optimization_line_search.h"
namespace dlib namespace dlib
{ {
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
class objective_delta_stop_strategy
{
public:
objective_delta_stop_strategy (
double min_delta = 1e-7
) : _been_used(false), _min_delta(min_delta), _max_iter(0), _cur_iter(0), _prev_funct_value(0) {}
objective_delta_stop_strategy (
double min_delta,
unsigned long max_iter
) : _been_used(false), _min_delta(min_delta), _max_iter(max_iter), _cur_iter(0), _prev_funct_value(0) {}
template <typename T>
bool should_continue_search (
const T& ,
const double funct_value,
const T&
)
{
++_cur_iter;
if (_been_used)
{
// Check if we have hit the max allowable number of iterations. (but only
// check if _max_iter is enabled (i.e. not 0)).
if (_max_iter != 0 && _cur_iter > _max_iter)
return false;
// check if the function change was too small
if (std::abs(funct_value - _prev_funct_value) < _min_delta)
return false;
}
_been_used = true;
_prev_funct_value = funct_value;
return true;
}
private:
bool _been_used;
double _min_delta;
unsigned long _max_iter;
unsigned long _cur_iter;
double _prev_funct_value;
};
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
class cg_search_strategy
{
public:
cg_search_strategy() : been_used(false) {}
double get_wolfe_rho (
) const { return 0.001; }
double get_wolfe_sigma (
) const { return 0.01; }
template <typename T>
const matrix<double,0,1>& get_next_direction (
const T& ,
const double ,
const T& funct_derivative
)
/*!
requires
- for some function f():
- funct_value == f(x)
- funct_derivative == derivative(f)(x)
ensures
- returns the next search direction (from the point x). This
direction is chosen using the Polak-Ribiere conjugate gradient
method.
!*/
{
if (been_used == false)
{
been_used = true;
prev_direction = -funct_derivative;
}
else
{
// Use the Polak-Ribiere (4.1.12) conjugate gradient described by Fletcher on page 83
const double temp = trans(prev_derivative)*prev_derivative;
// If this value hits zero then just use the direction of steepest descent.
if (std::abs(temp) < std::numeric_limits<double>::epsilon())
{
prev_derivative = funct_derivative;
prev_direction = -funct_derivative;
return prev_direction;
}
double b = trans(funct_derivative-prev_derivative)*funct_derivative/(temp);
prev_direction = -funct_derivative + b*prev_direction;
}
prev_derivative = funct_derivative;
return prev_direction;
}
private:
bool been_used;
matrix<double,0,1> prev_derivative;
matrix<double,0,1> prev_direction;
};
// ----------------------------------------------------------------------------------------
class bfgs_search_strategy
{
public:
bfgs_search_strategy() : been_used(false), been_used_twice(false) {}
double get_wolfe_rho (
) const { return 0.01; }
double get_wolfe_sigma (
) const { return 0.9; }
template <typename T>
const matrix<double,0,1>& get_next_direction (
const T& x,
const double ,
const T& funct_derivative
)
{
if (been_used == false)
{
been_used = true;
H = identity_matrix<double>(x.size());
}
else
{
// update H with the BFGS formula from (3.2.12) on page 55 of Fletcher
delta = (x-prev_x);
gamma = funct_derivative-prev_derivative;
double dg = dot(delta,gamma);
// Try to set the initial value of the H matrix to something reasonable if we are still
// in the early stages of figuring out what it is. This formula below is what is suggested
// in the book Numerical Optimization by Nocedal and Wright in the chapter on Quasi-Newton methods.
if (been_used_twice == false)
{
double gg = trans(gamma)*gamma;
if (std::abs(gg) > std::numeric_limits<double>::epsilon())
{
const double temp = put_in_range(0.01, 100, dg/gg);
H = diagm(uniform_matrix<double>(x.size(),1, temp));
been_used_twice = true;
}
}
Hg = H*gamma;
gH = trans(trans(gamma)*H);
double gHg = trans(gamma)*H*gamma;
if (gHg < std::numeric_limits<double>::infinity() && dg < std::numeric_limits<double>::infinity() &&
dg != 0)
{
H += (1 + gHg/dg)*delta*trans(delta)/(dg) - (delta*trans(gH) + Hg*trans(delta))/(dg);
}
else
{
H = identity_matrix<double>(H.nr());
been_used_twice = false;
}
}
prev_x = x;
prev_direction = -H*funct_derivative;
prev_derivative = funct_derivative;
return prev_direction;
}
private:
bool been_used;
bool been_used_twice;
matrix<double,0,1> prev_x;
matrix<double,0,1> prev_derivative;
matrix<double,0,1> prev_direction;
matrix<double> H;
matrix<double,0,1> delta, gamma, Hg, gH;
};
// ----------------------------------------------------------------------------------------
class lbfgs_search_strategy
{
public:
lbfgs_search_strategy(unsigned long max_size_) : max_size(max_size_), been_used(false) {}
lbfgs_search_strategy(const lbfgs_search_strategy& item)
{
max_size = item.max_size;
been_used = item.been_used;
prev_x = item.prev_x;
prev_derivative = item.prev_derivative;
prev_direction = item.prev_direction;
alpha = item.alpha;
dh_temp = item.dh_temp;
}
double get_wolfe_rho (
) const { return 0.01; }
double get_wolfe_sigma (
) const { return 0.9; }
template <typename T>
const matrix<double,0,1>& get_next_direction (
const T& x,
const double ,
const T& funct_derivative
)
{
if (been_used == false)
{
prev_direction = -funct_derivative;
been_used = true;
}
else
{
// add an element into the stored data sequence
dh_temp.s = x - prev_x;
dh_temp.y = funct_derivative - prev_derivative;
double temp = dlib::dot(dh_temp.s, dh_temp.y);
// only accept this bit of data if temp isn't zero
if (std::abs(temp) > std::numeric_limits<double>::epsilon())
{
dh_temp.rho = 1/temp;
data.add(data.size(), dh_temp);
}
else
{
data.clear();
}
if (data.size() > 0)
{
// This block of code is from algorithm 7.4 in the Nocedal book.
prev_direction = -funct_derivative;
alpha.resize(data.size());
for (unsigned long i = data.size()-1; i < data.size(); --i)
{
alpha[i] = data[i].rho*dot(data[i].s, prev_direction);
prev_direction -= alpha[i]*data[i].y;
}
// Take a guess at what the first H matrix should be. This formula below is what is suggested
// in the book Numerical Optimization by Nocedal and Wright in the chapter on Large Scale
// Unconstrained Optimization (in the L-BFGS section).
double H_0 = 1.0/data[data.size()-1].rho/dot(data[data.size()-1].y, data[data.size()-1].y);
H_0 = put_in_range(0.001, 1000.0, H_0);
prev_direction *= H_0;
for (unsigned long i = 0; i < data.size(); ++i)
{
double beta = data[i].rho*dot(data[i].y, prev_direction);
prev_direction += data[i].s * (alpha[i] - beta);
}
}
else
{
prev_derivative = -funct_derivative;
}
}
if (data.size() > max_size)
{
// remove the oldest element in the data sequence
data.remove(0, dh_temp);
}
prev_x = x;
prev_derivative = funct_derivative;
return prev_direction;
}
private:
struct data_helper
{
matrix<double,0,1> s;
matrix<double,0,1> y;
double rho;
friend void swap(data_helper& a, data_helper& b)
{
a.s.swap(b.s);
a.y.swap(b.y);
std::swap(a.rho, b.rho);
}
};
sequence<data_helper>::kernel_2a data;
unsigned long max_size;
bool been_used;
matrix<double,0,1> prev_x;
matrix<double,0,1> prev_derivative;
matrix<double,0,1> prev_direction;
std::vector<double> alpha;
data_helper dh_temp;
};
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
// Functions that transform other functions // Functions that transform other functions
...@@ -432,393 +117,10 @@ namespace dlib ...@@ -432,393 +117,10 @@ namespace dlib
template <typename funct> template <typename funct>
const negate_function_object<funct> negate_function(const funct& f) { return negate_function_object<funct>(f); } const negate_function_object<funct> negate_function(const funct& f) { return negate_function_object<funct>(f); }
// ----------------------------------------------------------------------------------------
template <typename funct, typename T>
class line_search_funct
{
public:
// You get an error on this line when you pass in a global function to this function.
// You have to either use a function object or pass a pointer to your global function
// by taking its address using the & operator. (This check is here because gcc 4.0
// has a bug that causes it to silently corrupt return values from functions that
// invoked through a reference)
COMPILE_TIME_ASSERT(is_function<funct>::value == false);
line_search_funct(const funct& f_, const T& start_, const T& direction_)
: f(f_),start(start_), direction(direction_), matrix_r(0), scalar_r(0)
{}
line_search_funct(const funct& f_, const T& start_, const T& direction_, T& r)
: f(f_),start(start_), direction(direction_), matrix_r(&r), scalar_r(0)
{}
line_search_funct(const funct& f_, const T& start_, const T& direction_, double& r)
: f(f_),start(start_), direction(direction_), matrix_r(0), scalar_r(&r)
{}
double operator()(const double& x) const
{
return get_value(f(start + x*direction));
}
// TODO figure out some requirements for these two functions and add them to the abstract
const T& get_last_scalar_eval (
) const { return *scalar_r; }
const T& get_last_gradient_eval (
) const { return *matrix_r; }
private:
double get_value (const double& r) const
{
// save a copy of this value for later
if (scalar_r)
*scalar_r = r;
return r;
}
template <typename U>
double get_value (const U& r) const
{
// U should be a matrix type
COMPILE_TIME_ASSERT(is_matrix<U>::value);
// save a copy of this value for later
if (matrix_r)
*matrix_r = r;
return trans(r)*direction;
}
const funct& f;
const T& start;
const T& direction;
T* matrix_r;
double* scalar_r;
};
template <typename funct, typename T>
const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction)
{
// You get an error on this line when you pass in a global function to this function.
// You have to either use a function object or pass a pointer to your global function
// by taking its address using the & operator. (This check is here because gcc 4.0
// has a bug that causes it to silently corrupt return values from functions that
// invoked through a reference)
COMPILE_TIME_ASSERT(is_function<funct>::value == false);
COMPILE_TIME_ASSERT(is_matrix<T>::value);
DLIB_ASSERT (
is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(),
"\tline_search_funct make_line_search_function(f,start,direction)"
<< "\n\tYou have to supply column vectors to this function"
<< "\n\tstart.nc(): " << start.nc()
<< "\n\tdirection.nc(): " << direction.nc()
<< "\n\tstart.nr(): " << start.nr()
<< "\n\tdirection.nr(): " << direction.nr()
);
return line_search_funct<funct,T>(f,start,direction);
}
// ----------------------------------------------------------------------------------------
template <typename funct, typename T>
const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction, double& f_out)
{
// You get an error on this line when you pass in a global function to this function.
// You have to either use a function object or pass a pointer to your global function
// by taking its address using the & operator. (This check is here because gcc 4.0
// has a bug that causes it to silently corrupt return values from functions that
// invoked through a reference)
COMPILE_TIME_ASSERT(is_function<funct>::value == false);
COMPILE_TIME_ASSERT(is_matrix<T>::value);
DLIB_ASSERT (
is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(),
"\tline_search_funct make_line_search_function(f,start,direction)"
<< "\n\tYou have to supply column vectors to this function"
<< "\n\tstart.nc(): " << start.nc()
<< "\n\tdirection.nc(): " << direction.nc()
<< "\n\tstart.nr(): " << start.nr()
<< "\n\tdirection.nr(): " << direction.nr()
);
return line_search_funct<funct,T>(f,start,direction, f_out);
}
// ----------------------------------------------------------------------------------------
template <typename funct, typename T>
const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction, T& grad_out)
{
// You get an error on this line when you pass in a global function to this function.
// You have to either use a function object or pass a pointer to your global function
// by taking its address using the & operator. (This check is here because gcc 4.0
// has a bug that causes it to silently corrupt return values from functions that
// invoked through a reference)
COMPILE_TIME_ASSERT(is_function<funct>::value == false);
COMPILE_TIME_ASSERT(is_matrix<T>::value);
DLIB_ASSERT (
is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(),
"\tline_search_funct make_line_search_function(f,start,direction)"
<< "\n\tYou have to supply column vectors to this function"
<< "\n\tstart.nc(): " << start.nc()
<< "\n\tdirection.nc(): " << direction.nc()
<< "\n\tstart.nr(): " << start.nr()
<< "\n\tdirection.nr(): " << direction.nr()
);
return line_search_funct<funct,T>(f,start,direction,grad_out);
}
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
// Functions that perform unconstrained optimization // Functions that perform unconstrained optimization
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
inline double poly_min_extrap (
double f0,
double d0,
double f1,
double d1
)
{
const double n = 3*(f1 - f0) - 2*d0 - d1;
const double e = d0 + d1 - 2*(f1 - f0);
// find the minimum of the derivative of the polynomial
double temp = std::max(n*n - 3*e*d0,0.0);
if (temp < 0)
return 0.5;
temp = std::sqrt(temp);
if (std::abs(e) <= std::numeric_limits<double>::epsilon())
return 0.5;
// figure out the two possible min values
double x1 = (temp - n)/(3*e);
double x2 = -(temp + n)/(3*e);
// compute the value of the interpolating polynomial at these two points
double y1 = f0 + d0*x1 + n*x1*x1 + e*x1*x1*x1;
double y2 = f0 + d0*x2 + n*x2*x2 + e*x2*x2*x2;
// pick the best point
double x;
if (y1 < y2)
x = x1;
else
x = x2;
// now make sure the minimum is within the allowed range of (0,1)
return put_in_range(0,1,x);
}
// ----------------------------------------------------------------------------------------
template <
typename funct,
typename funct_der
>
double line_search (
const funct& f,
const double f0,
const funct_der& der,
const double d0,
double rho,
double sigma,
double minf
)
{
// You get an error on this line when you pass in a global function to this function.
// You have to either use a function object or pass a pointer to your global function
// by taking its address using the & operator. (This check is here because gcc 4.0
// has a bug that causes it to silently corrupt return values from functions that
// invoked through a reference)
COMPILE_TIME_ASSERT(is_function<funct>::value == false);
COMPILE_TIME_ASSERT(is_function<funct_der>::value == false);
DLIB_ASSERT (
1 > sigma && sigma > rho && rho > 0,
"\tdouble line_search()"
<< "\n\tYou have given invalid arguments to this function"
<< "\n\tsigma: " << sigma
<< "\n\trho: " << rho
);
// The bracketing phase of this function is implemented according to block 2.6.2 from
// the book Practical Methods of Optimization by R. Fletcher. The sectioning
// phase is an implementation of 2.6.4 from the same book.
// tau1 > 1. Controls the alpha jump size during the search
const double tau1 = 9;
// it must be the case that 0 < tau2 < tau3 <= 1/2 for the algorithm to function
// correctly but the specific values of tau2 and tau3 aren't super important.
const double tau2 = 1.0/10.0;
const double tau3 = 1.0/2.0;
// Stop right away and return a step size of 0 if the gradient is 0 at the starting point
if (std::abs(d0) < std::numeric_limits<double>::epsilon())
return 0;
// Figure out a reasonable upper bound on how large alpha can get.
const double mu = (minf-f0)/(rho*d0);
double alpha = 1;
if (mu < 0)
alpha = -alpha;
alpha = put_in_range(0, 0.65*mu, alpha);
double last_alpha = 0;
double last_val = f0;
double last_val_der = d0;
// the bracketing stage will find a find a range of points [a,b]
// that contains a reasonable solution to the line search
double a, b;
// These variables will hold the values and derivatives of f(a) and f(b)
double a_val, b_val, a_val_der, b_val_der;
// This thresh value represents the Wolfe curvature condition
const double thresh = std::abs(sigma*d0);
// do the bracketing stage to find the bracket range [a,b]
while (true)
{
const double val = f(alpha);
const double val_der = der(alpha);
// we are done with the line search since we found a value smaller
// than the minimum f value
if (val <= minf)
return alpha;
if (val > f0 + rho*alpha*d0 || val >= last_val)
{
a_val = last_val;
a_val_der = last_val_der;
b_val = val;
b_val_der = val_der;
a = last_alpha;
b = alpha;
break;
}
if (std::abs(val_der) <= thresh)
return alpha;
// if we are stuck not making progress then quit with the current alpha
if (last_alpha == alpha)
return alpha;
if (val_der >= 0)
{
a_val = val;
a_val_der = val_der;
b_val = last_val;
b_val_der = last_val_der;
a = alpha;
b = last_alpha;
break;
}
if (mu <= 2*alpha - last_alpha)
{
last_alpha = alpha;
alpha = mu;
}
else
{
const double temp = alpha;
double first = 2*alpha - last_alpha;
double last;
if (mu > 0)
last = std::min(mu, alpha + tau1*(alpha - last_alpha));
else
last = std::max(mu, alpha + tau1*(alpha - last_alpha));
// pick a point between first and last by doing some kind of interpolation
if (last_alpha < alpha)
alpha = last_alpha + (alpha-last_alpha)*poly_min_extrap(last_val, last_val_der, val, val_der);
else
alpha = alpha + (last_alpha-alpha)*poly_min_extrap(val, val_der, last_val, last_val_der);
alpha = put_in_range(first,last,alpha);
last_alpha = temp;
}
last_val = val;
last_val_der = val_der;
}
// Now do the sectioning phase from 2.6.4
while (true)
{
double first = a + tau2*(b-a);
double last = b - tau3*(b-a);
// use interpolation to pick alpha between first and last
alpha = a + (b-a)*poly_min_extrap(a_val, a_val_der, b_val, b_val_der);
alpha = put_in_range(first,last,alpha);
const double val = f(alpha);
const double val_der = der(alpha);
// stop if the interval gets so small that it isn't shrinking any more due to rounding error
if (a == first || b == last)
{
return b;
}
if (val > f0 + rho*alpha*d0 || val >= a_val)
{
b = alpha;
b_val = val;
b_val_der = val_der;
}
else
{
if (std::abs(val_der) <= thresh)
return alpha;
if ( (b-a)*val_der >= 0)
{
b = a;
b_val = a_val;
b_val_der = a_val_der;
}
a = alpha;
a_val = val;
a_val_der = val_der;
}
}
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
template < template <
...@@ -859,7 +161,7 @@ namespace dlib ...@@ -859,7 +161,7 @@ namespace dlib
double f_value = f(x); double f_value = f(x);
g = der(x); g = der(x);
while(stop_strategy.should_continue_search(x, f_value, g)) while(stop_strategy.should_continue_search(x, f_value, g) && f_value > min_f)
{ {
s = search_strategy.get_next_direction(x, f_value, g); s = search_strategy.get_next_direction(x, f_value, g);
...@@ -870,6 +172,7 @@ namespace dlib ...@@ -870,6 +172,7 @@ namespace dlib
dot(g,s), // compute initial gradient for the line search dot(g,s), // compute initial gradient for the line search
search_strategy.get_wolfe_rho(), search_strategy.get_wolfe_sigma(), min_f); search_strategy.get_wolfe_rho(), search_strategy.get_wolfe_sigma(), min_f);
// Take the search step indicated by the above line search
x += alpha*s; x += alpha*s;
} }
} }
...@@ -910,10 +213,13 @@ namespace dlib ...@@ -910,10 +213,13 @@ namespace dlib
T g, s; T g, s;
// This function is basically just a copy of find_min() but with - put in the right places
// to flip things around so that it ends up looking for the max rather than the min.
double f_value = -f(x); double f_value = -f(x);
g = -der(x); g = -der(x);
while(stop_strategy.should_continue_search(x, f_value, g)) while(stop_strategy.should_continue_search(x, f_value, g) && f_value > -max_f)
{ {
s = search_strategy.get_next_direction(x, f_value, g); s = search_strategy.get_next_direction(x, f_value, g);
...@@ -924,9 +230,11 @@ namespace dlib ...@@ -924,9 +230,11 @@ namespace dlib
dot(g,s), // compute initial gradient for the line search dot(g,s), // compute initial gradient for the line search
search_strategy.get_wolfe_rho(), search_strategy.get_wolfe_sigma(), -max_f); search_strategy.get_wolfe_rho(), search_strategy.get_wolfe_sigma(), -max_f);
// Take the search step indicated by the above line search
x += alpha*s; x += alpha*s;
// we have to negate the outputs from the line search // Don't forget to negate these outputs from the line search since they are
// from the unnegated versions of f() and der()
g *= -1; g *= -1;
f_value *= -1; f_value *= -1;
} }
...@@ -971,7 +279,7 @@ namespace dlib ...@@ -971,7 +279,7 @@ namespace dlib
double f_value = f(x); double f_value = f(x);
g = derivative(f,derivative_eps)(x); g = derivative(f,derivative_eps)(x);
while(stop_strategy.should_continue_search(x, f_value, g)) while(stop_strategy.should_continue_search(x, f_value, g) && f_value > min_f)
{ {
s = search_strategy.get_next_direction(x, f_value, g); s = search_strategy.get_next_direction(x, f_value, g);
...@@ -983,6 +291,7 @@ namespace dlib ...@@ -983,6 +291,7 @@ namespace dlib
//derivative(make_line_search_function(f,x,s),derivative_eps)(0), //derivative(make_line_search_function(f,x,s),derivative_eps)(0),
search_strategy.get_wolfe_rho(), search_strategy.get_wolfe_sigma(), min_f); search_strategy.get_wolfe_rho(), search_strategy.get_wolfe_sigma(), min_f);
// Take the search step indicated by the above line search
x += alpha*s; x += alpha*s;
g = derivative(f,derivative_eps)(x); g = derivative(f,derivative_eps)(x);
...@@ -1023,6 +332,7 @@ namespace dlib ...@@ -1023,6 +332,7 @@ namespace dlib
<< "\n\tderivative_eps: " << derivative_eps << "\n\tderivative_eps: " << derivative_eps
); );
// Just negate the necessary things and call the find_min version of this function.
find_min_using_approximate_derivatives( find_min_using_approximate_derivatives(
search_strategy, search_strategy,
stop_strategy, stop_strategy,
......
...@@ -7,6 +7,9 @@ ...@@ -7,6 +7,9 @@
#include <limits> #include <limits>
#include "../matrix/matrix_abstract.h" #include "../matrix/matrix_abstract.h"
#include "../algs.h" #include "../algs.h"
#include "optimization_search_strategies_abstract.h"
#include "optimization_stop_strategies_abstract.h"
#include "optimization_line_search_abstract.h"
namespace dlib namespace dlib
...@@ -28,7 +31,8 @@ namespace dlib ...@@ -28,7 +31,8 @@ namespace dlib
Note that if funct is a function of a double then the derivative of Note that if funct is a function of a double then the derivative of
funct is just a double but if funct is a function of a dlib::matrix (i.e. a funct is just a double but if funct is a function of a dlib::matrix (i.e. a
function of many variables) then its derivative is a gradient vector. function of many variables) then its derivative is a gradient vector (a column
vector in particular).
!*/ !*/
template < template <
...@@ -60,228 +64,173 @@ namespace dlib ...@@ -60,228 +64,173 @@ namespace dlib
- returns derivative(f, 1e-7) - returns derivative(f, 1e-7)
!*/ !*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
template < template <
typename funct, typename funct
typename T
> >
class line_search_funct; class negate_function_object;
/*! /*!
This object is a function object that represents a line search function. This is a function object that represents the negative of some other
function.
It represents a function with the signature:
double l(double x)
!*/ !*/
template < template <
typename funct, typename funct
typename T
> >
const line_search_funct<funct,T> make_line_search_function ( const negate_function_object<funct> negate_function(
const funct& f, const funct& f
const T& start, );
const T& direction
);
/*! /*!
requires requires
- is_matrix<T>::value == true (i.e. T must be a dlib::matrix) - f == a function that returns a scalar
- f must take a dlib::matrix that is a column vector
- is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size()
(i.e. start and direction should be column vectors of the same size)
- f must return either a double or a column vector the same length and
type as start
- f(start + 1.5*direction) should be a valid expression
ensures ensures
- if (f returns a double) then - returns a function that represents the negation of f. That is,
- returns a line search function that computes l(x) == f(start + x*direction) the returned function object represents g(x) == -f(x)
- else
- returns a line search function that computes l(x) == trans(f(start + x*direction))*direction
- We assume that f is the derivative of some other function and that what
f returns is a gradient vector.
So the following two expressions both create the derivative of l(x):
- derivative(make_line_search_function(funct,start,direction))
- make_line_search_function(derivative(funct),start,direction)
!*/ !*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
// Functions that perform unconstrained optimization // Functions that perform unconstrained optimization
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
inline double poly_min_extrap (
double f0,
double d0,
double f1,
double d1
);
/*!
ensures
- let c(x) be a 3rd degree polynomial such that:
- c(0) == f0
- c(1) == f1
- derivative of c(x) at x==0 is d0
- derivative of c(x) at x==1 is d1
- returns the point in the range [0,1] that minimizes the polynomial c(x)
!*/
// ----------------------------------------------------------------------------------------
template <
typename funct,
typename funct_der
>
double line_search (
const funct& f,
const funct_der& der,
double rho,
double sigma,
double minf,
double& f0_out
);
/*!
requires
- 1 > sigma > rho > 0
- f and der are scalar functions of scalars
(e.g. line_search_funct objects)
- der is the derivative of f
ensures
- returns a value alpha such that f(alpha) is
significantly closer to the minimum of f than f(0).
- bigger values of sigma result in a less accurate but faster line search
- f0_out == f(0)
!*/
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
template < template <
typename search_strategy_type,
typename stop_strategy_type,
typename funct, typename funct,
typename funct_der, typename funct_der,
typename T typename T
> >
void find_min_quasi_newton ( void find_min (
search_strategy_type search_strategy,
stop_strategy_type stop_strategy,
const funct& f, const funct& f,
const funct_der& der, const funct_der& der,
T& x, T& x,
double min_f, double min_f
double min_delta = 1e-7
); );
/*! /*!
requires requires
- min_delta >= 0 - search_strategy == an object that defines a search strategy such as one
of the objects from dlib/optimization/optimization_search_strategies_abstract.h
- stop_strategy == an object that defines a stop strategy such as one of
the objects from dlib/optimization/optimization_stop_strategies_abstract.h
- f(x) must be a valid expression that evaluates to a double - f(x) must be a valid expression that evaluates to a double
- der(x) must be a valid expression that evaluates to the derivative of - der(x) must be a valid expression that evaluates to the derivative of
f() at x. f() at x.
- is_matrix<T>::value == true (i.e. T must be a dlib::matrix type) - is_col_vector(x) == true
- x.nc() == 1 (i.e. x must be a column vector)
ensures ensures
- Performs an unconstrained minimization of the function f() using the BFGS - Performs an unconstrained minimization of the function f() using the given
quasi newton method. The optimization stops when any of the following search_strategy.
conditions are satisfied: - The function is optimized until stop_strategy decides that an acceptable
- the change in f() from one iteration to the next is less than min_delta point has been found or f(#x) < min_f.
- f(#x) <= min_f
- #x == the value of x that was found to minimize f() - #x == the value of x that was found to minimize f()
!*/ !*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
template < template <
typename search_strategy_type,
typename stop_strategy_type,
typename funct, typename funct,
typename funct_der,
typename T typename T
> >
void find_min_quasi_newton2 ( void find_max (
search_strategy_type search_strategy,
stop_strategy_type stop_strategy,
const funct& f, const funct& f,
const funct_der& der,
T& x, T& x,
double min_f, double max_f
double min_delta = 1e-7,
const double derivative_eps = 1e-7
); );
/*! /*!
requires requires
- min_delta >= 0 - search_strategy == an object that defines a search strategy such as one
- derivative_eps > 0 of the objects from dlib/optimization/optimization_search_strategies_abstract.h
- stop_strategy == an object that defines a stop strategy such as one of
the objects from dlib/optimization/optimization_stop_strategies_abstract.h
- f(x) must be a valid expression that evaluates to a double - f(x) must be a valid expression that evaluates to a double
- is_matrix<T>::value == true (i.e. T must be a dlib::matrix type) - der(x) must be a valid expression that evaluates to the derivative of
- x.nc() == 1 (i.e. x must be a column vector) f() at x.
- is_col_vector(x) == true
ensures ensures
- Performs an unconstrained minimization of the function f() using a - Performs an unconstrained maximization of the function f() using the given
quasi newton method. The optimization stops when any of the following search_strategy.
conditions are satisfied: - The function is optimized until stop_strategy decides that an acceptable
- the change in f() from one iteration to the next is less than min_delta point has been found or f(#x) > max_f.
- f(#x) <= min_f - #x == the value of x that was found to maximize f()
- Uses the dlib::derivative(f,derivative_eps) function to compute gradient
information
- #x == the value of x that was found to minimize f()
!*/ !*/
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
template < template <
typename funct, typename search_strategy_type,
typename funct_der, typename stop_strategy_type,
typename funct,
typename T typename T
> >
void find_min_conjugate_gradient ( void find_min_using_approximate_derivatives (
const funct& f, search_strategy_type search_strategy,
const funct_der& der, stop_strategy_type stop_strategy,
T& x, const funct& f,
double min_f, T& x,
double min_delta = 1e-7 double min_f,
double derivative_eps = 1e-7
); );
/*! /*!
requires requires
- min_delta >= 0 - search_strategy == an object that defines a search strategy such as one
of the objects from dlib/optimization/optimization_search_strategies_abstract.h
- stop_strategy == an object that defines a stop strategy such as one of
the objects from dlib/optimization/optimization_stop_strategies_abstract.h
- f(x) must be a valid expression that evaluates to a double - f(x) must be a valid expression that evaluates to a double
- der(x) must be a valid expression that evaluates to the derivative of - is_col_vector(x) == true
f() at x. - derivative_eps > 0
- is_matrix<T>::value == true (i.e. T must be a dlib::matrix type)
- x.nc() == 1 (i.e. x must be a column vector)
ensures ensures
- Performs an unconstrained minimization of the function f() using a - Performs an unconstrained minimization of the function f() using the given
conjugate gradient method. The optimization stops when any of the following search_strategy.
conditions are satisfied: - The function is optimized until stop_strategy decides that an acceptable
- the change in f() from one iteration to the next is less than min_delta point has been found or f(#x) < min_f.
- f(#x) <= min_f
- #x == the value of x that was found to minimize f() - #x == the value of x that was found to minimize f()
- Uses the dlib::derivative(f,derivative_eps) function to compute gradient
information.
!*/ !*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
template < template <
typename funct, typename search_strategy_type,
typename stop_strategy_type,
typename funct,
typename T typename T
> >
void find_min_conjugate_gradient2 ( void find_max_using_approximate_derivatives (
const funct& f, search_strategy_type search_strategy,
T& x, stop_strategy_type stop_strategy,
double min_f, const funct& f,
double min_delta = 1e-7, T& x,
const double derivative_eps = 1e-7 double max_f,
double derivative_eps = 1e-7
); );
/*! /*!
requires requires
- min_delta >= 0 - search_strategy == an object that defines a search strategy such as one
- derivative_eps > 0 of the objects from dlib/optimization/optimization_search_strategies_abstract.h
- stop_strategy == an object that defines a stop strategy such as one of
the objects from dlib/optimization/optimization_stop_strategies_abstract.h
- f(x) must be a valid expression that evaluates to a double - f(x) must be a valid expression that evaluates to a double
- der(x) must be a valid expression that evaluates to the derivative of - is_col_vector(x) == true
f() at x. - derivative_eps > 0
- is_matrix<T>::value == true (i.e. T must be a dlib::matrix type)
- x.nc() == 1 (i.e. x must be a column vector)
ensures ensures
- Performs an unconstrained minimization of the function f() using a - Performs an unconstrained maximization of the function f() using the given
conjugate gradient method. The optimization stops when any of the following search_strategy.
conditions are satisfied: - The function is optimized until stop_strategy decides that an acceptable
- the change in f() from one iteration to the next is less than min_delta point has been found or f(#x) > max_f.
- f(#x) <= min_f - #x == the value of x that was found to maximize f()
- Uses the dlib::derivative(f,derivative_eps) function to compute gradient - Uses the dlib::derivative(f,derivative_eps) function to compute gradient
information information.
- #x == the value of x that was found to minimize f()
!*/ !*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
......
// Copyright (C) 2008 Davis E. King (davisking@users.sourceforge.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_OPTIMIZATIOn_LINE_SEARCH_H_
#define DLIB_OPTIMIZATIOn_LINE_SEARCH_H_
#include <cmath>
#include <limits>
#include "../matrix.h"
#include "../algs.h"
#include "optimization_line_search_abstract.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
template <typename funct, typename T>
class line_search_funct
{
public:
// You get an error on this line when you pass in a global function to this function.
// You have to either use a function object or pass a pointer to your global function
// by taking its address using the & operator. (This check is here because gcc 4.0
// has a bug that causes it to silently corrupt return values from functions that
// invoked through a reference)
COMPILE_TIME_ASSERT(is_function<funct>::value == false);
line_search_funct(const funct& f_, const T& start_, const T& direction_)
: f(f_),start(start_), direction(direction_), matrix_r(0), scalar_r(0)
{}
line_search_funct(const funct& f_, const T& start_, const T& direction_, T& r)
: f(f_),start(start_), direction(direction_), matrix_r(&r), scalar_r(0)
{}
line_search_funct(const funct& f_, const T& start_, const T& direction_, double& r)
: f(f_),start(start_), direction(direction_), matrix_r(0), scalar_r(&r)
{}
double operator()(const double& x) const
{
return get_value(f(start + x*direction));
}
private:
double get_value (const double& r) const
{
// save a copy of this value for later
if (scalar_r)
*scalar_r = r;
return r;
}
template <typename U>
double get_value (const U& r) const
{
// U should be a matrix type
COMPILE_TIME_ASSERT(is_matrix<U>::value);
// save a copy of this value for later
if (matrix_r)
*matrix_r = r;
return dot(r,direction);
}
const funct& f;
const T& start;
const T& direction;
T* matrix_r;
double* scalar_r;
};
template <typename funct, typename T>
const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction)
{
// You get an error on this line when you pass in a global function to this function.
// You have to either use a function object or pass a pointer to your global function
// by taking its address using the & operator. (This check is here because gcc 4.0
// has a bug that causes it to silently corrupt return values from functions that
// invoked through a reference)
COMPILE_TIME_ASSERT(is_function<funct>::value == false);
COMPILE_TIME_ASSERT(is_matrix<T>::value);
DLIB_ASSERT (
is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(),
"\tline_search_funct make_line_search_function(f,start,direction)"
<< "\n\tYou have to supply column vectors to this function"
<< "\n\tstart.nc(): " << start.nc()
<< "\n\tdirection.nc(): " << direction.nc()
<< "\n\tstart.nr(): " << start.nr()
<< "\n\tdirection.nr(): " << direction.nr()
);
return line_search_funct<funct,T>(f,start,direction);
}
// ----------------------------------------------------------------------------------------
template <typename funct, typename T>
const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction, double& f_out)
{
// You get an error on this line when you pass in a global function to this function.
// You have to either use a function object or pass a pointer to your global function
// by taking its address using the & operator. (This check is here because gcc 4.0
// has a bug that causes it to silently corrupt return values from functions that
// invoked through a reference)
COMPILE_TIME_ASSERT(is_function<funct>::value == false);
COMPILE_TIME_ASSERT(is_matrix<T>::value);
DLIB_ASSERT (
is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(),
"\tline_search_funct make_line_search_function(f,start,direction)"
<< "\n\tYou have to supply column vectors to this function"
<< "\n\tstart.nc(): " << start.nc()
<< "\n\tdirection.nc(): " << direction.nc()
<< "\n\tstart.nr(): " << start.nr()
<< "\n\tdirection.nr(): " << direction.nr()
);
return line_search_funct<funct,T>(f,start,direction, f_out);
}
// ----------------------------------------------------------------------------------------
template <typename funct, typename T>
const line_search_funct<funct,T> make_line_search_function(const funct& f, const T& start, const T& direction, T& grad_out)
{
// You get an error on this line when you pass in a global function to this function.
// You have to either use a function object or pass a pointer to your global function
// by taking its address using the & operator. (This check is here because gcc 4.0
// has a bug that causes it to silently corrupt return values from functions that
// invoked through a reference)
COMPILE_TIME_ASSERT(is_function<funct>::value == false);
COMPILE_TIME_ASSERT(is_matrix<T>::value);
DLIB_ASSERT (
is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size(),
"\tline_search_funct make_line_search_function(f,start,direction)"
<< "\n\tYou have to supply column vectors to this function"
<< "\n\tstart.nc(): " << start.nc()
<< "\n\tdirection.nc(): " << direction.nc()
<< "\n\tstart.nr(): " << start.nr()
<< "\n\tdirection.nr(): " << direction.nr()
);
return line_search_funct<funct,T>(f,start,direction,grad_out);
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
inline double poly_min_extrap (
double f0,
double d0,
double f1,
double d1
)
{
const double n = 3*(f1 - f0) - 2*d0 - d1;
const double e = d0 + d1 - 2*(f1 - f0);
// find the minimum of the derivative of the polynomial
double temp = std::max(n*n - 3*e*d0,0.0);
if (temp < 0)
return 0.5;
temp = std::sqrt(temp);
if (std::abs(e) <= std::numeric_limits<double>::epsilon())
return 0.5;
// figure out the two possible min values
double x1 = (temp - n)/(3*e);
double x2 = -(temp + n)/(3*e);
// compute the value of the interpolating polynomial at these two points
double y1 = f0 + d0*x1 + n*x1*x1 + e*x1*x1*x1;
double y2 = f0 + d0*x2 + n*x2*x2 + e*x2*x2*x2;
// pick the best point
double x;
if (y1 < y2)
x = x1;
else
x = x2;
// now make sure the minimum is within the allowed range of (0,1)
return put_in_range(0,1,x);
}
// ----------------------------------------------------------------------------------------
template <
typename funct,
typename funct_der
>
double line_search (
const funct& f,
const double f0,
const funct_der& der,
const double d0,
double rho,
double sigma,
double min_f
)
{
// You get an error on this line when you pass in a global function to this function.
// You have to either use a function object or pass a pointer to your global function
// by taking its address using the & operator. (This check is here because gcc 4.0
// has a bug that causes it to silently corrupt return values from functions that
// invoked through a reference)
COMPILE_TIME_ASSERT(is_function<funct>::value == false);
COMPILE_TIME_ASSERT(is_function<funct_der>::value == false);
DLIB_ASSERT (
1 > sigma && sigma > rho && rho > 0,
"\tdouble line_search()"
<< "\n\tYou have given invalid arguments to this function"
<< "\n\tsigma: " << sigma
<< "\n\trho: " << rho
);
// The bracketing phase of this function is implemented according to block 2.6.2 from
// the book Practical Methods of Optimization by R. Fletcher. The sectioning
// phase is an implementation of 2.6.4 from the same book.
// tau1 > 1. Controls the alpha jump size during the search
const double tau1 = 9;
// it must be the case that 0 < tau2 < tau3 <= 1/2 for the algorithm to function
// correctly but the specific values of tau2 and tau3 aren't super important.
const double tau2 = 1.0/10.0;
const double tau3 = 1.0/2.0;
// Stop right away and return a step size of 0 if the gradient is 0 at the starting point
if (std::abs(d0) < std::numeric_limits<double>::epsilon())
return 0;
// Stop right away if the current value is good enough according to min_f
if (f0 <= min_f)
return 0;
// Figure out a reasonable upper bound on how large alpha can get.
const double mu = (min_f-f0)/(rho*d0);
double alpha = 1;
if (mu < 0)
alpha = -alpha;
alpha = put_in_range(0, 0.65*mu, alpha);
double last_alpha = 0;
double last_val = f0;
double last_val_der = d0;
// the bracketing stage will find a find a range of points [a,b]
// that contains a reasonable solution to the line search
double a, b;
// These variables will hold the values and derivatives of f(a) and f(b)
double a_val, b_val, a_val_der, b_val_der;
// This thresh value represents the Wolfe curvature condition
const double thresh = std::abs(sigma*d0);
// do the bracketing stage to find the bracket range [a,b]
while (true)
{
const double val = f(alpha);
const double val_der = der(alpha);
// we are done with the line search since we found a value smaller
// than the minimum f value
if (val <= min_f)
return alpha;
if (val > f0 + rho*alpha*d0 || val >= last_val)
{
a_val = last_val;
a_val_der = last_val_der;
b_val = val;
b_val_der = val_der;
a = last_alpha;
b = alpha;
break;
}
if (std::abs(val_der) <= thresh)
return alpha;
// if we are stuck not making progress then quit with the current alpha
if (last_alpha == alpha)
return alpha;
if (val_der >= 0)
{
a_val = val;
a_val_der = val_der;
b_val = last_val;
b_val_der = last_val_der;
a = alpha;
b = last_alpha;
break;
}
if (mu <= 2*alpha - last_alpha)
{
last_alpha = alpha;
alpha = mu;
}
else
{
const double temp = alpha;
double first = 2*alpha - last_alpha;
double last;
if (mu > 0)
last = std::min(mu, alpha + tau1*(alpha - last_alpha));
else
last = std::max(mu, alpha + tau1*(alpha - last_alpha));
// pick a point between first and last by doing some kind of interpolation
if (last_alpha < alpha)
alpha = last_alpha + (alpha-last_alpha)*poly_min_extrap(last_val, last_val_der, val, val_der);
else
alpha = alpha + (last_alpha-alpha)*poly_min_extrap(val, val_der, last_val, last_val_der);
alpha = put_in_range(first,last,alpha);
last_alpha = temp;
}
last_val = val;
last_val_der = val_der;
}
// Now do the sectioning phase from 2.6.4
while (true)
{
double first = a + tau2*(b-a);
double last = b - tau3*(b-a);
// use interpolation to pick alpha between first and last
alpha = a + (b-a)*poly_min_extrap(a_val, a_val_der, b_val, b_val_der);
alpha = put_in_range(first,last,alpha);
const double val = f(alpha);
const double val_der = der(alpha);
// we are done with the line search since we found a value smaller
// than the minimum f value
if (val <= min_f)
return alpha;
// stop if the interval gets so small that it isn't shrinking any more due to rounding error
if (a == first || b == last)
{
return b;
}
if (val > f0 + rho*alpha*d0 || val >= a_val)
{
b = alpha;
b_val = val;
b_val_der = val_der;
}
else
{
if (std::abs(val_der) <= thresh)
return alpha;
if ( (b-a)*val_der >= 0)
{
b = a;
b_val = a_val;
b_val_der = a_val_der;
}
a = alpha;
a_val = val;
a_val_der = val_der;
}
}
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_OPTIMIZATIOn_LINE_SEARCH_H_
// Copyright (C) 2008 Davis E. King (davisking@users.sourceforge.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_OPTIMIZATIOn_ABSTRACT_
#ifdef DLIB_OPTIMIZATIOn_ABSTRACT_
#include <cmath>
#include <limits>
#include "../matrix/matrix_abstract.h"
#include "../algs.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
template <
typename funct,
typename T
>
class line_search_funct;
/*!
This object is a function object that represents a line search function.
Moreover, it represents a function with the signature:
double l(double x)
!*/
template <
typename funct,
typename T
>
const line_search_funct<funct,T> make_line_search_function (
const funct& f,
const T& start,
const T& direction
);
/*!
requires
- is_col_vector(start) && is_col_vector(direction) && start.size() == direction.size()
(i.e. start and direction should be column vectors of the same size)
- f must return either a double or a column vector the same length as start
- f(start + 1.5*direction) should be a valid expression
ensures
- if (f returns a double) then
- returns a line search function that computes l(x) == f(start + x*direction)
- else
- returns a line search function that computes l(x) == dot(f(start + x*direction),direction).
That is, we assume f is the derivative of some other function and that what
f returns is a gradient vector.
So the following two expressions both create the derivative of l(x):
- derivative(make_line_search_function(funct,start,direction))
- make_line_search_function(derivative(funct),start,direction)
!*/
template <
typename funct,
typename T
>
const line_search_funct<funct,T> make_line_search_function (
const funct& f,
const T& start,
const T& direction,
double& f_out
);
/*!
This function is identical to the above three argument version of make_line_search_function()
except that, if f() outputs a double, every time f() is evaluated its output is also stored
into f_out.
!*/
template <
typename funct,
typename T
>
const line_search_funct<funct,T> make_line_search_function (
const funct& f,
const T& start,
const T& direction,
T& gradient_out
);
/*!
This function is identical to the above three argument version of make_line_search_function()
except that, if f() outputs a column vector, every time f() is evaluated its output is also
stored into gradient_out.
!*/
// ----------------------------------------------------------------------------------------
inline double poly_min_extrap (
double f0,
double d0,
double f1,
double d1
);
/*!
ensures
- let c(x) be a 3rd degree polynomial such that:
- c(0) == f0
- c(1) == f1
- derivative of c(x) at x==0 is d0
- derivative of c(x) at x==1 is d1
- returns the point in the range [0,1] that minimizes the polynomial c(x)
!*/
// ----------------------------------------------------------------------------------------
template <
typename funct,
typename funct_der
>
double line_search (
const funct& f,
const double f0,
const funct_der& der,
const double d0,
double rho,
double sigma,
double min_f
)
/*!
requires
- 0 < rho < sigma < 1
- f and der are scalar functions of scalars
(e.g. line_search_funct objects)
- der is the derivative of f
- f0 == f(0)
- d0 == der(0)
ensures
- Performs a line search and uses the strong Wolfe conditions to decide when
the search can stop.
- rho == the parameter of the Wolfe sufficient decrease condition
- sigma == the parameter of the Wolfe curvature condition
- returns a value alpha such that f(alpha) is significantly closer to
the minimum of f than f(0).
- It is assumed that the minimum possible value of f(x) is min_f. So if
an alpha is found such that f(alpha) <= min_f then the search stops
immediately.
!*/
/*
A good discussion of the Wolfe conditions and line search algorithms in
general can be found in the book Practical Methods of Optimization by R. Fletcher
and also in the more recent book Numerical Optimization by Nocedal and Wright.
*/
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_OPTIMIZATIOn_ABSTRACT_
// Copyright (C) 2008 Davis E. King (davisking@users.sourceforge.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_
#define DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_
#include <cmath>
#include <limits>
#include "../matrix.h"
#include "../algs.h"
#include "optimization_search_strategies_abstract.h"
#include "../sequence.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
class cg_search_strategy
{
public:
cg_search_strategy() : been_used(false) {}
double get_wolfe_rho (
) const { return 0.001; }
double get_wolfe_sigma (
) const { return 0.01; }
template <typename T>
const matrix<double,0,1>& get_next_direction (
const T& ,
const double ,
const T& funct_derivative
)
{
if (been_used == false)
{
been_used = true;
prev_direction = -funct_derivative;
}
else
{
// Use the Polak-Ribiere (4.1.12) conjugate gradient described by Fletcher on page 83
const double temp = trans(prev_derivative)*prev_derivative;
// If this value hits zero then just use the direction of steepest descent.
if (std::abs(temp) < std::numeric_limits<double>::epsilon())
{
prev_derivative = funct_derivative;
prev_direction = -funct_derivative;
return prev_direction;
}
double b = trans(funct_derivative-prev_derivative)*funct_derivative/(temp);
prev_direction = -funct_derivative + b*prev_direction;
}
prev_derivative = funct_derivative;
return prev_direction;
}
private:
bool been_used;
matrix<double,0,1> prev_derivative;
matrix<double,0,1> prev_direction;
};
// ----------------------------------------------------------------------------------------
class bfgs_search_strategy
{
public:
bfgs_search_strategy() : been_used(false), been_used_twice(false) {}
double get_wolfe_rho (
) const { return 0.01; }
double get_wolfe_sigma (
) const { return 0.9; }
template <typename T>
const matrix<double,0,1>& get_next_direction (
const T& x,
const double ,
const T& funct_derivative
)
{
if (been_used == false)
{
been_used = true;
H = identity_matrix<double>(x.size());
}
else
{
// update H with the BFGS formula from (3.2.12) on page 55 of Fletcher
delta = (x-prev_x);
gamma = funct_derivative-prev_derivative;
double dg = dot(delta,gamma);
// Try to set the initial value of the H matrix to something reasonable if we are still
// in the early stages of figuring out what it is. This formula below is what is suggested
// in the book Numerical Optimization by Nocedal and Wright in the chapter on Quasi-Newton methods.
if (been_used_twice == false)
{
double gg = trans(gamma)*gamma;
if (std::abs(gg) > std::numeric_limits<double>::epsilon())
{
const double temp = put_in_range(0.01, 100, dg/gg);
H = diagm(uniform_matrix<double>(x.size(),1, temp));
been_used_twice = true;
}
}
Hg = H*gamma;
gH = trans(trans(gamma)*H);
double gHg = trans(gamma)*H*gamma;
if (gHg < std::numeric_limits<double>::infinity() && dg < std::numeric_limits<double>::infinity() &&
dg != 0)
{
H += (1 + gHg/dg)*delta*trans(delta)/(dg) - (delta*trans(gH) + Hg*trans(delta))/(dg);
}
else
{
H = identity_matrix<double>(H.nr());
been_used_twice = false;
}
}
prev_x = x;
prev_direction = -H*funct_derivative;
prev_derivative = funct_derivative;
return prev_direction;
}
private:
bool been_used;
bool been_used_twice;
matrix<double,0,1> prev_x;
matrix<double,0,1> prev_derivative;
matrix<double,0,1> prev_direction;
matrix<double> H;
matrix<double,0,1> delta, gamma, Hg, gH;
};
// ----------------------------------------------------------------------------------------
class lbfgs_search_strategy
{
public:
lbfgs_search_strategy(unsigned long max_size_) : max_size(max_size_), been_used(false) {}
lbfgs_search_strategy(const lbfgs_search_strategy& item)
{
max_size = item.max_size;
been_used = item.been_used;
prev_x = item.prev_x;
prev_derivative = item.prev_derivative;
prev_direction = item.prev_direction;
alpha = item.alpha;
dh_temp = item.dh_temp;
}
double get_wolfe_rho (
) const { return 0.01; }
double get_wolfe_sigma (
) const { return 0.9; }
template <typename T>
const matrix<double,0,1>& get_next_direction (
const T& x,
const double ,
const T& funct_derivative
)
{
if (been_used == false)
{
prev_direction = -funct_derivative;
been_used = true;
}
else
{
// add an element into the stored data sequence
dh_temp.s = x - prev_x;
dh_temp.y = funct_derivative - prev_derivative;
double temp = dlib::dot(dh_temp.s, dh_temp.y);
// only accept this bit of data if temp isn't zero
if (std::abs(temp) > std::numeric_limits<double>::epsilon())
{
dh_temp.rho = 1/temp;
data.add(data.size(), dh_temp);
}
else
{
data.clear();
}
if (data.size() > 0)
{
// This block of code is from algorithm 7.4 in the Nocedal book.
prev_direction = -funct_derivative;
alpha.resize(data.size());
for (unsigned long i = data.size()-1; i < data.size(); --i)
{
alpha[i] = data[i].rho*dot(data[i].s, prev_direction);
prev_direction -= alpha[i]*data[i].y;
}
// Take a guess at what the first H matrix should be. This formula below is what is suggested
// in the book Numerical Optimization by Nocedal and Wright in the chapter on Large Scale
// Unconstrained Optimization (in the L-BFGS section).
double H_0 = 1.0/data[data.size()-1].rho/dot(data[data.size()-1].y, data[data.size()-1].y);
H_0 = put_in_range(0.001, 1000.0, H_0);
prev_direction *= H_0;
for (unsigned long i = 0; i < data.size(); ++i)
{
double beta = data[i].rho*dot(data[i].y, prev_direction);
prev_direction += data[i].s * (alpha[i] - beta);
}
}
else
{
prev_derivative = -funct_derivative;
}
}
if (data.size() > max_size)
{
// remove the oldest element in the data sequence
data.remove(0, dh_temp);
}
prev_x = x;
prev_derivative = funct_derivative;
return prev_direction;
}
private:
struct data_helper
{
matrix<double,0,1> s;
matrix<double,0,1> y;
double rho;
friend void swap(data_helper& a, data_helper& b)
{
a.s.swap(b.s);
a.y.swap(b.y);
std::swap(a.rho, b.rho);
}
};
sequence<data_helper>::kernel_2a data;
unsigned long max_size;
bool been_used;
matrix<double,0,1> prev_x;
matrix<double,0,1> prev_derivative;
matrix<double,0,1> prev_direction;
std::vector<double> alpha;
data_helper dh_temp;
};
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_
// Copyright (C) 2008 Davis E. King (davisking@users.sourceforge.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_ABSTRACT_
#ifdef DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_ABSTRACT_
#include <cmath>
#include <limits>
#include "../matrix/matrix_abstract.h"
#include "../algs.h"
namespace dlib
{
/*
A good discussion of the search strategies in this file can be found in the
following book: Numerical Optimization by Nocedal and Wright.
*/
// ----------------------------------------------------------------------------------------
class cg_search_strategy
{
/*!
WHAT THIS OBJECT REPRESENTS
This object represents a strategy for determining which direction
a line search should be carried out along. This particular object
is an implementation of the Polak-Ribiere conjugate gradient method
for determining this direction.
This method uses an amount of memory that is linear in the number
of variables to be optimized. So it is capable of handling problems
with a very large number of variables. However, it is generally
not as good as the L-BFGS algorithm (which is defined below in
the lbfgs_search_strategy class).
!*/
public:
cg_search_strategy(
);
/*!
ensures
- This object is properly initialized and ready to generate
search directions.
!*/
double get_wolfe_rho (
) const;
/*!
ensures
- returns the value of the Wolfe rho parameter that should be used when
this search strategy is used with the line_search() function.
!*/
double get_wolfe_sigma (
) const;
/*!
ensures
- returns the value of the Wolfe sigma parameter that should be used when
this search strategy is used with the line_search() function.
!*/
template <typename T>
const matrix<double,0,1>& get_next_direction (
const T& x,
const double funct_value,
const T& funct_derivative
);
/*!
requires
- for some function f():
- funct_value == f(x)
- funct_derivative == derivative(f)(x)
ensures
- Assuming that a line search is going to be conducted starting from the point x,
this function returns the direction in which the search should proceed.
!*/
};
// ----------------------------------------------------------------------------------------
class bfgs_search_strategy
{
/*!
WHAT THIS OBJECT REPRESENTS
This object represents a strategy for determining which direction
a line search should be carried out along. This particular object
is an implementation of the BFGS quasi-newton method for determining
this direction.
This method uses an amount of memory that is quadratic in the number
of variables to be optimized. It is generally very effective but
if your problem has a very large number of variables then it isn't
appropriate. Instead You should try the lbfgs_search_strategy.
!*/
public:
bfgs_search_strategy(
);
/*!
ensures
- This object is properly initialized and ready to generate
search directions.
!*/
double get_wolfe_rho (
) const;
/*!
ensures
- returns the value of the Wolfe rho parameter that should be used when
this search strategy is used with the line_search() function.
!*/
double get_wolfe_sigma (
) const;
/*!
ensures
- returns the value of the Wolfe sigma parameter that should be used when
this search strategy is used with the line_search() function.
!*/
template <typename T>
const matrix<double,0,1>& get_next_direction (
const T& x,
const double funct_value,
const T& funct_derivative
);
/*!
requires
- for some function f():
- funct_value == f(x)
- funct_derivative == derivative(f)(x)
ensures
- Assuming that a line search is going to be conducted starting from the point x,
this function returns the direction in which the search should proceed.
!*/
};
// ----------------------------------------------------------------------------------------
class lbfgs_search_strategy
{
/*!
WHAT THIS OBJECT REPRESENTS
This object represents a strategy for determining which direction
a line search should be carried out along. This particular object
is an implementation of the L-BFGS quasi-newton method for determining
this direction.
This method uses an amount of memory that is linear in the number
of variables to be optimized. This makes it an excellent method
to use when an optimization problem has a large number of variables.
!*/
public:
lbfgs_search_strategy(
unsigned long max_size
);
/*!
requires
- max_size > 0
ensures
- This object is properly initialized and ready to generate
search directions.
- L-BFGS works by remembering a certain number of position and gradient
pairs. It uses this remembered information to compute search directions.
The max_size argument determines how many of these pairs will be remembered.
Typically, using between 3 and 30 pairs performs well for many problems.
!*/
double get_wolfe_rho (
) const;
/*!
ensures
- returns the value of the Wolfe rho parameter that should be used when
this search strategy is used with the line_search() function.
!*/
double get_wolfe_sigma (
) const;
/*!
ensures
- returns the value of the Wolfe sigma parameter that should be used when
this search strategy is used with the line_search() function.
!*/
template <typename T>
const matrix<double,0,1>& get_next_direction (
const T& x,
const double funct_value,
const T& funct_derivative
);
/*!
requires
- for some function f():
- funct_value == f(x)
- funct_derivative == derivative(f)(x)
ensures
- Assuming that a line search is going to be conducted starting from the point x,
this function returns the direction in which the search should proceed.
!*/
};
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_ABSTRACT_
// Copyright (C) 2008 Davis E. King (davisking@users.sourceforge.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_OPTIMIZATIOn_STOP_STRATEGIES_H_
#define DLIB_OPTIMIZATIOn_STOP_STRATEGIES_H_
#include <cmath>
#include <limits>
#include "../matrix.h"
#include "../algs.h"
#include "optimization_stop_strategies_abstract.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
class objective_delta_stop_strategy
{
public:
objective_delta_stop_strategy (
double min_delta = 1e-7
) : _been_used(false), _min_delta(min_delta), _max_iter(0), _cur_iter(0), _prev_funct_value(0) {}
objective_delta_stop_strategy (
double min_delta,
unsigned long max_iter
) : _been_used(false), _min_delta(min_delta), _max_iter(max_iter), _cur_iter(0), _prev_funct_value(0) {}
template <typename T>
bool should_continue_search (
const T& ,
const double funct_value,
const T&
)
{
++_cur_iter;
if (_been_used)
{
// Check if we have hit the max allowable number of iterations. (but only
// check if _max_iter is enabled (i.e. not 0)).
if (_max_iter != 0 && _cur_iter > _max_iter)
return false;
// check if the function change was too small
if (std::abs(funct_value - _prev_funct_value) < _min_delta)
return false;
}
_been_used = true;
_prev_funct_value = funct_value;
return true;
}
private:
bool _been_used;
double _min_delta;
unsigned long _max_iter;
unsigned long _cur_iter;
double _prev_funct_value;
};
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_OPTIMIZATIOn_STOP_STRATEGIES_H_
// Copyright (C) 2008 Davis E. King (davisking@users.sourceforge.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_OPTIMIZATIOn_STOP_STRATEGIES_ABSTRACT_
#ifdef DLIB_OPTIMIZATIOn_STOP_STRATEGIES_ABSTRACT_
#include <cmath>
#include <limits>
#include "../matrix/matrix_abstract.h"
#include "../algs.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
class objective_delta_stop_strategy
{
/*!
WHAT THIS OBJECT REPRESENTS
This object represents a strategy for deciding if an optimization
algorithm should terminate. This particular object looks at the
change in the objective function from one iteration to the next and
bases its decision on how large this change is. If the change
is below a user given threshold then the search stops.
!*/
public:
objective_delta_stop_strategy (
double min_delta = 1e-7
);
/*!
requires
- min_delta >= 0
ensures
- This stop strategy object will only consider a search to be complete
if a change in an objective function from one iteration to the next
is less than min_delta.
!*/
objective_delta_stop_strategy (
double min_delta,
unsigned long max_iter
);
/*!
requires
- min_delta >= 0
- max_iter > 0
ensures
- This stop strategy object will only consider a search to be complete
if a change in an objective function from one iteration to the next
is less than min_delta or more than max_iter iterations has been
executed.
!*/
template <typename T>
bool should_continue_search (
const T& x,
const double funct_value,
const T& funct_derivative
);
/*!
requires
- for some function f():
- funct_value == f(x)
- funct_derivative == derivative(f)(x)
ensures
- returns true if the point x doest not satisfy the stopping condition and
false otherwise.
!*/
};
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_OPTIMIZATIOn_STOP_STRATEGIES_ABSTRACT_
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment