Commit 82fb3682 authored by Davis King's avatar Davis King

Refactored the code in the reduced_decision_function_trainer2. Part of it has been turned into

a global function called approximate_distance_function() which performs the main optimization.  The
reduced_decision_function_trainer2 now depends on this global function.  This changes makes this
function optimizer available for other purposes besides use in the reduced_decision_function_trainer2
object.

--HG--
extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%404125
parent 543e289f
......@@ -144,94 +144,35 @@ namespace dlib
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
template <
typename trainer_type
>
class reduced_decision_function_trainer2
namespace red_impl
{
public:
typedef typename trainer_type::kernel_type kernel_type;
typedef typename trainer_type::scalar_type scalar_type;
typedef typename trainer_type::sample_type sample_type;
typedef typename trainer_type::mem_manager_type mem_manager_type;
typedef typename trainer_type::trained_function_type trained_function_type;
reduced_decision_function_trainer2 () : num_bv(0) {}
reduced_decision_function_trainer2 (
const trainer_type& trainer_,
const long num_sb_,
const double eps_ = 1e-3
) :
trainer(trainer_),
num_bv(num_sb_),
eps(eps_)
{
COMPILE_TIME_ASSERT(is_matrix<sample_type>::value);
// make sure requires clause is not broken
DLIB_ASSERT(num_bv > 0 && eps > 0,
"\t reduced_decision_function_trainer2()"
<< "\n\t you have given invalid arguments to this function"
<< "\n\t num_bv: " << num_bv
<< "\n\t eps: " << eps
);
}
template <
typename in_sample_vector_type,
typename in_scalar_vector_type
>
const decision_function<kernel_type> train (
const in_sample_vector_type& x,
const in_scalar_vector_type& y
) const
{
// make sure requires clause is not broken
DLIB_ASSERT(num_bv > 0,
"\t reduced_decision_function_trainer2::train(x,y)"
<< "\n\t You have tried to use an uninitialized version of this object"
<< "\n\t num_bv: " << num_bv );
return do_train(vector_to_matrix(x), vector_to_matrix(y));
}
private:
// ------------------------------------------------------------------------------------
template <typename kernel_type>
class objective
{
/*
This object represents the objective function we will try to
minimize in the final stage of this reduced set method.
minimize in approximate_distance_function().
The objective is the distance, in kernel induced feature space, between
the original decision function and the approximated version.
the original distance function and the approximated version.
*/
typedef typename kernel_type::scalar_type scalar_type;
typedef typename kernel_type::sample_type sample_type;
typedef typename kernel_type::mem_manager_type mem_manager_type;
public:
objective(
const decision_function<kernel_type>& dec_funct_,
const distance_function<kernel_type>& dist_funct_,
matrix<scalar_type,0,1,mem_manager_type>& b_,
matrix<sample_type,0,1,mem_manager_type>& out_vectors_
) :
dec_funct(dec_funct_),
dist_funct(dist_funct_),
b(b_),
out_vectors(out_vectors_)
{
const kernel_type k(dec_funct.kernel_function);
// here we compute a term in the objective function that is a constant. So
// do it in the constructor so we don't have to recompute it every time
// the objective is evaluated.
bias = 0;
for (long i = 0; i < dec_funct.alpha.size(); ++i)
{
for (long j = 0; j < dec_funct.alpha.size(); ++j)
{
bias += dec_funct.alpha(i)*dec_funct.alpha(j)*
k(dec_funct.basis_vectors(i), dec_funct.basis_vectors(j));
}
}
}
const matrix<scalar_type, 0, 1, mem_manager_type> state_to_vector (
......@@ -239,7 +180,7 @@ namespace dlib
/*!
ensures
- returns a vector that contains all the information necessary to
reproduce the current state of the approximated decision function
reproduce the current state of the approximated distance function
!*/
{
matrix<scalar_type, 0, 1, mem_manager_type> z(b.nr() + out_vectors.size()*out_vectors(0).nr());
......@@ -270,7 +211,7 @@ namespace dlib
- z came from the state_to_vector() function or has a compatible format
ensures
- loads the vector z into the state variables of the approximate
decision function (i.e. b and out_vectors)
distance function (i.e. b and out_vectors)
!*/
{
long i = 0;
......@@ -295,20 +236,20 @@ namespace dlib
) const
/*!
ensures
- loads the current approximate decision function with z
- returns the distance between the original decision function
- loads the current approximate distance function with z
- returns the distance between the original distance function
and the approximate one.
!*/
{
vector_to_state(z);
const kernel_type k(dec_funct.kernel_function);
const kernel_type k(dist_funct.get_kernel());
double temp = 0;
for (long i = 0; i < out_vectors.size(); ++i)
{
for (long j = 0; j < dec_funct.basis_vectors.nr(); ++j)
for (long j = 0; j < dist_funct.get_basis_vectors().nr(); ++j)
{
temp -= b(i)*dec_funct.alpha(j)*k(out_vectors(i), dec_funct.basis_vectors(j));
temp -= b(i)*dist_funct.get_alpha()(j)*k(out_vectors(i), dist_funct.get_basis_vectors()(j));
}
}
......@@ -322,14 +263,12 @@ namespace dlib
}
}
return temp + bias;
return temp + dist_funct.get_squared_norm();
}
private:
scalar_type bias;
const decision_function<kernel_type>& dec_funct;
const distance_function<kernel_type>& dist_funct;
matrix<scalar_type,0,1,mem_manager_type>& b;
matrix<sample_type,0,1,mem_manager_type>& out_vectors;
......@@ -337,20 +276,24 @@ namespace dlib
// ------------------------------------------------------------------------------------
template <typename kernel_type>
class objective_derivative
{
/*!
This object represents the derivative of the objective object
!*/
typedef typename kernel_type::scalar_type scalar_type;
typedef typename kernel_type::sample_type sample_type;
typedef typename kernel_type::mem_manager_type mem_manager_type;
public:
objective_derivative(
const decision_function<kernel_type>& dec_funct_,
const distance_function<kernel_type>& dist_funct_,
matrix<scalar_type,0,1,mem_manager_type>& b_,
matrix<sample_type,0,1,mem_manager_type>& out_vectors_
) :
dec_funct(dec_funct_),
dist_funct(dist_funct_),
b(b_),
out_vectors(out_vectors_)
{
......@@ -364,7 +307,7 @@ namespace dlib
- z came from the state_to_vector() function or has a compatible format
ensures
- loads the vector z into the state variables of the approximate
decision function (i.e. b and out_vectors)
distance function (i.e. b and out_vectors)
!*/
{
long i = 0;
......@@ -389,15 +332,15 @@ namespace dlib
) const
/*!
ensures
- loads the current approximate decision function with z
- loads the current approximate distance function with z
- returns the derivative of the distance between the original
decision function and the approximate one.
distance function and the approximate one.
!*/
{
vector_to_state(z);
res.set_size(z.nr());
set_all_elements(res,0);
const kernel_type k(dec_funct.kernel_function);
const kernel_type k(dist_funct.get_kernel());
const kernel_derivative<kernel_type> K_der(k);
// first compute the gradient for the beta weights
......@@ -410,9 +353,9 @@ namespace dlib
}
for (long i = 0; i < out_vectors.size(); ++i)
{
for (long j = 0; j < dec_funct.basis_vectors.size(); ++j)
for (long j = 0; j < dist_funct.get_basis_vectors().size(); ++j)
{
res(i) -= dec_funct.alpha(j)*k(out_vectors(i), dec_funct.basis_vectors(j));
res(i) -= dist_funct.get_alpha()(j)*k(out_vectors(i), dist_funct.get_basis_vectors()(j));
}
}
......@@ -428,12 +371,12 @@ namespace dlib
{
temp += b(j)*K_der(out_vectors(j), out_vectors(i));
}
for (long j = 0; j < dec_funct.basis_vectors.nr(); ++j)
for (long j = 0; j < dist_funct.get_basis_vectors().nr(); ++j)
{
temp -= dec_funct.alpha(j)*K_der(dec_funct.basis_vectors(j), out_vectors(i) );
temp -= dist_funct.get_alpha()(j)*K_der(dist_funct.get_basis_vectors()(j), out_vectors(i) );
}
// store the gradient for out_vectors[i] into result in the proper spot
// store the gradient for out_vectors(i) into result in the proper spot
set_subm(res,pos,0,num,1) = b(i)*temp;
pos += num;
}
......@@ -448,7 +391,7 @@ namespace dlib
mutable matrix<scalar_type, 0, 1, mem_manager_type> res;
mutable sample_type temp;
const decision_function<kernel_type>& dec_funct;
const distance_function<kernel_type>& dist_funct;
matrix<scalar_type,0,1,mem_manager_type>& b;
matrix<sample_type,0,1,mem_manager_type>& out_vectors;
......@@ -456,6 +399,142 @@ namespace dlib
// ------------------------------------------------------------------------------------
}
template <
typename K,
typename stop_strategy_type
>
distance_function<K> approximate_distance_function (
stop_strategy_type stop_strategy,
const distance_function<K>& target,
const distance_function<K>& starting_point
)
{
// make sure requires clause is not broken
DLIB_ASSERT(target.get_basis_vectors().size() > 0 &&
starting_point.get_basis_vectors().size() > 0 &&
target.get_kernel() == starting_point.get_kernel(),
"\t distance_function approximate_distance_function()"
<< "\n\t Invalid inputs were given to this function."
<< "\n\t target.get_basis_vectors().size(): " << target.get_basis_vectors().size()
<< "\n\t starting_point.get_basis_vectors().size(): " << starting_point.get_basis_vectors().size()
<< "\n\t target.kernel_function == starting_point.kernel_function: " << (target.get_kernel() == starting_point.get_kernel())
);
using namespace red_impl;
// The next few statements just find the best weights with which to approximate
// the target object with the set of vectors in the starting_point object. This
// is really just a simple application of some linear algebra. For the details
// see page 554 of Learning with kernels by Scholkopf and Smola where they talk
// about "Optimal Expansion Coefficients."
const K kern(target.get_kernel());
typedef typename K::scalar_type scalar_type;
typedef typename K::sample_type sample_type;
typedef typename K::mem_manager_type mem_manager_type;
matrix<scalar_type,0,1,mem_manager_type> beta;
// Now we compute the fist approximate distance function.
beta = pinv(kernel_matrix(kern,starting_point.get_basis_vectors())) *
(kernel_matrix(kern,starting_point.get_basis_vectors(),target.get_basis_vectors())*target.get_alpha());
matrix<sample_type,0,1,mem_manager_type> out_vectors(starting_point.get_basis_vectors());
// Now setup to do a global optimization of all the parameters in the approximate
// distance function.
const objective<K> obj(target, beta, out_vectors);
const objective_derivative<K> obj_der(target, beta, out_vectors);
matrix<scalar_type,0,1,mem_manager_type> opt_starting_point(obj.state_to_vector());
// perform a full optimization of all the parameters (i.e. both beta and the basis vectors together)
find_min(lbfgs_search_strategy(20),
stop_strategy,
obj, obj_der, opt_starting_point, 0);
// now make sure that the final optimized value is loaded into the beta and
// out_vectors matrices
obj.vector_to_state(opt_starting_point);
// Do a final reoptimization of beta just to make sure it is optimal given the new
// set of basis vectors.
beta = pinv(kernel_matrix(kern,out_vectors))*(kernel_matrix(kern,out_vectors,target.get_basis_vectors())*target.get_alpha());
// It is possible that some of the beta weights will be very close to zero. Lets remove
// the basis vectors with these essentially zero weights.
const scalar_type eps = max(abs(beta))*std::numeric_limits<scalar_type>::epsilon();
for (long i = 0; i < beta.size(); ++i)
{
// if beta(i) is zero (but leave at least one beta no matter what)
if (std::abs(beta(i)) < eps && beta.size() > 1)
{
beta = remove_row(beta, i);
out_vectors = remove_row(out_vectors, i);
--i;
}
}
return distance_function<K>(beta, kern, out_vectors);
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
template <
typename trainer_type
>
class reduced_decision_function_trainer2
{
public:
typedef typename trainer_type::kernel_type kernel_type;
typedef typename trainer_type::scalar_type scalar_type;
typedef typename trainer_type::sample_type sample_type;
typedef typename trainer_type::mem_manager_type mem_manager_type;
typedef typename trainer_type::trained_function_type trained_function_type;
reduced_decision_function_trainer2 () : num_bv(0) {}
reduced_decision_function_trainer2 (
const trainer_type& trainer_,
const long num_sb_,
const double eps_ = 1e-3
) :
trainer(trainer_),
num_bv(num_sb_),
eps(eps_)
{
COMPILE_TIME_ASSERT(is_matrix<sample_type>::value);
// make sure requires clause is not broken
DLIB_ASSERT(num_bv > 0 && eps > 0,
"\t reduced_decision_function_trainer2()"
<< "\n\t you have given invalid arguments to this function"
<< "\n\t num_bv: " << num_bv
<< "\n\t eps: " << eps
);
}
template <
typename in_sample_vector_type,
typename in_scalar_vector_type
>
const decision_function<kernel_type> train (
const in_sample_vector_type& x,
const in_scalar_vector_type& y
) const
{
// make sure requires clause is not broken
DLIB_ASSERT(num_bv > 0,
"\t reduced_decision_function_trainer2::train(x,y)"
<< "\n\t You have tried to use an uninitialized version of this object"
<< "\n\t num_bv: " << num_bv );
return do_train(vector_to_matrix(x), vector_to_matrix(y));
}
private:
template <
typename in_sample_vector_type,
typename in_scalar_vector_type
......@@ -467,64 +546,21 @@ namespace dlib
{
// get the decision function object we are going to try and approximate
const decision_function<kernel_type>& dec_funct = trainer.train(x,y);
const kernel_type kern(dec_funct.kernel_function);
// now find a linearly independent subset of the training points of num_bv points.
linearly_independent_subset_finder<kernel_type> lisf(dec_funct.kernel_function, num_bv);
linearly_independent_subset_finder<kernel_type> lisf(kern, num_bv);
fill_lisf(lisf,x);
// The next few statements just find the best weights with which to approximate
// the dec_funct object with the smaller set of vectors in the lisf dictionary. This
// is really just a simple application of some linear algebra. For the details
// see page 554 of Learning with kernels by Scholkopf and Smola where they talk
// about "Optimal Expansion Coefficients."
const kernel_type kern(dec_funct.kernel_function);
matrix<scalar_type,0,1,mem_manager_type> beta;
// Now we compute the fist approximate decision function.
beta = lisf.get_inv_kernel_marix()*(kernel_matrix(kern,lisf,dec_funct.basis_vectors)*dec_funct.alpha);
matrix<sample_type,0,1,mem_manager_type> out_vectors(lisf.get_dictionary());
distance_function<kernel_type> approx(ones_matrix<scalar_type>(lisf.size(),1), kern, vector_to_matrix(lisf));
const distance_function<kernel_type> target = dec_funct;
// Now setup to do a global optimization of all the parameters in the approximate
// decision function.
const objective obj(dec_funct, beta, out_vectors);
const objective_derivative obj_der(dec_funct, beta, out_vectors);
matrix<scalar_type,0,1,mem_manager_type> opt_starting_point(obj.state_to_vector());
// perform the actual optimization
find_min(lbfgs_search_strategy(20),
objective_delta_stop_strategy(eps),
obj, obj_der, opt_starting_point, 0);
// now make sure that the final optimized value is loaded into the beta and
// out_vectors matrices
obj.vector_to_state(opt_starting_point);
// Do a final reoptimization of beta just to make sure it is optimal given the new
// set of basis vectors.
beta = pinv(kernel_matrix(kern,out_vectors))*(kernel_matrix(kern,out_vectors,dec_funct.basis_vectors)*dec_funct.alpha);
// It is possible that some of the beta weights will be very close to zero. Lets remove
// the basis vectors with these essentially zero weights.
const scalar_type eps = max(abs(beta))*std::numeric_limits<scalar_type>::epsilon();
for (long i = 0; i < beta.size(); ++i)
{
// if beta(i) is zero
if (std::abs(beta(i)) < eps)
{
beta = remove_row(beta, i);
out_vectors = remove_row(out_vectors, i);
--i;
}
}
approx = approximate_distance_function(objective_delta_stop_strategy(eps), target, approx);
decision_function<kernel_type> new_df(beta,
decision_function<kernel_type> new_df(approx.get_alpha(),
0,
kern,
out_vectors);
approx.get_basis_vectors());
// now we have to figure out what the new bias should be. It might be a little
// different since we just messed with all the weights and vectors.
......
......@@ -106,6 +106,42 @@ namespace dlib
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
template <
typename K,
typename stop_strategy_type
>
distance_function<K> approximate_distance_function (
stop_strategy_type stop_strategy,
const distance_function<K>& target,
const distance_function<K>& starting_point
);
/*!
requires
- stop_strategy == an object that defines a stop strategy such as one of
the objects from dlib/optimization/optimization_stop_strategies_abstract.h
- target.get_basis_vectors().size() > 0 && starting_point.get_basis_vectors().size() > 0
(i.e. target and starting_point have to have some basis vectors in them)
- target.get_kernel() == starting_point.get_kernel()
(i.e. both distance functions must use the same kernel)
- kernel_derivative<K> is defined
(i.e. The analytic derivative for the given kernel must be defined)
- K::sample_type must be a dlib::matrix object and the basis_vectors inside the
distance_functions must be column vectors.
ensures
- This function attempts to find a distance function object which is close
to the given target. That is, it searches for an X such that target(X) is
minimized. The optimization begins with the initial guess contained in
starting_point and searches for an X which locally minimizes target(X). Since
this problem can have many local minima the quality of the starting point
can significantly influence the results.
- The returned distance_function will contain the same number of basis vectors
as the given starting_point object.
- The optimization is carried out until the stop_strategy indicates it
should stop.
!*/
// ----------------------------------------------------------------------------------------
template <
......@@ -179,6 +215,9 @@ namespace dlib
const in_scalar_vector_type& y
) const;
/*!
requires
- x must be a list of objects which are each some kind of dlib::matrix
which represents column or row vectors.
ensures
- trains a decision_function using the trainer that was supplied to
this object's constructor and then finds a reduced representation
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment