Commit 82fb3682 authored by Davis King's avatar Davis King

Refactored the code in the reduced_decision_function_trainer2. Part of it has been turned into

a global function called approximate_distance_function() which performs the main optimization.  The
reduced_decision_function_trainer2 now depends on this global function.  This changes makes this
function optimizer available for other purposes besides use in the reduced_decision_function_trainer2
object.

--HG--
extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%404125
parent 543e289f
...@@ -144,94 +144,35 @@ namespace dlib ...@@ -144,94 +144,35 @@ namespace dlib
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
namespace red_impl
template <
typename trainer_type
>
class reduced_decision_function_trainer2
{ {
public:
typedef typename trainer_type::kernel_type kernel_type;
typedef typename trainer_type::scalar_type scalar_type;
typedef typename trainer_type::sample_type sample_type;
typedef typename trainer_type::mem_manager_type mem_manager_type;
typedef typename trainer_type::trained_function_type trained_function_type;
reduced_decision_function_trainer2 () : num_bv(0) {}
reduced_decision_function_trainer2 (
const trainer_type& trainer_,
const long num_sb_,
const double eps_ = 1e-3
) :
trainer(trainer_),
num_bv(num_sb_),
eps(eps_)
{
COMPILE_TIME_ASSERT(is_matrix<sample_type>::value);
// make sure requires clause is not broken
DLIB_ASSERT(num_bv > 0 && eps > 0,
"\t reduced_decision_function_trainer2()"
<< "\n\t you have given invalid arguments to this function"
<< "\n\t num_bv: " << num_bv
<< "\n\t eps: " << eps
);
}
template <
typename in_sample_vector_type,
typename in_scalar_vector_type
>
const decision_function<kernel_type> train (
const in_sample_vector_type& x,
const in_scalar_vector_type& y
) const
{
// make sure requires clause is not broken
DLIB_ASSERT(num_bv > 0,
"\t reduced_decision_function_trainer2::train(x,y)"
<< "\n\t You have tried to use an uninitialized version of this object"
<< "\n\t num_bv: " << num_bv );
return do_train(vector_to_matrix(x), vector_to_matrix(y));
}
private:
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
template <typename kernel_type>
class objective class objective
{ {
/* /*
This object represents the objective function we will try to This object represents the objective function we will try to
minimize in the final stage of this reduced set method. minimize in approximate_distance_function().
The objective is the distance, in kernel induced feature space, between The objective is the distance, in kernel induced feature space, between
the original decision function and the approximated version. the original distance function and the approximated version.
*/ */
typedef typename kernel_type::scalar_type scalar_type;
typedef typename kernel_type::sample_type sample_type;
typedef typename kernel_type::mem_manager_type mem_manager_type;
public: public:
objective( objective(
const decision_function<kernel_type>& dec_funct_, const distance_function<kernel_type>& dist_funct_,
matrix<scalar_type,0,1,mem_manager_type>& b_, matrix<scalar_type,0,1,mem_manager_type>& b_,
matrix<sample_type,0,1,mem_manager_type>& out_vectors_ matrix<sample_type,0,1,mem_manager_type>& out_vectors_
) : ) :
dec_funct(dec_funct_), dist_funct(dist_funct_),
b(b_), b(b_),
out_vectors(out_vectors_) out_vectors(out_vectors_)
{ {
const kernel_type k(dec_funct.kernel_function);
// here we compute a term in the objective function that is a constant. So
// do it in the constructor so we don't have to recompute it every time
// the objective is evaluated.
bias = 0;
for (long i = 0; i < dec_funct.alpha.size(); ++i)
{
for (long j = 0; j < dec_funct.alpha.size(); ++j)
{
bias += dec_funct.alpha(i)*dec_funct.alpha(j)*
k(dec_funct.basis_vectors(i), dec_funct.basis_vectors(j));
}
}
} }
const matrix<scalar_type, 0, 1, mem_manager_type> state_to_vector ( const matrix<scalar_type, 0, 1, mem_manager_type> state_to_vector (
...@@ -239,7 +180,7 @@ namespace dlib ...@@ -239,7 +180,7 @@ namespace dlib
/*! /*!
ensures ensures
- returns a vector that contains all the information necessary to - returns a vector that contains all the information necessary to
reproduce the current state of the approximated decision function reproduce the current state of the approximated distance function
!*/ !*/
{ {
matrix<scalar_type, 0, 1, mem_manager_type> z(b.nr() + out_vectors.size()*out_vectors(0).nr()); matrix<scalar_type, 0, 1, mem_manager_type> z(b.nr() + out_vectors.size()*out_vectors(0).nr());
...@@ -270,7 +211,7 @@ namespace dlib ...@@ -270,7 +211,7 @@ namespace dlib
- z came from the state_to_vector() function or has a compatible format - z came from the state_to_vector() function or has a compatible format
ensures ensures
- loads the vector z into the state variables of the approximate - loads the vector z into the state variables of the approximate
decision function (i.e. b and out_vectors) distance function (i.e. b and out_vectors)
!*/ !*/
{ {
long i = 0; long i = 0;
...@@ -295,20 +236,20 @@ namespace dlib ...@@ -295,20 +236,20 @@ namespace dlib
) const ) const
/*! /*!
ensures ensures
- loads the current approximate decision function with z - loads the current approximate distance function with z
- returns the distance between the original decision function - returns the distance between the original distance function
and the approximate one. and the approximate one.
!*/ !*/
{ {
vector_to_state(z); vector_to_state(z);
const kernel_type k(dec_funct.kernel_function); const kernel_type k(dist_funct.get_kernel());
double temp = 0; double temp = 0;
for (long i = 0; i < out_vectors.size(); ++i) for (long i = 0; i < out_vectors.size(); ++i)
{ {
for (long j = 0; j < dec_funct.basis_vectors.nr(); ++j) for (long j = 0; j < dist_funct.get_basis_vectors().nr(); ++j)
{ {
temp -= b(i)*dec_funct.alpha(j)*k(out_vectors(i), dec_funct.basis_vectors(j)); temp -= b(i)*dist_funct.get_alpha()(j)*k(out_vectors(i), dist_funct.get_basis_vectors()(j));
} }
} }
...@@ -322,14 +263,12 @@ namespace dlib ...@@ -322,14 +263,12 @@ namespace dlib
} }
} }
return temp + bias; return temp + dist_funct.get_squared_norm();
} }
private: private:
scalar_type bias; const distance_function<kernel_type>& dist_funct;
const decision_function<kernel_type>& dec_funct;
matrix<scalar_type,0,1,mem_manager_type>& b; matrix<scalar_type,0,1,mem_manager_type>& b;
matrix<sample_type,0,1,mem_manager_type>& out_vectors; matrix<sample_type,0,1,mem_manager_type>& out_vectors;
...@@ -337,20 +276,24 @@ namespace dlib ...@@ -337,20 +276,24 @@ namespace dlib
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
template <typename kernel_type>
class objective_derivative class objective_derivative
{ {
/*! /*!
This object represents the derivative of the objective object This object represents the derivative of the objective object
!*/ !*/
typedef typename kernel_type::scalar_type scalar_type;
typedef typename kernel_type::sample_type sample_type;
typedef typename kernel_type::mem_manager_type mem_manager_type;
public: public:
objective_derivative( objective_derivative(
const decision_function<kernel_type>& dec_funct_, const distance_function<kernel_type>& dist_funct_,
matrix<scalar_type,0,1,mem_manager_type>& b_, matrix<scalar_type,0,1,mem_manager_type>& b_,
matrix<sample_type,0,1,mem_manager_type>& out_vectors_ matrix<sample_type,0,1,mem_manager_type>& out_vectors_
) : ) :
dec_funct(dec_funct_), dist_funct(dist_funct_),
b(b_), b(b_),
out_vectors(out_vectors_) out_vectors(out_vectors_)
{ {
...@@ -364,7 +307,7 @@ namespace dlib ...@@ -364,7 +307,7 @@ namespace dlib
- z came from the state_to_vector() function or has a compatible format - z came from the state_to_vector() function or has a compatible format
ensures ensures
- loads the vector z into the state variables of the approximate - loads the vector z into the state variables of the approximate
decision function (i.e. b and out_vectors) distance function (i.e. b and out_vectors)
!*/ !*/
{ {
long i = 0; long i = 0;
...@@ -389,15 +332,15 @@ namespace dlib ...@@ -389,15 +332,15 @@ namespace dlib
) const ) const
/*! /*!
ensures ensures
- loads the current approximate decision function with z - loads the current approximate distance function with z
- returns the derivative of the distance between the original - returns the derivative of the distance between the original
decision function and the approximate one. distance function and the approximate one.
!*/ !*/
{ {
vector_to_state(z); vector_to_state(z);
res.set_size(z.nr()); res.set_size(z.nr());
set_all_elements(res,0); set_all_elements(res,0);
const kernel_type k(dec_funct.kernel_function); const kernel_type k(dist_funct.get_kernel());
const kernel_derivative<kernel_type> K_der(k); const kernel_derivative<kernel_type> K_der(k);
// first compute the gradient for the beta weights // first compute the gradient for the beta weights
...@@ -410,9 +353,9 @@ namespace dlib ...@@ -410,9 +353,9 @@ namespace dlib
} }
for (long i = 0; i < out_vectors.size(); ++i) for (long i = 0; i < out_vectors.size(); ++i)
{ {
for (long j = 0; j < dec_funct.basis_vectors.size(); ++j) for (long j = 0; j < dist_funct.get_basis_vectors().size(); ++j)
{ {
res(i) -= dec_funct.alpha(j)*k(out_vectors(i), dec_funct.basis_vectors(j)); res(i) -= dist_funct.get_alpha()(j)*k(out_vectors(i), dist_funct.get_basis_vectors()(j));
} }
} }
...@@ -428,12 +371,12 @@ namespace dlib ...@@ -428,12 +371,12 @@ namespace dlib
{ {
temp += b(j)*K_der(out_vectors(j), out_vectors(i)); temp += b(j)*K_der(out_vectors(j), out_vectors(i));
} }
for (long j = 0; j < dec_funct.basis_vectors.nr(); ++j) for (long j = 0; j < dist_funct.get_basis_vectors().nr(); ++j)
{ {
temp -= dec_funct.alpha(j)*K_der(dec_funct.basis_vectors(j), out_vectors(i) ); temp -= dist_funct.get_alpha()(j)*K_der(dist_funct.get_basis_vectors()(j), out_vectors(i) );
} }
// store the gradient for out_vectors[i] into result in the proper spot // store the gradient for out_vectors(i) into result in the proper spot
set_subm(res,pos,0,num,1) = b(i)*temp; set_subm(res,pos,0,num,1) = b(i)*temp;
pos += num; pos += num;
} }
...@@ -448,7 +391,7 @@ namespace dlib ...@@ -448,7 +391,7 @@ namespace dlib
mutable matrix<scalar_type, 0, 1, mem_manager_type> res; mutable matrix<scalar_type, 0, 1, mem_manager_type> res;
mutable sample_type temp; mutable sample_type temp;
const decision_function<kernel_type>& dec_funct; const distance_function<kernel_type>& dist_funct;
matrix<scalar_type,0,1,mem_manager_type>& b; matrix<scalar_type,0,1,mem_manager_type>& b;
matrix<sample_type,0,1,mem_manager_type>& out_vectors; matrix<sample_type,0,1,mem_manager_type>& out_vectors;
...@@ -456,6 +399,142 @@ namespace dlib ...@@ -456,6 +399,142 @@ namespace dlib
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
}
template <
typename K,
typename stop_strategy_type
>
distance_function<K> approximate_distance_function (
stop_strategy_type stop_strategy,
const distance_function<K>& target,
const distance_function<K>& starting_point
)
{
// make sure requires clause is not broken
DLIB_ASSERT(target.get_basis_vectors().size() > 0 &&
starting_point.get_basis_vectors().size() > 0 &&
target.get_kernel() == starting_point.get_kernel(),
"\t distance_function approximate_distance_function()"
<< "\n\t Invalid inputs were given to this function."
<< "\n\t target.get_basis_vectors().size(): " << target.get_basis_vectors().size()
<< "\n\t starting_point.get_basis_vectors().size(): " << starting_point.get_basis_vectors().size()
<< "\n\t target.kernel_function == starting_point.kernel_function: " << (target.get_kernel() == starting_point.get_kernel())
);
using namespace red_impl;
// The next few statements just find the best weights with which to approximate
// the target object with the set of vectors in the starting_point object. This
// is really just a simple application of some linear algebra. For the details
// see page 554 of Learning with kernels by Scholkopf and Smola where they talk
// about "Optimal Expansion Coefficients."
const K kern(target.get_kernel());
typedef typename K::scalar_type scalar_type;
typedef typename K::sample_type sample_type;
typedef typename K::mem_manager_type mem_manager_type;
matrix<scalar_type,0,1,mem_manager_type> beta;
// Now we compute the fist approximate distance function.
beta = pinv(kernel_matrix(kern,starting_point.get_basis_vectors())) *
(kernel_matrix(kern,starting_point.get_basis_vectors(),target.get_basis_vectors())*target.get_alpha());
matrix<sample_type,0,1,mem_manager_type> out_vectors(starting_point.get_basis_vectors());
// Now setup to do a global optimization of all the parameters in the approximate
// distance function.
const objective<K> obj(target, beta, out_vectors);
const objective_derivative<K> obj_der(target, beta, out_vectors);
matrix<scalar_type,0,1,mem_manager_type> opt_starting_point(obj.state_to_vector());
// perform a full optimization of all the parameters (i.e. both beta and the basis vectors together)
find_min(lbfgs_search_strategy(20),
stop_strategy,
obj, obj_der, opt_starting_point, 0);
// now make sure that the final optimized value is loaded into the beta and
// out_vectors matrices
obj.vector_to_state(opt_starting_point);
// Do a final reoptimization of beta just to make sure it is optimal given the new
// set of basis vectors.
beta = pinv(kernel_matrix(kern,out_vectors))*(kernel_matrix(kern,out_vectors,target.get_basis_vectors())*target.get_alpha());
// It is possible that some of the beta weights will be very close to zero. Lets remove
// the basis vectors with these essentially zero weights.
const scalar_type eps = max(abs(beta))*std::numeric_limits<scalar_type>::epsilon();
for (long i = 0; i < beta.size(); ++i)
{
// if beta(i) is zero (but leave at least one beta no matter what)
if (std::abs(beta(i)) < eps && beta.size() > 1)
{
beta = remove_row(beta, i);
out_vectors = remove_row(out_vectors, i);
--i;
}
}
return distance_function<K>(beta, kern, out_vectors);
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
template <
typename trainer_type
>
class reduced_decision_function_trainer2
{
public:
typedef typename trainer_type::kernel_type kernel_type;
typedef typename trainer_type::scalar_type scalar_type;
typedef typename trainer_type::sample_type sample_type;
typedef typename trainer_type::mem_manager_type mem_manager_type;
typedef typename trainer_type::trained_function_type trained_function_type;
reduced_decision_function_trainer2 () : num_bv(0) {}
reduced_decision_function_trainer2 (
const trainer_type& trainer_,
const long num_sb_,
const double eps_ = 1e-3
) :
trainer(trainer_),
num_bv(num_sb_),
eps(eps_)
{
COMPILE_TIME_ASSERT(is_matrix<sample_type>::value);
// make sure requires clause is not broken
DLIB_ASSERT(num_bv > 0 && eps > 0,
"\t reduced_decision_function_trainer2()"
<< "\n\t you have given invalid arguments to this function"
<< "\n\t num_bv: " << num_bv
<< "\n\t eps: " << eps
);
}
template <
typename in_sample_vector_type,
typename in_scalar_vector_type
>
const decision_function<kernel_type> train (
const in_sample_vector_type& x,
const in_scalar_vector_type& y
) const
{
// make sure requires clause is not broken
DLIB_ASSERT(num_bv > 0,
"\t reduced_decision_function_trainer2::train(x,y)"
<< "\n\t You have tried to use an uninitialized version of this object"
<< "\n\t num_bv: " << num_bv );
return do_train(vector_to_matrix(x), vector_to_matrix(y));
}
private:
template < template <
typename in_sample_vector_type, typename in_sample_vector_type,
typename in_scalar_vector_type typename in_scalar_vector_type
...@@ -467,64 +546,21 @@ namespace dlib ...@@ -467,64 +546,21 @@ namespace dlib
{ {
// get the decision function object we are going to try and approximate // get the decision function object we are going to try and approximate
const decision_function<kernel_type>& dec_funct = trainer.train(x,y); const decision_function<kernel_type>& dec_funct = trainer.train(x,y);
const kernel_type kern(dec_funct.kernel_function);
// now find a linearly independent subset of the training points of num_bv points. // now find a linearly independent subset of the training points of num_bv points.
linearly_independent_subset_finder<kernel_type> lisf(dec_funct.kernel_function, num_bv); linearly_independent_subset_finder<kernel_type> lisf(kern, num_bv);
fill_lisf(lisf,x); fill_lisf(lisf,x);
// The next few statements just find the best weights with which to approximate distance_function<kernel_type> approx(ones_matrix<scalar_type>(lisf.size(),1), kern, vector_to_matrix(lisf));
// the dec_funct object with the smaller set of vectors in the lisf dictionary. This const distance_function<kernel_type> target = dec_funct;
// is really just a simple application of some linear algebra. For the details
// see page 554 of Learning with kernels by Scholkopf and Smola where they talk
// about "Optimal Expansion Coefficients."
const kernel_type kern(dec_funct.kernel_function);
matrix<scalar_type,0,1,mem_manager_type> beta;
// Now we compute the fist approximate decision function.
beta = lisf.get_inv_kernel_marix()*(kernel_matrix(kern,lisf,dec_funct.basis_vectors)*dec_funct.alpha);
matrix<sample_type,0,1,mem_manager_type> out_vectors(lisf.get_dictionary());
// Now setup to do a global optimization of all the parameters in the approximate approx = approximate_distance_function(objective_delta_stop_strategy(eps), target, approx);
// decision function.
const objective obj(dec_funct, beta, out_vectors);
const objective_derivative obj_der(dec_funct, beta, out_vectors);
matrix<scalar_type,0,1,mem_manager_type> opt_starting_point(obj.state_to_vector());
// perform the actual optimization
find_min(lbfgs_search_strategy(20),
objective_delta_stop_strategy(eps),
obj, obj_der, opt_starting_point, 0);
// now make sure that the final optimized value is loaded into the beta and
// out_vectors matrices
obj.vector_to_state(opt_starting_point);
// Do a final reoptimization of beta just to make sure it is optimal given the new
// set of basis vectors.
beta = pinv(kernel_matrix(kern,out_vectors))*(kernel_matrix(kern,out_vectors,dec_funct.basis_vectors)*dec_funct.alpha);
// It is possible that some of the beta weights will be very close to zero. Lets remove
// the basis vectors with these essentially zero weights.
const scalar_type eps = max(abs(beta))*std::numeric_limits<scalar_type>::epsilon();
for (long i = 0; i < beta.size(); ++i)
{
// if beta(i) is zero
if (std::abs(beta(i)) < eps)
{
beta = remove_row(beta, i);
out_vectors = remove_row(out_vectors, i);
--i;
}
}
decision_function<kernel_type> new_df(beta, decision_function<kernel_type> new_df(approx.get_alpha(),
0, 0,
kern, kern,
out_vectors); approx.get_basis_vectors());
// now we have to figure out what the new bias should be. It might be a little // now we have to figure out what the new bias should be. It might be a little
// different since we just messed with all the weights and vectors. // different since we just messed with all the weights and vectors.
......
...@@ -106,6 +106,42 @@ namespace dlib ...@@ -106,6 +106,42 @@ namespace dlib
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
template <
typename K,
typename stop_strategy_type
>
distance_function<K> approximate_distance_function (
stop_strategy_type stop_strategy,
const distance_function<K>& target,
const distance_function<K>& starting_point
);
/*!
requires
- stop_strategy == an object that defines a stop strategy such as one of
the objects from dlib/optimization/optimization_stop_strategies_abstract.h
- target.get_basis_vectors().size() > 0 && starting_point.get_basis_vectors().size() > 0
(i.e. target and starting_point have to have some basis vectors in them)
- target.get_kernel() == starting_point.get_kernel()
(i.e. both distance functions must use the same kernel)
- kernel_derivative<K> is defined
(i.e. The analytic derivative for the given kernel must be defined)
- K::sample_type must be a dlib::matrix object and the basis_vectors inside the
distance_functions must be column vectors.
ensures
- This function attempts to find a distance function object which is close
to the given target. That is, it searches for an X such that target(X) is
minimized. The optimization begins with the initial guess contained in
starting_point and searches for an X which locally minimizes target(X). Since
this problem can have many local minima the quality of the starting point
can significantly influence the results.
- The returned distance_function will contain the same number of basis vectors
as the given starting_point object.
- The optimization is carried out until the stop_strategy indicates it
should stop.
!*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
template < template <
...@@ -179,6 +215,9 @@ namespace dlib ...@@ -179,6 +215,9 @@ namespace dlib
const in_scalar_vector_type& y const in_scalar_vector_type& y
) const; ) const;
/*! /*!
requires
- x must be a list of objects which are each some kind of dlib::matrix
which represents column or row vectors.
ensures ensures
- trains a decision_function using the trainer that was supplied to - trains a decision_function using the trainer that was supplied to
this object's constructor and then finds a reduced representation this object's constructor and then finds a reduced representation
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment