Commit 67f3f463 authored by Davis King's avatar Davis King

Refactored the krr_trainer into two objects. A rr_trainer which just does

linear ridge regression and the krr_trainer which uses the empirical_kernel_map
to do non-linear ridge regression.  No changes were made to the behavior of the
krr_trainer.  This update is just to allow the use of linear ridge regression
without a superfluous empirical_kernel_map running at the beginning of the training
process.

--HG--
extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%404114
parent c729e2ea
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include "empirical_kernel_map.h" #include "empirical_kernel_map.h"
#include "linearly_independent_subset_finder.h" #include "linearly_independent_subset_finder.h"
#include "../statistics.h" #include "../statistics.h"
#include "rr_trainer.h"
#include "krr_trainer_abstract.h" #include "krr_trainer_abstract.h"
#include <vector> #include <vector>
#include <iostream> #include <iostream>
...@@ -31,43 +32,41 @@ namespace dlib ...@@ -31,43 +32,41 @@ namespace dlib
krr_trainer ( krr_trainer (
) : ) :
verbose(false), verbose(false),
use_regression_loss(true),
lambda(0),
max_basis_size(400), max_basis_size(400),
ekm_stale(true) ekm_stale(true)
{ {
// default lambda search list
lams = matrix_cast<scalar_type>(logspace(-9, 2, 50));
} }
void be_verbose ( void be_verbose (
) )
{ {
verbose = true; verbose = true;
trainer.be_verbose();
} }
void be_quiet ( void be_quiet (
) )
{ {
verbose = false; verbose = false;
trainer.be_quiet();
} }
void use_regression_loss_for_loo_cv ( void use_regression_loss_for_loo_cv (
) )
{ {
use_regression_loss = true; trainer.use_regression_loss_for_loo_cv();
} }
void use_classification_loss_for_loo_cv ( void use_classification_loss_for_loo_cv (
) )
{ {
use_regression_loss = false; trainer.use_classification_loss_for_loo_cv();
} }
bool will_use_regression_loss_for_loo_cv ( bool will_use_regression_loss_for_loo_cv (
) const ) const
{ {
return use_regression_loss; return trainer.will_use_regression_loss_for_loo_cv();
} }
const kernel_type get_kernel ( const kernel_type get_kernel (
...@@ -148,13 +147,13 @@ namespace dlib ...@@ -148,13 +147,13 @@ namespace dlib
<< "\n\t this: " << this << "\n\t this: " << this
); );
lambda = lambda_; trainer.set_lambda(lambda_);
} }
const scalar_type get_lambda ( const scalar_type get_lambda (
) const ) const
{ {
return lambda; return trainer.get_lambda();
} }
template <typename EXP> template <typename EXP>
...@@ -172,14 +171,13 @@ namespace dlib ...@@ -172,14 +171,13 @@ namespace dlib
<< "\n\t this: " << this << "\n\t this: " << this
); );
trainer.set_search_lambdas(lambdas);
lams = matrix_cast<scalar_type>(lambdas);
} }
const matrix<scalar_type,0,0,mem_manager_type>& get_search_lambdas ( const matrix<scalar_type,0,0,mem_manager_type>& get_search_lambdas (
) const ) const
{ {
return lams; return trainer.get_search_lambdas();
} }
template < template <
...@@ -304,210 +302,24 @@ namespace dlib ...@@ -304,210 +302,24 @@ namespace dlib
} }
} }
const long dims = ekm.out_vector_size();
if (verbose) if (verbose)
{ {
std::cout << "Mean EKM projection error: " << rs.mean() << std::endl; std::cout << "Mean EKM projection error: " << rs.mean() << std::endl;
std::cout << "Standard deviation of EKM projection error: " << rs.stddev() << std::endl; std::cout << "Standard deviation of EKM projection error: " << rs.stddev() << std::endl;
} }
/*
Notes on the solution of KRR
Let A = an proj_x.size() by ekm.out_vector_size() matrix which contains
all the projected data samples.
Let I = an identity matrix
Let C = trans(A)*A
Let L = trans(A)*y
Then the optimal w is given by:
w = inv(C + lambda*I) * L
There is a trick to compute leave one out cross validation results for many different
lambda values quickly. The following paper has a detailed discussion of various
approaches:
Notes on Regularized Least Squares by Ryan M. Rifkin and Ross A. Lippert.
In the implementation of the krr_trainer I'm only using two simple equations
from the above paper.
decision_function<linear_kernel<matrix<scalar_type,0,0,mem_manager_type> > > lin_df;
First note that inv(C + lambda*I) can be computed for many different lambda
values in an efficient way by using an eigen decomposition of C. So we use
the fact that:
inv(C + lambda*I) == V*inv(D + lambda*I)*trans(V)
where V*D*trans(V) == C
Also, via some simple linear algebra the above paper works out that the leave one out
value for a sample x(i) is equal to the following (we refer to proj_x(i) as x(i) for brevity):
Let G = inv(C + lambda*I)
let val = trans(x(i))*G*x(i);
leave one out value for sample x(i):
LOOV = (trans(w)*x(i) - y(i)*val) / (1 - val)
leave one out error for sample x(i):
LOOE = loss(y(i), LOOV)
Finally, note that we will pretend there was a 1 appended to the end of each
vector in proj_x. We won't actually do that though because we don't want to
have to make a copy of all the samples. So throughout the following code
I have explicitly dealt with this.
*/
general_matrix_type C, tempm, G;
column_matrix_type L, tempv, w;
// compute C and L
for (long i = 0; i < proj_x.size(); ++i)
{
C += proj_x(i)*trans(proj_x(i));
L += y(i)*proj_x(i);
tempv += proj_x(i);
}
// Make C = [C tempv
// tempv' proj_x.size()]
C = join_cols(join_rows(C, tempv),
join_rows(trans(tempv), uniform_matrix<scalar_type>(1,1, proj_x.size())));
L = join_cols(L, uniform_matrix<scalar_type>(1,1, sum(y)));
eigenvalue_decomposition<general_matrix_type> eig(make_symmetric(C));
const general_matrix_type V = eig.get_pseudo_v();
const column_matrix_type D = eig.get_real_eigenvalues();
// We can save some work by pre-multiplying the proj_x vectors by trans(V)
// and saving the result so we don't have to recompute it over and over later.
matrix<column_matrix_type,0,1,mem_manager_type > Vx;
if (lambda == 0 || output_looe)
{
// Save the transpose of V into a temporary because the subsequent matrix
// vector multiplies will be faster (because of better cache locality).
const general_matrix_type transV( colm(trans(V),range(0,dims-1)) );
// Remember the pretend 1 at the end of proj_x(*). We want to multiply trans(V)*proj_x(*)
// so to do this we pull the last column off trans(V) and store it separately.
const column_matrix_type lastV = colm(trans(V), dims);
Vx.set_size(proj_x.size());
for (long i = 0; i < proj_x.size(); ++i)
{
Vx(i) = transV*proj_x(i);
Vx(i) = squared(Vx(i) + lastV);
}
}
the_lambda = lambda;
// If we need to automatically select a lambda then do so using the LOOE trick described
// above.
if (lambda == 0)
{
best_looe = std::numeric_limits<scalar_type>::max();
// Compute leave one out errors for a bunch of different lambdas and pick the best one.
for (long idx = 0; idx < lams.size(); ++idx)
{
// first compute G
tempv = 1.0/(D + lams(idx));
tempm = scale_columns(V,tempv);
G = tempm*trans(V);
// compute the solution w for the current lambda
w = G*L;
// make w have the same length as the x_proj vectors.
const scalar_type b = w(dims);
w = colm(w,0,dims);
scalar_type looe = 0;
for (long i = 0; i < proj_x.size(); ++i)
{
// perform equivalent of: val = trans(proj_x(i))*G*proj_x(i);
const scalar_type val = dot(tempv, Vx(i));
const scalar_type temp = (1 - val);
scalar_type loov;
if (temp != 0)
loov = (trans(w)*proj_x(i) + b - y(i)*val) / temp;
else
loov = 0;
looe += loss(loov, y(i));
}
// Keep track of the lambda which gave the lowest looe. If two lambdas
// have the same looe then pick the biggest lambda.
if (looe < best_looe || (looe == best_looe && lams(idx) > the_lambda))
{
best_looe = looe;
the_lambda = lams(idx);
}
}
// mark that we saved the looe to best_looe already
output_looe = false;
best_looe /= proj_x.size();
if (verbose)
{
using namespace std;
cout << "Using lambda: " << the_lambda << endl;
cout << "LOO Error: " << best_looe << endl;
}
}
// Now perform the main training. That is, find w.
// first, compute G = inv(C + the_lambda*I)
tempv = 1.0/(D + the_lambda);
tempm = scale_columns(V,tempv);
G = tempm*trans(V);
w = G*L;
// make w have the same length as the x_proj vectors.
const scalar_type b = w(dims);
w = colm(w,0,dims);
// If we haven't done this already and we are supposed to then compute the LOO error rate for
// the current lambda and store the result in best_looe.
if (output_looe) if (output_looe)
{ lin_df = trainer.train(proj_x,y, best_looe, the_lambda);
best_looe = 0;
for (long i = 0; i < proj_x.size(); ++i)
{
// perform equivalent of: val = trans(proj_x(i))*G*proj_x(i);
const scalar_type val = dot(tempv, Vx(i));
const scalar_type temp = (1 - val);
scalar_type loov;
if (temp != 0)
loov = (trans(w)*proj_x(i) + b - y(i)*val) / temp;
else else
loov = 0; lin_df = trainer.train(proj_x,y);
best_looe += loss(loov, y(i));
}
best_looe /= proj_x.size();
if (verbose)
{
using namespace std;
cout << "Using lambda: " << the_lambda << endl;
cout << "LOO Error: " << best_looe << endl;
}
}
// convert w into a proper decision function // convert the linear decision function into a kernelized one.
decision_function<kernel_type> df; decision_function<kernel_type> df;
df = ekm.convert_to_decision_function(w); df = ekm.convert_to_decision_function(lin_df.basis_vectors(0));
df.b = -b; // don't forget about the bias we stuck onto all the vectors df.b = lin_df.b;
// If we used an automatically derived basis then there isn't any point in // If we used an automatically derived basis then there isn't any point in
// keeping the ekm around. So free its memory. // keeping the ekm around. So free its memory.
...@@ -519,25 +331,6 @@ namespace dlib ...@@ -519,25 +331,6 @@ namespace dlib
return df; return df;
} }
inline scalar_type loss (
const scalar_type& a,
const scalar_type& b
) const
{
if (use_regression_loss)
{
return (a-b)*(a-b);
}
else
{
// if a and b have the same sign then no loss
if (a*b >= 0)
return 0;
else
return 1;
}
}
/*! /*!
CONVENTION CONVENTION
...@@ -545,19 +338,19 @@ namespace dlib ...@@ -545,19 +338,19 @@ namespace dlib
- kern or basis have changed since the last time - kern or basis have changed since the last time
they were loaded into the ekm they were loaded into the ekm
- get_lambda() == lambda - get_lambda() == trainer.get_lambda()
- get_kernel() == kern - get_kernel() == kern
- get_max_basis_size() == max_basis_size - get_max_basis_size() == max_basis_size
- will_use_regression_loss_for_loo_cv() == use_regression_loss - will_use_regression_loss_for_loo_cv() == trainer.will_use_regression_loss_for_loo_cv()
- get_search_lambdas() == lams - get_search_lambdas() == trainer.get_search_lambdas()
- basis_loaded() == (basis.size() != 0) - basis_loaded() == (basis.size() != 0)
!*/ !*/
rr_trainer<linear_kernel<matrix<scalar_type,0,0,mem_manager_type> > > trainer;
bool verbose; bool verbose;
bool use_regression_loss;
scalar_type lambda;
kernel_type kern; kernel_type kern;
unsigned long max_basis_size; unsigned long max_basis_size;
...@@ -566,7 +359,6 @@ namespace dlib ...@@ -566,7 +359,6 @@ namespace dlib
mutable empirical_kernel_map<kernel_type> ekm; mutable empirical_kernel_map<kernel_type> ekm;
mutable bool ekm_stale; mutable bool ekm_stale;
matrix<scalar_type,0,0,mem_manager_type> lams;
}; };
} }
......
// Copyright (C) 2010 Davis E. King (davis@dlib.net) // Copyright (C) 2010 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license. // License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_KRR_TRAInER_H__ #ifndef DLIB_RR_TRAInER_H__
#define DLIB_KRR_TRAInER_H__ #define DLIB_RR_TRAInER_H__
#include "../algs.h" #include "../algs.h"
#include "function.h" #include "function.h"
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#include "empirical_kernel_map.h" #include "empirical_kernel_map.h"
#include "linearly_independent_subset_finder.h" #include "linearly_independent_subset_finder.h"
#include "../statistics.h" #include "../statistics.h"
#include "krr_trainer_abstract.h" #include "rr_trainer_abstract.h"
#include <vector> #include <vector>
#include <iostream> #include <iostream>
...@@ -18,7 +18,7 @@ namespace dlib ...@@ -18,7 +18,7 @@ namespace dlib
template < template <
typename K typename K
> >
class krr_trainer class rr_trainer
{ {
public: public:
...@@ -28,13 +28,15 @@ namespace dlib ...@@ -28,13 +28,15 @@ namespace dlib
typedef typename kernel_type::mem_manager_type mem_manager_type; typedef typename kernel_type::mem_manager_type mem_manager_type;
typedef decision_function<kernel_type> trained_function_type; typedef decision_function<kernel_type> trained_function_type;
krr_trainer ( // You are getting a compiler error on this line because you supplied a non-linear or
// sparse kernel to the rr_trainer object. You have to use dlib::linear_kernel with this trainer.
COMPILE_TIME_ASSERT((is_same_type<K, linear_kernel<sample_type> >::value));
rr_trainer (
) : ) :
verbose(false), verbose(false),
use_regression_loss(true), use_regression_loss(true),
lambda(0), lambda(0)
max_basis_size(400),
ekm_stale(true)
{ {
// default lambda search list // default lambda search list
lams = matrix_cast<scalar_type>(logspace(-9, 2, 50)); lams = matrix_cast<scalar_type>(logspace(-9, 2, 50));
...@@ -73,67 +75,7 @@ namespace dlib ...@@ -73,67 +75,7 @@ namespace dlib
const kernel_type get_kernel ( const kernel_type get_kernel (
) const ) const
{ {
return kern; return kernel_type();
}
void set_kernel (
const kernel_type& k
)
{
kern = k;
}
template <typename T>
void set_basis (
const T& basis_samples
)
{
// make sure requires clause is not broken
DLIB_ASSERT(basis_samples.size() > 0 && is_vector(vector_to_matrix(basis_samples)),
"\tvoid krr_trainer::set_basis(basis_samples)"
<< "\n\t You have to give a non-empty set of basis_samples and it must be a vector"
<< "\n\t basis_samples.size(): " << basis_samples.size()
<< "\n\t is_vector(vector_to_matrix(basis_samples)): " << is_vector(vector_to_matrix(basis_samples))
<< "\n\t this: " << this
);
basis = vector_to_matrix(basis_samples);
ekm_stale = true;
}
bool basis_loaded (
) const
{
return (basis.size() != 0);
}
void clear_basis (
)
{
basis.set_size(0);
ekm.clear();
ekm_stale = true;
}
unsigned long get_max_basis_size (
) const
{
return max_basis_size;
}
void set_max_basis_size (
unsigned long max_basis_size_
)
{
// make sure requires clause is not broken
DLIB_ASSERT(max_basis_size_ > 0,
"\t void krr_trainer::set_max_basis_size()"
<< "\n\t max_basis_size_ must be greater than 0"
<< "\n\t max_basis_size_: " << max_basis_size_
<< "\n\t this: " << this
);
max_basis_size = max_basis_size_;
} }
void set_lambda ( void set_lambda (
...@@ -142,7 +84,7 @@ namespace dlib ...@@ -142,7 +84,7 @@ namespace dlib
{ {
// make sure requires clause is not broken // make sure requires clause is not broken
DLIB_ASSERT(lambda_ >= 0, DLIB_ASSERT(lambda_ >= 0,
"\t void krr_trainer::set_lambda()" "\t void rr_trainer::set_lambda()"
<< "\n\t lambda must be greater than or equal to 0" << "\n\t lambda must be greater than or equal to 0"
<< "\n\t lambda: " << lambda << "\n\t lambda: " << lambda
<< "\n\t this: " << this << "\n\t this: " << this
...@@ -164,7 +106,7 @@ namespace dlib ...@@ -164,7 +106,7 @@ namespace dlib
{ {
// make sure requires clause is not broken // make sure requires clause is not broken
DLIB_ASSERT(is_vector(lambdas) && lambdas.size() > 0 && min(lambdas) > 0, DLIB_ASSERT(is_vector(lambdas) && lambdas.size() > 0 && min(lambdas) > 0,
"\t void krr_trainer::set_search_lambdas()" "\t void rr_trainer::set_search_lambdas()"
<< "\n\t lambdas must be a non-empty vector of values" << "\n\t lambdas must be a non-empty vector of values"
<< "\n\t is_vector(lambdas): " << is_vector(lambdas) << "\n\t is_vector(lambdas): " << is_vector(lambdas)
<< "\n\t lambdas.size(): " << lambdas.size() << "\n\t lambdas.size(): " << lambdas.size()
...@@ -240,7 +182,7 @@ namespace dlib ...@@ -240,7 +182,7 @@ namespace dlib
{ {
// make sure requires clause is not broken // make sure requires clause is not broken
DLIB_ASSERT(is_learning_problem(x,y), DLIB_ASSERT(is_learning_problem(x,y),
"\t decision_function krr_trainer::train(x,y)" "\t decision_function rr_trainer::train(x,y)"
<< "\n\t invalid inputs were given to this function" << "\n\t invalid inputs were given to this function"
<< "\n\t is_vector(x): " << is_vector(x) << "\n\t is_vector(x): " << is_vector(x)
<< "\n\t is_vector(y): " << is_vector(y) << "\n\t is_vector(y): " << is_vector(y)
...@@ -253,69 +195,21 @@ namespace dlib ...@@ -253,69 +195,21 @@ namespace dlib
{ {
// make sure requires clause is not broken // make sure requires clause is not broken
DLIB_ASSERT(is_binary_classification_problem(x,y), DLIB_ASSERT(is_binary_classification_problem(x,y),
"\t decision_function krr_trainer::train(x,y)" "\t decision_function rr_trainer::train(x,y)"
<< "\n\t invalid inputs were given to this function" << "\n\t invalid inputs were given to this function"
); );
} }
#endif #endif
// The first thing we do is make sure we have an appropriate ekm ready for use below.
if (basis_loaded())
{
if (ekm_stale)
{
ekm.load(kern, basis);
ekm_stale = false;
}
}
else
{
linearly_independent_subset_finder<kernel_type> lisf(kern, max_basis_size);
fill_lisf(lisf, x);
ekm.load(lisf);
}
if (verbose)
{
std::cout << "\nNumber of basis vectors used: " << ekm.out_vector_size() << std::endl;
}
typedef matrix<scalar_type,0,1,mem_manager_type> column_matrix_type; typedef matrix<scalar_type,0,1,mem_manager_type> column_matrix_type;
typedef matrix<scalar_type,0,0,mem_manager_type> general_matrix_type; typedef matrix<scalar_type,0,0,mem_manager_type> general_matrix_type;
running_stats<scalar_type> rs; const long dims = x(0).size();
// Now we project all the x samples into kernel space using our EKM
matrix<column_matrix_type,0,1,mem_manager_type > proj_x;
proj_x.set_size(x.size());
for (long i = 0; i < proj_x.size(); ++i)
{
scalar_type err;
// Note that we also append a 1 to the end of the vectors because this is
// a convenient way of dealing with the bias term later on.
if (verbose == false)
{
proj_x(i) = ekm.project(x(i));
}
else
{
proj_x(i) = ekm.project(x(i),err);
rs.add(err);
}
}
const long dims = ekm.out_vector_size();
if (verbose)
{
std::cout << "Mean EKM projection error: " << rs.mean() << std::endl;
std::cout << "Standard deviation of EKM projection error: " << rs.stddev() << std::endl;
}
/* /*
Notes on the solution of KRR Notes on the solution of ridge regression
Let A = an proj_x.size() by ekm.out_vector_size() matrix which contains Let A = an x.size() by dims matrix which contains
all the projected data samples. all the projected data samples.
Let I = an identity matrix Let I = an identity matrix
...@@ -333,7 +227,7 @@ namespace dlib ...@@ -333,7 +227,7 @@ namespace dlib
Notes on Regularized Least Squares by Ryan M. Rifkin and Ross A. Lippert. Notes on Regularized Least Squares by Ryan M. Rifkin and Ross A. Lippert.
In the implementation of the krr_trainer I'm only using two simple equations In the implementation of the rr_trainer I'm only using two simple equations
from the above paper. from the above paper.
...@@ -344,7 +238,7 @@ namespace dlib ...@@ -344,7 +238,7 @@ namespace dlib
where V*D*trans(V) == C where V*D*trans(V) == C
Also, via some simple linear algebra the above paper works out that the leave one out Also, via some simple linear algebra the above paper works out that the leave one out
value for a sample x(i) is equal to the following (we refer to proj_x(i) as x(i) for brevity): value for a sample x(i) is equal to the following (we refer to x(i) as x(i) for brevity):
Let G = inv(C + lambda*I) Let G = inv(C + lambda*I)
let val = trans(x(i))*G*x(i); let val = trans(x(i))*G*x(i);
...@@ -356,7 +250,7 @@ namespace dlib ...@@ -356,7 +250,7 @@ namespace dlib
Finally, note that we will pretend there was a 1 appended to the end of each Finally, note that we will pretend there was a 1 appended to the end of each
vector in proj_x. We won't actually do that though because we don't want to vector in x. We won't actually do that though because we don't want to
have to make a copy of all the samples. So throughout the following code have to make a copy of all the samples. So throughout the following code
I have explicitly dealt with this. I have explicitly dealt with this.
*/ */
...@@ -365,24 +259,25 @@ namespace dlib ...@@ -365,24 +259,25 @@ namespace dlib
column_matrix_type L, tempv, w; column_matrix_type L, tempv, w;
// compute C and L // compute C and L
for (long i = 0; i < proj_x.size(); ++i) for (long i = 0; i < x.size(); ++i)
{ {
C += proj_x(i)*trans(proj_x(i)); C += x(i)*trans(x(i));
L += y(i)*proj_x(i); L += y(i)*x(i);
tempv += proj_x(i); tempv += x(i);
} }
// Account for the extra 1 that we pretend is appended to x
// Make C = [C tempv // Make C = [C tempv
// tempv' proj_x.size()] // tempv' x.size()]
C = join_cols(join_rows(C, tempv), C = join_cols(join_rows(C, tempv),
join_rows(trans(tempv), uniform_matrix<scalar_type>(1,1, proj_x.size()))); join_rows(trans(tempv), uniform_matrix<scalar_type>(1,1, x.size())));
L = join_cols(L, uniform_matrix<scalar_type>(1,1, sum(y))); L = join_cols(L, uniform_matrix<scalar_type>(1,1, sum(y)));
eigenvalue_decomposition<general_matrix_type> eig(make_symmetric(C)); eigenvalue_decomposition<general_matrix_type> eig(make_symmetric(C));
const general_matrix_type V = eig.get_pseudo_v(); const general_matrix_type V = eig.get_pseudo_v();
const column_matrix_type D = eig.get_real_eigenvalues(); const column_matrix_type D = eig.get_real_eigenvalues();
// We can save some work by pre-multiplying the proj_x vectors by trans(V) // We can save some work by pre-multiplying the x vectors by trans(V)
// and saving the result so we don't have to recompute it over and over later. // and saving the result so we don't have to recompute it over and over later.
matrix<column_matrix_type,0,1,mem_manager_type > Vx; matrix<column_matrix_type,0,1,mem_manager_type > Vx;
if (lambda == 0 || output_looe) if (lambda == 0 || output_looe)
...@@ -390,13 +285,13 @@ namespace dlib ...@@ -390,13 +285,13 @@ namespace dlib
// Save the transpose of V into a temporary because the subsequent matrix // Save the transpose of V into a temporary because the subsequent matrix
// vector multiplies will be faster (because of better cache locality). // vector multiplies will be faster (because of better cache locality).
const general_matrix_type transV( colm(trans(V),range(0,dims-1)) ); const general_matrix_type transV( colm(trans(V),range(0,dims-1)) );
// Remember the pretend 1 at the end of proj_x(*). We want to multiply trans(V)*proj_x(*) // Remember the pretend 1 at the end of x(*). We want to multiply trans(V)*x(*)
// so to do this we pull the last column off trans(V) and store it separately. // so to do this we pull the last column off trans(V) and store it separately.
const column_matrix_type lastV = colm(trans(V), dims); const column_matrix_type lastV = colm(trans(V), dims);
Vx.set_size(proj_x.size()); Vx.set_size(x.size());
for (long i = 0; i < proj_x.size(); ++i) for (long i = 0; i < x.size(); ++i)
{ {
Vx(i) = transV*proj_x(i); Vx(i) = transV*x(i);
Vx(i) = squared(Vx(i) + lastV); Vx(i) = squared(Vx(i) + lastV);
} }
} }
...@@ -425,14 +320,14 @@ namespace dlib ...@@ -425,14 +320,14 @@ namespace dlib
w = colm(w,0,dims); w = colm(w,0,dims);
scalar_type looe = 0; scalar_type looe = 0;
for (long i = 0; i < proj_x.size(); ++i) for (long i = 0; i < x.size(); ++i)
{ {
// perform equivalent of: val = trans(proj_x(i))*G*proj_x(i); // perform equivalent of: val = trans(x(i))*G*x(i);
const scalar_type val = dot(tempv, Vx(i)); const scalar_type val = dot(tempv, Vx(i));
const scalar_type temp = (1 - val); const scalar_type temp = (1 - val);
scalar_type loov; scalar_type loov;
if (temp != 0) if (temp != 0)
loov = (trans(w)*proj_x(i) + b - y(i)*val) / temp; loov = (trans(w)*x(i) + b - y(i)*val) / temp;
else else
loov = 0; loov = 0;
...@@ -450,7 +345,7 @@ namespace dlib ...@@ -450,7 +345,7 @@ namespace dlib
// mark that we saved the looe to best_looe already // mark that we saved the looe to best_looe already
output_looe = false; output_looe = false;
best_looe /= proj_x.size(); best_looe /= x.size();
if (verbose) if (verbose)
{ {
...@@ -479,21 +374,21 @@ namespace dlib ...@@ -479,21 +374,21 @@ namespace dlib
if (output_looe) if (output_looe)
{ {
best_looe = 0; best_looe = 0;
for (long i = 0; i < proj_x.size(); ++i) for (long i = 0; i < x.size(); ++i)
{ {
// perform equivalent of: val = trans(proj_x(i))*G*proj_x(i); // perform equivalent of: val = trans(x(i))*G*x(i);
const scalar_type val = dot(tempv, Vx(i)); const scalar_type val = dot(tempv, Vx(i));
const scalar_type temp = (1 - val); const scalar_type temp = (1 - val);
scalar_type loov; scalar_type loov;
if (temp != 0) if (temp != 0)
loov = (trans(w)*proj_x(i) + b - y(i)*val) / temp; loov = (trans(w)*x(i) + b - y(i)*val) / temp;
else else
loov = 0; loov = 0;
best_looe += loss(loov, y(i)); best_looe += loss(loov, y(i));
} }
best_looe /= proj_x.size(); best_looe /= x.size();
if (verbose) if (verbose)
{ {
...@@ -506,16 +401,12 @@ namespace dlib ...@@ -506,16 +401,12 @@ namespace dlib
// convert w into a proper decision function // convert w into a proper decision function
decision_function<kernel_type> df; decision_function<kernel_type> df;
df = ekm.convert_to_decision_function(w); df.alpha.set_size(1);
df.alpha = 1;
df.basis_vectors.set_size(1);
df.basis_vectors(0) = w;
df.b = -b; // don't forget about the bias we stuck onto all the vectors df.b = -b; // don't forget about the bias we stuck onto all the vectors
// If we used an automatically derived basis then there isn't any point in
// keeping the ekm around. So free its memory.
if (basis_loaded() == false)
{
ekm.clear();
}
return df; return df;
} }
...@@ -541,17 +432,10 @@ namespace dlib ...@@ -541,17 +432,10 @@ namespace dlib
/*! /*!
CONVENTION CONVENTION
- if (ekm_stale) then
- kern or basis have changed since the last time
they were loaded into the ekm
- get_lambda() == lambda - get_lambda() == lambda
- get_kernel() == kern - get_kernel() == kernel_type()
- get_max_basis_size() == max_basis_size
- will_use_regression_loss_for_loo_cv() == use_regression_loss - will_use_regression_loss_for_loo_cv() == use_regression_loss
- get_search_lambdas() == lams - get_search_lambdas() == lams
- basis_loaded() == (basis.size() != 0)
!*/ !*/
bool verbose; bool verbose;
...@@ -559,18 +443,11 @@ namespace dlib ...@@ -559,18 +443,11 @@ namespace dlib
scalar_type lambda; scalar_type lambda;
kernel_type kern;
unsigned long max_basis_size;
matrix<sample_type,0,1,mem_manager_type> basis;
mutable empirical_kernel_map<kernel_type> ekm;
mutable bool ekm_stale;
matrix<scalar_type,0,0,mem_manager_type> lams; matrix<scalar_type,0,0,mem_manager_type> lams;
}; };
} }
#endif // DLIB_KRR_TRAInER_H__ #endif // DLIB_RR_TRAInER_H__
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_RR_TRAInER_ABSTRACT_H__
#ifdef DLIB_RR_TRAInER_ABSTRACT_H__
#include "../algs.h"
#include "function_abstract.h"
namespace dlib
{
template <
typename K
>
class rr_trainer
{
/*!
REQUIREMENTS ON K
is the dlib::linear_kernel instantiated with some kind of column vector.
INITIAL VALUE
- get_lambda() == 0
- basis_loaded() == false
- get_max_basis_size() == 400
- will_use_regression_loss_for_loo_cv() == true
- get_search_lambdas() == logspace(-9, 2, 50)
- this object will not be verbose unless be_verbose() is called
WHAT THIS OBJECT REPRESENTS
This object represents a tool for performing linear ridge regression
(This basic algorithm is also known my many other names, e.g. regularized
least squares or least squares SVM).
The exact definition of what this algorithm does is this:
Find w and b that minimizes the following (x_i are input samples and y_i are target values):
lambda*dot(w,w) + sum_over_i( (f(x_i) - y_i)^2 )
where f(x) == dot(x,w) - b
So this algorithm is just regular old least squares regression but
with the addition of a regularization term which encourages small w.
It is capable of estimating the lambda parameter using leave-one-out cross-validation.
The leave-one-out cross-validation implementation is based on the techniques
discussed in this paper:
Notes on Regularized Least Squares by Ryan M. Rifkin and Ross A. Lippert.
!*/
public:
typedef K kernel_type;
typedef typename kernel_type::scalar_type scalar_type;
typedef typename kernel_type::sample_type sample_type;
typedef typename kernel_type::mem_manager_type mem_manager_type;
typedef decision_function<kernel_type> trained_function_type;
rr_trainer (
);
/*!
ensures
- This object is properly initialized and ready to be used.
!*/
void be_verbose (
);
/*!
ensures
- This object will print status messages to standard out.
!*/
void be_quiet (
);
/*!
ensures
- this object will not print anything to standard out
!*/
const kernel_type get_kernel (
) const;
/*!
ensures
- returns a copy of the kernel function in use by this object
!*/
void set_lambda (
scalar_type lambda
);
/*!
requires
- lambda >= 0
ensures
- #get_lambda() == lambda
!*/
const scalar_type get_lambda (
) const;
/*!
ensures
- returns the regularization parameter. It is the parameter that
determines the trade off between trying to fit the training data
exactly or allowing more errors but hopefully improving the
generalization ability of the resulting function. Smaller values
encourage exact fitting while larger values of lambda may encourage
better generalization.
Note that a lambda of 0 has a special meaning. It indicates to this
object that it should automatically determine an appropriate lambda
value. This is done using leave-one-out cross-validation.
!*/
void use_regression_loss_for_loo_cv (
);
/*!
ensures
- #will_use_regression_loss_for_loo_cv() == true
!*/
void use_classification_loss_for_loo_cv (
);
/*!
ensures
- #will_use_regression_loss_for_loo_cv() == false
!*/
bool will_use_regression_loss_for_loo_cv (
) const;
/*!
ensures
- returns true if the automatic lambda estimation will attempt to estimate a lambda
appropriate for a regression task. Otherwise it will try and find one which
minimizes the number of classification errors.
!*/
template <typename EXP>
void set_search_lambdas (
const matrix_exp<EXP>& lambdas
);
/*!
requires
- is_vector(lambdas) == true
- lambdas.size() > 0
- min(lambdas) > 0
- lambdas must contain floating point numbers
ensures
- #get_search_lambdas() == lambdas
!*/
const matrix<scalar_type,0,0,mem_manager_type>& get_search_lambdas (
) const;
/*!
ensures
- returns a matrix M such that:
- is_vector(M) == true
- M == a list of all the lambda values which will be tried when performing
LOO cross-validation for determining the best lambda.
!*/
template <
typename in_sample_vector_type,
typename in_scalar_vector_type
>
const decision_function<kernel_type> train (
const in_sample_vector_type& x,
const in_scalar_vector_type& y
) const;
/*!
requires
- x == a matrix or something convertible to a matrix via vector_to_matrix().
Also, x should contain sample_type objects.
- y == a matrix or something convertible to a matrix via vector_to_matrix().
Also, y should contain scalar_type objects.
- is_learning_problem(x,y) == true
- if (get_lambda() == 0 && will_use_regression_loss_for_loo_cv() == false) then
- is_binary_classification_problem(x,y) == true
(i.e. if you want this algorithm to estimate a lambda appropriate for
classification functions then you had better give a valid classification
problem)
ensures
- performs linear ridge regression given the training samples in x and target values in y.
- returns a decision_function F with the following properties:
- F(new_x) == predicted y value
- F.alpha.size() == 1
- F.basis_vectors.size() == 1
- F.alpha(0) == 1
- if (get_lambda() == 0) then
- This object will perform internal leave-one-out cross-validation to determine an
appropriate lambda automatically. It will compute the LOO error for each lambda
in get_search_lambdas() and select the best one.
- if (will_use_regression_loss_for_loo_cv()) then
- the lambda selected will be the one that minimizes the mean squared error.
- else
- the lambda selected will be the one that minimizes the number classification
mistakes. We say a point is classified correctly if the output of the
decision_function has the same sign as its label.
- #get_lambda() == 0
(i.e. we don't change the get_lambda() value. If you want to know what the
automatically selected lambda value was then call the version of train()
defined below)
- else
- The user supplied value of get_lambda() will be used to perform the ridge regression.
!*/
template <
typename in_sample_vector_type,
typename in_scalar_vector_type
>
const decision_function<kernel_type> train (
const in_sample_vector_type& x,
const in_scalar_vector_type& y,
scalar_type& looe
) const;
/*!
requires
- all the requirements for train(x,y) must be satisfied
ensures
- returns train(x,y)
(i.e. executes train(x,y) and returns its result)
- if (will_use_regression_loss_for_loo_cv())
- #looe == the mean squared error as determined by leave-one-out
cross-validation.
- else
- #looe == the fraction of samples misclassified as determined by
leave-one-out cross-validation.
!*/
template <
typename in_sample_vector_type,
typename in_scalar_vector_type
>
const decision_function<kernel_type> train (
const in_sample_vector_type& x,
const in_scalar_vector_type& y,
scalar_type& looe,
scalar_type& lambda_used
) const;
/*!
requires
- all the requirements for train(x,y) must be satisfied
ensures
- returns train(x,y)
(i.e. executes train(x,y) and returns its result)
- if (will_use_regression_loss_for_loo_cv())
- #looe == the mean squared error as determined by leave-one-out
cross-validation.
- else
- #looe == the fraction of samples misclassified as determined by
leave-one-out cross-validation.
- #lambda_used == the value of lambda used to generate the
decision_function. Note that this lambda value is always
equal to get_lambda() if get_lambda() isn't 0.
!*/
};
}
#endif // DLIB_RR_TRAInER_ABSTRACT_H__
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment