Commit de39ebdf authored by Davis King's avatar Davis King

Added the option to use the elastic net regularizer to the OCA solver.

parent 2889e520
......@@ -117,7 +117,21 @@ namespace dlib
) const
{
matrix_type empty_prior;
return oca_impl(problem, w, empty_prior, false, num_nonnegative, force_weight_to_1);
return oca_impl(problem, w, empty_prior, false, num_nonnegative, force_weight_to_1, 0);
}
template <
typename matrix_type
>
typename matrix_type::type solve_with_elastic_net (
const oca_problem<matrix_type>& problem,
matrix_type& w,
double lasso_lambda,
unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
) const
{
matrix_type empty_prior;
return oca_impl(problem, w, empty_prior, false, 0, force_weight_to_1, lasso_lambda);
}
template <
......@@ -141,7 +155,7 @@ namespace dlib
// disable the force weight to 1 option for this mode. We also disable the
// non-negative constraints.
unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max();
return oca_impl(problem, w, prior, true, 0, force_weight_to_1);
return oca_impl(problem, w, prior, true, 0, force_weight_to_1, 0);
}
private:
......@@ -152,24 +166,37 @@ namespace dlib
typename matrix_type::type oca_impl (
const oca_problem<matrix_type>& problem,
matrix_type& w,
const matrix_type prior,
const matrix_type& prior,
bool have_prior,
unsigned long num_nonnegative,
unsigned long force_weight_to_1
unsigned long force_weight_to_1,
const double lasso_lambda
) const
{
const unsigned long num_dims = problem.get_num_dimensions();
// make sure requires clause is not broken
DLIB_ASSERT(problem.get_c() > 0 &&
problem.get_num_dimensions() > 0,
problem.get_num_dimensions() > 0 &&
0 <= lasso_lambda && lasso_lambda < 1,
"\t scalar_type oca::operator()"
<< "\n\t The oca_problem is invalid"
<< "\n\t problem.get_c(): " << problem.get_c()
<< "\n\t problem.get_num_dimensions(): " << num_dims
<< "\n\t lasso_lambda: " << lasso_lambda
<< "\n\t this: " << this
);
if (have_prior)
{
DLIB_ASSERT(lasso_lambda == 0, "Solver doesn't support using a prior with lasso.");
DLIB_ASSERT(num_nonnegative == 0, "Solver doesn't support using a prior with non-negative constraints.");
}
else if (lasso_lambda != 0)
{
DLIB_ASSERT(num_nonnegative == 0, "Solver doesn't support using lasso with non-negative constraints.");
}
const double ridge_lambda = 1-lasso_lambda;
if (num_nonnegative > num_dims)
num_nonnegative = num_dims;
......@@ -184,7 +211,7 @@ namespace dlib
typename sequence<vect_type>::kernel_2a planes;
std::vector<scalar_type> bs, miss_count;
vect_type new_plane, alpha;
vect_type new_plane, alpha, btemp;
w.set_size(num_dims, 1);
w = 0;
......@@ -198,6 +225,12 @@ namespace dlib
scalar_type cp_obj = 0;
matrix<scalar_type,0,0,mem_manager_type, layout_type> K, Ktmp;
matrix<scalar_type,0,1,mem_manager_type, layout_type> lambda, d;
if (lasso_lambda != 0)
d.set_size(num_dims);
else
d.set_size(num_nonnegative);
d = lasso_lambda*ones_matrix(d);
scalar_type R_lower_bound;
if (problem.risk_has_lower_bound(R_lower_bound))
......@@ -253,7 +286,7 @@ namespace dlib
else
alpha = join_cols(alpha,zeros_matrix<scalar_type>(1,1));
const scalar_type wnorm = 0.5*trans(w)*w;
const scalar_type wnorm = 0.5*ridge_lambda*trans(w)*w + lasso_lambda*sum(abs(w));
const double prior_part = have_prior? dot(w,prior) : 0;
cur_obj = wnorm + C*cur_risk + prior_norm-prior_part;
......@@ -280,21 +313,36 @@ namespace dlib
// solve the cutting plane subproblem for the next w. We solve it to an
// accuracy that is related to how big the error gap is
scalar_type eps = std::min<scalar_type>(sub_eps, 0.1*(cur_obj-cp_obj)) ;
// accuracy that is related to how big the error gap is. Also, we multiply
// by ridge_lambda because the objective function for the QP we solve was
// implicitly scaled by ridge_lambda. That is, we want to ask the QP
// solver to solve the problem until the duality gap is 0.1 times smaller
// than what it is now. So the factor of ridge_lambda is necessary to make
// this happen.
scalar_type eps = std::min<scalar_type>(sub_eps, 0.1*ridge_lambda*(cur_obj-cp_obj));
// just a sanity check
if (eps < 1e-16)
eps = 1e-16;
// Note that we warm start this optimization by using the alpha from the last
// iteration as the starting point.
if (num_nonnegative != 0)
if (lasso_lambda != 0)
{
// copy planes into a matrix so we can call solve_qp4_using_smo()
matrix<scalar_type,0,0,mem_manager_type, layout_type> planes_mat(num_dims,planes.size());
for (unsigned long i = 0; i < planes.size(); ++i)
set_colm(planes_mat,i) = planes[i];
btemp = ridge_lambda*mat(bs) - trans(planes_mat)*d;
solve_qp4_using_smo(planes_mat, K, btemp, d, alpha, lambda, eps, sub_max_iter, (scalar_type)(2*lasso_lambda));
}
else if (num_nonnegative != 0)
{
// copy planes into a matrix so we can call solve_qp4_using_smo()
matrix<scalar_type,0,0,mem_manager_type, layout_type> planes_mat(num_nonnegative,planes.size());
for (unsigned long i = 0; i < planes.size(); ++i)
set_colm(planes_mat,i) = colm(planes[i],0,num_nonnegative);
solve_qp4_using_smo(planes_mat, K, mat(bs), alpha, eps, sub_max_iter);
solve_qp4_using_smo(planes_mat, K, mat(bs), d, alpha, lambda, eps, sub_max_iter);
}
else
{
......@@ -305,8 +353,9 @@ namespace dlib
w = -alpha(0)*planes[0];
for (unsigned long i = 1; i < planes.size(); ++i)
w -= alpha(i)*planes[i];
// threshold the first num_nonnegative w elements if necessary.
if (num_nonnegative != 0)
if (lasso_lambda != 0)
w = (lambda-d+w)/ridge_lambda;
else if (num_nonnegative != 0) // threshold the first num_nonnegative w elements if necessary.
set_rowm(w,range(0,num_nonnegative-1)) = lowerbound(rowm(w,range(0,num_nonnegative-1)),0);
for (long i = 0; i < alpha.size(); ++i)
......@@ -319,7 +368,7 @@ namespace dlib
// Compute the lower bound on the true objective given to us by the cutting
// plane subproblem.
cp_obj = -0.5*trans(w)*w + trans(alpha)*mat(bs);
cp_obj = -0.5*ridge_lambda*trans(w)*w + trans(alpha)*mat(bs);
if (have_prior)
w += prior;
......
......@@ -31,6 +31,13 @@ namespace dlib
Where prior is a user supplied vector and R(w) has the same
interpretation as above.
Or it can use the elastic net regularizer:
Minimize: f(w) == 0.5*(1-lasso_lambda)*length_squared(w) + lasso_lambda*sum(abs(w)) + C*R(w)
Where lasso_lambda is a number in the range [0, 1) and controls
trade-off between doing L2 and L2 regularization. R(w) has the same
interpretation as above.
Note that the stopping condition must be provided by the user
in the form of the optimization_status() function.
......@@ -142,6 +149,13 @@ namespace dlib
Where prior is a user supplied vector and R(w) has the same
interpretation as above.
Or it can use the elastic net regularizer:
Minimize: f(w) == 0.5*(1-lasso_lambda)*length_squared(w) + lasso_lambda*sum(abs(w)) + C*R(w)
Where lasso_lambda is a number in the range [0, 1) and controls
trade-off between doing L2 and L2 regularization. R(w) has the same
interpretation as above.
For a detailed discussion you should consult the following papers
from the Journal of Machine Learning Research:
......@@ -221,6 +235,39 @@ namespace dlib
- returns the objective value at the solution #w
!*/
template <
typename matrix_type
>
typename matrix_type::type solve_with_elastic_net (
const oca_problem<matrix_type>& problem,
matrix_type& w,
scalar_type lasso_lambda,
unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
) const;
/*!
requires
- problem.get_c() > 0
- problem.get_num_dimensions() > 0
- 0 <= lasso_lambda < 1
ensures
- Solves the given oca problem and stores the solution in #w, but uses an
elastic net regularizer instead of the normal L2 regularizer. In
particular, this function solves:
Minimize: f(w) == 0.5*(1-lasso_lambda)*length_squared(w) + lasso_lambda*sum(abs(w)) + C*R(w)
- The optimization algorithm runs until problem.optimization_status()
indicates it is time to stop.
- returns the objective value at the solution #w
- if (force_weight_to_1 < problem.get_num_dimensions()) then
- The optimizer enforces the following constraints:
- #w(force_weight_to_1) == 1
- for all i > force_weight_to_1:
- #w(i) == 0
- That is, the element in the weight vector at the index indicated
by force_weight_to_1 will have a value of 1 upon completion of
this function, while all subsequent elements of w will have
values of 0.
!*/
void set_subproblem_epsilon (
double eps
);
......
......@@ -215,15 +215,20 @@ namespace dlib
typename EXP1,
typename EXP2,
typename EXP3,
typename T, long NR, long NC, typename MM, typename L
typename EXP4,
typename T, long NR, long NC, typename MM, typename L,
long NR2, long NC2
>
unsigned long solve_qp4_using_smo (
const matrix_exp<EXP1>& A,
const matrix_exp<EXP2>& Q,
const matrix_exp<EXP3>& b,
const matrix_exp<EXP4>& d,
matrix<T,NR,NC,MM,L>& alpha,
matrix<T,NR2,NC2,MM,L>& lambda,
T eps,
unsigned long max_iter
unsigned long max_iter,
T max_lambda = std::numeric_limits<T>::infinity()
)
{
// make sure requires clause is not broken
......@@ -251,6 +256,15 @@ namespace dlib
<< "\n\t eps: " << eps
<< "\n\t max_iter: " << max_iter
);
DLIB_ASSERT(is_col_vector(d) == true &&
max_lambda >= 0 &&
d.size() == A.nr(),
"\t void solve_qp4_using_smo()"
<< "\n\t Invalid arguments were given to this function"
<< "\n\t A.nr(): " << A.nr()
<< "\n\t d.size(): " << d.size()
<< "\n\t max_lambda: " << max_lambda
);
const T C = sum(alpha);
......@@ -263,9 +277,14 @@ namespace dlib
solve_qp_using_smo() routine.
*/
const bool d_is_zero = d==zeros_matrix(d);
// compute optimal lambda for current alpha
matrix<T,NR,1,MM,L> lambda = A*alpha;
lambda = lowerbound(lambda, 0);
if (d_is_zero)
lambda = A*alpha;
else
lambda = A*alpha + d;
lambda = clamp(lambda, 0, max_lambda);
// Compute f'(alpha) (i.e. the gradient of f(alpha) with respect to alpha) for the current alpha.
matrix<T,NR,NC,MM,L> df = Q*alpha - b - trans(A)*lambda;
......@@ -308,8 +327,11 @@ namespace dlib
{
// compute optimal lambda and recheck the duality gap to make
// sure we have really converged.
lambda = A*alpha;
lambda = lowerbound(lambda, 0);
if (d_is_zero)
lambda = A*alpha;
else
lambda = A*alpha + d;
lambda = clamp(lambda, 0, max_lambda);
df = Q*alpha - b - trans(A)*lambda;
if (trans(alpha)*df - C*min(df) < eps)
......@@ -347,8 +369,11 @@ namespace dlib
if ((iter%300) == 299)
{
// compute the optimal lambda for the current alpha
lambda = A*alpha;
lambda = lowerbound(lambda, 0);
if (d_is_zero)
lambda = A*alpha;
else
lambda = A*alpha + d;
lambda = clamp(lambda, 0, max_lambda);
// Perform this form of the update every so often because doing so can help
// avoid the buildup of numerical errors you get with the alternate update
......
......@@ -58,46 +58,56 @@ namespace dlib
typename EXP1,
typename EXP2,
typename EXP3,
typename T, long NR, long NC, typename MM, typename L
typename T, long NR, long NC, typename MM, typename L,
long NR2, long NC2
>
unsigned long solve_qp4_using_smo (
const matrix_exp<EXP1>& A,
const matrix_exp<EXP2>& Q,
const matrix_exp<EXP3>& b,
const matrix_exp<EXP4>& d,
matrix<T,NR,NC,MM,L>& alpha,
matrix<T,NR2,NC2,MM,L>& lambda,
T eps,
unsigned long max_iter
unsigned long max_iter,
T max_lambda = std::numeric_limits<T>::infinity()
);
/*!
requires
- A.nc() == alpha.size()
- Q.nr() == Q.nc()
- is_col_vector(b) == true
- is_col_vector(d) == true
- is_col_vector(alpha) == true
- b.size() == alpha.size() == Q.nr()
- d.size() == A.nr()
- alpha.size() > 0
- min(alpha) >= 0
- eps > 0
- max_iter > 0
- max_lambda >= 0
ensures
- Let C == sum(alpha) (i.e. C is the sum of the alpha values you
supply to this function)
- This function solves the following quadratic program:
Minimize: f(alpha,lambda) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b +
0.5*trans(lambda)*lambda - trans(lambda)*A*alpha
0.5*trans(lambda)*lambda - trans(lambda)*A*alpha - trans(lambda)*d
subject to the following constraints:
- sum(alpha) == C (i.e. the sum of alpha values doesn't change)
- min(alpha) >= 0 (i.e. all alpha values are nonnegative)
- min(lambda) >= 0 (i.e. all lambda values are nonnegative)
- max(lambda) <= max_lambda (i.e. all lambda values are less than max_lambda)
Where f is convex. This means that Q should be positive-semidefinite.
- The solution to the above QP will be stored in #alpha. The optimal
lambda is not output since its value is given by the following expression:
lowerbound(A*alpha,0)
- If you don't want an upper limit on lambda then max_lambda can be set to
infinity.
- The solution to the above QP will be stored in #alpha and #lambda.
- This function uses a simple implementation of the sequential minimal
optimization algorithm. It starts the algorithm with the given alpha
and it works on the problem until the duality gap (i.e. how far away
we are from the optimum solution) is less than eps. So eps controls
how accurate the solution is and smaller values result in better solutions.
The initial value of lambda is ignored since the optimal lambda can be
obtained via a simple closed form expression given alpha.
- At most max_iter iterations of optimization will be performed.
- returns the number of iterations performed. If this method fails to
converge to eps accuracy then the number returned will be max_iter+1.
......
......@@ -72,6 +72,14 @@ namespace
dlog << LINFO << "error: "<< max(abs(w-true_w));
DLIB_TEST(max(abs(w-true_w)) < 1e-10);
solver.solve_with_elastic_net(make_oca_problem_c_svm<w_type>(2.0, 3.0, mat(x), mat(y), false, 1e-12, 40, max_index_plus_one(x)), w, 0.5);
dlog << LINFO << trans(w);
true_w = -0.5, 0.5, 0;
dlog << LINFO << "error: "<< max(abs(w-true_w));
DLIB_TEST(max(abs(w-true_w)) < 1e-10);
print_spinner();
w_type prior = true_w;
solver(make_oca_problem_c_svm<w_type>(20.0, 30.0, mat(x), mat(y), false, 1e-12, 40, max_index_plus_one(x)), w, prior);
dlog << LINFO << trans(w);
......
......@@ -99,10 +99,11 @@ namespace
const double C = 2;
matrix<double,0,1> alpha(2), true_alpha(2);
matrix<double,0,1> alpha(2), true_alpha(2), d(3), lambda;
alpha = C/2, C/2;
d = 0;
solve_qp4_using_smo(A, tmp(trans(A)*A), b, alpha, 1e-9, 800);
solve_qp4_using_smo(A, tmp(trans(A)*A), b, d, alpha, lambda, 1e-9, 800);
matrix<double,0,1> w = lowerbound(-A*alpha, 0);
dlog << LINFO << "*******************************************************";
......@@ -136,10 +137,11 @@ namespace
const double C = 2;
matrix<double,0,1> alpha(2), true_alpha(2);
matrix<double,0,1> alpha(2), true_alpha(2), d(3), lambda;
alpha = C/2, C/2;
d = 0;
solve_qp4_using_smo(A, tmp(trans(A)*A), b, alpha, 1e-9, 800);
solve_qp4_using_smo(A, tmp(trans(A)*A), b, d, alpha, lambda, 1e-9, 800);
matrix<double,0,1> w = lowerbound(-A*alpha, 0);
dlog << LINFO << "*******************************************************";
......@@ -173,10 +175,11 @@ namespace
const double C = 2;
matrix<double,0,1> alpha(2), true_alpha(2);
matrix<double,0,1> alpha(2), true_alpha(2), d(3), lambda;
alpha = C/2, C/2;
d = 0;
solve_qp4_using_smo(A, tmp(trans(A)*A), b, alpha, 1e-9, 800);
solve_qp4_using_smo(A, tmp(trans(A)*A), b, d, alpha, lambda, 1e-9, 800);
matrix<double,0,1> w = lowerbound(-A*alpha, 0);
dlog << LINFO << "*******************************************************";
......@@ -211,10 +214,11 @@ namespace
const double C = 2;
matrix<double,0,1> alpha(3), true_alpha(3);
matrix<double,0,1> alpha(3), true_alpha(3), d(3), lambda;
alpha = C/2, C/2, 0;
d = 0;
solve_qp4_using_smo(A, tmp(trans(A)*A), b, alpha, 1e-9, 800);
solve_qp4_using_smo(A, tmp(trans(A)*A), b, d, alpha, lambda, 1e-9, 800);
matrix<double,0,1> w = lowerbound(-A*alpha, 0);
......@@ -249,10 +253,10 @@ namespace
const double C = 2;
matrix<double,0,1> alpha(2), true_alpha(2);
matrix<double,0,1> alpha(2), true_alpha(2), d(3), lambda;
alpha = C/2, C/2;
solve_qp4_using_smo(A, tmp(trans(A)*A), b, alpha, 1e-9, 800);
solve_qp4_using_smo(A, tmp(trans(A)*A), b, d, alpha, lambda, 1e-9, 800);
matrix<double,0,1> w = lowerbound(-A*alpha, 0);
dlog << LINFO << "*******************************************************";
......@@ -285,10 +289,10 @@ namespace
const double C = 2;
matrix<double,0,1> alpha(3), true_alpha(3);
matrix<double,0,1> alpha(3), true_alpha(3), d(3), lambda;
alpha = C/2, C/2, 0;
solve_qp4_using_smo(A, tmp(trans(A)*A), b, alpha, 1e-9, 800);
solve_qp4_using_smo(A, tmp(trans(A)*A), b, d, alpha, lambda, 1e-9, 800);
matrix<double,0,1> w = lowerbound(-A*alpha, 0);
dlog << LINFO << "*******************************************************";
......@@ -327,10 +331,11 @@ namespace
const double C = 2;
matrix<double,0,1> alpha(3), true_alpha(3);
matrix<double,0,1> alpha(3), true_alpha(3), d(3), lambda;
alpha = C/2, C/2, 0;
d = 0;
solve_qp4_using_smo(A, Q, b, alpha, 1e-9, 800);
solve_qp4_using_smo(A, Q, b, d, alpha, lambda, 1e-9, 800);
dlog << LINFO << "*******************************************************";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment