Added the option to use the elastic net regularizer to the OCA solver.

de39ebdf · Davis King · 2889e520 · de39ebdf · de39ebdf · de39ebdf
Commit de39ebdf authored Feb 22, 2016 by Davis King
6 changed files
--- a/dlib/optimization/optimization_oca.h
+++ b/dlib/optimization/optimization_oca.h
@@ -117,7 +117,21 @@ namespace dlib
        ) const
        {
            matrix_type empty_prior;
-            return oca_impl(problem, w, empty_prior, false, num_nonnegative, force_weight_to_1);
+            return oca_impl(problem, w, empty_prior, false, num_nonnegative, force_weight_to_1, 0);
+        }
+
+        template <
+            typename matrix_type
+            >
+        typename matrix_type::type solve_with_elastic_net (
+            const oca_problem<matrix_type>& problem,
+            matrix_type& w,
+            double lasso_lambda,
+            unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
+        ) const
+        {
+            matrix_type empty_prior;
+            return oca_impl(problem, w, empty_prior, false, 0, force_weight_to_1, lasso_lambda);
        }

        template <
@@ -141,7 +155,7 @@ namespace dlib
            // disable the force weight to 1 option for this mode.  We also disable the
            // non-negative constraints.
            unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max();
-            return oca_impl(problem, w, prior, true, 0, force_weight_to_1);
+            return oca_impl(problem, w, prior, true, 0, force_weight_to_1, 0);
        }

    private:
@@ -152,24 +166,37 @@ namespace dlib
        typename matrix_type::type oca_impl (
            const oca_problem<matrix_type>& problem,
            matrix_type& w,
-            const matrix_type prior,
+            const matrix_type& prior,
            bool have_prior,
            unsigned long num_nonnegative,
-            unsigned long force_weight_to_1
+            unsigned long force_weight_to_1,
+            const double lasso_lambda
        ) const
        {
            const unsigned long num_dims = problem.get_num_dimensions();

            // make sure requires clause is not broken
            DLIB_ASSERT(problem.get_c() > 0 &&
-                        problem.get_num_dimensions() > 0,
+                        problem.get_num_dimensions() > 0 && 
+                        0 <= lasso_lambda && lasso_lambda < 1,
                "\t scalar_type oca::operator()"
                << "\n\t The oca_problem is invalid"
                << "\n\t problem.get_c():              " << problem.get_c() 
                << "\n\t problem.get_num_dimensions(): " << num_dims 
+                << "\n\t lasso_lambda:                 " << lasso_lambda 
                << "\n\t this: " << this
                );
+            if (have_prior)
+            {
+                DLIB_ASSERT(lasso_lambda == 0, "Solver doesn't support using a prior with lasso.");
+                DLIB_ASSERT(num_nonnegative == 0, "Solver doesn't support using a prior with non-negative constraints.");
+            }
+            else if (lasso_lambda != 0)
+            {
+                DLIB_ASSERT(num_nonnegative == 0, "Solver doesn't support using lasso with non-negative constraints.");
+            }

+            const double ridge_lambda = 1-lasso_lambda;

            if (num_nonnegative > num_dims)
                num_nonnegative = num_dims;
@@ -184,7 +211,7 @@ namespace dlib
            typename sequence<vect_type>::kernel_2a planes;
            std::vector<scalar_type> bs, miss_count;

-            vect_type new_plane, alpha;
+            vect_type new_plane, alpha, btemp;

            w.set_size(num_dims, 1);
            w = 0;
@@ -198,6 +225,12 @@ namespace dlib
            scalar_type cp_obj = 0;

            matrix<scalar_type,0,0,mem_manager_type, layout_type> K, Ktmp;
+            matrix<scalar_type,0,1,mem_manager_type, layout_type> lambda, d;
+            if (lasso_lambda != 0)
+                d.set_size(num_dims);
+            else
+                d.set_size(num_nonnegative);
+            d = lasso_lambda*ones_matrix(d);

            scalar_type R_lower_bound;
            if (problem.risk_has_lower_bound(R_lower_bound))
@@ -253,7 +286,7 @@ namespace dlib
                else
                    alpha = join_cols(alpha,zeros_matrix<scalar_type>(1,1));

-                const scalar_type wnorm = 0.5*trans(w)*w;
+                const scalar_type wnorm = 0.5*ridge_lambda*trans(w)*w + lasso_lambda*sum(abs(w));
                const double prior_part = have_prior? dot(w,prior) : 0;
                cur_obj = wnorm + C*cur_risk + prior_norm-prior_part;

@@ -280,21 +313,36 @@ namespace dlib


                // solve the cutting plane subproblem for the next w.   We solve it to an
-                // accuracy that is related to how big the error gap is
-                scalar_type eps = std::min<scalar_type>(sub_eps, 0.1*(cur_obj-cp_obj)) ;
+                // accuracy that is related to how big the error gap is.  Also, we multiply
+                // by ridge_lambda because the objective function for the QP we solve was
+                // implicitly scaled by ridge_lambda.  That is, we want to ask the QP
+                // solver to solve the problem until the duality gap is 0.1 times smaller
+                // than what it is now.  So the factor of ridge_lambda is necessary to make
+                // this happen. 
+                scalar_type eps = std::min<scalar_type>(sub_eps, 0.1*ridge_lambda*(cur_obj-cp_obj));
                // just a sanity check
                if (eps < 1e-16)
                    eps = 1e-16;
                // Note that we warm start this optimization by using the alpha from the last
                // iteration as the starting point.
-                if (num_nonnegative != 0)
+                if (lasso_lambda != 0)
+                {
+                    // copy planes into a matrix so we can call solve_qp4_using_smo()
+                    matrix<scalar_type,0,0,mem_manager_type, layout_type> planes_mat(num_dims,planes.size());
+                    for (unsigned long i = 0; i < planes.size(); ++i)
+                        set_colm(planes_mat,i) = planes[i];
+
+                    btemp = ridge_lambda*mat(bs) - trans(planes_mat)*d;
+                    solve_qp4_using_smo(planes_mat, K, btemp, d, alpha, lambda, eps, sub_max_iter, (scalar_type)(2*lasso_lambda)); 
+                }
+                else if (num_nonnegative != 0)
                {
                    // copy planes into a matrix so we can call solve_qp4_using_smo()
                    matrix<scalar_type,0,0,mem_manager_type, layout_type> planes_mat(num_nonnegative,planes.size());
                    for (unsigned long i = 0; i < planes.size(); ++i)
                        set_colm(planes_mat,i) = colm(planes[i],0,num_nonnegative);

-                    solve_qp4_using_smo(planes_mat, K, mat(bs), alpha, eps, sub_max_iter); 
+                    solve_qp4_using_smo(planes_mat, K, mat(bs), d, alpha, lambda, eps, sub_max_iter); 
                }
                else
                {
@@ -305,8 +353,9 @@ namespace dlib
                w = -alpha(0)*planes[0];
                for (unsigned long i = 1; i < planes.size(); ++i)
                    w -= alpha(i)*planes[i];
-                // threshold the first num_nonnegative w elements if necessary.
-                if (num_nonnegative != 0)
+                if (lasso_lambda != 0)
+                    w = (lambda-d+w)/ridge_lambda;
+                else if (num_nonnegative != 0) // threshold the first num_nonnegative w elements if necessary.
                    set_rowm(w,range(0,num_nonnegative-1)) = lowerbound(rowm(w,range(0,num_nonnegative-1)),0);

                for (long i = 0; i < alpha.size(); ++i)
@@ -319,7 +368,7 @@ namespace dlib

                // Compute the lower bound on the true objective given to us by the cutting 
                // plane subproblem.
-                cp_obj = -0.5*trans(w)*w + trans(alpha)*mat(bs);
+                cp_obj = -0.5*ridge_lambda*trans(w)*w + trans(alpha)*mat(bs);
                if (have_prior)
                    w += prior;


--- a/dlib/optimization/optimization_oca_abstract.h
+++ b/dlib/optimization/optimization_oca_abstract.h
@@ -31,6 +31,13 @@ namespace dlib
                    Where prior is a user supplied vector and R(w) has the same
                    interpretation as above.
                       
+                Or it can use the elastic net regularizer:
+                    Minimize: f(w) == 0.5*(1-lasso_lambda)*length_squared(w) + lasso_lambda*sum(abs(w)) + C*R(w)
+
+                    Where lasso_lambda is a number in the range [0, 1) and controls
+                    trade-off between doing L2 and L2 regularization.  R(w) has the same
+                    interpretation as above.
+                       

                Note that the stopping condition must be provided by the user
                in the form of the optimization_status() function.
@@ -142,6 +149,13 @@ namespace dlib
                    Where prior is a user supplied vector and R(w) has the same
                    interpretation as above.
                       
+                Or it can use the elastic net regularizer:
+                    Minimize: f(w) == 0.5*(1-lasso_lambda)*length_squared(w) + lasso_lambda*sum(abs(w)) + C*R(w)
+
+                    Where lasso_lambda is a number in the range [0, 1) and controls
+                    trade-off between doing L2 and L2 regularization.  R(w) has the same
+                    interpretation as above.
+                       

                For a detailed discussion you should consult the following papers
                from the Journal of Machine Learning Research:
@@ -221,6 +235,39 @@ namespace dlib
                - returns the objective value at the solution #w
        !*/

+        template <
+            typename matrix_type
+            >
+        typename matrix_type::type solve_with_elastic_net (
+            const oca_problem<matrix_type>& problem,
+            matrix_type& w,
+            scalar_type lasso_lambda,
+            unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
+        ) const;
+        /*!
+            requires
+                - problem.get_c() > 0
+                - problem.get_num_dimensions() > 0
+                - 0 <= lasso_lambda < 1
+            ensures
+                - Solves the given oca problem and stores the solution in #w, but uses an
+                  elastic net regularizer instead of the normal L2 regularizer.  In
+                  particular, this function solves:
+                    Minimize: f(w) == 0.5*(1-lasso_lambda)*length_squared(w) + lasso_lambda*sum(abs(w)) + C*R(w)
+                - The optimization algorithm runs until problem.optimization_status() 
+                  indicates it is time to stop.
+                - returns the objective value at the solution #w
+                - if (force_weight_to_1 < problem.get_num_dimensions()) then
+                    - The optimizer enforces the following constraints:
+                        - #w(force_weight_to_1) == 1
+                        - for all i > force_weight_to_1:
+                            - #w(i) == 0 
+                        - That is, the element in the weight vector at the index indicated
+                          by force_weight_to_1 will have a value of 1 upon completion of
+                          this function, while all subsequent elements of w will have
+                          values of 0.
+        !*/
+
        void set_subproblem_epsilon (
            double eps
        ); 

--- a/dlib/optimization/optimization_solve_qp_using_smo.h
+++ b/dlib/optimization/optimization_solve_qp_using_smo.h
@@ -215,15 +215,20 @@ namespace dlib
        typename EXP1,
        typename EXP2,
        typename EXP3,
-        typename T, long NR, long NC, typename MM, typename L
+        typename EXP4,
+        typename T, long NR, long NC, typename MM, typename L,
+        long NR2, long NC2
        >
    unsigned long solve_qp4_using_smo ( 
        const matrix_exp<EXP1>& A,
        const matrix_exp<EXP2>& Q,
        const matrix_exp<EXP3>& b,
+        const matrix_exp<EXP4>& d,
        matrix<T,NR,NC,MM,L>& alpha,
+        matrix<T,NR2,NC2,MM,L>& lambda,
        T eps,
-        unsigned long max_iter
+        unsigned long max_iter,
+        T max_lambda = std::numeric_limits<T>::infinity()
    )
    {
        // make sure requires clause is not broken
@@ -251,6 +256,15 @@ namespace dlib
                     << "\n\t eps:                  " << eps 
                     << "\n\t max_iter:             " << max_iter 
        );
+        DLIB_ASSERT(is_col_vector(d) == true &&
+                     max_lambda >= 0 &&
+                     d.size() == A.nr(),
+                     "\t void solve_qp4_using_smo()"
+                     << "\n\t Invalid arguments were given to this function"
+                     << "\n\t A.nr():     " << A.nr()
+                     << "\n\t d.size():   " << d.size()
+                     << "\n\t max_lambda: " << max_lambda
+        );

        const T C = sum(alpha);

@@ -263,9 +277,14 @@ namespace dlib
            solve_qp_using_smo() routine.  
        */

+        const bool d_is_zero = d==zeros_matrix(d);
+
        // compute optimal lambda for current alpha
-        matrix<T,NR,1,MM,L> lambda = A*alpha;
-        lambda = lowerbound(lambda, 0);
+        if (d_is_zero)
+            lambda = A*alpha;
+        else
+            lambda = A*alpha + d;
+        lambda = clamp(lambda, 0, max_lambda);

        // Compute f'(alpha) (i.e. the gradient of f(alpha) with respect to alpha) for the current alpha.  
        matrix<T,NR,NC,MM,L> df = Q*alpha - b - trans(A)*lambda;
@@ -308,8 +327,11 @@ namespace dlib
            {
                // compute optimal lambda and recheck the duality gap to make
                // sure we have really converged.
-                lambda = A*alpha;
-                lambda = lowerbound(lambda, 0);
+                if (d_is_zero)
+                    lambda = A*alpha;
+                else
+                    lambda = A*alpha + d;
+                lambda = clamp(lambda, 0, max_lambda);
                df = Q*alpha - b - trans(A)*lambda;

                if (trans(alpha)*df - C*min(df) < eps)
@@ -347,8 +369,11 @@ namespace dlib
            if ((iter%300) == 299)
            {
                // compute the optimal lambda for the current alpha
-                lambda = A*alpha;
-                lambda = lowerbound(lambda, 0);
+                if (d_is_zero)
+                    lambda = A*alpha;
+                else
+                    lambda = A*alpha + d;
+                lambda = clamp(lambda, 0, max_lambda);

                // Perform this form of the update every so often because doing so can help
                // avoid the buildup of numerical errors you get with the alternate update

--- a/dlib/optimization/optimization_solve_qp_using_smo_abstract.h
+++ b/dlib/optimization/optimization_solve_qp_using_smo_abstract.h
@@ -58,46 +58,56 @@ namespace dlib
        typename EXP1,
        typename EXP2,
        typename EXP3,
-        typename T, long NR, long NC, typename MM, typename L
+        typename T, long NR, long NC, typename MM, typename L,
+        long NR2, long NC2
        >
    unsigned long solve_qp4_using_smo ( 
        const matrix_exp<EXP1>& A,
        const matrix_exp<EXP2>& Q,
        const matrix_exp<EXP3>& b,
+        const matrix_exp<EXP4>& d,
        matrix<T,NR,NC,MM,L>& alpha,
+        matrix<T,NR2,NC2,MM,L>& lambda,
        T eps,
-        unsigned long max_iter
+        unsigned long max_iter,
+        T max_lambda = std::numeric_limits<T>::infinity()
    );
    /*!
        requires
            - A.nc() == alpha.size()
            - Q.nr() == Q.nc()
            - is_col_vector(b) == true
+            - is_col_vector(d) == true
            - is_col_vector(alpha) == true
            - b.size() == alpha.size() == Q.nr()
+            - d.size() == A.nr()
            - alpha.size() > 0
            - min(alpha) >= 0
            - eps > 0
            - max_iter > 0
+            - max_lambda >= 0
        ensures
            - Let C == sum(alpha) (i.e. C is the sum of the alpha values you 
              supply to this function)
            - This function solves the following quadratic program:
                Minimize: f(alpha,lambda) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b + 
-                                             0.5*trans(lambda)*lambda - trans(lambda)*A*alpha
+                                             0.5*trans(lambda)*lambda - trans(lambda)*A*alpha - trans(lambda)*d
                subject to the following constraints:
                    - sum(alpha)  == C (i.e. the sum of alpha values doesn't change)
                    - min(alpha)  >= 0 (i.e. all alpha values are nonnegative)
                    - min(lambda) >= 0 (i.e. all lambda values are nonnegative)
+                    - max(lambda) <= max_lambda (i.e. all lambda values are less than max_lambda)
                Where f is convex.  This means that Q should be positive-semidefinite.
-            - The solution to the above QP will be stored in #alpha.  The optimal
-              lambda is not output since its value is given by the following expression:
-                lowerbound(A*alpha,0)
+            - If you don't want an upper limit on lambda then max_lambda can be set to
+              infinity.
+            - The solution to the above QP will be stored in #alpha and #lambda.  
            - This function uses a simple implementation of the sequential minimal
              optimization algorithm.  It starts the algorithm with the given alpha
              and it works on the problem until the duality gap (i.e. how far away
              we are from the optimum solution) is less than eps.  So eps controls 
              how accurate the solution is and smaller values result in better solutions.
+              The initial value of lambda is ignored since the optimal lambda can be
+              obtained via a simple closed form expression given alpha.
            - At most max_iter iterations of optimization will be performed.  
            - returns the number of iterations performed.  If this method fails to
              converge to eps accuracy then the number returned will be max_iter+1.

--- a/dlib/test/oca.cpp
+++ b/dlib/test/oca.cpp
@@ -72,6 +72,14 @@ namespace
            dlog << LINFO << "error: "<< max(abs(w-true_w));
            DLIB_TEST(max(abs(w-true_w)) < 1e-10);

+            solver.solve_with_elastic_net(make_oca_problem_c_svm<w_type>(2.0, 3.0, mat(x), mat(y), false, 1e-12, 40, max_index_plus_one(x)), w, 0.5);
+            dlog << LINFO << trans(w);
+            true_w = -0.5, 0.5, 0;
+            dlog << LINFO << "error: "<< max(abs(w-true_w));
+            DLIB_TEST(max(abs(w-true_w)) < 1e-10);
+
+            print_spinner();
+
            w_type prior = true_w;
            solver(make_oca_problem_c_svm<w_type>(20.0, 30.0, mat(x), mat(y), false, 1e-12, 40, max_index_plus_one(x)), w, prior);
            dlog << LINFO << trans(w);

--- a/dlib/test/opt_qp_solver.cpp
+++ b/dlib/test/opt_qp_solver.cpp
@@ -99,10 +99,11 @@ namespace

        const double C = 2;

-        matrix<double,0,1> alpha(2), true_alpha(2);
+        matrix<double,0,1> alpha(2), true_alpha(2), d(3), lambda;
        alpha = C/2, C/2;
+        d = 0;

-        solve_qp4_using_smo(A, tmp(trans(A)*A), b, alpha, 1e-9, 800);
+        solve_qp4_using_smo(A, tmp(trans(A)*A), b, d, alpha, lambda, 1e-9, 800);
        matrix<double,0,1> w = lowerbound(-A*alpha, 0);

        dlog << LINFO << "*******************************************************";
@@ -136,10 +137,11 @@ namespace

        const double C = 2;

-        matrix<double,0,1> alpha(2), true_alpha(2);
+        matrix<double,0,1> alpha(2), true_alpha(2), d(3), lambda;
        alpha = C/2, C/2;
+        d = 0;

-        solve_qp4_using_smo(A, tmp(trans(A)*A), b, alpha, 1e-9, 800);
+        solve_qp4_using_smo(A, tmp(trans(A)*A), b, d, alpha, lambda, 1e-9, 800);
        matrix<double,0,1> w = lowerbound(-A*alpha, 0);

        dlog << LINFO << "*******************************************************";
@@ -173,10 +175,11 @@ namespace

        const double C = 2;

-        matrix<double,0,1> alpha(2), true_alpha(2);
+        matrix<double,0,1> alpha(2), true_alpha(2), d(3), lambda;
        alpha = C/2, C/2;
+        d = 0;

-        solve_qp4_using_smo(A, tmp(trans(A)*A), b, alpha, 1e-9, 800);
+        solve_qp4_using_smo(A, tmp(trans(A)*A), b, d, alpha, lambda, 1e-9, 800);
        matrix<double,0,1> w = lowerbound(-A*alpha, 0);

        dlog << LINFO << "*******************************************************";
@@ -211,10 +214,11 @@ namespace

        const double C = 2;

-        matrix<double,0,1> alpha(3), true_alpha(3);
+        matrix<double,0,1> alpha(3), true_alpha(3), d(3), lambda;
        alpha = C/2, C/2, 0;
+        d = 0;

-        solve_qp4_using_smo(A, tmp(trans(A)*A), b, alpha, 1e-9, 800);
+        solve_qp4_using_smo(A, tmp(trans(A)*A), b, d, alpha, lambda, 1e-9, 800);
        matrix<double,0,1> w = lowerbound(-A*alpha, 0);


@@ -249,10 +253,10 @@ namespace

        const double C = 2;

-        matrix<double,0,1> alpha(2), true_alpha(2);
+        matrix<double,0,1> alpha(2), true_alpha(2), d(3), lambda;
        alpha = C/2, C/2;

-        solve_qp4_using_smo(A, tmp(trans(A)*A), b, alpha, 1e-9, 800);
+        solve_qp4_using_smo(A, tmp(trans(A)*A), b, d, alpha, lambda, 1e-9, 800);
        matrix<double,0,1> w = lowerbound(-A*alpha, 0);

        dlog << LINFO << "*******************************************************";
@@ -285,10 +289,10 @@ namespace

        const double C = 2;

-        matrix<double,0,1> alpha(3), true_alpha(3);
+        matrix<double,0,1> alpha(3), true_alpha(3), d(3), lambda;
        alpha = C/2, C/2, 0;

-        solve_qp4_using_smo(A, tmp(trans(A)*A), b, alpha, 1e-9, 800);
+        solve_qp4_using_smo(A, tmp(trans(A)*A), b, d, alpha, lambda, 1e-9, 800);
        matrix<double,0,1> w = lowerbound(-A*alpha, 0);

        dlog << LINFO << "*******************************************************";
@@ -327,10 +331,11 @@ namespace

        const double C = 2;

-        matrix<double,0,1> alpha(3), true_alpha(3);
+        matrix<double,0,1> alpha(3), true_alpha(3), d(3), lambda;
        alpha = C/2, C/2, 0;
+        d = 0;

-        solve_qp4_using_smo(A, Q, b, alpha, 1e-9, 800);
+        solve_qp4_using_smo(A, Q, b, d, alpha, lambda, 1e-9, 800);

        dlog << LINFO << "*******************************************************";