Added elastic_net solver.

47fbaff0 · Davis King · 5ec306d2 · 47fbaff0 · 47fbaff0 · 47fbaff0
Commit 47fbaff0 authored Apr 25, 2016 by Davis King
4 changed files
--- a/dlib/optimization/elastic_net.h
+++ b/dlib/optimization/elastic_net.h
+// Copyright (C) 2016  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_ElASTIC_NET_Hh_
+#define DLIB_ElASTIC_NET_Hh_
+
+#include "../matrix.h"
+#include "elastic_net_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class elastic_net
+    {
+    public:
+
+        template <typename EXP>
+        explicit elastic_net(
+            const matrix_exp<EXP>& X_
+        ) : eps(1e-5), max_iterations(50000), verbose(false)
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(X_.size() > 0,
+                "\t elastic_net::elastic_net(X)"
+                << " \n\t X can't be empty"
+                << " \n\t this: " << this
+                );
+
+
+            // If the number of columns in X is big and in particular bigger than the number of
+            // rows then we can get rid of them by doing some SVD magic.  Doing this doesn't
+            // make the final results of anything change but makes all the matrices have
+            // dimensions that are X.nr() in size, which can be much smaller.
+            matrix<double,0,1> s;
+            svd3(trans(X_),u,s,eig_vects);
+            X = eig_vects*diagm(s);
+
+
+            // Later on, we will use the eigenvalues of X*trans(X) to compute the solution
+            // without lasso.  So we save them here.
+            eig_vals = squared(s);
+
+            samples.resize(X.nr()*2);
+
+            for (size_t i = 0; i < samples.size(); ++i)
+                index.push_back(i);
+            active_size = index.size();
+
+
+            // setup the training samples used in the SVM optimizer below
+            for (size_t i = 0; i < samples.size(); ++i)
+            {
+                auto& x = samples[i];
+                const long idx = i/2;
+                if (i%2 == 0)
+                    x.label = +1;
+                else
+                    x.label = -1;
+
+                x.r = idx%X.nr();
+            }
+        }
+
+        template <typename EXP1, typename EXP2>
+        elastic_net(
+            const matrix_exp<EXP1>& X_,
+            const matrix_exp<EXP2>& Y_
+        ) : elastic_net(X_)
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(X_.size() > 0 && 
+                        is_col_vector(Y_) && 
+                        X_.nc() == Y_.size() ,
+                "\t elastic_net::elastic_net(X,Y)"
+                << " \n\t Invalid inputs were given to this function."
+                << " \n\t X_.size(): " << X_.size() 
+                << " \n\t is_col_vector(Y_): " << is_col_vector(Y_) 
+                << " \n\t X_.nc():   " << X_.nc() 
+                << " \n\t Y_.size(): " << Y_.size() 
+                << " \n\t this: " << this
+                );
+
+            set_y(Y_);
+        }
+
+        long size (
+        ) const { return u.nr(); }
+
+        template <typename EXP>
+        void set_y(
+            const matrix_exp<EXP>& Y_
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(is_col_vector(Y_) && 
+                        Y_.size() == size(),
+                "\t void elastic_net::set_y(Y)"
+                << " \n\t Invalid inputs were given to this function."
+                << " \n\t is_col_vector(Y_): " << is_col_vector(Y_) 
+                << " \n\t size():    " << size() 
+                << " \n\t Y_.size(): " << Y_.size() 
+                << " \n\t this: " << this
+                );
+
+            ynorm = length_squared(Y_); 
+            Y = trans(u)*Y_;
+            xdoty = X*Y;
+            eig_vects_xdoty = trans(eig_vects)*xdoty;
+
+            w.set_size(Y.size());
+            // zero out any memory of previous solutions
+            alpha.assign(X.nr()*2, 0);
+        }
+
+        bool have_target_values (
+        ) const { return Y.size() != 0; }
+
+        void set_epsilon(
+            double eps_
+        ) 
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(eps_ > 0,
+                "\t void elastic_net::set_epsilon()"
+                << " \n\t eps_ must be greater than 0"
+                << " \n\t eps_: " << eps_ 
+                << " \n\t this: " << this
+                );
+
+            eps = eps_;
+        }
+
+        unsigned long get_max_iterations (
+        ) const { return max_iterations; }
+
+        void set_max_iterations (
+            unsigned long max_iter
+        ) 
+        {
+            max_iterations = max_iter;
+        }
+
+        void be_verbose (
+        )
+        {
+            verbose = true;
+        }
+
+        void be_quiet (
+        )
+        {
+            verbose = false;
+        }
+
+        double get_epsilon (
+        ) const { return eps; }
+
+        matrix<double,0,1> operator() (
+            double ridge_lambda,
+            double lasso_budget = std::numeric_limits<double>::infinity()
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(have_target_values() && 
+                        ridge_lambda > 0 && 
+                        lasso_budget > 0 ,
+                "\t matrix<double,0,1> elastic_net::operator()()"
+                << " \n\t Invalid inputs were given to this function."
+                << " \n\t have_target_values(): " << have_target_values() 
+                << " \n\t ridge_lambda: " << ridge_lambda 
+                << " \n\t lasso_budget: " << lasso_budget 
+                << " \n\t this: " << this
+                );
+
+
+            // First check if lasso_budget is so big that it isn't even active.  We do this
+            // by doing just ridge regression and checking the result.
+            matrix<double,0,1> betas = eig_vects*tmp(inv(diagm(eig_vals + ridge_lambda))*eig_vects_xdoty);
+            if (sum(abs(betas)) <= lasso_budget)
+                return betas;
+
+
+            // Set w back to 0.  We will compute the w corresponding to what is currently
+            // in alpha layer on.  This way w and alpha are always in sync.
+            w = 0;
+            wy_mult = 0;
+            wdoty = 0;
+
+
+            // return dot(w,x)
+            auto dot = [&](const matrix<double,0,1>& w, const en_sample2& x)
+            {
+                const double xmul = -x.label*(1/lasso_budget);
+                // Do the base dot product but don't forget to add in the -(1/t)*y part from the svm reduction paper
+                double val = rowm(X,x.r)*w + xmul*wdoty + wy_mult*xdoty(x.r) + xmul*wy_mult*ynorm;
+
+                return val;
+            };
+
+
+            // perform w += scale*x;
+            auto add_to = [&](matrix<double,0,1>& w, double scale, const en_sample2& x)
+            {
+                const double xmul = -x.label*(1/lasso_budget);
+                wy_mult += scale*xmul;
+                wdoty += scale*xdoty(x.r);
+                w += scale*trans(rowm(X,x.r));
+
+            };
+
+            const double Dii = ridge_lambda;
+
+            // setup the training samples used in the SVM optimizer below
+            for (size_t i = 0; i < samples.size(); ++i)
+            {
+                auto& x = samples[i];
+
+                const double xmul = -x.label*(1/lasso_budget);
+                x.xdotx = xmul*xmul*ynorm;
+                for (long c = 0; c < X.nc(); ++c)
+                    x.xdotx += std::pow(X(x.r,c)+xmul*Y(c), 2.0) - std::pow(xmul*Y(c),2.0);
+
+                // compute the correct w given whatever might be in alpha.
+                if (alpha[i] != 0)
+                    add_to(w, x.label*alpha[i], samples[i]);
+            }
+
+
+            // Now run the optimizer
+            double PG_max_prev = std::numeric_limits<double>::infinity();
+            double PG_min_prev = -std::numeric_limits<double>::infinity();
+
+
+            unsigned int iter;
+            for (iter = 0; iter < max_iterations; ++iter)
+            {
+                // randomly shuffle the indices
+                for (unsigned long i = 0; i < active_size; ++i)
+                {
+                    // pick a random index >= i
+                    const long j = i + rnd.get_random_32bit_number()%(active_size-i);
+                    std::swap(index[i], index[j]);
+                }
+
+                double PG_max = -std::numeric_limits<double>::infinity();
+                double PG_min = std::numeric_limits<double>::infinity();
+                for (size_t ii = 0; ii < active_size; ++ii)
+                {
+                    const auto i = index[ii];
+                    const auto& x = samples[i];
+                    double G = x.label*dot(w, x) - 1 + Dii*alpha[i];
+
+                    double PG = 0;
+                    if (alpha[i] == 0)
+                    {
+                        if (G > PG_max_prev)
+                        {
+                            // shrink the active set of training examples
+                            --active_size;
+                            std::swap(index[ii], index[active_size]);
+                            --ii;
+                            continue;
+                        }
+
+                        if (G < 0)
+                            PG = G;
+                    }
+                    else
+                    {
+                        PG = G;
+                    }
+
+                    if (PG > PG_max) 
+                        PG_max = PG;
+                    if (PG < PG_min) 
+                        PG_min = PG;
+
+                    // if PG != 0
+                    if (std::abs(PG) > 1e-12)
+                    {
+                        const double alpha_old = alpha[i];
+                        alpha[i] = std::max(alpha[i] - G/(x.xdotx+Dii), (double)0.0);
+                        const double delta = (alpha[i]-alpha_old)*x.label;
+                        add_to(w, delta, x);
+                    }
+                }
+
+                if (verbose)
+                {
+                    using namespace std;
+                    cout << "gap:         " << PG_max - PG_min << endl;
+                    cout << "active_size: " << active_size << endl;
+                    cout << "iter:        " << iter << endl;
+                    cout << endl;
+                }
+
+                if (PG_max - PG_min <= eps)
+                {
+                    // stop if we are within eps tolerance and the last iteration
+                    // was over all the samples
+                    if (active_size == index.size())
+                        break;
+
+                    // Turn off shrinking on the next iteration.  We will stop if the
+                    // tolerance is still <= eps when shrinking is off.
+                    active_size = index.size();
+                    PG_max_prev = std::numeric_limits<double>::infinity();
+                    PG_min_prev = -std::numeric_limits<double>::infinity();
+                }
+                else
+                {
+                    PG_max_prev = PG_max;
+                    PG_min_prev = PG_min;
+                    if (PG_max_prev <= 0)
+                        PG_max_prev = std::numeric_limits<double>::infinity();
+                    if (PG_min_prev >= 0)
+                        PG_min_prev = -std::numeric_limits<double>::infinity();
+                }
+
+
+                // recalculate wdoty every so often to avoid drift.
+                if (iter%100 == 0)
+                    wdoty = dlib::dot(Y, w);
+            }
+
+
+            betas.set_size(alpha.size()/2);
+            for (long i = 0; i < betas.size(); ++i)
+                betas(i) = lasso_budget*(alpha[2*i] - alpha[2*i+1]);
+            betas /= sum(mat(alpha));
+            return betas;
+        }
+
+
+    private:
+
+        struct en_sample2
+        {
+            // X location
+            long r;
+
+
+            double label;
+
+            double xdotx;
+        };
+
+        std::vector<en_sample2> samples;
+        std::vector<double> alpha;
+        double ynorm;
+        matrix<double> X;
+        matrix<double,0,1> Y;
+        matrix<double,0,1> xdoty;
+        double wdoty;
+        double wy_mult; // logically, the real w is what is in the w vector + wy_mult*Y
+        matrix<double,0,1> w;
+        std::vector<long> index; 
+        unsigned long active_size;
+
+        matrix<double,0,1> eig_vects_xdoty;
+        matrix<double,0,1> eig_vals;
+        matrix<double> eig_vects;
+        matrix<double> u;
+
+        dlib::rand rnd;
+
+
+        double eps;
+        unsigned long max_iterations;
+        bool verbose;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_ElASTIC_NET_Hh_
+
+
--- a/dlib/optimization/elastic_net_abstract.h
+++ b/dlib/optimization/elastic_net_abstract.h
+// Copyright (C) 2016  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_ElASTIC_NET_ABSTRACT_Hh_
+#ifdef DLIB_ElASTIC_NET_ABSTRACT_Hh_
+
+#include "../matrix.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class elastic_net
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is a tool for solving the following optimization problem:
+
+                    min_w:      length_squared(X*w - Y) + ridge_lambda*length_squared(w)
+                    such that:  sum(abs(w)) <= lasso_budget
+
+                That is, it solves the elastic net optimization problem.  This object also
+                has the special property that you can quickly obtain different solutions
+                for different settings of ridge_lambda, lasso_budget, and target Y values.
+
+                This is because a large amount of work is precomputed in the constructor.
+                The solver will also remember the previous solution and will use that to
+                warm start subsequent invocations.  Therefore, you can efficiently get
+                solutions for a wide range of regularization parameters.
+                
+                
+                The particular algorithm used to solve it is described in the paper:
+                    Zhou, Quan, et al. "A reduction of the elastic net to support vector
+                    machines with an application to gpu computing." arXiv preprint
+                    arXiv:1409.1976 (2014).  APA 
+
+                And for the SVM solver sub-component we use the algorithm from:
+                    Hsieh, Cho-Jui, et al. "A dual coordinate descent method for large-scale
+                    linear SVM." Proceedings of the 25th international conference on Machine
+                    learning. ACM, 2008. 
+        !*/
+
+    public:
+
+        template <typename EXP>
+        explicit elastic_net(
+            const matrix_exp<EXP>& X
+        ); 
+        /*!
+            requires
+                - X.size() != 0
+            ensures
+                - #get_epsilon() == 1e-5
+                - #get_max_iterations() == 50000
+                - this object will not be verbose unless be_verbose() is called
+                - #size() == X.nc()
+                - #have_target_values() == false
+        !*/
+
+        template <typename EXP1, typename EXP2>
+        elastic_net(
+            const matrix_exp<EXP1>& X,
+            const matrix_exp<EXP2>& Y
+        ); 
+        /*!
+            requires
+                - X.size() != 0
+                - is_col_vector(Y)
+                - X.nc() == Y.size()
+            ensures
+                - constructs this object by calling the elastic_net(X) constructor and then
+                  calling this->set_y(Y).
+                - #have_target_values() == true 
+        !*/
+
+        long size (
+        ) const; 
+        /*!
+            ensures
+                - returns the number of samples loaded into this object.  
+        !*/
+
+        bool have_target_values (
+        ) const;
+        /*!
+            ensures
+                - returns true if set_y() has been called and false otherwise.
+        !*/
+
+        template <typename EXP>
+        void set_y(
+            const matrix_exp<EXP>& Y
+        );
+        /*!
+            requires
+                - is_col_vector(Y)
+                - Y.size() == size()
+            ensures
+                - #have_target_values() == true
+                - Sets the target values, the Y variable in the objective function, to the
+                  given Y.
+        !*/
+
+        void set_epsilon(
+            double eps
+        );
+        /*!
+            requires
+                - eps > 0
+            ensures
+                - #get_epsilon() == eps
+        !*/
+
+        double get_epsilon (
+        ) const;
+        /*!
+            ensures
+                - returns the error epsilon that determines when the solver should stop.
+                  Smaller values may result in a more accurate solution but take longer to
+                  execute.  
+        !*/
+
+        unsigned long get_max_iterations (
+        ) const; 
+        /*!
+            ensures
+                - returns the maximum number of iterations the optimizer is allowed to run
+                  before it is required to stop and return a result.
+        !*/
+
+        void set_max_iterations (
+            unsigned long max_iter
+        );
+        /*!
+            ensures
+                - #get_max_iterations() == max_iter
+        !*/
+
+        void be_verbose (
+        );
+        /*!
+            ensures
+                - This object will print status messages to standard out so that a 
+                  user can observe the progress of the algorithm.
+        !*/
+
+        void be_quiet (
+        );
+        /*!
+            ensures
+                - this object will not print anything to standard out.
+        !*/
+
+
+        matrix<double,0,1> operator() (
+            double ridge_lambda,
+            double lasso_budget = std::numeric_limits<double>::infinity()
+        );
+        /*!
+            requires
+                - have_target_values() == true
+                - ridge_lambda > 0
+                - lasso_budget > 0
+            ensures
+                - Solves the optimization problem described in the WHAT THIS OBJECT
+                  REPRESENTS section above and returns the optimal w.
+                - if (lasso_budget == infinity) then
+                    - The lasso constraint is ignored 
+        !*/
+
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_ElASTIC_NET_ABSTRACT_Hh_
+
+
--- a/dlib/test/CMakeLists.txt
+++ b/dlib/test/CMakeLists.txt
@@ -156,6 +156,7 @@ if (COMPILER_CAN_DO_CPP_11)
      dnn.cpp
      cublas.cpp
      find_optimal_parameters.cpp
+      elastic_net.cpp
      )
 endif()


--- a/dlib/test/elastic_net.cpp
+++ b/dlib/test/elastic_net.cpp
+// Copyright (C) 2016  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+
+#include <dlib/optimization/elastic_net.h>
+#include "tester.h"
+#include <dlib/svm.h>
+#include <dlib/rand.h>
+#include <dlib/string.h>
+#include <vector>
+#include <sstream>
+#include <ctime>
+
+namespace  
+{
+    using namespace test;
+    using namespace dlib;
+    using namespace std;
+    dlib::logger dlog("test.elastic_net");
+
+// ----------------------------------------------------------------------------------------
+
+    matrix<double,0,1> basic_elastic_net(
+        const matrix<double>& X,
+        const matrix<double,0,1>& Y,
+        double ridge_lambda,
+        double lasso_budget,
+        double eps
+    )
+    {
+        DLIB_CASSERT(X.nc() == Y.nr(),"");
+
+
+        typedef matrix<double,0,1> sample_type;
+        typedef linear_kernel<sample_type> kernel_type;
+
+        svm_c_linear_dcd_trainer<kernel_type> trainer;
+        trainer.solve_svm_l2_problem(true);
+        const double C = 1/(2*ridge_lambda);
+        trainer.set_c(C);
+        trainer.set_epsilon(eps);
+        trainer.enable_shrinking(true);
+        trainer.include_bias(false);
+
+
+        std::vector<sample_type> samples;
+        std::vector<double> labels;
+        for (long r = 0; r < X.nr(); ++r)
+        {
+            sample_type temp = trans(rowm(X,r));
+
+            const double xmul = (1/lasso_budget);
+            samples.push_back(temp - xmul*Y);
+            labels.push_back(+1);
+            samples.push_back(temp + xmul*Y);
+            labels.push_back(-1);
+        }
+
+        svm_c_linear_dcd_trainer<kernel_type>::optimizer_state state;
+        auto df = trainer.train(samples, labels, state);
+        auto&& alpha = state.get_alpha();
+
+        matrix<double,0,1> betas(alpha.size()/2);
+        for (long i = 0; i < betas.size(); ++i)
+            betas(i) = lasso_budget*(alpha[2*i] - alpha[2*i+1]);
+        betas /= sum(mat(alpha));
+        return betas;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    class test_elastic_net : public tester
+    {
+    public:
+        test_elastic_net (
+        ) :
+            tester (
+                "test_elastic_net",       
+                "Run tests on the elastic_net object.", 
+                0                     
+            )
+        {
+        }
+
+        void perform_test (
+        )
+        {
+            matrix<double> w = {1,2,0,4, 0,0,0,0,0, 6, 7,8,0, 9, 0};
+
+            matrix<double> X = randm(w.size(),1000);
+            matrix<double> Y = trans(X)*w;
+            Y += 0.1*(randm(Y.nr(), Y.nc())-0.5);
+
+
+            double ridge_lambda = 0.1;
+            double lasso_budget = sum(abs(w));
+            double eps = 0.0000001;
+
+            dlib::elastic_net solver(X,Y);
+            solver.set_epsilon(eps);
+
+
+            matrix<double,0,1> results;
+            matrix<double,0,1> results2;
+            for (double s = 1.2; s > 0.10; s *= 0.9)
+            {
+                print_spinner();
+                dlog << LINFO << "s: "<< s;
+                // make sure the two solvers agree.  
+                results = basic_elastic_net(X, Y, ridge_lambda, lasso_budget*s, eps);
+                results2 = solver(ridge_lambda, lasso_budget*s);
+                dlog << LINFO << "error: "<< max(abs(results - results2));
+                DLIB_TEST(max(abs(results - results2) < 1e-3));
+            }
+        }
+    } a;
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+
+