Refactored the code in the reduced_decision_function_trainer2. Part of it has been turned into

a global function called approximate_distance_function() which performs the main optimization. The reduced_decision_function_trainer2 now depends on this global function. This changes makes this function optimizer available for other purposes besides use in the reduced_decision_function_trainer2 object. --HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%404125

Refactored the code in the reduced_decision_function_trainer2. Part of it has been turned into
a global function called approximate_distance_function() which performs the main optimization. The reduced_decision_function_trainer2 now depends on this global function. This changes makes this function optimizer available for other purposes besides use in the reduced_decision_function_trainer2 object. --HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%404125
82fb3682 · Davis King · 543e289f · 82fb3682 · 82fb3682
Commit 82fb3682 authored Feb 05, 2011 by Davis King
Hide whitespace changes
Inline Side-by-side

Showing with 216 additions and 141 deletions

reduced.h dlib/svm/reduced.h +177 -141

reduced_abstract.h dlib/svm/reduced_abstract.h +39 -0

No files found.
--- a/dlib/svm/reduced.h
+++ b/dlib/svm/reduced.h
@@ -144,94 +144,35 @@ namespace dlib
 // ----------------------------------------------------------------------------------------
 // ----------------------------------------------------------------------------------------
+    namespace red_impl
-    template <
-        typename trainer_type 
-        >
-    class reduced_decision_function_trainer2
    {
-    public:
-        typedef typename trainer_type::kernel_type kernel_type;
-        typedef typename trainer_type::scalar_type scalar_type;
-        typedef typename trainer_type::sample_type sample_type;
-        typedef typename trainer_type::mem_manager_type mem_manager_type;
-        typedef typename trainer_type::trained_function_type trained_function_type;
-        reduced_decision_function_trainer2 () : num_bv(0) {}
-        reduced_decision_function_trainer2 (
-            const trainer_type& trainer_,
-            const long num_sb_,
-            const double eps_ = 1e-3
-        ) :
-            trainer(trainer_),
-            num_bv(num_sb_),
-            eps(eps_)
-        {
-            COMPILE_TIME_ASSERT(is_matrix<sample_type>::value);
-            // make sure requires clause is not broken
-            DLIB_ASSERT(num_bv > 0 && eps > 0,
-                        "\t reduced_decision_function_trainer2()"
-                        << "\n\t you have given invalid arguments to this function"
-                        << "\n\t num_bv: " << num_bv 
-                        << "\n\t eps:    " << eps 
-            );
-        }
-        template <
-            typename in_sample_vector_type,
-            typename in_scalar_vector_type
-            >
-        const decision_function<kernel_type> train (
-            const in_sample_vector_type& x,
-            const in_scalar_vector_type& y
-        ) const
-        {
-            // make sure requires clause is not broken
-            DLIB_ASSERT(num_bv > 0,
-                        "\t reduced_decision_function_trainer2::train(x,y)"
-                        << "\n\t You have tried to use an uninitialized version of this object"
-                        << "\n\t num_bv: " << num_bv );
-            return do_train(vector_to_matrix(x), vector_to_matrix(y));
-        }
-    private:
    // ------------------------------------------------------------------------------------
+        template <typename kernel_type>
        class objective
        {
            /*
                This object represents the objective function we will try to
-                minimize in the final stage of this reduced set method.  
+                minimize in approximate_distance_function().  
                The objective is the distance, in kernel induced feature space, between
-                the original decision function and the approximated version.
+                the original distance function and the approximated version.
            */
+            typedef typename kernel_type::scalar_type scalar_type;
+            typedef typename kernel_type::sample_type sample_type;
+            typedef typename kernel_type::mem_manager_type mem_manager_type;
        public:
            objective(
-                const decision_function<kernel_type>& dec_funct_,
+                const distance_function<kernel_type>& dist_funct_,
                matrix<scalar_type,0,1,mem_manager_type>& b_,
                matrix<sample_type,0,1,mem_manager_type>& out_vectors_
            ) :
-                dec_funct(dec_funct_),
+                dist_funct(dist_funct_),
                b(b_),
                out_vectors(out_vectors_)
            {
-                const kernel_type k(dec_funct.kernel_function);
-                // here we compute a term in the objective function that is a constant.  So
-                // do it in the constructor so we don't have to recompute it every time
-                // the objective is evaluated.
-                bias = 0;
-                for (long i = 0; i < dec_funct.alpha.size(); ++i)
-                {
-                    for (long j = 0; j < dec_funct.alpha.size(); ++j)
-                    {
-                        bias += dec_funct.alpha(i)*dec_funct.alpha(j)*
-                            k(dec_funct.basis_vectors(i), dec_funct.basis_vectors(j));
-                    }
-                }
            }
            const matrix<scalar_type, 0, 1, mem_manager_type> state_to_vector (
@@ -239,7 +180,7 @@ namespace dlib
            /*!
                ensures
                    - returns a vector that contains all the information necessary to
-                      reproduce the current state of the approximated decision function
+                      reproduce the current state of the approximated distance function
            !*/
            {
                matrix<scalar_type, 0, 1, mem_manager_type> z(b.nr() + out_vectors.size()*out_vectors(0).nr());
@@ -270,7 +211,7 @@ namespace dlib
                    - z came from the state_to_vector() function or has a compatible format
                ensures
                    - loads the vector z into the state variables of the approximate
-                      decision function (i.e. b and out_vectors)
+                      distance function (i.e. b and out_vectors)
            !*/
            {
                long i = 0;
@@ -295,20 +236,20 @@ namespace dlib
            ) const
            /*!
                ensures
-                    - loads the current approximate decision function with z
+                    - loads the current approximate distance function with z
-                    - returns the distance between the original decision function
+                    - returns the distance between the original distance function
                      and the approximate one.
            !*/
            {
                vector_to_state(z);
-                const kernel_type k(dec_funct.kernel_function);
+                const kernel_type k(dist_funct.get_kernel());
                double temp = 0;
                for (long i = 0; i < out_vectors.size(); ++i)
                {
-                    for (long j = 0; j < dec_funct.basis_vectors.nr(); ++j)
+                    for (long j = 0; j < dist_funct.get_basis_vectors().nr(); ++j)
                    {
-                        temp -= b(i)*dec_funct.alpha(j)*k(out_vectors(i), dec_funct.basis_vectors(j));
+                        temp -= b(i)*dist_funct.get_alpha()(j)*k(out_vectors(i), dist_funct.get_basis_vectors()(j));
                    }
                }
@@ -322,14 +263,12 @@ namespace dlib
                    }
                }
-                return temp + bias;
+                return temp + dist_funct.get_squared_norm();
            }
        private:
-            scalar_type bias;
+            const distance_function<kernel_type>& dist_funct;
-            const decision_function<kernel_type>& dec_funct;
            matrix<scalar_type,0,1,mem_manager_type>& b;
            matrix<sample_type,0,1,mem_manager_type>& out_vectors;
@@ -337,20 +276,24 @@ namespace dlib
    // ------------------------------------------------------------------------------------
+        template <typename kernel_type>
        class objective_derivative
        {
            /*!
                This object represents the derivative of the objective object
            !*/
+            typedef typename kernel_type::scalar_type scalar_type;
+            typedef typename kernel_type::sample_type sample_type;
+            typedef typename kernel_type::mem_manager_type mem_manager_type;
        public:
            objective_derivative(
-                const decision_function<kernel_type>& dec_funct_,
+                const distance_function<kernel_type>& dist_funct_,
                matrix<scalar_type,0,1,mem_manager_type>& b_,
                matrix<sample_type,0,1,mem_manager_type>& out_vectors_
            ) :
-                dec_funct(dec_funct_),
+                dist_funct(dist_funct_),
                b(b_),
                out_vectors(out_vectors_)
            {
@@ -364,7 +307,7 @@ namespace dlib
                    - z came from the state_to_vector() function or has a compatible format
                ensures
                    - loads the vector z into the state variables of the approximate
-                      decision function (i.e. b and out_vectors)
+                      distance function (i.e. b and out_vectors)
            !*/
            {
                long i = 0;
@@ -389,15 +332,15 @@ namespace dlib
            ) const
            /*!
                ensures
-                    - loads the current approximate decision function with z
+                    - loads the current approximate distance function with z
                    - returns the derivative of the distance between the original 
-                      decision function and the approximate one.
+                      distance function and the approximate one.
            !*/
            {
                vector_to_state(z);
                res.set_size(z.nr());
                set_all_elements(res,0);
-                const kernel_type k(dec_funct.kernel_function);
+                const kernel_type k(dist_funct.get_kernel());
                const kernel_derivative<kernel_type> K_der(k);
                // first compute the gradient for the beta weights
@@ -410,9 +353,9 @@ namespace dlib
                }
                for (long i = 0; i < out_vectors.size(); ++i)
                {
-                    for (long j = 0; j < dec_funct.basis_vectors.size(); ++j)
+                    for (long j = 0; j < dist_funct.get_basis_vectors().size(); ++j)
                    {
-                        res(i) -= dec_funct.alpha(j)*k(out_vectors(i), dec_funct.basis_vectors(j)); 
+                        res(i) -= dist_funct.get_alpha()(j)*k(out_vectors(i), dist_funct.get_basis_vectors()(j)); 
                    }
                }
@@ -428,12 +371,12 @@ namespace dlib
                    {
                        temp += b(j)*K_der(out_vectors(j), out_vectors(i));
                    }
-                    for (long j = 0; j < dec_funct.basis_vectors.nr(); ++j)
+                    for (long j = 0; j < dist_funct.get_basis_vectors().nr(); ++j)
                    {
-                        temp -= dec_funct.alpha(j)*K_der(dec_funct.basis_vectors(j), out_vectors(i) );
+                        temp -= dist_funct.get_alpha()(j)*K_der(dist_funct.get_basis_vectors()(j), out_vectors(i) );
                    }
-                    // store the gradient for out_vectors[i] into result in the proper spot
+                    // store the gradient for out_vectors(i) into result in the proper spot
                    set_subm(res,pos,0,num,1) = b(i)*temp;
                    pos += num;
                }
@@ -448,7 +391,7 @@ namespace dlib
            mutable matrix<scalar_type, 0, 1, mem_manager_type> res;
            mutable sample_type temp;
-            const decision_function<kernel_type>& dec_funct;
+            const distance_function<kernel_type>& dist_funct;
            matrix<scalar_type,0,1,mem_manager_type>& b;
            matrix<sample_type,0,1,mem_manager_type>& out_vectors;
@@ -456,6 +399,142 @@ namespace dlib
    // ------------------------------------------------------------------------------------
+    }
+    template <
+        typename K,
+        typename stop_strategy_type
+        >
+    distance_function<K> approximate_distance_function (
+        stop_strategy_type stop_strategy,
+        const distance_function<K>& target,
+        const distance_function<K>& starting_point
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(target.get_basis_vectors().size() > 0 &&
+                    starting_point.get_basis_vectors().size() > 0 &&
+                    target.get_kernel() == starting_point.get_kernel(),
+                    "\t  distance_function approximate_distance_function()"
+                    << "\n\t Invalid inputs were given to this function."
+                    << "\n\t target.get_basis_vectors().size():         " << target.get_basis_vectors().size() 
+                    << "\n\t starting_point.get_basis_vectors().size(): " << starting_point.get_basis_vectors().size() 
+                    << "\n\t target.kernel_function == starting_point.kernel_function: " << (target.get_kernel() == starting_point.get_kernel())
+        );
+        using namespace red_impl;
+        // The next few statements just find the best weights with which to approximate 
+        // the target object with the set of vectors in the starting_point object.  This
+        // is really just a simple application of some linear algebra.  For the details 
+        // see page 554 of Learning with kernels by Scholkopf and Smola where they talk 
+        // about "Optimal Expansion Coefficients."
+        const K kern(target.get_kernel());
+        typedef typename K::scalar_type scalar_type;
+        typedef typename K::sample_type sample_type;
+        typedef typename K::mem_manager_type mem_manager_type;
+        matrix<scalar_type,0,1,mem_manager_type> beta;
+        // Now we compute the fist approximate distance function.  
+        beta = pinv(kernel_matrix(kern,starting_point.get_basis_vectors())) *
+            (kernel_matrix(kern,starting_point.get_basis_vectors(),target.get_basis_vectors())*target.get_alpha());
+        matrix<sample_type,0,1,mem_manager_type> out_vectors(starting_point.get_basis_vectors());
+        // Now setup to do a global optimization of all the parameters in the approximate 
+        // distance function.  
+        const objective<K> obj(target, beta, out_vectors);
+        const objective_derivative<K> obj_der(target, beta, out_vectors);
+        matrix<scalar_type,0,1,mem_manager_type> opt_starting_point(obj.state_to_vector());
+        // perform a full optimization of all the parameters (i.e. both beta and the basis vectors together)
+        find_min(lbfgs_search_strategy(20),
+                 stop_strategy,
+                 obj, obj_der, opt_starting_point, 0); 
+        // now make sure that the final optimized value is loaded into the beta and
+        // out_vectors matrices
+        obj.vector_to_state(opt_starting_point);
+        // Do a final reoptimization of beta just to make sure it is optimal given the new
+        // set of basis vectors.
+        beta = pinv(kernel_matrix(kern,out_vectors))*(kernel_matrix(kern,out_vectors,target.get_basis_vectors())*target.get_alpha());
+        // It is possible that some of the beta weights will be very close to zero.  Lets remove
+        // the basis vectors with these essentially zero weights.
+        const scalar_type eps = max(abs(beta))*std::numeric_limits<scalar_type>::epsilon();
+        for (long i = 0; i < beta.size(); ++i)
+        {
+            // if beta(i) is zero (but leave at least one beta no matter what)
+            if (std::abs(beta(i)) < eps && beta.size() > 1)
+            {
+                beta = remove_row(beta, i);
+                out_vectors = remove_row(out_vectors, i);
+                --i;
+            }
+        }
+        return distance_function<K>(beta, kern, out_vectors);
+    }
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+    template <
+        typename trainer_type 
+        >
+    class reduced_decision_function_trainer2
+    {
+    public:
+        typedef typename trainer_type::kernel_type kernel_type;
+        typedef typename trainer_type::scalar_type scalar_type;
+        typedef typename trainer_type::sample_type sample_type;
+        typedef typename trainer_type::mem_manager_type mem_manager_type;
+        typedef typename trainer_type::trained_function_type trained_function_type;
+        reduced_decision_function_trainer2 () : num_bv(0) {}
+        reduced_decision_function_trainer2 (
+            const trainer_type& trainer_,
+            const long num_sb_,
+            const double eps_ = 1e-3
+        ) :
+            trainer(trainer_),
+            num_bv(num_sb_),
+            eps(eps_)
+        {
+            COMPILE_TIME_ASSERT(is_matrix<sample_type>::value);
+            // make sure requires clause is not broken
+            DLIB_ASSERT(num_bv > 0 && eps > 0,
+                        "\t reduced_decision_function_trainer2()"
+                        << "\n\t you have given invalid arguments to this function"
+                        << "\n\t num_bv: " << num_bv 
+                        << "\n\t eps:    " << eps 
+            );
+        }
+        template <
+            typename in_sample_vector_type,
+            typename in_scalar_vector_type
+            >
+        const decision_function<kernel_type> train (
+            const in_sample_vector_type& x,
+            const in_scalar_vector_type& y
+        ) const
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(num_bv > 0,
+                        "\t reduced_decision_function_trainer2::train(x,y)"
+                        << "\n\t You have tried to use an uninitialized version of this object"
+                        << "\n\t num_bv: " << num_bv );
+            return do_train(vector_to_matrix(x), vector_to_matrix(y));
+        }
+    private:
        template <
            typename in_sample_vector_type,
            typename in_scalar_vector_type
@@ -467,64 +546,21 @@ namespace dlib
        {
            // get the decision function object we are going to try and approximate
            const decision_function<kernel_type>& dec_funct = trainer.train(x,y);
+            const kernel_type kern(dec_funct.kernel_function);
            // now find a linearly independent subset of the training points of num_bv points.
-            linearly_independent_subset_finder<kernel_type> lisf(dec_funct.kernel_function, num_bv);
+            linearly_independent_subset_finder<kernel_type> lisf(kern, num_bv);
            fill_lisf(lisf,x);
-            // The next few statements just find the best weights with which to approximate 
+            distance_function<kernel_type> approx(ones_matrix<scalar_type>(lisf.size(),1), kern, vector_to_matrix(lisf));
-            // the dec_funct object with the smaller set of vectors in the lisf dictionary.  This
+            const distance_function<kernel_type> target = dec_funct;
-            // is really just a simple application of some linear algebra.  For the details 
-            // see page 554 of Learning with kernels by Scholkopf and Smola where they talk 
-            // about "Optimal Expansion Coefficients."
-            const kernel_type kern(dec_funct.kernel_function);
-            matrix<scalar_type,0,1,mem_manager_type> beta;
-            // Now we compute the fist approximate decision function.  
-            beta = lisf.get_inv_kernel_marix()*(kernel_matrix(kern,lisf,dec_funct.basis_vectors)*dec_funct.alpha);
-            matrix<sample_type,0,1,mem_manager_type> out_vectors(lisf.get_dictionary());
-            // Now setup to do a global optimization of all the parameters in the approximate 
+            approx = approximate_distance_function(objective_delta_stop_strategy(eps), target, approx);
-            // decision function.  
-            const objective obj(dec_funct, beta, out_vectors);
-            const objective_derivative obj_der(dec_funct, beta, out_vectors);
-            matrix<scalar_type,0,1,mem_manager_type> opt_starting_point(obj.state_to_vector());
-            // perform the actual optimization
-            find_min(lbfgs_search_strategy(20),
-                     objective_delta_stop_strategy(eps),
-                     obj, obj_der, opt_starting_point, 0); 
-            // now make sure that the final optimized value is loaded into the beta and
-            // out_vectors matrices
-            obj.vector_to_state(opt_starting_point);
-            // Do a final reoptimization of beta just to make sure it is optimal given the new
-            // set of basis vectors.
-            beta = pinv(kernel_matrix(kern,out_vectors))*(kernel_matrix(kern,out_vectors,dec_funct.basis_vectors)*dec_funct.alpha);
-            // It is possible that some of the beta weights will be very close to zero.  Lets remove
-            // the basis vectors with these essentially zero weights.
-            const scalar_type eps = max(abs(beta))*std::numeric_limits<scalar_type>::epsilon();
-            for (long i = 0; i < beta.size(); ++i)
-            {
-                // if beta(i) is zero
-                if (std::abs(beta(i)) < eps)
-                {
-                    beta = remove_row(beta, i);
-                    out_vectors = remove_row(out_vectors, i);
-                    --i;
-                }
-            }
-            decision_function<kernel_type> new_df(beta, 
+            decision_function<kernel_type> new_df(approx.get_alpha(), 
                                                  0,
                                                  kern, 
-                                                  out_vectors);
+                                                  approx.get_basis_vectors());
            // now we have to figure out what the new bias should be.  It might be a little
            // different since we just messed with all the weights and vectors.

--- a/dlib/svm/reduced_abstract.h
+++ b/dlib/svm/reduced_abstract.h
@@ -106,6 +106,42 @@ namespace dlib
 // ----------------------------------------------------------------------------------------
 // ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+    template <
+        typename K,
+        typename stop_strategy_type
+        >
+    distance_function<K> approximate_distance_function (
+        stop_strategy_type stop_strategy,
+        const distance_function<K>& target,
+        const distance_function<K>& starting_point
+    );
+    /*!
+        requires
+            - stop_strategy == an object that defines a stop strategy such as one of 
+              the objects from dlib/optimization/optimization_stop_strategies_abstract.h
+            - target.get_basis_vectors().size() > 0 && starting_point.get_basis_vectors().size() > 0
+              (i.e. target and starting_point have to have some basis vectors in them)
+            - target.get_kernel() == starting_point.get_kernel()
+              (i.e. both distance functions must use the same kernel)
+            - kernel_derivative<K> is defined
+              (i.e. The analytic derivative for the given kernel must be defined)
+            - K::sample_type must be a dlib::matrix object and the basis_vectors inside the
+              distance_functions must be column vectors.
+        ensures
+            - This function attempts to find a distance function object which is close
+              to the given target.  That is, it searches for an X such that target(X) is
+              minimized.  The optimization begins with the initial guess contained in 
+              starting_point and searches for an X which locally minimizes target(X).  Since
+              this problem can have many local minima the quality of the starting point
+              can significantly influence the results.   
+            - The returned distance_function will contain the same number of basis vectors
+              as the given starting_point object.  
+            - The optimization is carried out until the stop_strategy indicates it 
+              should stop.
+    !*/
 // ----------------------------------------------------------------------------------------
    template <
@@ -179,6 +215,9 @@ namespace dlib
            const in_scalar_vector_type& y
        ) const;
        /*!
+            requires
+                - x must be a list of objects which are each some kind of dlib::matrix 
+                  which represents column or row vectors.
            ensures
                - trains a decision_function using the trainer that was supplied to
                  this object's constructor and then finds a reduced representation