Made this object properly warm-startable

52e35c31 · Davis King · 34a9e4f6 · 52e35c31
Commit 52e35c31 authored Dec 17, 2012 by Davis King
Hide whitespace changes
Inline Side-by-side

Showing with 160 additions and 78 deletions

svm_c_linear_dcd_trainer.h dlib/svm/svm_c_linear_dcd_trainer.h +160 -78

No files found.
--- a/dlib/svm/svm_c_linear_dcd_trainer.h
+++ b/dlib/svm/svm_c_linear_dcd_trainer.h
@@ -207,6 +207,128 @@ namespace dlib
            Cneg = C;
        }

+        class optimizer_state
+        {
+            friend class svm_c_linear_dcd_trainer;
+
+        public:
+            optimizer_state() : did_init(false) {}
+
+        private:
+
+            template <
+                typename in_sample_vector_type
+                >
+            void init(
+                const in_sample_vector_type& x,
+                bool have_bias_,
+                bool last_weight_1_
+            )
+            {
+                const long new_dims = max_index_plus_one(x);
+                long new_idx = 0;
+
+                if (did_init)
+                {
+                    DLIB_CASSERT(have_bias_ == have_bias &&
+                                 last_weight_1_ == last_weight_1, "");
+
+                    DLIB_CASSERT( new_dims >= dims,"");
+                    DLIB_CASSERT( x.size() >= static_cast<long>(alpha.size()),"");
+
+                    // make sure we amortize the cost of growing the alpha vector.
+                    if (alpha.capacity() < static_cast<unsigned long>(x.size()))
+                        alpha.reserve(x.size()*2);
+
+                    new_idx = alpha.size();
+
+                    // Make sure alpha has the same length as x.  So pad with extra zeros if
+                    // necessary to make this happen.
+                    alpha.resize(x.size(),0);
+
+
+                    if (new_dims != dims)
+                    {
+                        // The only valid way the dimensions can be different here is if
+                        // you are using a sparse vector type.  This is because we might
+                        // have had training samples which just happened to not include all
+                        // the features previously.  Therefore, max_index_plus_one() would
+                        // have given too low of a result.  But for dense vectors it is
+                        // definitely a user error if the dimensions don't match.
+
+                        DLIB_CASSERT(is_matrix<sample_type>::value == false, "");
+
+                        // extend w by the right number of elements
+                        if (have_bias)
+                        {
+                            // Splice some zeros into the w vector so it will have the
+                            // right length.  Here we are being careful to move the bias
+                            // weight to the end of the resulting vector.
+                            w = join_cols(join_cols(
+                                    colm(w,0,dims), 
+                                    zeros_matrix<scalar_type>(1, new_dims-dims)), 
+                                    uniform_matrix<scalar_type>(1,1,w(dims))
+                                    );
+                        }
+                        else
+                        {
+                            // Just concatenate the right number of zeros.
+                            w = join_cols(w, zeros_matrix<scalar_type>(1, new_dims-dims));
+                        }
+                        dims = new_dims;
+                    }
+
+                }
+                else
+                {
+                    did_init = true;
+                    have_bias = have_bias_;
+                    last_weight_1 = last_weight_1_;
+                    dims = new_dims;
+
+                    alpha.resize(x.size());
+
+                    index.reserve(x.size());
+                    Q.reserve(x.size());
+
+                    if (have_bias)
+                        w.set_size(dims+1);
+                    else
+                        w.set_size(dims);
+
+                    w = 0;
+                }
+
+                for (long i = new_idx; i < x.size(); ++i)
+                {
+                    Q.push_back(dlib::dot(x(i),x(i)));
+
+                    if (have_bias)
+                    {
+                        index.push_back(i);
+                        Q.back() += 1;
+                    }
+                    else if (Q.back() != 0)
+                    {
+                        index.push_back(i);
+                    }
+                }
+
+                if (last_weight_1)
+                    w(dims-1) = 1;
+            }
+
+            bool did_init;
+            bool have_bias;
+            bool last_weight_1;
+            std::vector<scalar_type> alpha;
+            scalar_vector_type w;
+            std::vector<scalar_type> Q;
+            std::vector<long> index;
+            long dims;
+            dlib::rand rnd;
+        };
+
        template <
            typename in_sample_vector_type,
            typename in_scalar_vector_type
@@ -216,9 +338,8 @@ namespace dlib
            const in_scalar_vector_type& y
        ) const
        {
-            scalar_vector_type alpha(x.size());
-            alpha = 0;
-            return do_train(vector_to_matrix(x), vector_to_matrix(y), alpha);
+            optimizer_state state;
+            return do_train(vector_to_matrix(x), vector_to_matrix(y), state);
        }

        template <
@@ -228,24 +349,10 @@ namespace dlib
        const decision_function<kernel_type> train (
            const in_sample_vector_type& x,
            const in_scalar_vector_type& y,
-            scalar_vector_type& alpha
+            optimizer_state& state 
        ) const
        {
-            DLIB_CASSERT (static_cast<long>(x.size()) >= alpha.size(), 
-                "\t decision_function svm_c_linear_dcd_trainer::train(x,y,alpha)"
-                << "\n\t invalid inputs were given to this function"
-                << "\n\t x.size():     " << x.size() 
-                << "\n\t alpha.size(): " << alpha.size() 
-                );
-
-            if (static_cast<long>(x.size()) > alpha.size())
-            {
-                // Make sure alpha has the same length as x.  So pad with extra zeros if
-                // necessary to make this happen.
-                alpha = join_cols(alpha, zeros_matrix<scalar_type>(1,x.size()-alpha.size()));
-            }
-
-            return do_train(vector_to_matrix(x), vector_to_matrix(y), alpha);
+            return do_train(vector_to_matrix(x), vector_to_matrix(y), state);
        }

    private:
@@ -259,12 +366,9 @@ namespace dlib
        const decision_function<kernel_type> do_train (
            const in_sample_vector_type& x,
            const in_scalar_vector_type& y,
-            scalar_vector_type& alpha
+            optimizer_state& state 
        ) const
        {
-            // TODO, requires labels are all +1 or -1.  But we don't have to see both
-            // types.
-
            // make sure requires clause is not broken
            DLIB_ASSERT(is_learning_problem(x,y) == true,
                "\t decision_function svm_c_linear_dcd_trainer::train(x,y)"
@@ -273,50 +377,25 @@ namespace dlib
                << "\n\t y.size(): " << y.size() 
                << "\n\t is_learning_problem(x,y): " << is_learning_problem(x,y)
                );
-
-            const long dims = max_index_plus_one(x);
-
-            // TODO, return an opaque object instead of alpha.  Also, the object
-            // needs to verify that the trainer has the same settings from one
-            // call to the next.
-
-            std::vector<long> index(x.size());
-            scalar_vector_type Q(x.size());
-
-            scalar_vector_type w;
-            if (have_bias)
-                w.set_size(dims+1);
-            else
-                w.set_size(dims);
-
-            w = 0;
-            if (last_weight_1)
-                w(dims-1) = 1;
-
-            long ii = 0;
-            for (long i = 0; i < alpha.size(); ++i)
+#if ENABLE_ASSERTS
+            for (long i = 0; i < x.size(); ++i)
            {
-                index[ii] = i;
-                Q(ii) = dlib::dot(x(i),x(i));
-
-                if (have_bias)
-                {
-                    Q(ii) += 1;
-                    ++ii;
-                }
-                else if (Q(ii) != 0) 
-                {
-                    ++ii;
-                }
+                DLIB_ASSERT(y(i) == +1 || y(i) == -1,
+                    "\t decision_function svm_c_linear_dcd_trainer::train(x,y)"
+                    << "\n\t invalid inputs were given to this function"
+                    << "\n\t y("<<i<<"): " << y(i)
+                );
            }
+#endif

-            // What we are doing here is ignoring x elements that have 0 norm.  We
-            // Do this because they are impossible to classify and this also avoids
-            // a division by zero problem later on in the code.
-            const long max_possible_active = ii;
+            state.init(x,have_bias,last_weight_1);

-            dlib::rand rnd;
-            long active_size = max_possible_active;
+            std::vector<scalar_type>& alpha = state.alpha;
+            scalar_vector_type& w = state.w;
+            std::vector<long>& index = state.index;
+            const long dims = state.dims;
+
+            unsigned long active_size = index.size();

            scalar_type PG_max_prev = std::numeric_limits<scalar_type>::infinity();
            scalar_type PG_min_prev = -std::numeric_limits<scalar_type>::infinity();
@@ -328,15 +407,15 @@ namespace dlib
                scalar_type PG_min = std::numeric_limits<scalar_type>::infinity();

                // randomly shuffle the indices
-                for (long i = 0; i < active_size; ++i)
+                for (unsigned long i = 0; i < active_size; ++i)
                {
                    // pick a random index >= i
-                    const long j = i + rnd.get_random_32bit_number()%(active_size-i);
+                    const long j = i + state.rnd.get_random_32bit_number()%(active_size-i);
                    std::swap(index[i], index[j]);
                }
                
                // for all the active training samples
-                for (long ii = 0; ii < active_size; ++ii)
+                for (unsigned long ii = 0; ii < active_size; ++ii)
                {
                    const long i = index[ii];

@@ -344,7 +423,7 @@ namespace dlib
                    const scalar_type C = (y(i) > 0) ? Cpos : Cneg;

                    scalar_type PG = 0;
-                    if (alpha(i) == 0)
+                    if (alpha[i] == 0)
                    {
                        if (G > PG_max_prev)
                        {
@@ -358,7 +437,7 @@ namespace dlib
                        if (G < 0)
                            PG = G;
                    }
-                    else if (alpha(i) == C)
+                    else if (alpha[i] == C)
                    {
                        if (G < PG_min_prev)
                        {
@@ -385,9 +464,9 @@ namespace dlib
                    // if PG != 0
                    if (std::abs(PG) > 1e-12)
                    {
-                        const scalar_type alpha_old = alpha(i);
-                        alpha(i) = std::min(std::max(alpha(i) - G/Q(i), (scalar_type)0.0), C);
-                        const scalar_type delta = (alpha(i)-alpha_old)*y(i);
+                        const scalar_type alpha_old = alpha[i];
+                        alpha[i] = std::min(std::max(alpha[i] - G/state.Q[i], (scalar_type)0.0), C);
+                        const scalar_type delta = (alpha[i]-alpha_old)*y(i);
                        add_to(w, x(i), delta);
                        if (have_bias)
                            w(w.size()-1) -= delta;
@@ -411,12 +490,12 @@ namespace dlib
                {
                    // stop if we are within eps tolerance and the last iteration
                    // was over all the samples
-                    if (active_size == max_possible_active)
+                    if (active_size == index.size())
                        break;

                    // Turn of shrinking on the next iteration.  We will stop if the
                    // tolerance is still <= eps when shrinking is off.
-                    active_size = max_possible_active;
+                    active_size = index.size();
                    PG_max_prev = std::numeric_limits<scalar_type>::infinity();
                    PG_min_prev = -std::numeric_limits<scalar_type>::infinity();
                }
@@ -429,7 +508,11 @@ namespace dlib
                    if (PG_min_prev >= 0)
                        PG_min_prev = -std::numeric_limits<scalar_type>::infinity();
                }
-            }
+
+            } // end of main optimization loop
+
+
+

            // put the solution into a decision function and then return it
            decision_function<kernel_type> df;
@@ -439,10 +522,9 @@ namespace dlib
                df.b = 0;

            df.basis_vectors.set_size(1);
-            // Copy the plane normal into the output basis vector.  The output vector might be a
-            // sparse vector container so we need to use this special kind of copy to handle that case.
-            // As an aside, the reason for using max_index_plus_one() and not just w.size()-1 is because
-            // doing it this way avoids an inane warning from gcc that can occur in some cases.
+            // Copy the plane normal into the output basis vector.  The output vector might
+            // be a sparse vector container so we need to use this special kind of copy to
+            // handle that case.  
            assign(df.basis_vectors(0), colm(w, 0, dims));
            df.alpha.set_size(1);
            df.alpha(0) = 1;