From 01d8ff8d2033a392afe1d9c4103b9f9f9f787fed Mon Sep 17 00:00:00 2001
From: Davis King <davis@dlib.net>
Date: Wed, 23 Mar 2011 02:00:53 +0000
Subject: [PATCH] Added a structural SVM solver.

--HG--
extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%404188
---
 dlib/svm.h                                 |   2 +
 dlib/svm/structural_svm_problem.h          | 313 +++++++++++++++++++++
 dlib/svm/structural_svm_problem_abstract.h | 166 +++++++++++
 3 files changed, 481 insertions(+)
 create mode 100644 dlib/svm/structural_svm_problem.h
 create mode 100644 dlib/svm/structural_svm_problem_abstract.h

diff --git a/dlib/svm.h b/dlib/svm.h
index 90ff7879..78c9f41f 100644
--- a/dlib/svm.h
+++ b/dlib/svm.h
@@ -37,6 +37,8 @@
 #include "svm/one_vs_all_decision_function.h"
 #include "svm/one_vs_all_trainer.h"
 
+#include "svm/structural_svm_problem.h"
+
 #endif // DLIB_SVm_HEADER
 
 
diff --git a/dlib/svm/structural_svm_problem.h b/dlib/svm/structural_svm_problem.h
new file mode 100644
index 00000000..cdb4f2c2
--- /dev/null
+++ b/dlib/svm/structural_svm_problem.h
@@ -0,0 +1,313 @@
+// Copyright (C) 2011  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_STRUCTURAL_SVM_PRObLEM_H__
+#define DLIB_STRUCTURAL_SVM_PRObLEM_H__
+
+#include "structural_svm_problem_abstract.h"
+#include "../algs.h"
+#include <vector>
+#include "../optimization/optimization_oca.h"
+#include "../matrix.h"
+#include "sparse_vector.h"
+#include <iostream>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename matrix_type,
+        typename feature_vector_type_ 
+        >
+    class structural_svm_problem : public oca_problem<matrix_type> 
+    {
+    public:
+        /*!
+            CONVENTION
+                - C == get_c()
+                - eps == get_epsilon()
+                - if (skip_cache) then
+                    - we won't use the oracle cache when we need to evaluate the separation
+                      oracle. Instead, we will directly call the user supplied separation_oracle().
+
+                - get_max_cache_size() == max_cache_size
+
+                - if (cache.size() != 0) then
+                    - cache.size() == get_num_samples()
+                    - for all i: cache[i] == the cached results of calls to separation_oracle()
+                      for the i-th sample.
+        !*/
+
+        typedef typename matrix_type::type scalar_type;
+        typedef feature_vector_type_ feature_vector_type;
+
+        structural_svm_problem (
+        ) :
+            eps(0.001),
+            verbose(false),
+            skip_cache(true),
+            max_cache_size(10),
+            C(1)
+        {}
+
+        void set_epsilon (
+            scalar_type eps_
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(eps_ > 0,
+                "\t void structural_svm_problem::set_epsilon()"
+                << "\n\t eps_ must be greater than 0"
+                << "\n\t eps_: " << eps_ 
+                << "\n\t this: " << this
+                );
+
+            eps = eps_;
+        }
+
+        const scalar_type get_epsilon (
+        ) const { return eps; }
+
+        void set_max_cache_size (
+            unsigned long max_size
+        )
+        {
+            max_cache_size = max_size;
+        }
+
+        unsigned long get_max_cache_size (
+        ) const { return max_cache_size; }
+
+        void be_verbose (
+        ) 
+        {
+            verbose = true;
+        }
+
+        void be_quiet(
+        )
+        {
+            verbose = false;
+        }
+
+        scalar_type get_c (
+        ) const { return C; }
+
+        void set_c (
+            scalar_type C_
+        ) 
+        { 
+            // make sure requires clause is not broken
+            DLIB_ASSERT(C_ > 0,
+                "\t void structural_svm_problem::set_c()"
+                << "\n\t C_ must be greater than 0"
+                << "\n\t C_:    " << C_ 
+                << "\n\t this: " << this
+                );
+
+            C = C_; 
+        }
+
+        virtual long get_num_dimensions (
+        ) const = 0;
+
+        virtual long get_num_samples (
+        ) const = 0;
+
+        virtual void get_truth_joint_feature_vector (
+            long idx,
+            feature_vector_type& psi 
+        ) const = 0;
+
+        virtual void separation_oracle (
+            const long idx,
+            const matrix_type& current_solution,
+            scalar_type& loss,
+            feature_vector_type& psi
+        ) const = 0;
+
+    private:
+
+        virtual bool risk_has_lower_bound (
+            scalar_type& lower_bound
+        ) const 
+        { 
+            lower_bound = 0;
+            return true; 
+        }
+
+        virtual bool optimization_status (
+            scalar_type current_objective_value,
+            scalar_type current_error_gap,
+            unsigned long num_cutting_planes,
+            unsigned long num_iterations
+        ) const 
+        {
+            if (verbose)
+            {
+                using namespace std;
+                cout << "svm objective: " << current_objective_value << endl;
+                cout << "gap: " << current_error_gap << endl;
+                cout << "num planes: " << num_cutting_planes << endl;
+                cout << "iter: " << num_iterations << endl;
+                cout << endl;
+            }
+
+            cur_gap = current_error_gap/get_c();
+
+            bool should_stop = false;
+
+            if (current_objective_value == 0)
+                should_stop = true;
+            else if (current_error_gap/current_objective_value < eps)
+                should_stop = true;
+
+            if (should_stop && !skip_cache)
+            {
+                // Instead of stopping we shouldn't use the cache on the next iteration.  This way
+                // we can be sure to have the best solution rather than assuming the cache is up-to-date
+                // enough.
+                should_stop = false;
+                skip_cache = true;
+            }
+            else
+            {
+                skip_cache = false;
+            }
+
+
+            return should_stop;
+        }
+
+        virtual void get_risk (
+            matrix_type& w,
+            scalar_type& risk,
+            matrix_type& subgradient
+        ) const 
+        {
+            feature_vector_type ftemp;
+            const unsigned long num = get_num_samples();
+
+            // initialize psi_true if we haven't done so already.  
+            if (psi_true.size() == 0)
+            {
+                psi_true.set_size(w.size(),1);
+                psi_true = 0;
+                for (unsigned long i = 0; i < num; ++i)
+                {
+                    get_truth_joint_feature_vector(i, ftemp);
+                    subtract_from(psi_true, ftemp);
+                }
+            }
+
+            subgradient = psi_true;
+            scalar_type total_loss = 0;
+            for (unsigned long i = 0; i < num; ++i)
+            {
+                scalar_type loss;
+                separation_oracle_cached(i, w, loss, ftemp);
+                total_loss += loss;
+                add_to(subgradient, ftemp);
+            }
+
+            subgradient /= num;
+            total_loss /= num;
+            risk = total_loss + dot(subgradient,w);
+            cur_risk = risk;
+        }
+
+        void separation_oracle_cached (
+            const long idx,
+            const matrix_type& current_solution,
+            scalar_type& loss,
+            feature_vector_type& psi
+        ) const 
+        {
+
+            if (cache.size() == 0 && max_cache_size != 0)
+                cache.resize(get_num_samples());
+
+            if (!skip_cache && max_cache_size != 0)
+            {
+                scalar_type best_val = -std::numeric_limits<scalar_type>::infinity();
+                unsigned long best_idx = 0;
+
+                cache_record& rec = cache[idx];
+
+                // figure out which element in the cache is the best
+                for (unsigned long i = 0; i < rec.loss.size(); ++i)
+                {
+                    using sparse_vector::dot;
+                    const scalar_type temp = rec.loss[i] + dot(rec.psi[i], current_solution);
+                    if (temp > best_val)
+                    {
+                        best_val = temp;
+                        loss = rec.loss[i];
+                        best_idx = i;
+                    }
+                }
+
+                if (best_val > cur_risk-cur_gap)
+                {
+                    psi = rec.psi[best_idx];
+                    rec.lru_count[best_idx] += 1;
+                    return;
+                }
+            }
+
+
+            separation_oracle(idx, current_solution, loss, psi);
+
+            if (cache.size() != 0)
+            {
+                if (cache[idx].loss.size() < max_cache_size)
+                {
+                    cache[idx].loss.push_back(loss);
+                    cache[idx].psi.push_back(psi);
+                    cache[idx].lru_count.push_back(cache[idx].lru_count.size());
+                }
+                else
+                {
+                    // find least recently used cache entry for idx-th sample
+                    const long i       = index_of_min(vector_to_matrix(cache[idx].lru_count));
+
+                    // save our new data in the cache
+                    cache[idx].loss[i] = loss;
+                    cache[idx].psi[i]  = psi;
+
+                    const long max_use = max(vector_to_matrix(cache[idx].lru_count));
+                    // Make sure this new cache entry has the best lru count since we have used
+                    // it most recently.
+                    cache[idx].lru_count[i] = max_use + 1;
+                }
+            }
+        }
+
+        struct cache_record
+        {
+            std::vector<scalar_type> loss;
+            std::vector<feature_vector_type> psi;
+            std::vector<long> lru_count;
+        };
+
+
+        mutable scalar_type cur_risk;
+        mutable scalar_type cur_gap;
+        mutable matrix_type psi_true;
+        scalar_type eps;
+        mutable bool verbose;
+
+        mutable std::vector<cache_record> cache;
+        mutable bool skip_cache;
+        unsigned long max_cache_size;
+
+        scalar_type C;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_STRUCTURAL_SVM_PRObLEM_H__
+
diff --git a/dlib/svm/structural_svm_problem_abstract.h b/dlib/svm/structural_svm_problem_abstract.h
new file mode 100644
index 00000000..3333f809
--- /dev/null
+++ b/dlib/svm/structural_svm_problem_abstract.h
@@ -0,0 +1,166 @@
+// Copyright (C) 2011  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_STRUCTURAL_SVM_PRObLEM_ABSTRACT_H__
+#ifdef DLIB_STRUCTURAL_SVM_PRObLEM_ABSTRACT_H__
+
+#include "../optimization/optimization_oca_abstract.h"
+#include "sparse_vector_abstract.h"
+#include "../matrix.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename matrix_type,
+        typename feature_vector_type_ 
+        >
+    class structural_svm_problem : public oca_problem<matrix_type> 
+    {
+    public:
+        /*!
+            INITIAL VALUE
+                - get_epsilon() == 0.001
+                - get_max_cache_size() == 10
+                - get_c() == 1
+                - This object will not be verbose
+
+            WHAT THIS OBJECT REPRESENTS
+
+        !*/
+
+        typedef typename matrix_type::type scalar_type;
+        typedef feature_vector_type_ feature_vector_type;
+
+        structural_svm_problem (
+        );
+        /*!
+            ensures
+                - this object is properly initialized
+        !*/
+
+        void set_epsilon (
+            scalar_type eps_
+        );
+        /*!
+            requires
+                - eps_ > 0
+            ensures
+                - #get_epsilon() == eps_
+        !*/
+
+        const scalar_type get_epsilon (
+        ) const;
+        /*!
+        !*/
+
+        void set_max_cache_size (
+            unsigned long max_size
+        );
+        /*!
+            ensures
+                - #get_max_cache_size() == max_size
+        !*/
+
+        unsigned long get_max_cache_size (
+        ) const; 
+        /*!
+            ensures
+                - Returns the number of joint feature vectors per training sample kept in 
+                  the separation oracle cache.  This cache is used to avoid unnecessary 
+                  calls to the separation oracle.  Note that a value of 0 means that 
+                  caching is not used at all.  This is appropriate if the separation
+                  oracle is cheap to evaluate. 
+        !*/
+
+        void be_verbose (
+        );
+        /*!
+        !*/
+
+        void be_quiet(
+        );
+        /*!
+        !*/
+
+        scalar_type get_c (
+        ) const; 
+        /*!
+        !*/
+
+        void set_c (
+            scalar_type C_
+        );
+        /*!
+            requires
+                - C_ > 0
+            ensures
+                - #get_c() == C_
+        !*/
+
+    // --------------------------------
+    //     User supplied routines
+    // --------------------------------
+
+        virtual long get_num_dimensions (
+        ) const = 0;
+        /*!
+            ensures
+                - returns the dimensionality of a joint feature vector
+        !*/
+
+        virtual long get_num_samples (
+        ) const = 0;
+        /*!
+            ensures
+                - returns the number of training samples in this problem. 
+        !*/
+
+        virtual void get_truth_joint_feature_vector (
+            long idx,
+            feature_vector_type& psi 
+        ) const = 0;
+        /*!
+            requires
+                - 0 <= idx < get_num_samples()
+            ensures
+                - #psi == PSI(x_idx, y_idx)
+                  (i.e. the joint feature vector for sample idx and its true label.)
+        !*/
+
+        virtual void separation_oracle (
+            const long idx,
+            const matrix_type& current_solution,
+            scalar_type& loss,
+            feature_vector_type& psi
+        ) const = 0;
+        /*!
+            requires
+                - 0 <= idx < get_num_samples()
+                - current_solution.size() == get_num_dimensions()
+            ensures
+                - runs the separation oracle on the idx-th sample.  We define this as follows: 
+                    - let X           == the idx-th input sample.
+                    - let PSI(X,y)    == the joint feature vector for input X and an arbitrary label y.
+                    - let F(X,y)      == dot(current_solution,PSI(X,y)).  
+                    - let LOSS(idx,y) == the loss incurred for predicting that the ith-th sample
+                      has a label of y.  
+
+                        Then the separation oracle finds a Y such that: 
+                            Y = argmax over all y: LOSS(idx,y) + F(X,y) 
+                            (i.e. It finds the label which maximizes the above expression.)
+
+                        Finally, we can define the outputs of this function as:
+                        - #loss == LOSS(idx,Y) 
+                        - #psi == PSI(X,Y) 
+        !*/
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_STRUCTURAL_SVM_PRObLEM_ABSTRACT_H__
+
+
-- 
2.18.0