Refactored the svm_nu_trainer. Specifically, I pulled the quadratic

solver out and made it a separate class. The kernel_matrix_cache has also been removed in favor of the new symmetric_matrix_cache. Finally, the remaining bits of the svm_nu_trainer have been moved into svm_nu_trainer.h Also note that invalid_svm_nu_error has been renamed to invalid_nu_error. --HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403994

Refactored the svm_nu_trainer. Specifically, I pulled the quadratic
solver out and made it a separate class. The kernel_matrix_cache has also been removed in favor of the new symmetric_matrix_cache. Finally, the remaining bits of the svm_nu_trainer have been moved into svm_nu_trainer.h Also note that invalid_svm_nu_error has been renamed to invalid_nu_error. --HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403994
b7a02418 · Davis King · dbba600f · b7a02418 · b7a02418 · b7a02418
Commit b7a02418 authored Dec 21, 2010 by Davis King
8 changed files
--- a/dlib/optimization.h
+++ b/dlib/optimization.h
@@ -6,6 +6,7 @@
 #include "optimization/optimization.h"
 #include "optimization/optimization_bobyqa.h"
 #include "optimization/optimization_solve_qp_using_smo.h"
+#include "optimization/optimization_solve_qp2_using_smo.h"
 #include "optimization/optimization_oca.h"
 #include "optimization/optimization_trust_region.h"
 #include "optimization/optimization_least_squares.h"

--- a/dlib/optimization/optimization_solve_qp2_using_smo.h
+++ b/dlib/optimization/optimization_solve_qp2_using_smo.h
+// Copyright (C) 2007  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_SOLVE_QP2_USING_SMo_H__
+#define DLIB_SOLVE_QP2_USING_SMo_H__
+#include "optimization_solve_qp2_using_smo_abstract.h"
+#include <cmath>
+#include <limits>
+#include <sstream>
+#include "../matrix.h"
+#include "../algs.h"
+namespace dlib 
+{
+// ----------------------------------------------------------------------------------------
+    class invalid_nu_error : public dlib::error 
+    { 
+    public: 
+        invalid_nu_error(const std::string& msg, double nu_) : dlib::error(msg), nu(nu_) {};
+        const double nu;
+    };
+// ----------------------------------------------------------------------------------------
+    template <
+        typename T
+        >
+    typename T::type maximum_nu_impl (
+        const T& y
+    )
+    {
+        typedef typename T::type scalar_type;
+        // make sure requires clause is not broken
+        DLIB_ASSERT(y.size() > 1 && is_col_vector(y),
+            "\ttypedef T::type maximum_nu(y)"
+            << "\n\ty should be a column vector with more than one entry"
+            << "\n\ty.nr(): " << y.nr() 
+            << "\n\ty.nc(): " << y.nc() 
+            );
+        long pos_count = 0;
+        long neg_count = 0;
+        for (long r = 0; r < y.nr(); ++r)
+        {
+            if (y(r) == 1.0)
+            {
+                ++pos_count;
+            }
+            else if (y(r) == -1.0)
+            {
+                ++neg_count;
+            }
+            else
+            {
+                // make sure requires clause is not broken
+                DLIB_ASSERT(y(r) == -1.0 || y(r) == 1.0,
+                       "\ttypedef T::type maximum_nu(y)"
+                       << "\n\ty should contain only 1 and 0 entries"
+                       << "\n\tr:    " << r 
+                       << "\n\ty(r): " << y(r) 
+                );
+            }
+        }
+        return static_cast<scalar_type>(2.0*(scalar_type)std::min(pos_count,neg_count)/(scalar_type)y.nr());
+    }
+    template <
+        typename T
+        >
+    typename T::type maximum_nu (
+        const T& y
+    )
+    {
+        return maximum_nu_impl(vector_to_matrix(y));
+    }
+    template <
+        typename T
+        >
+    typename T::value_type maximum_nu (
+        const T& y
+    )
+    {
+        return maximum_nu_impl(vector_to_matrix(y));
+    }
+// ----------------------------------------------------------------------------------------
+    template <
+        typename matrix_type
+        >
+    class solve_qp2_using_smo
+    {
+    public:
+        typedef typename matrix_type::mem_manager_type mem_manager_type;
+        typedef typename matrix_type::type scalar_type;
+        typedef typename matrix_type::layout_type layout_type;
+        typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix;
+        typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix;
+        template <
+            typename EXP1,
+            typename EXP2,
+            long NR
+            >
+        void operator() ( 
+            const matrix_exp<EXP1>& Q,
+            const matrix_exp<EXP2>& y,
+            const scalar_type nu,
+            matrix<scalar_type,NR,1,mem_manager_type, layout_type>& alpha,
+            scalar_type eps
+        ) 
+        {
+            DLIB_ASSERT(Q.nr() == Q.nc() && y.size() == Q.nr() && y.size() > 1 && is_col_vector(y) &&
+                        sum((y == +1) + (y == -1)) == y.size() &&
+                        0 < nu && nu <= 1 &&
+                        eps > 0,
+                "\t void solve_qp2_using_smo::operator()"
+                << "\n\t invalid arguments were given to this function"
+                << "\n\t Q.nr():                     " << Q.nr() 
+                << "\n\t Q.nc():                     " << Q.nc() 
+                << "\n\t is_col_vector(y):           " << is_col_vector(y) 
+                << "\n\t y.size():                   " << y.size() 
+                << "\n\t sum((y == +1) + (y == -1)): " << sum((y == +1) + (y == -1)) 
+                << "\n\t nu:                         " << nu 
+                << "\n\t eps:                        " << eps 
+                );
+            alpha.set_size(Q.nr(),1);
+            df.set_size(Q.nr());
+            // now initialize alpha
+            set_initial_alpha(y, nu, alpha);
+            const scalar_type tau = 1e-12;
+            typedef typename colm_exp<EXP1>::type col_type;
+            set_all_elements(df, 0);
+            // initialize df.  Compute df = Q*alpha
+            for (long r = 0; r < df.nr(); ++r)
+            {
+                if (alpha(r) != 0)
+                {
+                    df += alpha(r)*matrix_cast<scalar_type>(colm(Q,r));
+                }
+            }
+            // now perform the actual optimization of alpha
+            long i=0, j=0;
+            while (find_working_group(y,alpha,Q,df,tau,eps,i,j))
+            {
+                const scalar_type old_alpha_i = alpha(i);
+                const scalar_type old_alpha_j = alpha(j);
+                optimize_working_pair(alpha,Q,df,tau,i,j);
+                // update the df vector now that we have modified alpha(i) and alpha(j)
+                scalar_type delta_alpha_i = alpha(i) - old_alpha_i;
+                scalar_type delta_alpha_j = alpha(j) - old_alpha_j;
+                col_type Q_i = colm(Q,i);
+                col_type Q_j = colm(Q,j);
+                for(long k = 0; k < df.nr(); ++k)
+                    df(k) += Q_i(k)*delta_alpha_i + Q_j(k)*delta_alpha_j;
+            }
+        }
+        const column_matrix& get_gradient (
+        ) const { return df; }
+    private:
+    // -------------------------------------------------------------------------------------
+        template <
+            typename scalar_type,
+            typename scalar_vector_type,
+            typename scalar_vector_type2
+            >
+        inline void set_initial_alpha (
+            const scalar_vector_type& y,
+            const scalar_type nu,
+            scalar_vector_type2& alpha
+        ) const
+        {
+            set_all_elements(alpha,0);
+            const scalar_type l = y.nr();
+            scalar_type temp = nu*l/2;
+            long num = (long)std::floor(temp);
+            long num_total = (long)std::ceil(temp);
+            bool has_slack = false;
+            int count = 0;
+            for (int i = 0; i < alpha.nr(); ++i)
+            {
+                if (y(i) == 1)
+                {
+                    if (count < num)
+                    {
+                        ++count;
+                        alpha(i) = 1;
+                    }
+                    else 
+                    {
+                        has_slack = true;
+                        if (num_total > num)
+                        {
+                            ++count;
+                            alpha(i) = temp - std::floor(temp);
+                        }
+                        break;
+                    }
+                }
+            }
+            if (count != num_total || has_slack == false)
+            {
+                std::ostringstream sout;
+                sout << "Invalid nu of " << nu << ".  It is required that: 0 < nu < " << 2*(scalar_type)count/y.nr();
+                throw invalid_nu_error(sout.str(),nu);
+            }
+            has_slack = false;
+            count = 0;
+            for (int i = 0; i < alpha.nr(); ++i)
+            {
+                if (y(i) == -1)
+                {
+                    if (count < num)
+                    {
+                        ++count;
+                        alpha(i) = 1;
+                    }
+                    else 
+                    {
+                        has_slack = true;
+                        if (num_total > num)
+                        {
+                            ++count;
+                            alpha(i) = temp - std::floor(temp);
+                        }
+                        break;
+                    }
+                }
+            }
+            if (count != num_total || has_slack == false)
+            {
+                std::ostringstream sout;
+                sout << "Invalid nu of " << nu << ".  It is required that: 0 < nu < " << 2*(scalar_type)count/y.nr();
+                throw invalid_nu_error(sout.str(),nu);
+            }
+        }
+    // ------------------------------------------------------------------------------------
+        template <
+            typename scalar_vector_type,
+            typename scalar_type,
+            typename EXP,
+            typename U, typename V
+            >
+        inline bool find_working_group (
+            const V& y,
+            const U& alpha,
+            const matrix_exp<EXP>& Q,
+            const scalar_vector_type& df,
+            const scalar_type tau,
+            const scalar_type eps,
+            long& i_out,
+            long& j_out
+        ) const
+        {
+            using namespace std;
+            long ip = 0;
+            long jp = 0;
+            long in = 0;
+            long jn = 0;
+            typedef typename colm_exp<EXP>::type col_type;
+            typedef typename diag_exp<EXP>::type diag_type;
+            scalar_type ip_val = -numeric_limits<scalar_type>::infinity();
+            scalar_type jp_val = numeric_limits<scalar_type>::infinity();
+            scalar_type in_val = -numeric_limits<scalar_type>::infinity();
+            scalar_type jn_val = numeric_limits<scalar_type>::infinity();
+            // loop over the alphas and find the maximum ip and in indices.
+            for (long i = 0; i < alpha.nr(); ++i)
+            {
+                if (y(i) == 1)
+                {
+                    if (alpha(i) < 1.0)
+                    {
+                        if (-df(i) > ip_val)
+                        {
+                            ip_val = -df(i);
+                            ip = i;
+                        }
+                    }
+                }
+                else
+                {
+                    if (alpha(i) > 0.0)
+                    {
+                        if (df(i) > in_val)
+                        {
+                            in_val = df(i);
+                            in = i;
+                        }
+                    }
+                }
+            }
+            scalar_type Mp = numeric_limits<scalar_type>::infinity();
+            scalar_type Mn = numeric_limits<scalar_type>::infinity();
+            // Pick out the columns and diagonal of Q we need below.  Doing
+            // it this way is faster if Q is actually a symmetric_matrix_cache
+            // object.
+            col_type Q_ip = colm(Q,ip);
+            col_type Q_in = colm(Q,in);
+            diag_type Q_diag = diag(Q);
+            // now we need to find the minimum jp and jn indices
+            for (long j = 0; j < alpha.nr(); ++j)
+            {
+                if (y(j) == 1)
+                {
+                    if (alpha(j) > 0.0)
+                    {
+                        scalar_type b = ip_val + df(j);
+                        if (-df(j) < Mp)
+                            Mp = -df(j);
+                        if (b > 0)
+                        {
+                            scalar_type a = Q_ip(ip) + Q_diag(j) - 2*Q_ip(j); 
+                            if (a <= 0)
+                                a = tau;
+                            scalar_type temp = -b*b/a;
+                            if (temp < jp_val)
+                            {
+                                jp_val = temp;
+                                jp = j;
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    if (alpha(j) < 1.0)
+                    {
+                        scalar_type b = in_val - df(j);
+                        if (df(j) < Mn)
+                            Mn = df(j);
+                        if (b > 0)
+                        {
+                            scalar_type a = Q_in(in) + Q_diag(j) - 2*Q_in(j); 
+                            if (a <= 0)
+                                a = tau;
+                            scalar_type temp = -b*b/a;
+                            if (temp < jn_val)
+                            {
+                                jn_val = temp;
+                                jn = j;
+                            }
+                        }
+                    }
+                }
+            }
+            // if we are at the optimal point then return false so the caller knows
+            // to stop optimizing
+            if (std::max(ip_val - Mp, in_val - Mn) < eps)
+                return false;
+            if (jp_val < jn_val)
+            {
+                i_out = ip;
+                j_out = jp;
+            }
+            else
+            {
+                i_out = in;
+                j_out = jn;
+            }
+            if (j_out >= 0 && i_out >= 0)
+                return true;
+            else
+                return false;
+        }
+    // ------------------------------------------------------------------------------------
+        template <
+            typename EXP,
+            typename T, typename U
+            >
+        inline void optimize_working_pair (
+            T& alpha,
+            const matrix_exp<EXP>& Q,
+            const U& df,
+            const scalar_type tau,
+            const long i,
+            const long j
+        ) const
+        {
+            scalar_type quad_coef = Q(i,i)+Q(j,j)-2*Q(j,i);
+            if (quad_coef <= 0)
+                quad_coef = tau;
+            scalar_type delta = (df(i)-df(j))/quad_coef;
+            scalar_type sum = alpha(i) + alpha(j);
+            alpha(i) -= delta;
+            alpha(j) += delta;
+            if(sum > 1)
+            {
+                if(alpha(i) > 1)
+                {
+                    alpha(i) = 1;
+                    alpha(j) = sum - 1;
+                }
+                else if(alpha(j) > 1)
+                {
+                    alpha(j) = 1;
+                    alpha(i) = sum - 1;
+                }
+            }
+            else
+            {
+                if(alpha(j) < 0)
+                {
+                    alpha(j) = 0;
+                    alpha(i) = sum;
+                }
+                else if(alpha(i) < 0)
+                {
+                    alpha(i) = 0;
+                    alpha(j) = sum;
+                }
+            }
+        }
+    // ------------------------------------------------------------------------------------
+        column_matrix df; // gradient of f(alpha)
+    };
+// ----------------------------------------------------------------------------------------
+}
+#endif // DLIB_SOLVE_QP2_USING_SMo_H__
--- a/dlib/optimization/optimization_solve_qp2_using_smo_abstract.h
+++ b/dlib/optimization/optimization_solve_qp2_using_smo_abstract.h
+// Copyright (C) 2007  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_OPTIMIZATION_SOLVE_QP2_USING_SMO_ABSTRACT_H_
+#ifdef DLIB_OPTIMIZATION_SOLVE_QP2_USING_SMO_ABSTRACT_H_
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+namespace dlib
+{
+// ----------------------------------------------------------------------------------------
+    class invalid_nu_error : public dlib::error 
+    { 
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is an exception class used to indicate that a
+                value of nu given to the solve_qp2_using_smo object is incompatible 
+                with the constraints of the quadratic program.
+                this->nu will be set to the invalid value of nu used.
+        !*/
+    public: 
+        invalid_nu_error(const std::string& msg, double nu_) : dlib::error(msg), nu(nu_) {};
+        const double nu;
+    };
+// ----------------------------------------------------------------------------------------
+    template <
+        typename T
+        >
+    typename T::type maximum_nu (
+        const T& y
+    );
+    /*!
+        requires
+            - T == a matrix object or an object convertible to a matrix via vector_to_matrix()
+            - is_col_vector(y) == true
+            - y.size() > 1
+            - sum((y == +1) + (y == -1)) == y.size()
+              (i.e. all elements of y must be equal to +1 or -1)
+        ensures
+            - returns the maximum valid nu that can be used with solve_qp2_using_smo and
+              the given y vector.
+              (i.e. 2.0*min(sum(y == +1), sum(y == -1))/y.size())
+    !*/
+// ----------------------------------------------------------------------------------------
+    template <
+        typename matrix_type
+        >
+    class solve_qp2_using_smo
+    {
+        /*!
+            REQUIREMENTS ON matrix_type
+                Must be some type of dlib::matrix.
+            WHAT THIS OBJECT REPRESENTS
+                This object is a tool for solving the following quadratic programming
+                problem using the sequential minimal optimization algorithm:  
+                  Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha 
+                  subject to the following constraints:
+                    - sum(alpha) == nu*y.size() 
+                    - 0 <= min(alpha) && max(alpha) <= 1 
+                    - trans(y)*alpha == 0
+                  Where f is convex.  This means that Q should be symmetric and positive-semidefinite.
+                This object implements the strategy used by the LIBSVM tool and described 
+                by the following papers:
+                    - Chang and Lin, Training {nu}-Support Vector Classifiers: Theory and Algorithms
+                    - Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector 
+                      machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
+        !*/
+    public:
+        typedef typename matrix_type::mem_manager_type mem_manager_type;
+        typedef typename matrix_type::type scalar_type;
+        typedef typename matrix_type::layout_type layout_type;
+        typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix;
+        typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix;
+        template <
+            typename EXP1,
+            typename EXP2,
+            long NR
+            >
+        void operator() ( 
+            const matrix_exp<EXP1>& Q,
+            const matrix_exp<EXP2>& y,
+            const scalar_type nu,
+            matrix<scalar_type,NR,1,mem_manager_type, layout_type>& alpha,
+            scalar_type eps
+        );
+        /*!
+            requires
+                - Q.nr() == Q.nc()
+                - is_col_vector(y) == true
+                - y.size() == Q.nr()
+                - y.size() > 1
+                - sum((y == +1) + (y == -1)) == y.size()
+                  (i.e. all elements of y must be equal to +1 or -1)
+                - alpha must be capable of representing a vector of size y.size() elements
+                - 0 < nu <= 1
+                - eps > 0
+            ensures
+                - This function solves the quadratic program defined in this class's main comment.
+                - The solution to the quadratic program will be stored in #alpha.
+                - #alpha.size() == y.size()
+                - This function uses an implementation of the sequential minimal optimization 
+                  algorithm.  It runs until the KKT violation is less than eps.  So eps controls 
+                  how accurate the solution is and smaller values result in better solutions.
+                  (a reasonable eps is usually about 1e-3)
+                - #get_gradient() == Q*(#alpha)
+                  (i.e. stores the gradient of f() at #alpha in get_gradient())
+            throws
+                - invalid_nu_error
+                  This exception is thrown if nu >= maximum_nu(y).  
+                  (some values of nu cause the constraints to become impossible to satisfy. 
+                  If this is detected then an exception is thrown).
+        !*/
+        const column_matrix& get_gradient (
+        ) const;
+        /*!
+            ensures
+                - returns the gradient vector at the solution of the last problem solved
+                  by this object.  If no problem has been solved then returns an empty
+                  vector.
+        !*/
+    };
+// ----------------------------------------------------------------------------------------
+}
+#endif // DLIB_OPTIMIZATION_SOLVE_QP2_USING_SMO_ABSTRACT_H_
--- a/dlib/svm/svm.h
+++ b/dlib/svm/svm.h
@@ -16,83 +16,11 @@
 #include "kernel.h"
 #include "../enable_if.h"
 #include "../optimization.h"
+#include "svm_nu_trainer.h"
 namespace dlib
 {
-// ----------------------------------------------------------------------------------------
-    class invalid_svm_nu_error : public dlib::error 
-    { 
-    public: 
-        invalid_svm_nu_error(const std::string& msg, double nu_) : dlib::error(msg), nu(nu_) {};
-        const double nu;
-    };
-// ----------------------------------------------------------------------------------------
-    template <
-        typename T
-        >
-    typename T::type maximum_nu_impl (
-        const T& y
-    )
-    {
-        typedef typename T::type scalar_type;
-        // make sure requires clause is not broken
-        DLIB_ASSERT(y.nr() > 1 && y.nc() == 1,
-            "\ttypedef T::type maximum_nu(y)"
-            << "\n\ty should be a column vector with more than one entry"
-            << "\n\ty.nr(): " << y.nr() 
-            << "\n\ty.nc(): " << y.nc() 
-            );
-        long pos_count = 0;
-        long neg_count = 0;
-        for (long r = 0; r < y.nr(); ++r)
-        {
-            if (y(r) == 1.0)
-            {
-                ++pos_count;
-            }
-            else if (y(r) == -1.0)
-            {
-                ++neg_count;
-            }
-            else
-            {
-                // make sure requires clause is not broken
-                DLIB_ASSERT(y(r) == -1.0 || y(r) == 1.0,
-                       "\ttypedef T::type maximum_nu(y)"
-                       << "\n\ty should contain only 1 and 0 entries"
-                       << "\n\tr:    " << r 
-                       << "\n\ty(r): " << y(r) 
-                );
-            }
-        }
-        return static_cast<scalar_type>(2.0*(scalar_type)std::min(pos_count,neg_count)/(scalar_type)y.nr());
-    }
-    template <
-        typename T
-        >
-    typename T::type maximum_nu (
-        const T& y
-    )
-    {
-        return maximum_nu_impl(vector_to_matrix(y));
-    }
-    template <
-        typename T
-        >
-    typename T::value_type maximum_nu (
-        const T& y
-    )
-    {
-        return maximum_nu_impl(vector_to_matrix(y));
-    }
 // ----------------------------------------------------------------------------------------
    template <
@@ -135,149 +63,6 @@ namespace dlib
        return is_binary_classification_problem_impl(vector_to_matrix(x), vector_to_matrix(x_labels));
    }
-// ----------------------------------------------------------------------------------------
-    template <
-        typename K,
-        typename sample_vector_type,
-        typename scalar_vector_type
-        >
-    class kernel_matrix_cache
-    {
-    public:
-        typedef float scalar_type;
-        //typedef typename K::scalar_type scalar_type;
-        typedef typename K::sample_type sample_type;
-        typedef typename K::mem_manager_type mem_manager_type;
-        const sample_vector_type& x;
-        const scalar_vector_type& y;
-        K kernel_function;
-        mutable matrix<scalar_type,0,0,mem_manager_type> cache;
-        mutable matrix<scalar_type,0,1,mem_manager_type> diag_cache;
-        mutable matrix<long,0,1,mem_manager_type> lookup;
-        mutable matrix<long,0,1,mem_manager_type> rlookup;
-        mutable long next;
-        /*!
-        INITIAL VALUE
-            - for all valid x:
-                - lookup(x) == -1 
-                - rlookup(x) == -1 
-        CONVENTION
-            - if (lookup(c) != -1) then
-                - cache(lookup(c),*) == the cached column c of the kernel matrix
-                - rlookup(lookup(c)) == c
-            - if (rlookup(x) != -1) then
-                - lookup(rlookup(x)) == x
-                - cache(x,*) == the cached column rlookup(x) of the kernel matrix
-            - next == the next row in the cache table to use to cache something 
-        !*/
-    public:
-        kernel_matrix_cache (
-            const sample_vector_type& x_,
-            const scalar_vector_type& y_,
-            K kernel_function_,
-            long max_size_megabytes
-        ) : x(x_), y(y_), kernel_function(kernel_function_) 
-        {
-            // figure out how many rows of the kernel matrix we can have
-            // with the given amount of memory.
-            long max_size = (max_size_megabytes*1024*1024)/(x.nr()*sizeof(scalar_type));
-            // don't let it be 0
-            if (max_size == 0)
-                max_size = 1;
-            long size = std::min(max_size,x.nr());
-            diag_cache.set_size(x.nr(),1);
-            cache.set_size(size,x.nr());
-            lookup.set_size(x.nr(),1);
-            rlookup.set_size(size,1);
-            set_all_elements(lookup,-1);
-            set_all_elements(rlookup,-1);
-            next = 0;
-            for (long i = 0; i < diag_cache.nr(); ++i)
-                diag_cache(i) = kernel_function(x(i),x(i));
-        }
-        inline bool is_cached (
-            long r
-        ) const
-        {
-            return (lookup(r) != -1);
-        }
-        const scalar_type* col(long i) const 
-        { 
-            if (is_cached(i) == false)
-                add_col_to_cache(i);
-            // find where this column is in the cache
-            long idx = lookup(i);
-            if (idx == next)
-            {
-                // if this column was the next to be replaced
-                // then make sure that doesn't happen
-                next = (next + 1)%cache.nr();
-            }
-            return &cache(idx,0); 
-        }
-        const scalar_type* diag() const { return &diag_cache(0); }
-        inline scalar_type operator () (
-            long r,
-            long c
-        ) const
-        {
-            if (lookup(c) != -1)
-            {
-                return cache(lookup(c),r);
-            }
-            else if (r == c)
-            {
-                return diag_cache(r);
-            }
-            else if (lookup(r) != -1)
-            {
-                // the kernel is symmetric so this is legit
-                return cache(lookup(r),c);
-            }
-            else
-            {
-                add_col_to_cache(c);
-                return cache(lookup(c),r);
-            }
-        }
-    private:
-        void add_col_to_cache(
-            long c
-        ) const
-        {
-            // if the lookup table is pointing to cache(next,*) then clear lookup(next)
-            if (rlookup(next) != -1)
-                lookup(rlookup(next)) = -1;
-            // make the lookup table so that it says c is now cached at the spot indicated by next
-            lookup(c) = next;
-            rlookup(next) = c;
-            // compute this column in the kernel matrix and store it in the cache
-            for (long i = 0; i < cache.nc(); ++i)
-                cache(next,i) = y(c)*y(i)*kernel_function(x(c),x(i));
-            next = (next + 1)%cache.nr();
-        }
-    };
 // ----------------------------------------------------------------------------------------
    template <
@@ -488,7 +273,7 @@ namespace dlib
                // do the training and testing
                res += test_binary_decision_function(trainer.train(x_train,y_train),x_test,y_test);
            }
-            catch (invalid_svm_nu_error&)
+            catch (invalid_nu_error&)
            {
                // Just ignore the error in this case since we are going to
                // interpret an invalid nu value the same as generating a decision
@@ -675,8 +460,8 @@ namespace dlib
        /*
            This function fits a sigmoid function to the output of the 
-            svm trained by svm_nu_trainer.  The technique used is the one
+            svm trained by svm_nu_trainer or a similar trainer.  The 
-            described in the papers:
+            technique used is the one described in the papers:
                Probabilistic Outputs for Support Vector Machines and
                Comparisons to Regularized Likelihood Methods by 
@@ -1021,633 +806,7 @@ namespace dlib
    }
 // ----------------------------------------------------------------------------------------
-// ----------------------------------------------------------------------------------------
-// ----------------------------------------------------------------------------------------
-    template <
-        typename K 
-        >
-    class svm_nu_trainer
-    {
-    public:
-        typedef K kernel_type;
-        typedef typename kernel_type::scalar_type scalar_type;
-        typedef typename kernel_type::sample_type sample_type;
-        typedef typename kernel_type::mem_manager_type mem_manager_type;
-        typedef decision_function<kernel_type> trained_function_type;
-        svm_nu_trainer (
-        ) :
-            nu(0.1),
-            cache_size(200),
-            eps(0.001)
-        {
-        }
-        svm_nu_trainer (
-            const kernel_type& kernel_, 
-            const scalar_type& nu_
-        ) :
-            kernel_function(kernel_),
-            nu(nu_),
-            cache_size(200),
-            eps(0.001)
-        {
-            // make sure requires clause is not broken
-            DLIB_ASSERT(0 < nu && nu <= 1,
-                "\tsvm_nu_trainer::svm_nu_trainer(kernel,nu)"
-                << "\n\t invalid inputs were given to this function"
-                << "\n\t nu: " << nu 
-                );
-        }
-        void set_cache_size (
-            long cache_size_
-        )
-        {
-            // make sure requires clause is not broken
-            DLIB_ASSERT(cache_size_ > 0,
-                "\tvoid svm_nu_trainer::set_cache_size(cache_size_)"
-                << "\n\t invalid inputs were given to this function"
-                << "\n\t cache_size: " << cache_size_ 
-                );
-            cache_size = cache_size_;
-        }
-        long get_cache_size (
-        ) const
-        {
-            return cache_size;
-        }
-        void set_epsilon (
-            scalar_type eps_
-        )
-        {
-            // make sure requires clause is not broken
-            DLIB_ASSERT(eps_ > 0,
-                "\tvoid svm_nu_trainer::set_epsilon(eps_)"
-                << "\n\t invalid inputs were given to this function"
-                << "\n\t eps: " << eps_ 
-                );
-            eps = eps_;
-        }
-        const scalar_type get_epsilon (
-        ) const
-        { 
-            return eps;
-        }
-        void set_kernel (
-            const kernel_type& k
-        )
-        {
-            kernel_function = k;
-        }
-        const kernel_type& get_kernel (
-        ) const
-        {
-            return kernel_function;
-        }
-        void set_nu (
-            scalar_type nu_
-        )
-        {
-            // make sure requires clause is not broken
-            DLIB_ASSERT(0 < nu_ && nu_ <= 1,
-                "\tvoid svm_nu_trainer::set_nu(nu_)"
-                << "\n\t invalid inputs were given to this function"
-                << "\n\t nu: " << nu_ 
-                );
-            nu = nu_;
-        }
-        const scalar_type get_nu (
-        ) const
-        {
-            return nu;
-        }
-        template <
-            typename in_sample_vector_type,
-            typename in_scalar_vector_type
-            >
-        const decision_function<kernel_type> train (
-            const in_sample_vector_type& x,
-            const in_scalar_vector_type& y
-        ) const
-        {
-            return do_train(vector_to_matrix(x), vector_to_matrix(y));
-        }
-        void swap (
-            svm_nu_trainer& item
-        )
-        {
-            exchange(kernel_function, item.kernel_function);
-            exchange(nu,              item.nu);
-            exchange(cache_size,      item.cache_size);
-            exchange(eps,             item.eps);
-        }
-    private:
-    // ------------------------------------------------------------------------------------
-        template <
-            typename in_sample_vector_type,
-            typename in_scalar_vector_type
-            >
-        const decision_function<kernel_type> do_train (
-            const in_sample_vector_type& x,
-            const in_scalar_vector_type& y
-        ) const
-        {
-            typedef typename K::scalar_type scalar_type;
-            typedef typename decision_function<K>::sample_vector_type sample_vector_type;
-            typedef typename decision_function<K>::scalar_vector_type scalar_vector_type;
-            // make sure requires clause is not broken
-            DLIB_ASSERT(is_binary_classification_problem(x,y) == true,
-                "\tdecision_function svm_nu_trainer::train(x,y)"
-                << "\n\t invalid inputs were given to this function"
-                << "\n\t x.nr(): " << x.nr() 
-                << "\n\t y.nr(): " << y.nr() 
-                << "\n\t x.nc(): " << x.nc() 
-                << "\n\t y.nc(): " << y.nc() 
-                << "\n\t is_binary_classification_problem(x,y): " << ((is_binary_classification_problem(x,y))? "true":"false")
-                );
-            const scalar_type tau = 1e-12;
-            scalar_vector_type df; // delta f(alpha)
-            scalar_vector_type alpha;
-            kernel_matrix_cache<K, in_sample_vector_type, in_scalar_vector_type> Q(x,y,kernel_function,cache_size);
-            typedef typename kernel_matrix_cache<K, in_sample_vector_type, in_scalar_vector_type>::scalar_type cache_type;
-            alpha.set_size(x.nr());
-            df.set_size(x.nr());
-            // now initialize alpha
-            set_initial_alpha(y, nu, alpha);
-            set_all_elements(df, 0);
-            // initialize df.  Compute df = Q*alpha
-            for (long r = 0; r < df.nr(); ++r)
-            {
-                if (alpha(r) != 0)
-                {
-                    const cache_type* Q_r = Q.col(r);
-                    for (long c = 0; c < alpha.nr(); ++c)
-                    {
-                        df(c) += alpha(r)*Q_r[c];
-                    }
-                }
-            }
-            // now perform the actual optimization of alpha
-            long i, j;
-            while (find_working_group(y,alpha,Q,df,tau,eps,i,j))
-            {
-                const scalar_type old_alpha_i = alpha(i);
-                const scalar_type old_alpha_j = alpha(j);
-                optimize_working_pair(y,alpha,Q,df,tau,i,j);
-                // update the df vector now that we have modified alpha(i) and alpha(j)
-                scalar_type delta_alpha_i = alpha(i) - old_alpha_i;
-                scalar_type delta_alpha_j = alpha(j) - old_alpha_j;
-                const cache_type* Q_i = Q.col(i);
-                const cache_type* Q_j = Q.col(j);
-                for(long k = 0; k < df.nr(); ++k)
-                    df(k) += Q_i[k]*delta_alpha_i + Q_j[k]*delta_alpha_j;
-            }
-            scalar_type rho, b;
-            calculate_rho_and_b(y,alpha,df,rho,b);
-            alpha = pointwise_multiply(alpha,y)/rho;
-            // count the number of support vectors
-            long sv_count = 0;
-            for (long i = 0; i < alpha.nr(); ++i)
-            {
-                if (alpha(i) != 0)
-                    ++sv_count;
-            }
-            scalar_vector_type sv_alpha;
-            sample_vector_type support_vectors;
-            // size these column vectors so that they have an entry for each support vector
-            sv_alpha.set_size(sv_count);
-            support_vectors.set_size(sv_count);
-            // load the support vectors and their alpha values into these new column matrices
-            long idx = 0;
-            for (long i = 0; i < alpha.nr(); ++i)
-            {
-                if (alpha(i) != 0)
-                {
-                    sv_alpha(idx) = alpha(i);
-                    support_vectors(idx) = x(i);
-                    ++idx;
-                }
-            }
-            // now return the decision function
-            return decision_function<K> (sv_alpha, b, kernel_function, support_vectors);
-        }
-    // ------------------------------------------------------------------------------------
-        template <
-            typename scalar_type,
-            typename scalar_vector_type,
-            typename scalar_vector_type2
-            >
-        inline void set_initial_alpha (
-            const scalar_vector_type& y,
-            const scalar_type nu,
-            scalar_vector_type2& alpha
-        ) const
-        {
-            set_all_elements(alpha,0);
-            const scalar_type l = y.nr();
-            scalar_type temp = nu*l/2;
-            long num = (long)std::floor(temp);
-            long num_total = (long)std::ceil(temp);
-            bool has_slack = false;
-            int count = 0;
-            for (int i = 0; i < alpha.nr(); ++i)
-            {
-                if (y(i) == 1)
-                {
-                    if (count < num)
-                    {
-                        ++count;
-                        alpha(i) = 1;
-                    }
-                    else 
-                    {
-                        has_slack = true;
-                        if (num_total > num)
-                        {
-                            ++count;
-                            alpha(i) = temp - std::floor(temp);
-                        }
-                        break;
-                    }
-                }
-            }
-            if (count != num_total || has_slack == false)
-            {
-                std::ostringstream sout;
-                sout << "Invalid nu of " << nu << ".  It is required that: 0 < nu < " << 2*(scalar_type)count/y.nr();
-                throw invalid_svm_nu_error(sout.str(),nu);
-            }
-            has_slack = false;
-            count = 0;
-            for (int i = 0; i < alpha.nr(); ++i)
-            {
-                if (y(i) == -1)
-                {
-                    if (count < num)
-                    {
-                        ++count;
-                        alpha(i) = 1;
-                    }
-                    else 
-                    {
-                        has_slack = true;
-                        if (num_total > num)
-                        {
-                            ++count;
-                            alpha(i) = temp - std::floor(temp);
-                        }
-                        break;
-                    }
-                }
-            }
-            if (count != num_total || has_slack == false)
-            {
-                std::ostringstream sout;
-                sout << "Invalid nu of " << nu << ".  It is required that: 0 < nu < " << 2*(scalar_type)count/y.nr();
-                throw invalid_svm_nu_error(sout.str(),nu);
-            }
-        }
-    // ------------------------------------------------------------------------------------
-        template <
-            typename sample_vector_type,
-            typename scalar_vector_type,
-            typename scalar_vector_type2,
-            typename scalar_type
-            >
-        inline bool find_working_group (
-            const scalar_vector_type2& y,
-            const scalar_vector_type& alpha,
-            const kernel_matrix_cache<K,sample_vector_type, scalar_vector_type2>& Q,
-            const scalar_vector_type& df,
-            const scalar_type tau,
-            const scalar_type eps,
-            long& i_out,
-            long& j_out
-        ) const
-        {
-            using namespace std;
-            long ip = -1;
-            long jp = -1;
-            long in = -1;
-            long jn = -1;
-            typedef typename kernel_matrix_cache<K, sample_vector_type, scalar_vector_type2>::scalar_type cache_type;
-            scalar_type ip_val = -numeric_limits<scalar_type>::infinity();
-            scalar_type jp_val = numeric_limits<scalar_type>::infinity();
-            scalar_type in_val = -numeric_limits<scalar_type>::infinity();
-            scalar_type jn_val = numeric_limits<scalar_type>::infinity();
-            // loop over the alphas and find the maximum ip and in indices.
-            for (long i = 0; i < alpha.nr(); ++i)
-            {
-                if (y(i) == 1)
-                {
-                    if (alpha(i) < 1.0)
-                    {
-                        if (-df(i) > ip_val)
-                        {
-                            ip_val = -df(i);
-                            ip = i;
-                        }
-                    }
-                }
-                else
-                {
-                    if (alpha(i) > 0.0)
-                    {
-                        if (df(i) > in_val)
-                        {
-                            in_val = df(i);
-                            in = i;
-                        }
-                    }
-                }
-            }
-            scalar_type Mp = numeric_limits<scalar_type>::infinity();
-            scalar_type Mn = numeric_limits<scalar_type>::infinity();
-            // As a speed hack, pull out pointers to the columns of the
-            // kernel matrix we will be using below rather than accessing
-            // them through the Q(r,c) syntax.
-            const cache_type* Q_ip = 0;
-            const cache_type* Q_in = 0;
-            const cache_type* Q_diag = Q.diag();
-            if (ip != -1)
-                Q_ip = Q.col(ip);
-            if (in != -1)
-                Q_in = Q.col(in);
-            // now we need to find the minimum jp and jn indices
-            for (long j = 0; j < alpha.nr(); ++j)
-            {
-                if (y(j) == 1)
-                {
-                    if (alpha(j) > 0.0)
-                    {
-                        scalar_type b = ip_val + df(j);
-                        if (-df(j) < Mp)
-                            Mp = -df(j);
-                        if (b > 0)
-                        {
-                            scalar_type a = Q_ip[ip] + Q_diag[j] - 2*Q_ip[j]; 
-                            if (a <= 0)
-                                a = tau;
-                            scalar_type temp = -b*b/a;
-                            if (temp < jp_val)
-                            {
-                                jp_val = temp;
-                                jp = j;
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if (alpha(j) < 1.0)
-                    {
-                        scalar_type b = in_val - df(j);
-                        if (df(j) < Mn)
-                            Mn = df(j);
-                        if (b > 0)
-                        {
-                            scalar_type a = Q_in[in] + Q_diag[j] - 2*Q_in[j]; 
-                            if (a <= 0)
-                                a = tau;
-                            scalar_type temp = -b*b/a;
-                            if (temp < jn_val)
-                            {
-                                jn_val = temp;
-                                jn = j;
-                            }
-                        }
-                    }
-                }
-            }
-            // if we are at the optimal point then return false so the caller knows
-            // to stop optimizing
-            if (std::max(ip_val - Mp, in_val - Mn) < eps)
-                return false;
-            if (jp_val < jn_val)
-            {
-                i_out = ip;
-                j_out = jp;
-            }
-            else
-            {
-                i_out = in;
-                j_out = jn;
-            }
-            if (j_out >= 0 && i_out >= 0)
-                return true;
-            else
-                return false;
-        }
-    // ------------------------------------------------------------------------------------
-        template <
-            typename scalar_vector_type,
-            typename scalar_vector_type2,
-            typename scalar_type
-            >
-        void calculate_rho_and_b(
-            const scalar_vector_type2& y,
-            const scalar_vector_type& alpha,
-            const scalar_vector_type& df,
-            scalar_type& rho, 
-            scalar_type& b
-        ) const
-        {
-            using namespace std;
-            long num_p_free = 0;
-            long num_n_free = 0;
-            scalar_type sum_p_free = 0;
-            scalar_type sum_n_free = 0;
-            scalar_type upper_bound_p = -numeric_limits<scalar_type>::infinity();
-            scalar_type upper_bound_n = -numeric_limits<scalar_type>::infinity();
-            scalar_type lower_bound_p = numeric_limits<scalar_type>::infinity();
-            scalar_type lower_bound_n = numeric_limits<scalar_type>::infinity();
-            for(long i = 0; i < alpha.nr(); ++i)
-            {
-                if(y(i) == 1)
-                {
-                    if(alpha(i) == 1)
-                    {
-                        if (df(i) > upper_bound_p)
-                            upper_bound_p = df(i);
-                    }
-                    else if(alpha(i) == 0)
-                    {
-                        if (df(i) < lower_bound_p)
-                            lower_bound_p = df(i);
-                    }
-                    else
-                    {
-                        ++num_p_free;
-                        sum_p_free += df(i);
-                    }
-                }
-                else
-                {
-                    if(alpha(i) == 1)
-                    {
-                        if (df(i) > upper_bound_n)
-                            upper_bound_n = df(i);
-                    }
-                    else if(alpha(i) == 0)
-                    {
-                        if (df(i) < lower_bound_n)
-                            lower_bound_n = df(i);
-                    }
-                    else
-                    {
-                        ++num_n_free;
-                        sum_n_free += df(i);
-                    }
-                }
-            }
-            scalar_type r1,r2;
-            if(num_p_free > 0)
-                r1 = sum_p_free/num_p_free;
-            else
-                r1 = (upper_bound_p+lower_bound_p)/2;
-            if(num_n_free > 0)
-                r2 = sum_n_free/num_n_free;
-            else
-                r2 = (upper_bound_n+lower_bound_n)/2;
-            rho = (r1+r2)/2;
-            b = (r1-r2)/2/rho;
-        }
-    // ------------------------------------------------------------------------------------
-        template <
-            typename sample_vector_type,
-            typename scalar_vector_type,
-            typename scalar_vector_type2,
-            typename scalar_type
-            >
-        inline void optimize_working_pair (
-            const scalar_vector_type2& ,
-            scalar_vector_type& alpha,
-            const kernel_matrix_cache<K, sample_vector_type, scalar_vector_type2>& Q,
-            const scalar_vector_type& df,
-            const scalar_type tau,
-            const long i,
-            const long j
-        ) const
-        {
-            scalar_type quad_coef = Q(i,i)+Q(j,j)-2*Q(j,i);
-            if (quad_coef <= 0)
-                quad_coef = tau;
-            scalar_type delta = (df(i)-df(j))/quad_coef;
-            scalar_type sum = alpha(i) + alpha(j);
-            alpha(i) -= delta;
-            alpha(j) += delta;
-            if(sum > 1)
-            {
-                if(alpha(i) > 1)
-                {
-                    alpha(i) = 1;
-                    alpha(j) = sum - 1;
-                }
-                else if(alpha(j) > 1)
-                {
-                    alpha(j) = 1;
-                    alpha(i) = sum - 1;
-                }
-            }
-            else
-            {
-                if(alpha(j) < 0)
-                {
-                    alpha(j) = 0;
-                    alpha(i) = sum;
-                }
-                else if(alpha(i) < 0)
-                {
-                    alpha(i) = 0;
-                    alpha(j) = sum;
-                }
-            }
-        }
-    // ------------------------------------------------------------------------------------
-        kernel_type kernel_function;
-        scalar_type nu;
-        long cache_size;
-        scalar_type eps;
-    }; // end of class svm_nu_trainer
-// ----------------------------------------------------------------------------------------
-    template <typename K>
-    void swap (
-        svm_nu_trainer<K>& a,
-        svm_nu_trainer<K>& b
-    ) { a.swap(b); }
-// ----------------------------------------------------------------------------------------
-// ----------------------------------------------------------------------------------------
-// ----------------------------------------------------------------------------------------
 }
 #endif // DLIB_SVm_

--- a/dlib/svm/svm_abstract.h
+++ b/dlib/svm/svm_abstract.h
@@ -11,52 +11,13 @@
 #include "../serialize.h"
 #include "function_abstract.h"
 #include "kernel_abstract.h"
+#include "svm_nu_trainer_abstract.h"
 namespace dlib
 {
 // ----------------------------------------------------------------------------------------
 // ----------------------------------------------------------------------------------------
-// ----------------------------------------------------------------------------------------
-    class invalid_svm_nu_error : public dlib::error 
-    { 
-        /*!
-            WHAT THIS OBJECT REPRESENTS
-                This object is an exception class used to indicate that a
-                value of nu used for svm training is incompatible with a
-                particular data set.
-                this->nu will be set to the invalid value of nu used.
-        !*/
-    public: 
-        invalid_svm_nu_error(const std::string& msg, double nu_) : dlib::error(msg), nu(nu_) {};
-        const double nu;
-    };
-// ----------------------------------------------------------------------------------------
-    template <
-        typename T
-        >
-    typename T::type maximum_nu (
-        const T& y
-    );
-    /*!
-        requires
-            - T == a matrix object or an object convertible to a matrix via 
-              vector_to_matrix()
-            - y.nc() == 1
-            - y.nr() > 1
-            - for all valid i:
-                - y(i) == -1 or +1
-        ensures
-            - returns the maximum valid nu that can be used with the svm_nu_trainer and
-              the training set labels from the given y vector.
-              (i.e. 2.0*min(number of +1 examples in y, number of -1 examples in y)/y.nr())
-    !*/
 // ----------------------------------------------------------------------------------------
    template <
@@ -85,189 +46,6 @@ namespace dlib
 // ----------------------------------------------------------------------------------------
 // ----------------------------------------------------------------------------------------
-// ----------------------------------------------------------------------------------------
-    template <
-        typename K 
-        >
-    class svm_nu_trainer
-    {
-        /*!
-            REQUIREMENTS ON K 
-                is a kernel function object as defined in dlib/svm/kernel_abstract.h 
-            WHAT THIS OBJECT REPRESENTS
-                This object implements a trainer for a nu support vector machine for 
-                solving binary classification problems.
-                The implementation of the nu-svm training algorithm used by this object is based
-                on the following excellent papers:
-                    - Chang and Lin, Training {nu}-Support Vector Classifiers: Theory and Algorithms
-                    - Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector 
-                      machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
-        !*/
-    public:
-        typedef K kernel_type;
-        typedef typename kernel_type::scalar_type scalar_type;
-        typedef typename kernel_type::sample_type sample_type;
-        typedef typename kernel_type::mem_manager_type mem_manager_type;
-        typedef decision_function<kernel_type> trained_function_type;
-        svm_nu_trainer (
-        );
-        /*!
-            ensures
-                - This object is properly initialized and ready to be used
-                  to train a support vector machine.
-                - #get_nu() == 0.1 
-                - #get_cache_size() == 200
-                - #get_epsilon() == 0.001
-        !*/
-        svm_nu_trainer (
-            const kernel_type& kernel, 
-            const scalar_type& nu
-        );
-        /*!
-            requires
-                - 0 < nu <= 1
-            ensures
-                - This object is properly initialized and ready to be used
-                  to train a support vector machine.
-                - #get_kernel() == kernel
-                - #get_nu() == nu
-                - #get_cache_size() == 200
-                - #get_epsilon() == 0.001
-        !*/
-        void set_cache_size (
-            long cache_size
-        );
-        /*!
-            requires
-                - cache_size > 0
-            ensures
-                - #get_cache_size() == cache_size 
-        !*/
-        const long get_cache_size (
-        ) const;
-        /*!
-            ensures
-                - returns the number of megabytes of cache this object will use
-                  when it performs training via the this->train() function.
-                  (bigger values of this may make training go faster but won't affect 
-                  the result.  However, too big a value will cause you to run out of 
-                  memory, obviously.)
-        !*/
-        void set_epsilon (
-            scalar_type eps
-        );
-        /*!
-            requires
-                - eps > 0
-            ensures
-                - #get_epsilon() == eps 
-        !*/
-        const scalar_type get_epsilon (
-        ) const;
-        /*!
-            ensures
-                - returns the error epsilon that determines when training should stop.
-                  Generally a good value for this is 0.001.  Smaller values may result
-                  in a more accurate solution but take longer to execute.
-        !*/
-        void set_kernel (
-            const kernel_type& k
-        );
-        /*!
-            ensures
-                - #get_kernel() == k 
-        !*/
-        const kernel_type& get_kernel (
-        ) const;
-        /*!
-            ensures
-                - returns a copy of the kernel function in use by this object
-        !*/
-        void set_nu (
-            scalar_type nu
-        );
-        /*!
-            requires
-                - 0 < nu <= 1
-            ensures
-                - #get_nu() == nu
-        !*/
-        const scalar_type get_nu (
-        ) const;
-        /*!
-            ensures
-                - returns the nu svm parameter.  This is a value between 0 and
-                  1.  It is the parameter that determines the trade off between
-                  trying to fit the training data exactly or allowing more errors 
-                  but hopefully improving the generalization ability of the 
-                  resulting classifier.  Smaller values encourage exact fitting 
-                  while larger values of nu may encourage better generalization. 
-                  For more information you should consult the papers referenced 
-                  above.
-        !*/
-        template <
-            typename in_sample_vector_type,
-            typename in_scalar_vector_type
-            >
-        const decision_function<kernel_type> train (
-            const in_sample_vector_type& x,
-            const in_scalar_vector_type& y
-        ) const;
-        /*!
-            requires
-                - is_binary_classification_problem(x,y) == true
-                - x == a matrix or something convertible to a matrix via vector_to_matrix().
-                  Also, x should contain sample_type objects.
-                - y == a matrix or something convertible to a matrix via vector_to_matrix().
-                  Also, y should contain scalar_type objects.
-            ensures
-                - trains a nu support vector classifier given the training samples in x and 
-                  labels in y.  Training is done when the error is less than get_epsilon().
-                - returns a decision function F with the following properties:
-                    - if (new_x is a sample predicted have +1 label) then
-                        - F(new_x) >= 0
-                    - else
-                        - F(new_x) < 0
-            throws
-                - invalid_svm_nu_error
-                  This exception is thrown if get_nu() >= maximum_nu(y)
-                - std::bad_alloc
-        !*/
-        void swap (
-            svm_nu_trainer& item
-        );
-        /*!
-            ensures
-                - swaps *this and item
-        !*/
-    }; 
-    template <typename K>
-    void swap (
-        svm_nu_trainer<K>& a,
-        svm_nu_trainer<K>& b
-    ) { a.swap(b); }
-    /*!
-        provides a global swap
-    !*/
 // ----------------------------------------------------------------------------------------
    template <
@@ -288,9 +66,8 @@ namespace dlib
            - is_binary_classification_problem(x,y) == true
            - trainer_type == some kind of batch trainer object (e.g. svm_nu_trainer)
        ensures
-            - trains a nu support vector classifier given the training samples in x and 
+            - trains a classifier given the training samples in x and labels in y.  
-              labels in y.  
+            - returns a probabilistic_decision_function that represents the trained classifier.
-            - returns a probabilistic_decision_function that represents the trained svm.
            - The parameters of the probability model are estimated by performing k-fold 
              cross validation. 
            - The number of folds used is given by the folds argument.

--- a/dlib/svm/svm_nu_trainer.h
+++ b/dlib/svm/svm_nu_trainer.h
+// Copyright (C) 2007  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_SVm_NU_TRAINER_H__ 
+#define DLIB_SVm_NU_TRAINER_H__
+//#include "local/make_label_kernel_matrix.h"
+#include "svm_nu_trainer_abstract.h"
+#include <cmath>
+#include <limits>
+#include <sstream>
+#include "../matrix.h"
+#include "../algs.h"
+#include "../serialize.h"
+#include "function.h"
+#include "kernel.h"
+#include "../optimization/optimization_solve_qp2_using_smo.h"
+namespace dlib 
+{
+// ----------------------------------------------------------------------------------------
+    template <
+        typename K 
+        >
+    class svm_nu_trainer
+    {
+    public:
+        typedef K kernel_type;
+        typedef typename kernel_type::scalar_type scalar_type;
+        typedef typename kernel_type::sample_type sample_type;
+        typedef typename kernel_type::mem_manager_type mem_manager_type;
+        typedef decision_function<kernel_type> trained_function_type;
+        svm_nu_trainer (
+        ) :
+            nu(0.1),
+            cache_size(200),
+            eps(0.001)
+        {
+        }
+        svm_nu_trainer (
+            const kernel_type& kernel_, 
+            const scalar_type& nu_
+        ) :
+            kernel_function(kernel_),
+            nu(nu_),
+            cache_size(200),
+            eps(0.001)
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(0 < nu && nu <= 1,
+                "\tsvm_nu_trainer::svm_nu_trainer(kernel,nu)"
+                << "\n\t invalid inputs were given to this function"
+                << "\n\t nu: " << nu 
+                );
+        }
+        void set_cache_size (
+            long cache_size_
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(cache_size_ > 0,
+                "\tvoid svm_nu_trainer::set_cache_size(cache_size_)"
+                << "\n\t invalid inputs were given to this function"
+                << "\n\t cache_size: " << cache_size_ 
+                );
+            cache_size = cache_size_;
+        }
+        long get_cache_size (
+        ) const
+        {
+            return cache_size;
+        }
+        void set_epsilon (
+            scalar_type eps_
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(eps_ > 0,
+                "\tvoid svm_nu_trainer::set_epsilon(eps_)"
+                << "\n\t invalid inputs were given to this function"
+                << "\n\t eps: " << eps_ 
+                );
+            eps = eps_;
+        }
+        const scalar_type get_epsilon (
+        ) const
+        { 
+            return eps;
+        }
+        void set_kernel (
+            const kernel_type& k
+        )
+        {
+            kernel_function = k;
+        }
+        const kernel_type& get_kernel (
+        ) const
+        {
+            return kernel_function;
+        }
+        void set_nu (
+            scalar_type nu_
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(0 < nu_ && nu_ <= 1,
+                "\tvoid svm_nu_trainer::set_nu(nu_)"
+                << "\n\t invalid inputs were given to this function"
+                << "\n\t nu: " << nu_ 
+                );
+            nu = nu_;
+        }
+        const scalar_type get_nu (
+        ) const
+        {
+            return nu;
+        }
+        template <
+            typename in_sample_vector_type,
+            typename in_scalar_vector_type
+            >
+        const decision_function<kernel_type> train (
+            const in_sample_vector_type& x,
+            const in_scalar_vector_type& y
+        ) const
+        {
+            return do_train(vector_to_matrix(x), vector_to_matrix(y));
+        }
+        void swap (
+            svm_nu_trainer& item
+        )
+        {
+            exchange(kernel_function, item.kernel_function);
+            exchange(nu,              item.nu);
+            exchange(cache_size,      item.cache_size);
+            exchange(eps,             item.eps);
+        }
+    private:
+    // ------------------------------------------------------------------------------------
+        template <
+            typename in_sample_vector_type,
+            typename in_scalar_vector_type
+            >
+        const decision_function<kernel_type> do_train (
+            const in_sample_vector_type& x,
+            const in_scalar_vector_type& y
+        ) const
+        {
+            typedef typename K::scalar_type scalar_type;
+            typedef typename decision_function<K>::sample_vector_type sample_vector_type;
+            typedef typename decision_function<K>::scalar_vector_type scalar_vector_type;
+            // make sure requires clause is not broken
+            DLIB_ASSERT(is_binary_classification_problem(x,y) == true,
+                "\tdecision_function svm_nu_trainer::train(x,y)"
+                << "\n\t invalid inputs were given to this function"
+                << "\n\t x.nr(): " << x.nr() 
+                << "\n\t y.nr(): " << y.nr() 
+                << "\n\t x.nc(): " << x.nc() 
+                << "\n\t y.nc(): " << y.nc() 
+                << "\n\t is_binary_classification_problem(x,y): " << is_binary_classification_problem(x,y)
+                );
+            scalar_vector_type alpha;
+            solve_qp2_using_smo<scalar_vector_type> solver;
+            solver(symmetric_matrix_cache<float>((diagm(y)*kernel_matrix(kernel_function,x)*diagm(y)), cache_size), 
+            //solver(symmetric_matrix_cache<float>(make_label_kernel_matrix(kernel_matrix(kernel_function,x),y), cache_size), 
+                   y, 
+                   nu,
+                   alpha,
+                   eps);
+            scalar_type rho, b;
+            calculate_rho_and_b(y,alpha,solver.get_gradient(),rho,b);
+            alpha = pointwise_multiply(alpha,y)/rho;
+            // count the number of support vectors
+            const long sv_count = sum(alpha != 0);
+            scalar_vector_type sv_alpha;
+            sample_vector_type support_vectors;
+            // size these column vectors so that they have an entry for each support vector
+            sv_alpha.set_size(sv_count);
+            support_vectors.set_size(sv_count);
+            // load the support vectors and their alpha values into these new column matrices
+            long idx = 0;
+            for (long i = 0; i < alpha.nr(); ++i)
+            {
+                if (alpha(i) != 0)
+                {
+                    sv_alpha(idx) = alpha(i);
+                    support_vectors(idx) = x(i);
+                    ++idx;
+                }
+            }
+            // now return the decision function
+            return decision_function<K> (sv_alpha, b, kernel_function, support_vectors);
+        }
+    // ------------------------------------------------------------------------------------
+        template <
+            typename scalar_vector_type,
+            typename scalar_vector_type2,
+            typename scalar_type
+            >
+        void calculate_rho_and_b(
+            const scalar_vector_type2& y,
+            const scalar_vector_type& alpha,
+            const scalar_vector_type& df,
+            scalar_type& rho, 
+            scalar_type& b
+        ) const
+        {
+            using namespace std;
+            long num_p_free = 0;
+            long num_n_free = 0;
+            scalar_type sum_p_free = 0;
+            scalar_type sum_n_free = 0;
+            scalar_type upper_bound_p = -numeric_limits<scalar_type>::infinity();
+            scalar_type upper_bound_n = -numeric_limits<scalar_type>::infinity();
+            scalar_type lower_bound_p = numeric_limits<scalar_type>::infinity();
+            scalar_type lower_bound_n = numeric_limits<scalar_type>::infinity();
+            for(long i = 0; i < alpha.nr(); ++i)
+            {
+                if(y(i) == 1)
+                {
+                    if(alpha(i) == 1)
+                    {
+                        if (df(i) > upper_bound_p)
+                            upper_bound_p = df(i);
+                    }
+                    else if(alpha(i) == 0)
+                    {
+                        if (df(i) < lower_bound_p)
+                            lower_bound_p = df(i);
+                    }
+                    else
+                    {
+                        ++num_p_free;
+                        sum_p_free += df(i);
+                    }
+                }
+                else
+                {
+                    if(alpha(i) == 1)
+                    {
+                        if (df(i) > upper_bound_n)
+                            upper_bound_n = df(i);
+                    }
+                    else if(alpha(i) == 0)
+                    {
+                        if (df(i) < lower_bound_n)
+                            lower_bound_n = df(i);
+                    }
+                    else
+                    {
+                        ++num_n_free;
+                        sum_n_free += df(i);
+                    }
+                }
+            }
+            scalar_type r1,r2;
+            if(num_p_free > 0)
+                r1 = sum_p_free/num_p_free;
+            else
+                r1 = (upper_bound_p+lower_bound_p)/2;
+            if(num_n_free > 0)
+                r2 = sum_n_free/num_n_free;
+            else
+                r2 = (upper_bound_n+lower_bound_n)/2;
+            rho = (r1+r2)/2;
+            b = (r1-r2)/2/rho;
+        }
+    // ------------------------------------------------------------------------------------
+        kernel_type kernel_function;
+        scalar_type nu;
+        long cache_size;
+        scalar_type eps;
+    }; // end of class svm_nu_trainer
+// ----------------------------------------------------------------------------------------
+    template <typename K>
+    void swap (
+        svm_nu_trainer<K>& a,
+        svm_nu_trainer<K>& b
+    ) { a.swap(b); }
+// ----------------------------------------------------------------------------------------
+}
+#endif // DLIB_SVm_NU_TRAINER_H__
--- a/dlib/svm/svm_nu_trainer_abstract.h
+++ b/dlib/svm/svm_nu_trainer_abstract.h
+// Copyright (C) 2007  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_SVm_NU_TRAINER_ABSTRACT_
+#ifdef DLIB_SVm_NU_TRAINER_ABSTRACT_
+#include <cmath>
+#include <limits>
+#include <sstream>
+#include "../matrix/matrix_abstract.h"
+#include "../algs.h"
+#include "../serialize.h"
+#include "function_abstract.h"
+#include "kernel_abstract.h"
+#include "../optimization/optimization_solve_qp2_using_smo_abstract.h"
+namespace dlib
+{
+// ----------------------------------------------------------------------------------------
+    template <
+        typename K 
+        >
+    class svm_nu_trainer
+    {
+        /*!
+            REQUIREMENTS ON K 
+                is a kernel function object as defined in dlib/svm/kernel_abstract.h 
+            WHAT THIS OBJECT REPRESENTS
+                This object implements a trainer for a nu support vector machine for 
+                solving binary classification problems.
+                The implementation of the nu-svm training algorithm used by this object is based
+                on the following excellent papers:
+                    - Chang and Lin, Training {nu}-Support Vector Classifiers: Theory and Algorithms
+                    - Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector 
+                      machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
+        !*/
+    public:
+        typedef K kernel_type;
+        typedef typename kernel_type::scalar_type scalar_type;
+        typedef typename kernel_type::sample_type sample_type;
+        typedef typename kernel_type::mem_manager_type mem_manager_type;
+        typedef decision_function<kernel_type> trained_function_type;
+        svm_nu_trainer (
+        );
+        /*!
+            ensures
+                - This object is properly initialized and ready to be used
+                  to train a support vector machine.
+                - #get_nu() == 0.1 
+                - #get_cache_size() == 200
+                - #get_epsilon() == 0.001
+        !*/
+        svm_nu_trainer (
+            const kernel_type& kernel, 
+            const scalar_type& nu
+        );
+        /*!
+            requires
+                - 0 < nu <= 1
+            ensures
+                - This object is properly initialized and ready to be used
+                  to train a support vector machine.
+                - #get_kernel() == kernel
+                - #get_nu() == nu
+                - #get_cache_size() == 200
+                - #get_epsilon() == 0.001
+        !*/
+        void set_cache_size (
+            long cache_size
+        );
+        /*!
+            requires
+                - cache_size > 0
+            ensures
+                - #get_cache_size() == cache_size 
+        !*/
+        const long get_cache_size (
+        ) const;
+        /*!
+            ensures
+                - returns the number of megabytes of cache this object will use
+                  when it performs training via the this->train() function.
+                  (bigger values of this may make training go faster but won't affect 
+                  the result.  However, too big a value will cause you to run out of 
+                  memory, obviously.)
+        !*/
+        void set_epsilon (
+            scalar_type eps
+        );
+        /*!
+            requires
+                - eps > 0
+            ensures
+                - #get_epsilon() == eps 
+        !*/
+        const scalar_type get_epsilon (
+        ) const;
+        /*!
+            ensures
+                - returns the error epsilon that determines when training should stop.
+                  Generally a good value for this is 0.001.  Smaller values may result
+                  in a more accurate solution but take longer to execute.
+        !*/
+        void set_kernel (
+            const kernel_type& k
+        );
+        /*!
+            ensures
+                - #get_kernel() == k 
+        !*/
+        const kernel_type& get_kernel (
+        ) const;
+        /*!
+            ensures
+                - returns a copy of the kernel function in use by this object
+        !*/
+        void set_nu (
+            scalar_type nu
+        );
+        /*!
+            requires
+                - 0 < nu <= 1
+            ensures
+                - #get_nu() == nu
+        !*/
+        const scalar_type get_nu (
+        ) const;
+        /*!
+            ensures
+                - returns the nu svm parameter.  This is a value between 0 and
+                  1.  It is the parameter that determines the trade off between
+                  trying to fit the training data exactly or allowing more errors 
+                  but hopefully improving the generalization ability of the 
+                  resulting classifier.  Smaller values encourage exact fitting 
+                  while larger values of nu may encourage better generalization. 
+                  For more information you should consult the papers referenced 
+                  above.
+        !*/
+        template <
+            typename in_sample_vector_type,
+            typename in_scalar_vector_type
+            >
+        const decision_function<kernel_type> train (
+            const in_sample_vector_type& x,
+            const in_scalar_vector_type& y
+        ) const;
+        /*!
+            requires
+                - is_binary_classification_problem(x,y) == true
+                - x == a matrix or something convertible to a matrix via vector_to_matrix().
+                  Also, x should contain sample_type objects.
+                - y == a matrix or something convertible to a matrix via vector_to_matrix().
+                  Also, y should contain scalar_type objects.
+            ensures
+                - trains a nu support vector classifier given the training samples in x and 
+                  labels in y.  Training is done when the error is less than get_epsilon().
+                - returns a decision function F with the following properties:
+                    - if (new_x is a sample predicted have +1 label) then
+                        - F(new_x) >= 0
+                    - else
+                        - F(new_x) < 0
+            throws
+                - invalid_nu_error
+                  This exception is thrown if get_nu() >= maximum_nu(y)
+                - std::bad_alloc
+        !*/
+        void swap (
+            svm_nu_trainer& item
+        );
+        /*!
+            ensures
+                - swaps *this and item
+        !*/
+    }; 
+    template <typename K>
+    void swap (
+        svm_nu_trainer<K>& a,
+        svm_nu_trainer<K>& b
+    ) { a.swap(b); }
+    /*!
+        provides a global swap
+    !*/
+// ----------------------------------------------------------------------------------------
+}
+#endif // DLIB_SVm_NU_TRAINER_ABSTRACT_
--- a/dlib/svm/svm_threaded.h
+++ b/dlib/svm/svm_threaded.h
@@ -58,7 +58,7 @@ namespace dlib
                    // on very large datasets.  Every bit of freed memory helps out.
                    j = job<trainer_type>();
                }
-                catch (invalid_svm_nu_error&)
+                catch (invalid_nu_error&)
                {
                    // If this is a svm_nu_trainer then we might get this exception if the nu is
                    // invalid.  In this case just return a cross validation score of 0.