Refactored a bunch of the svm training code into a much cleaner form.

--HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%402380

Refactored a bunch of the svm training code into a much cleaner form.
--HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%402380
3c7c8fee · Davis King · b51cc5c2 · 3c7c8fee · 3c7c8fee
Commit 3c7c8fee authored Jul 06, 2008 by Davis King
Show whitespace changes
Inline Side-by-side

Showing with 1158 additions and 857 deletions

svm.h dlib/svm/svm.h +919 -764

svm_abstract.h dlib/svm/svm_abstract.h +239 -93

No files found.
--- a/dlib/svm/svm.h
+++ b/dlib/svm/svm.h
@@ -19,12 +19,21 @@
 namespace dlib
 {

+// ----------------------------------------------------------------------------------------
+
+    class invalid_svm_nu_error : public dlib::error 
+    { 
+    public: 
+        invalid_svm_nu_error(const std::string& msg, double nu_) : dlib::error(msg), nu(nu_) {};
+        const double nu;
+    };
+
 // ----------------------------------------------------------------------------------------

    template <
        typename T
        >
-    typename T::type maximum_nu (
+    typename T::type maximum_nu_impl (
        const T& y
    )
    {
@@ -63,10 +72,67 @@ namespace dlib
        return static_cast<scalar_type>(2.0*(scalar_type)std::min(pos_count,neg_count)/(scalar_type)y.nr());
    }

+    template <
+        typename T
+        >
+    typename T::type maximum_nu (
+        const T& y
+    )
+    {
+        return maximum_nu_impl(vector_to_matrix(y));
+    }
+
+    template <
+        typename T
+        >
+    typename T::value_type maximum_nu (
+        const T& y
+    )
+    {
+        return maximum_nu_impl(vector_to_matrix(y));
+    }
+
 // ----------------------------------------------------------------------------------------

    template <
-        typename K
+        typename T,
+        typename U
+        >
+    bool is_binary_classification_problem_impl (
+        const T& x,
+        const U& x_labels
+    )
+    {
+        if (x.nc() != 1 || x_labels.nc() != 1) return false; 
+        if (x.nr() != x_labels.nr()) return false;
+        if (x.nr() <= 1) return false;
+        for (long r = 0; r < x_labels.nr(); ++r)
+        {
+            if (x_labels(r) != -1 && x_labels(r) != 1)
+                return false;
+        }
+
+        return true;
+    }
+
+    template <
+        typename T,
+        typename U
+        >
+    bool is_binary_classification_problem (
+        const T& x,
+        const U& x_labels
+    )
+    {
+        return is_binary_classification_problem_impl(vector_to_matrix(x), vector_to_matrix(x_labels));
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename K,
+        typename sample_vector_type,
+        typename scalar_vector_type
        >
    class kernel_matrix_cache
    {
@@ -74,8 +140,8 @@ namespace dlib
        typedef typename K::sample_type sample_type;
        typedef typename K::mem_manager_type mem_manager_type;

-        const matrix<sample_type,0,1,mem_manager_type>& x;
-        const matrix<scalar_type,0,1,mem_manager_type>& y;
+        const sample_vector_type& x;
+        const scalar_vector_type& y;
        K kernel_function;

        mutable matrix<scalar_type,0,0,mem_manager_type> cache;
@@ -104,8 +170,8 @@ namespace dlib

    public:
        kernel_matrix_cache (
-            const matrix<sample_type,0,1,mem_manager_type>& x_,
-            const matrix<scalar_type,0,1,mem_manager_type>& y_,
+            const sample_vector_type& x_,
+            const scalar_vector_type& y_,
            K kernel_function_,
            long max_size_megabytes
        ) : x(x_), y(y_), kernel_function(kernel_function_) 
@@ -180,397 +246,712 @@ namespace dlib
 // ----------------------------------------------------------------------------------------

    template <
-        typename scalar_type,
-        typename scalar_vector_type
+        typename trainer_type,
+        typename in_sample_vector_type,
+        typename in_scalar_vector_type
        >
-    inline void set_initial_alpha (
-        const scalar_vector_type& y,
-        const scalar_type nu,
-        scalar_vector_type& alpha
+    const matrix<typename trainer_type::scalar_type, 1, 2, typename trainer_type::mem_manager_type> 
+    cross_validate_trainer_impl (
+        const trainer_type& trainer,
+        const in_sample_vector_type& x,
+        const in_scalar_vector_type& y,
+        const long folds
    )
    {
-        set_all_elements(alpha,0);
-        const scalar_type l = y.nr();
-        scalar_type temp = nu*l/2;
-        long num = (long)std::floor(temp);
-        long num_total = (long)std::ceil(temp);
+        typedef typename trainer_type::scalar_type scalar_type;
+        typedef typename trainer_type::sample_type sample_type;
+        typedef typename trainer_type::mem_manager_type mem_manager_type;
+        typedef matrix<sample_type,0,1,mem_manager_type> sample_vector_type;
+        typedef matrix<scalar_type,0,1,mem_manager_type> scalar_vector_type;

-        int count = 0;
-        for (int i = 0; i < alpha.nr(); ++i)
+        // make sure requires clause is not broken
+        DLIB_ASSERT(is_binary_classification_problem(x,y) == true &&
+                    1 < folds && folds <= x.nr(),
+            "\tmatrix cross_validate_trainer()"
+            << "\n\t invalid inputs were given to this function"
+            << "\n\t x.nr(): " << x.nr() 
+            << "\n\t y.nr(): " << y.nr() 
+            << "\n\t x.nc(): " << x.nc() 
+            << "\n\t y.nc(): " << y.nc() 
+            << "\n\t folds:  " << folds 
+            << "\n\t is_binary_classification_problem(x,y): " << ((is_binary_classification_problem(x,y))? "true":"false")
+            );
+
+
+        // count the number of positive and negative examples
+        long num_pos = 0;
+        long num_neg = 0;
+        for (long r = 0; r < y.nr(); ++r)
        {
-            if (y(i) == 1)
+            if (y(r) == +1.0)
+                ++num_pos;
+            else
+                ++num_neg;
+        }
+
+        // figure out how many positive and negative examples we will have in each fold
+        const long num_pos_test_samples = num_pos/folds; 
+        const long num_pos_train_samples = num_pos - num_pos_test_samples; 
+        const long num_neg_test_samples = num_neg/folds; 
+        const long num_neg_train_samples = num_neg - num_neg_test_samples; 
+
+        long num_pos_correct = 0;
+        long num_neg_correct = 0;
+
+        typename trainer_type::trained_function_type d;
+        sample_vector_type x_test, x_train;
+        scalar_vector_type y_test, y_train;
+        x_test.set_size (num_pos_test_samples  + num_neg_test_samples);
+        y_test.set_size (num_pos_test_samples  + num_neg_test_samples);
+        x_train.set_size(num_pos_train_samples + num_neg_train_samples);
+        y_train.set_size(num_pos_train_samples + num_neg_train_samples);
+
+        long pos_idx = 0;
+        long neg_idx = 0;
+
+        for (long i = 0; i < folds; ++i)
        {
-                if (count < num)
+            long cur = 0;
+
+            // load up our positive test samples
+            while (cur < num_pos_test_samples)
            {
-                    ++count;
-                    alpha(i) = 1;
+                if (y(pos_idx) == +1.0)
+                {
+                    x_test(cur) = x(pos_idx);
+                    y_test(cur) = +1.0;
+                    ++cur;
                }
-                else 
+                pos_idx = (pos_idx+1)%x.nr();
+            }
+
+            // load up our negative test samples
+            while (cur < x_test.nr())
            {
-                    if (temp > num)
+                if (y(neg_idx) == -1.0)
                {
-                        ++count;
-                        alpha(i) = temp - std::floor(temp);
+                    x_test(cur) = x(neg_idx);
+                    y_test(cur) = -1.0;
+                    ++cur;
                }
-                    break;
+                neg_idx = (neg_idx+1)%x.nr();
            }
+
+            // load the training data from the data following whatever we loaded
+            // as the testing data
+            long train_pos_idx = pos_idx;
+            long train_neg_idx = neg_idx;
+            cur = 0;
+
+            // load up our positive train samples
+            while (cur < num_pos_train_samples)
+            {
+                if (y(train_pos_idx) == +1.0)
+                {
+                    x_train(cur) = x(train_pos_idx);
+                    y_train(cur) = +1.0;
+                    ++cur;
                }
+                train_pos_idx = (train_pos_idx+1)%x.nr();
            }

-        if (count != num_total)
+            // load up our negative train samples
+            while (cur < x_train.nr())
            {
-            std::ostringstream sout;
-            sout << "invalid nu of " << nu << ".  Must be between 0 and " << (scalar_type)count/y.nr();
-            throw error(sout.str());
+                if (y(train_neg_idx) == -1.0)
+                {
+                    x_train(cur) = x(train_neg_idx);
+                    y_train(cur) = -1.0;
+                    ++cur;
+                }
+                train_neg_idx = (train_neg_idx+1)%x.nr();
            }

-        count = 0;
-        for (int i = 0; i < alpha.nr(); ++i)
+            // do the training
+            d = trainer.train(x_train,y_train);
+
+            // now test this fold 
+            for (long i = 0; i < x_test.nr(); ++i)
            {
-            if (y(i) == -1)
+                // if this is a positive example
+                if (y_test(i) == +1.0)
                {
-                if (count < num)
+                    if (d(x_test(i)) >= 0)
+                        ++num_pos_correct;
+                }
+                else if (y_test(i) == -1.0)
                {
-                    ++count;
-                    alpha(i) = 1;
+                    if (d(x_test(i)) < 0)
+                        ++num_neg_correct;
                }
                else
                {
-                    if (temp > num)
-                    {
-                        ++count;
-                        alpha(i) = temp - std::floor(temp);
-                    }
-                    break;
+                    throw dlib::error("invalid input labels to the cross_validate_trainer() function");
                }
            }
+
+        } // for (long i = 0; i < folds; ++i)
+
+        matrix<scalar_type, 1, 2, mem_manager_type> res;
+        res(0) = (scalar_type)num_pos_correct/(scalar_type)(num_pos_test_samples*folds); 
+        res(1) = (scalar_type)num_neg_correct/(scalar_type)(num_neg_test_samples*folds); 
+        return res;
    }

-        if (count != num_total)
+    template <
+        typename trainer_type,
+        typename in_sample_vector_type,
+        typename in_scalar_vector_type
+        >
+    const matrix<typename trainer_type::scalar_type, 1, 2, typename trainer_type::mem_manager_type> 
+    cross_validate_trainer (
+        const trainer_type& trainer,
+        const in_sample_vector_type& x,
+        const in_scalar_vector_type& y,
+        const long folds
+    )
    {
-            std::ostringstream sout;
-            sout << "invalid nu of " << nu << ".  Must be between 0 and " << (scalar_type)count/y.nr();
-            throw error(sout.str());
-        }
+        return cross_validate_trainer_impl(trainer,
+                                           vector_to_matrix(x),
+                                           vector_to_matrix(y),
+                                           folds);
    }

 // ----------------------------------------------------------------------------------------

    template <
-        typename K,
-        typename scalar_vector_type,
-        typename scalar_type
+        typename trainer_type,
+        typename in_sample_vector_type,
+        typename in_scalar_vector_type
        >
-    inline bool find_working_group (
-        const scalar_vector_type& y,
-        const scalar_vector_type& alpha,
-        const kernel_matrix_cache<K>& Q,
-        const scalar_vector_type& df,
-        const scalar_type tau,
-        const scalar_type eps,
-        long& i_out,
-        long& j_out
+    const probabilistic_decision_function<typename trainer_type::kernel_type> train_probabilistic_decision_function_impl (
+        const trainer_type& trainer,
+        const in_sample_vector_type& x,
+        const in_scalar_vector_type& y,
+        const long folds
    )
    {
-        using namespace std;
-        long ip = -1;
-        long jp = -1;
-        long in = -1;
-        long jn = -1;
+        typedef typename trainer_type::sample_type sample_type;
+        typedef typename trainer_type::scalar_type scalar_type;
+        typedef typename trainer_type::mem_manager_type mem_manager_type;
+        typedef typename trainer_type::kernel_type K;

-        scalar_type ip_val = -numeric_limits<scalar_type>::infinity();
-        scalar_type jp_val = numeric_limits<scalar_type>::infinity();
-        scalar_type in_val = -numeric_limits<scalar_type>::infinity();
-        scalar_type jn_val = numeric_limits<scalar_type>::infinity();

-        // loop over the alphas and find the maximum ip and in indices.
-        for (long i = 0; i < alpha.nr(); ++i)
-        {
-            if (y(i) == 1)
-            {
-                if (alpha(i) < 1.0)
-                {
-                    if (-df(i) > ip_val)
+        /*
+            This function fits a sigmoid function to the output of the 
+            svm trained by svm_nu_train().  The technique used is the one
+            described in the paper:
+                
+                Probabilistic Outputs for Support Vector Machines and
+                Comparisons to Regularized Likelihood Methods by 
+                John C. Platt.  Match 26, 1999
+        */
+
+        // make sure requires clause is not broken
+        DLIB_ASSERT(is_binary_classification_problem(x,y) == true &&
+                    1 < folds && folds <= x.nr(),
+            "\tprobabilistic_decision_function train_probabilistic_decision_function()"
+            << "\n\t invalid inputs were given to this function"
+            << "\n\t x.nr(): " << x.nr() 
+            << "\n\t y.nr(): " << y.nr() 
+            << "\n\t x.nc(): " << x.nc() 
+            << "\n\t y.nc(): " << y.nc() 
+            << "\n\t folds:  " << folds 
+            << "\n\t is_binary_classification_problem(x,y): " << ((is_binary_classification_problem(x,y))? "true":"false")
+            );
+
+        // count the number of positive and negative examples
+        long num_pos = 0;
+        long num_neg = 0;
+        for (long r = 0; r < y.nr(); ++r)
        {
-                        ip_val = -df(i);
-                        ip = i;
-                    }
-                }
-            }
+            if (y(r) == +1.0)
+                ++num_pos;
            else
-            {
-                if (alpha(i) > 0.0)
-                {
-                    if (df(i) > in_val)
-                    {
-                        in_val = df(i);
-                        in = i;
-                    }
-                }
-            }
+                ++num_neg;
        }

-        scalar_type Mp = numeric_limits<scalar_type>::infinity();
-        scalar_type Mn = numeric_limits<scalar_type>::infinity();
-        scalar_type bp = -numeric_limits<scalar_type>::infinity();
-        scalar_type bn = -numeric_limits<scalar_type>::infinity();
-
-        // now we need to find the minimum jp and jn indices
-        for (long j = 0; j < alpha.nr(); ++j)
-        {
-            if (y(j) == 1)
-            {
-                if (alpha(j) > 0.0)
+        // figure out how many positive and negative examples we will have in each fold
+        const long num_pos_test_samples = num_pos/folds; 
+        const long num_pos_train_samples = num_pos - num_pos_test_samples; 
+        const long num_neg_test_samples = num_neg/folds; 
+        const long num_neg_train_samples = num_neg - num_neg_test_samples; 
+
+        decision_function<K> d;
+        typename decision_function<K>::sample_vector_type x_test, x_train;
+        typename decision_function<K>::scalar_vector_type y_test, y_train;
+        x_test.set_size (num_pos_test_samples  + num_neg_test_samples);
+        y_test.set_size (num_pos_test_samples  + num_neg_test_samples);
+        x_train.set_size(num_pos_train_samples + num_neg_train_samples);
+        y_train.set_size(num_pos_train_samples + num_neg_train_samples);
+
+        typedef std_allocator<scalar_type, mem_manager_type> alloc_scalar_type_vector;
+        typedef std::vector<scalar_type, alloc_scalar_type_vector > dvector;
+        typedef std_allocator<int, mem_manager_type> alloc_int_vector;
+        typedef std::vector<int, alloc_int_vector > ivector;
+
+        dvector out;
+        ivector target;
+
+        long pos_idx = 0;
+        long neg_idx = 0;
+
+        for (long i = 0; i < folds; ++i)
        {
-                    scalar_type b = ip_val + df(j);
-                    if (-df(j) < Mp)
-                        Mp = -df(j);
+            long cur = 0;

-                    if (b > 0 && (Q.is_cached(j) || b > bp || jp == -1 ))
+            // load up our positive test samples
+            while (cur < num_pos_test_samples)
            {
-                        bp = b;
-                        scalar_type a = Q(ip,ip) + Q(j,j) - 2*Q(j,ip); 
-                        if (a <= 0)
-                            a = tau;
-                        scalar_type temp = -b*b/a;
-                        if (temp < jp_val)
+                if (y(pos_idx) == +1.0)
                {
-                            jp_val = temp;
-                            jp = j;
-                        }
-                    }
+                    x_test(cur) = x(pos_idx);
+                    y_test(cur) = +1.0;
+                    ++cur;
                }
+                pos_idx = (pos_idx+1)%x.nr();
            }
-            else
-            {
-                if (alpha(j) < 1.0)
-                {
-                    scalar_type b = in_val - df(j);
-                    if (df(j) < Mn)
-                        Mn = df(j);

-                    if (b > 0 && (Q.is_cached(j) || b > bn || jn == -1 ))
+            // load up our negative test samples
+            while (cur < x_test.nr())
            {
-                        bn = b;
-                        scalar_type a = Q(in,in) + Q(j,j) - 2*Q(j,in); 
-                        if (a <= 0)
-                            a = tau;
-                        scalar_type temp = -b*b/a;
-                        if (temp < jn_val)
+                if (y(neg_idx) == -1.0)
                {
-                            jn_val = temp;
-                            jn = j;
-                        }
-                    }
-                }
+                    x_test(cur) = x(neg_idx);
+                    y_test(cur) = -1.0;
+                    ++cur;
                }
+                neg_idx = (neg_idx+1)%x.nr();
            }

-        // if we are at the optimal point then return false so the caller knows
-        // to stop optimizing
-        if (std::max(ip_val - Mp, in_val - Mn) < eps)
-            return false;
+            // load the training data from the data following whatever we loaded
+            // as the testing data
+            long train_pos_idx = pos_idx;
+            long train_neg_idx = neg_idx;
+            cur = 0;

-        if (jp_val < jn_val)
+            // load up our positive train samples
+            while (cur < num_pos_train_samples)
            {
-            i_out = ip;
-            j_out = jp;
-        }
-        else
+                if (y(train_pos_idx) == +1.0)
                {
-            i_out = in;
-            j_out = jn;
+                    x_train(cur) = x(train_pos_idx);
+                    y_train(cur) = +1.0;
+                    ++cur;
                }
-
-        if (j_out >= 0 && i_out >= 0)
-            return true;
-        else
-            return false;
+                train_pos_idx = (train_pos_idx+1)%x.nr();
            }

-// ----------------------------------------------------------------------------------------
-
-    template <
-        typename scalar_vector_type,
-        typename scalar_type
-        >
-    void calculate_rho_and_b(
-        const scalar_vector_type& y,
-        const scalar_vector_type& alpha,
-        const scalar_vector_type& df,
-        scalar_type& rho, 
-        scalar_type& b
-    )
+            // load up our negative train samples
+            while (cur < x_train.nr())
            {
-        using namespace std;
-        long num_p_free = 0;
-        long num_n_free = 0;
-        scalar_type sum_p_free = 0;
-        scalar_type sum_n_free = 0;
+                if (y(train_neg_idx) == -1.0)
+                {
+                    x_train(cur) = x(train_neg_idx);
+                    y_train(cur) = -1.0;
+                    ++cur;
+                }
+                train_neg_idx = (train_neg_idx+1)%x.nr();
+            }

-        scalar_type upper_bound_p = -numeric_limits<scalar_type>::infinity();
-        scalar_type upper_bound_n = -numeric_limits<scalar_type>::infinity();
-        scalar_type lower_bound_p = numeric_limits<scalar_type>::infinity();
-        scalar_type lower_bound_n = numeric_limits<scalar_type>::infinity();
+            // do the training
+            d = trainer.train (x_train,y_train);

-        for(long i = 0; i < alpha.nr(); ++i)
-        {
-            if(y(i) == 1)
+            // now test this fold 
+            for (long i = 0; i < x_test.nr(); ++i)
            {
-                if(alpha(i) == 1)
+                out.push_back(d(x_test(i)));
+                // if this was a positive example
+                if (y_test(i) == +1.0)
                {
-                    if (df(i) > upper_bound_p)
-                        upper_bound_p = df(i);
+                    target.push_back(1);
                }
-                else if(alpha(i) == 0)
+                else if (y_test(i) == -1.0)
                {
-                    if (df(i) < lower_bound_p)
-                        lower_bound_p = df(i);
+                    target.push_back(0);
                }
                else
                {
-                    ++num_p_free;
-                    sum_p_free += df(i);
+                    throw dlib::error("invalid input labels to the train_probabilistic_decision_function() function");
                }
            }
-            else
+
+        } // for (long i = 0; i < folds; ++i)
+
+        // Now find the parameters of the sigmoid.  Do so using the method from the
+        // above referenced paper.
+        scalar_type prior0 = num_pos_test_samples*folds; 
+        scalar_type prior1 = num_neg_test_samples*folds; 
+        scalar_type A = 0;
+        scalar_type B = std::log((prior0+1)/(prior1+1));
+
+        const scalar_type hiTarget = (prior1+1)/(prior1+2);
+        const scalar_type loTarget = 1.0/(prior0+2);
+        scalar_type lambda = 1e-3;
+        scalar_type olderr = std::numeric_limits<scalar_type>::max();;
+        dvector pp(out.size(),(prior1+1)/(prior1+prior0+2));
+        const scalar_type min_log = -200.0;
+
+        scalar_type t = 0;
+        int count = 0;
+        for (int it = 0; it < 100; ++it)
        {
-                if(alpha(i) == 1)
+            scalar_type a = 0;
+            scalar_type b = 0;
+            scalar_type c = 0;
+            scalar_type d = 0;
+            scalar_type e = 0;
+
+            // First, compute Hessian & gradient of error function with 
+            // respect to A & B
+            for (unsigned long i = 0; i < out.size(); ++i)
            {
-                    if (df(i) > upper_bound_n)
-                        upper_bound_n = df(i);
+                if (target[i])
+                    t = hiTarget;
+                else
+                    t = loTarget;
+
+                const scalar_type d1 = pp[i] - t;
+                const scalar_type d2 = pp[i]*(1-pp[i]);
+                a += out[i]*out[i]*d2;
+                b += d2;
+                c += out[i]*d2;
+                d += out[i]*d1;
+                e += d1;
            }
-                else if(alpha(i) == 0)
+            
+            // If gradient is really tiny, then stop.
+            if (std::abs(d) < 1e-9 && std::abs(e) < 1e-9)
+                break;
+
+            scalar_type oldA = A;
+            scalar_type oldB = B;
+            scalar_type err = 0;
+
+            // Loop until goodness of fit increases
+            while (true)
            {
-                    if (df(i) < lower_bound_n)
-                        lower_bound_n = df(i);
-                }
-                else
+                scalar_type det = (a+lambda)*(b+lambda)-c*c;
+                // if determinant of Hessian is really close to zero then increase stabilizer.
+                if (std::abs(det) <= std::numeric_limits<scalar_type>::epsilon())
                {
-                    ++num_n_free;
-                    sum_n_free += df(i);
-                }
-            }
+                    lambda *= 10;
+                    continue;
                }

-        scalar_type r1,r2;
-        if(num_p_free > 0)
-            r1 = sum_p_free/num_p_free;
-        else
-            r1 = (upper_bound_p+lower_bound_p)/2;
+                A = oldA + ((b+lambda)*d-c*e)/det;
+                B = oldB + ((a+lambda)*e-c*d)/det;

-        if(num_n_free > 0)
-            r2 = sum_n_free/num_n_free;
+                // Now, compute the goodness of fit
+                err = 0;
+                for (unsigned long i = 0; i < out.size(); ++i)
+                {
+                    if (target[i])
+                        t = hiTarget;
                    else
-            r2 = (upper_bound_n+lower_bound_n)/2;
-
-        rho = (r1+r2)/2;
-        b = (r1-r2)/2/rho;
+                        t = loTarget;
+                    scalar_type p = 1.0/(1.0+std::exp(out[i]*A+B));
+                    pp[i] = p;
+                    // At this step, make sure log(0) returns min_log 
+                    err -= t*std::max(std::log(p),min_log) + (1-t)*std::max(std::log(1-p),min_log);
                }

-// ----------------------------------------------------------------------------------------
-
-    template <
-        typename K,
-        typename scalar_vector_type,
-        typename scalar_type
+                if (err < olderr*(1+1e-7))
+                {
+                    lambda *= 0.1;
+                    break;
+                }
+
+                // error did not decrease: increase stabilizer by factor of 10 
+                // & try again
+                lambda *= 10;
+                if (lambda >= 1e6) // something is broken. Give up
+                    break;
+            }
+
+            scalar_type diff = err-olderr;
+            scalar_type scale = 0.5*(err+olderr+1.0);
+            if (diff > -1e-3*scale && diff < 1e-7*scale)
+                ++count;
+            else
+                count = 0;
+
+            olderr = err;
+
+            if (count == 3)
+                break;
+        }
+
+        return probabilistic_decision_function<K>( A, B, trainer.train(x,y) );
+    }
+
+    template <
+        typename trainer_type,
+        typename in_sample_vector_type,
+        typename in_scalar_vector_type
        >
-    inline void optimize_working_pair (
-        const scalar_vector_type& y,
-        scalar_vector_type& alpha,
-        const kernel_matrix_cache<K>& Q,
-        const scalar_vector_type& df,
-        const scalar_type tau,
-        const long i,
-        const long j
+    const probabilistic_decision_function<typename trainer_type::kernel_type> train_probabilistic_decision_function (
+        const trainer_type& trainer,
+        const in_sample_vector_type& x,
+        const in_scalar_vector_type& y,
+        const long folds
    )
    {
-        scalar_type quad_coef = Q(i,i)+Q(j,j)-2*Q(j,i);
-        if (quad_coef <= 0)
-            quad_coef = tau;
-        scalar_type delta = (df(i)-df(j))/quad_coef;
-        scalar_type sum = alpha(i) + alpha(j);
-        alpha(i) -= delta;
-        alpha(j) += delta;
+        return train_probabilistic_decision_function_impl(trainer,
+                                                          vector_to_matrix(x),
+                                                          vector_to_matrix(y),
+                                                          folds);
+    }

-        if(sum > 1)
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename T,
+        typename U
+        >
+    typename enable_if<is_matrix<T>,void>::type randomize_samples (
+        T& t,
+        U& u
+    )
    {
-            if(alpha(i) > 1)
+        rand::kernel_1a r;
+
+        long n = t.nr()-1;
+        while (n > 0)
        {
-                alpha(i) = 1;
-                alpha(j) = sum - 1;
+            // put a random integer into idx
+            unsigned long idx = r.get_random_32bit_number();
+
+            // make idx be less than n
+            idx %= n;
+
+            // swap our randomly selected index into the n position
+            exchange(t(idx), t(n));
+            exchange(u(idx), u(n));
+
+            --n;
        }
-            else if(alpha(j) > 1)
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename T,
+        typename U
+        >
+    typename disable_if<is_matrix<T>,void>::type randomize_samples (
+        T& t,
+        U& u
+    )
    {
-                alpha(j) = 1;
-                alpha(i) = sum - 1;
+        rand::kernel_1a r;
+
+        long n = t.size()-1;
+        while (n > 0)
+        {
+            // put a random integer into idx
+            unsigned long idx = r.get_random_32bit_number();
+
+            // make idx be less than n
+            idx %= n;
+
+            // swap our randomly selected index into the n position
+            exchange(t[idx], t[n]);
+            exchange(u[idx], u[n]);
+
+            --n;
        }
    }
-        else
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename T
+        >
+    typename enable_if<is_matrix<T>,void>::type randomize_samples (
+        T& t
+    )
    {
-            if(alpha(j) < 0)
+        rand::kernel_1a r;
+
+        long n = t.nr()-1;
+        while (n > 0)
        {
-                alpha(j) = 0;
-                alpha(i) = sum;
+            // put a random integer into idx
+            unsigned long idx = r.get_random_32bit_number();
+
+            // make idx be less than n
+            idx %= n;
+
+            // swap our randomly selected index into the n position
+            exchange(t(idx), t(n));
+
+            --n;
        }
-            else if(alpha(i) < 0)
-            {
-                alpha(i) = 0;
-                alpha(j) = sum;
    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename T
+        >
+    typename disable_if<is_matrix<T>,void>::type randomize_samples (
+        T& t
+    )
+    {
+        rand::kernel_1a r;
+
+        long n = t.size()-1;
+        while (n > 0)
+        {
+            // put a random integer into idx
+            unsigned long idx = r.get_random_32bit_number();
+
+            // make idx be less than n
+            idx %= n;
+
+            // swap our randomly selected index into the n position
+            exchange(t[idx], t[n]);
+
+            --n;
        }
    }

+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
 // ----------------------------------------------------------------------------------------

    template <
        typename K 
        >
-    const decision_function<K> svm_nu_train (
-        const typename decision_function<K>::sample_vector_type& x,
-        const typename decision_function<K>::scalar_vector_type& y,
-        const K& kernel_function,
-        const typename K::scalar_type nu,
-        const long cache_size = 200,
-        const typename K::scalar_type eps = 0.001
+    class svm_nu_trainer
+    {
+    public:
+        typedef K kernel_type;
+        typedef typename kernel_type::scalar_type scalar_type;
+        typedef typename kernel_type::sample_type sample_type;
+        typedef typename kernel_type::mem_manager_type mem_manager_type;
+        typedef decision_function<kernel_type> trained_function_type;
+
+        svm_nu_trainer (
+        ) :
+            nu(0.1),
+            cache_size(200),
+            eps(0.001)
+        {
+        }
+
+        svm_nu_trainer (
+            const kernel_type& kernel_, 
+            const scalar_type& nu_
+        ) :
+            kernel_function(kernel_),
+            nu(nu_),
+            cache_size(200),
+            eps(0.001)
+        {
+        }
+
+        void set_cache_size (
+            long cache_size_
        )
        {
-        typedef typename K::scalar_type scalar_type;
-        typedef typename decision_function<K>::sample_vector_type sample_vector_type;
-        typedef typename decision_function<K>::scalar_vector_type scalar_vector_type;
+            cache_size = cache_size_;
+        }

-        // make sure requires clause is not broken
-#ifdef ENABLE_ASSERTS
-        for (long r = 0; r < y.nr(); ++r)
+        const long get_cache_size (
+        ) const
        {
-            DLIB_ASSERT(y(r) == -1.0 || y(r) == 1.0,
-                   "\tdecision_function svm_nu_train()"
-                   << "\n\tinvalid inputs were given to this function"
-                   << "\n\tr:    " << r 
-                   << "\n\ty(r): " << y(r) 
-            );
+            return cache_size;
+        }

+        void set_epsilon (
+            scalar_type eps_
+        )
+        {
+            eps = eps_;
        }
-#endif

-        DLIB_ASSERT(x.nr() > 1 &&  y.nr() == x.nr(),
-            "\tdecision_function svm_nu_train()"
-            << "\n\tinvalid inputs were given to this function"
-            << "\n\tx.nr(): " << x.nr() 
-            << "\n\ty.nr(): " << y.nr() 
-            << "\n\tx.nc(): " << x.nc() 
-            << "\n\ty.nc(): " << y.nc() 
-            );
+        const scalar_type get_epsilon (
+        ) const
+        { 
+            return eps;
+        }
+
+        void set_kernel (
+            const kernel_type& k
+        )
+        {
+            kernel_function = k;
+        }
+
+        const kernel_type& get_kernel (
+        ) const
+        {
+            return kernel_function;
+        }
+
+        void set_nu (
+            scalar_type nu_
+        )
+        {
+            nu = nu_;
+        }
+
+        const scalar_type get_nu (
+        ) const
+        {
+            return nu;
+        }
+
+        template <
+            typename in_sample_vector_type,
+            typename in_scalar_vector_type
+            >
+        const decision_function<kernel_type> train (
+            const in_sample_vector_type& x,
+            const in_scalar_vector_type& y
+        ) const
+        {
+            return do_train(vector_to_matrix(x), vector_to_matrix(y));
+        }
+
+        void swap (
+            svm_nu_trainer& item
+        )
+        {
+            exchange(kernel_function, item.kernel_function);
+            exchange(nu,              item.nu);
+            exchange(cache_size,      item.cache_size);
+            exchange(eps,             item.eps);
+        }
+
+    private:
+
+    // ------------------------------------------------------------------------------------
+
+        template <
+            typename in_sample_vector_type,
+            typename in_scalar_vector_type
+            >
+        const decision_function<kernel_type> do_train (
+            const in_sample_vector_type& x,
+            const in_scalar_vector_type& y
+        ) const
+        {
+            typedef typename K::scalar_type scalar_type;
+            typedef typename decision_function<K>::sample_vector_type sample_vector_type;
+            typedef typename decision_function<K>::scalar_vector_type scalar_vector_type;

-        DLIB_ASSERT(eps > 0 && 
-               cache_size > 0 && 
-               0 < nu && nu < maximum_nu(y),
-            "\tdecision_function svm_nu_train()"
-            << "\n\tinvalid inputs were given to this function"
-            << "\n\teps: " << eps 
-            << "\n\tcache_size:    " << cache_size
-            << "\n\tnu:            " << nu 
-            << "\n\tmaximum_nu(y): " << maximum_nu(y) 
+            // make sure requires clause is not broken
+            DLIB_ASSERT(is_binary_classification_problem(x,y) == true,
+                "\tdecision_function svm_nu_trainer::train(x,y)"
+                << "\n\t invalid inputs were given to this function"
+                << "\n\t x.nr(): " << x.nr() 
+                << "\n\t y.nr(): " << y.nr() 
+                << "\n\t x.nc(): " << x.nc() 
+                << "\n\t y.nc(): " << y.nc() 
+                << "\n\t is_binary_classification_problem(x,y): " << ((is_binary_classification_problem(x,y))? "true":"false")
                );


@@ -578,7 +959,7 @@ namespace dlib
            scalar_vector_type df; // delta f(alpha)
            scalar_vector_type alpha;

-        kernel_matrix_cache<K> Q(x,y,kernel_function,cache_size);
+            kernel_matrix_cache<K, in_sample_vector_type, in_scalar_vector_type> Q(x,y,kernel_function,cache_size);

            alpha.set_size(x.nr());
            df.set_size(x.nr());
@@ -650,587 +1031,361 @@ namespace dlib
            return decision_function<K> (sv_alpha, b, kernel_function, support_vectors);
        }

-// ----------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------

        template <
-        typename K
+            typename scalar_type,
+            typename scalar_vector_type,
+            typename scalar_vector_type2
            >
-    const matrix<typename K::scalar_type, 1, 2, typename K::mem_manager_type> svm_nu_cross_validate (
-        const typename decision_function<K>::sample_vector_type& x,
-        const typename decision_function<K>::scalar_vector_type& y,
-        const K& kernel_function,
-        const typename K::scalar_type nu,
-        const long folds,
-        const long cache_size = 200,
-        const typename K::scalar_type eps = 0.001
-    )
+        inline void set_initial_alpha (
+            const scalar_vector_type& y,
+            const scalar_type nu,
+            scalar_vector_type2& alpha
+        ) const
        {
-        typedef typename K::scalar_type scalar_type;
-        typedef typename decision_function<K>::sample_vector_type sample_vector_type;
-        typedef typename decision_function<K>::scalar_vector_type scalar_vector_type;
+            set_all_elements(alpha,0);
+            const scalar_type l = y.nr();
+            scalar_type temp = nu*l/2;
+            long num = (long)std::floor(temp);
+            long num_total = (long)std::ceil(temp);

-        // make sure requires clause is not broken
-#ifdef ENABLE_ASSERTS
-        for (long r = 0; r < y.nr(); ++r)
+            int count = 0;
+            for (int i = 0; i < alpha.nr(); ++i)
            {
-            DLIB_ASSERT(y(r) == -1.0 || y(r) == 1.0,
-                   "\tdecision_function svm_nu_cross_validate()"
-                   << "\n\tinvalid inputs were given to this function"
-                   << "\n\tr:    " << r 
-                   << "\n\ty(r): " << y(r) 
-            );
-
-        }
-#endif
-
-        DLIB_ASSERT(x.nr() > 1 &&  y.nr() == x.nr(),
-            "\tdecision_function svm_nu_cross_validate()"
-            << "\n\tinvalid inputs were given to this function"
-            << "\n\tx.nr(): " << x.nr() 
-            << "\n\ty.nr(): " << y.nr() 
-            << "\n\tx.nc(): " << x.nc() 
-            << "\n\ty.nc(): " << y.nc() 
-            );
-
-        DLIB_ASSERT(eps > 0 && 
-               folds > 1 && folds <= x.nr() &&
-               cache_size > 0 && 
-               0 < nu && nu < maximum_nu(y),
-            "\tdecision_function svm_nu_cross_validate()"
-            << "\n\tinvalid inputs were given to this function"
-            << "\n\teps: " << eps 
-            << "\n\tfolds:         " << folds 
-            << "\n\tcache_size:    " << cache_size
-            << "\n\tnu:            " << nu 
-            << "\n\tmaximum_nu(y): " << maximum_nu(y) 
-            );
-
-        // count the number of positive and negative examples
-        long num_pos = 0;
-        long num_neg = 0;
-        for (long r = 0; r < y.nr(); ++r)
-        {
-            if (y(r) == +1.0)
-                ++num_pos;
-            else
-                ++num_neg;
-        }
-
-        // figure out how many positive and negative examples we will have in each fold
-        const long num_pos_test_samples = num_pos/folds; 
-        const long num_pos_train_samples = num_pos - num_pos_test_samples; 
-        const long num_neg_test_samples = num_neg/folds; 
-        const long num_neg_train_samples = num_neg - num_neg_test_samples; 
-
-        long num_pos_correct = 0;
-        long num_neg_correct = 0;
-
-        decision_function<K> d;
-        typename decision_function<K>::sample_vector_type x_test, x_train;
-        typename decision_function<K>::scalar_vector_type y_test, y_train;
-        x_test.set_size (num_pos_test_samples  + num_neg_test_samples);
-        y_test.set_size (num_pos_test_samples  + num_neg_test_samples);
-        x_train.set_size(num_pos_train_samples + num_neg_train_samples);
-        y_train.set_size(num_pos_train_samples + num_neg_train_samples);
-
-        long pos_idx = 0;
-        long neg_idx = 0;
-
-        for (long i = 0; i < folds; ++i)
-        {
-            long cur = 0;
-
-            // load up our positive test samples
-            while (cur < num_pos_test_samples)
+                if (y(i) == 1)
                {
-                if (y(pos_idx) == +1.0)
+                    if (count < num)
                    {
-                    x_test(cur) = x(pos_idx);
-                    y_test(cur) = +1.0;
-                    ++cur;
-                }
-                pos_idx = (pos_idx+1)%x.nr();
+                        ++count;
+                        alpha(i) = 1;
                    }
-
-            // load up our negative test samples
-            while (cur < x_test.nr())
+                    else 
                    {
-                if (y(neg_idx) == -1.0)
+                        if (temp > num)
                        {
-                    x_test(cur) = x(neg_idx);
-                    y_test(cur) = -1.0;
-                    ++cur;
+                            ++count;
+                            alpha(i) = temp - std::floor(temp);
                        }
-                neg_idx = (neg_idx+1)%x.nr();
+                        break;
                    }
-
-            // load the training data from the data following whatever we loaded
-            // as the testing data
-            long train_pos_idx = pos_idx;
-            long train_neg_idx = neg_idx;
-            cur = 0;
-
-            // load up our positive train samples
-            while (cur < num_pos_train_samples)
-            {
-                if (y(train_pos_idx) == +1.0)
-                {
-                    x_train(cur) = x(train_pos_idx);
-                    y_train(cur) = +1.0;
-                    ++cur;
                }
-                train_pos_idx = (train_pos_idx+1)%x.nr();
            }

-            // load up our negative train samples
-            while (cur < x_train.nr())
-            {
-                if (y(train_neg_idx) == -1.0)
+            if (count != num_total)
            {
-                    x_train(cur) = x(train_neg_idx);
-                    y_train(cur) = -1.0;
-                    ++cur;
-                }
-                train_neg_idx = (train_neg_idx+1)%x.nr();
+                std::ostringstream sout;
+                sout << "invalid nu of " << nu << ".  Must be between 0 and " << (scalar_type)count/y.nr();
+                throw invalid_svm_nu_error(sout.str(),nu);
            }

-            // do the training
-            d = svm_nu_train (x_train,y_train,kernel_function,nu,cache_size,eps);
-
-            // now test this fold 
-            for (long i = 0; i < x_test.nr(); ++i)
+            count = 0;
+            for (int i = 0; i < alpha.nr(); ++i)
            {
-                // if this is a positive example
-                if (y_test(i) == +1.0)
+                if (y(i) == -1)
                {
-                    if (d(x_test(i)) >= 0)
-                        ++num_pos_correct;
-                }
-                else if (y_test(i) == -1.0)
+                    if (count < num)
                    {
-                    if (d(x_test(i)) < 0)
-                        ++num_neg_correct;
+                        ++count;
+                        alpha(i) = 1;
                    }
                    else 
                    {
-                    throw dlib::error("invalid input labels to the svm_nu_cross_validate() function");
+                        if (temp > num)
+                        {
+                            ++count;
+                            alpha(i) = temp - std::floor(temp);
+                        }
+                        break;
+                    }
                }
            }

-        } // for (long i = 0; i < folds; ++i)
-
-        matrix<typename K::scalar_type, 1, 2, typename K::mem_manager_type> res;
-        res(0) = (scalar_type)num_pos_correct/(scalar_type)(num_pos_test_samples*folds); 
-        res(1) = (scalar_type)num_neg_correct/(scalar_type)(num_neg_test_samples*folds); 
-        return res;
+            if (count != num_total)
+            {
+                std::ostringstream sout;
+                sout << "invalid nu of " << nu << ".  Must be between 0 and " << (scalar_type)count/y.nr();
+                throw invalid_svm_nu_error(sout.str(),nu);
+            }
        }

-// ----------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------

        template <
-        typename K
+            typename sample_vector_type,
+            typename scalar_vector_type,
+            typename scalar_vector_type2,
+            typename scalar_type
            >
-    const probabilistic_decision_function<K>  svm_nu_train_prob (
-        const typename decision_function<K>::sample_vector_type& x,
-        const typename decision_function<K>::scalar_vector_type& y,
-        const K& kernel_function,
-        const typename K::scalar_type nu,
-        const long folds,
-        const long cache_size = 200,
-        const typename K::scalar_type eps = 0.001
-    )
-    {
-        typedef typename K::scalar_type scalar_type;
-        typedef typename K::mem_manager_type mem_manager_type;
-        typedef typename decision_function<K>::sample_vector_type sample_vector_type;
-        typedef typename decision_function<K>::scalar_vector_type scalar_vector_type;
-
-        /*
-            This function fits a sigmoid function to the output of the 
-            svm trained by svm_nu_train().  The technique used is the one
-            described in the paper:
-                
-                Probabilistic Outputs for Support Vector Machines and
-                Comparisons to Regularized Likelihood Methods by 
-                John C. Platt.  Match 26, 1999
-        */
-
-        // make sure requires clause is not broken
-#ifdef ENABLE_ASSERTS
-        for (long r = 0; r < y.nr(); ++r)
+        inline bool find_working_group (
+            const scalar_vector_type2& y,
+            const scalar_vector_type& alpha,
+            const kernel_matrix_cache<K,sample_vector_type, scalar_vector_type2>& Q,
+            const scalar_vector_type& df,
+            const scalar_type tau,
+            const scalar_type eps,
+            long& i_out,
+            long& j_out
+        ) const
        {
-            DLIB_ASSERT(y(r) == -1.0 || y(r) == 1.0,
-                   "\tdecision_function svm_nu_train()"
-                   << "\n\tinvalid inputs were given to this function"
-                   << "\n\tr:    " << r 
-                   << "\n\ty(r): " << y(r) 
-            );
-
-        }
-#endif
-
-        DLIB_ASSERT(x.nr() > 1 &&  y.nr() == x.nr(),
-            "\tdecision_function svm_nu_train()"
-            << "\n\tinvalid inputs were given to this function"
-            << "\n\tx.nr(): " << x.nr() 
-            << "\n\ty.nr(): " << y.nr() 
-            << "\n\tx.nc(): " << x.nc() 
-            << "\n\ty.nc(): " << y.nc() 
-            );
+            using namespace std;
+            long ip = -1;
+            long jp = -1;
+            long in = -1;
+            long jn = -1;

-        DLIB_ASSERT(eps > 0 && 
-               folds > 1 && folds <= x.nr() &&
-               cache_size > 0 && 
-               0 < nu && nu < maximum_nu(y),
-            "\tdecision_function svm_nu_train()"
-            << "\n\tinvalid inputs were given to this function"
-            << "\n\teps: " << eps 
-            << "\n\tfolds:         " << folds 
-            << "\n\tcache_size:    " << cache_size
-            << "\n\tnu:            " << nu 
-            << "\n\tmaximum_nu(y): " << maximum_nu(y) 
-            );
+            scalar_type ip_val = -numeric_limits<scalar_type>::infinity();
+            scalar_type jp_val = numeric_limits<scalar_type>::infinity();
+            scalar_type in_val = -numeric_limits<scalar_type>::infinity();
+            scalar_type jn_val = numeric_limits<scalar_type>::infinity();

-        // count the number of positive and negative examples
-        long num_pos = 0;
-        long num_neg = 0;
-        for (long r = 0; r < y.nr(); ++r)
+            // loop over the alphas and find the maximum ip and in indices.
+            for (long i = 0; i < alpha.nr(); ++i)
            {
-            if (y(r) == +1.0)
-                ++num_pos;
-            else
-                ++num_neg;
-        }
-
-        // figure out how many positive and negative examples we will have in each fold
-        const long num_pos_test_samples = num_pos/folds; 
-        const long num_pos_train_samples = num_pos - num_pos_test_samples; 
-        const long num_neg_test_samples = num_neg/folds; 
-        const long num_neg_train_samples = num_neg - num_neg_test_samples; 
-
-        decision_function<K> d;
-        typename decision_function<K>::sample_vector_type x_test, x_train;
-        typename decision_function<K>::scalar_vector_type y_test, y_train;
-        x_test.set_size (num_pos_test_samples  + num_neg_test_samples);
-        y_test.set_size (num_pos_test_samples  + num_neg_test_samples);
-        x_train.set_size(num_pos_train_samples + num_neg_train_samples);
-        y_train.set_size(num_pos_train_samples + num_neg_train_samples);
-
-        typedef std_allocator<scalar_type, mem_manager_type> alloc_scalar_type_vector;
-        typedef std::vector<scalar_type, alloc_scalar_type_vector > dvector;
-        typedef std_allocator<int, mem_manager_type> alloc_int_vector;
-        typedef std::vector<int, alloc_int_vector > ivector;
-
-        dvector out;
-        ivector target;
-
-        long pos_idx = 0;
-        long neg_idx = 0;
-
-        for (long i = 0; i < folds; ++i)
+                if (y(i) == 1)
                {
-            long cur = 0;
-
-            // load up our positive test samples
-            while (cur < num_pos_test_samples)
+                    if (alpha(i) < 1.0)
                    {
-                if (y(pos_idx) == +1.0)
+                        if (-df(i) > ip_val)
                        {
-                    x_test(cur) = x(pos_idx);
-                    y_test(cur) = +1.0;
-                    ++cur;
+                            ip_val = -df(i);
+                            ip = i;
                        }
-                pos_idx = (pos_idx+1)%x.nr();
                    }
-
-            // load up our negative test samples
-            while (cur < x_test.nr())
+                }
+                else
                {
-                if (y(neg_idx) == -1.0)
+                    if (alpha(i) > 0.0)
                    {
-                    x_test(cur) = x(neg_idx);
-                    y_test(cur) = -1.0;
-                    ++cur;
+                        if (df(i) > in_val)
+                        {
+                            in_val = df(i);
+                            in = i;
                        }
-                neg_idx = (neg_idx+1)%x.nr();
                    }
-
-            // load the training data from the data following whatever we loaded
-            // as the testing data
-            long train_pos_idx = pos_idx;
-            long train_neg_idx = neg_idx;
-            cur = 0;
-
-            // load up our positive train samples
-            while (cur < num_pos_train_samples)
-            {
-                if (y(train_pos_idx) == +1.0)
-                {
-                    x_train(cur) = x(train_pos_idx);
-                    y_train(cur) = +1.0;
-                    ++cur;
                }
-                train_pos_idx = (train_pos_idx+1)%x.nr();
            }

-            // load up our negative train samples
-            while (cur < x_train.nr())
+            scalar_type Mp = numeric_limits<scalar_type>::infinity();
+            scalar_type Mn = numeric_limits<scalar_type>::infinity();
+            scalar_type bp = -numeric_limits<scalar_type>::infinity();
+            scalar_type bn = -numeric_limits<scalar_type>::infinity();
+
+            // now we need to find the minimum jp and jn indices
+            for (long j = 0; j < alpha.nr(); ++j)
            {
-                if (y(train_neg_idx) == -1.0)
+                if (y(j) == 1)
                {
-                    x_train(cur) = x(train_neg_idx);
-                    y_train(cur) = -1.0;
-                    ++cur;
-                }
-                train_neg_idx = (train_neg_idx+1)%x.nr();
-            }
-
-            // do the training
-            d = svm_nu_train (x_train,y_train,kernel_function,nu,cache_size,eps);
+                    if (alpha(j) > 0.0)
+                    {
+                        scalar_type b = ip_val + df(j);
+                        if (-df(j) < Mp)
+                            Mp = -df(j);

-            // now test this fold 
-            for (long i = 0; i < x_test.nr(); ++i)
+                        if (b > 0 && (Q.is_cached(j) || b > bp || jp == -1 ))
                        {
-                out.push_back(d(x_test(i)));
-                // if this was a positive example
-                if (y_test(i) == +1.0)
+                            bp = b;
+                            scalar_type a = Q(ip,ip) + Q(j,j) - 2*Q(j,ip); 
+                            if (a <= 0)
+                                a = tau;
+                            scalar_type temp = -b*b/a;
+                            if (temp < jp_val)
                            {
-                    target.push_back(1);
+                                jp_val = temp;
+                                jp = j;
                            }
-                else if (y_test(i) == -1.0)
-                {
-                    target.push_back(0);
                        }
-                else
-                {
-                    throw dlib::error("invalid input labels to the svm_nu_train_prob() function");
                    }
                }
-
-        } // for (long i = 0; i < folds; ++i)
-
-        // Now find the parameters of the sigmoid.  Do so using the method from the
-        // above referenced paper.
-        scalar_type prior0 = num_pos_test_samples*folds; 
-        scalar_type prior1 = num_neg_test_samples*folds; 
-        scalar_type A = 0;
-        scalar_type B = std::log((prior0+1)/(prior1+1));
-
-        const scalar_type hiTarget = (prior1+1)/(prior1+2);
-        const scalar_type loTarget = 1.0/(prior0+2);
-        scalar_type lambda = 1e-3;
-        scalar_type olderr = std::numeric_limits<scalar_type>::max();;
-        dvector pp(out.size(),(prior1+1)/(prior1+prior0+2));
-        const scalar_type min_log = -200.0;
-
-        scalar_type t = 0;
-        int count = 0;
-        for (int it = 0; it < 100; ++it)
+                else
                {
-            scalar_type a = 0;
-            scalar_type b = 0;
-            scalar_type c = 0;
-            scalar_type d = 0;
-            scalar_type e = 0;
-
-            // First, compute Hessian & gradient of error function with 
-            // respect to A & B
-            for (unsigned long i = 0; i < out.size(); ++i)
+                    if (alpha(j) < 1.0)
                    {
-                if (target[i])
-                    t = hiTarget;
-                else
-                    t = loTarget;
-
-                const scalar_type d1 = pp[i] - t;
-                const scalar_type d2 = pp[i]*(1-pp[i]);
-                a += out[i]*out[i]*d2;
-                b += d2;
-                c += out[i]*d2;
-                d += out[i]*d1;
-                e += d1;
-            }
-            
-            // If gradient is really tiny, then stop.
-            if (std::abs(d) < 1e-9 && std::abs(e) < 1e-9)
-                break;
-
-            scalar_type oldA = A;
-            scalar_type oldB = B;
-            scalar_type err = 0;
+                        scalar_type b = in_val - df(j);
+                        if (df(j) < Mn)
+                            Mn = df(j);

-            // Loop until goodness of fit increases
-            while (true)
+                        if (b > 0 && (Q.is_cached(j) || b > bn || jn == -1 ))
                        {
-                scalar_type det = (a+lambda)*(b+lambda)-c*c;
-                // if determinant of Hessian is really close to zero then increase stabilizer.
-                if (std::abs(det) <= std::numeric_limits<scalar_type>::epsilon())
+                            bn = b;
+                            scalar_type a = Q(in,in) + Q(j,j) - 2*Q(j,in); 
+                            if (a <= 0)
+                                a = tau;
+                            scalar_type temp = -b*b/a;
+                            if (temp < jn_val)
                            {
-                    lambda *= 10;
-                    continue;
+                                jn_val = temp;
+                                jn = j;
+                            }
+                        }
+                    }
+                }
            }

-                A = oldA + ((b+lambda)*d-c*e)/det;
-                B = oldB + ((a+lambda)*e-c*d)/det;
+            // if we are at the optimal point then return false so the caller knows
+            // to stop optimizing
+            if (std::max(ip_val - Mp, in_val - Mn) < eps)
+                return false;

-                // Now, compute the goodness of fit
-                err = 0;
-                for (unsigned long i = 0; i < out.size(); ++i)
+            if (jp_val < jn_val)
            {
-                    if (target[i])
-                        t = hiTarget;
-                    else
-                        t = loTarget;
-                    scalar_type p = 1.0/(1.0+std::exp(out[i]*A+B));
-                    pp[i] = p;
-                    // At this step, make sure log(0) returns min_log 
-                    err -= t*std::max(std::log(p),min_log) + (1-t)*std::max(std::log(1-p),min_log);
+                i_out = ip;
+                j_out = jp;
            }
-
-                if (err < olderr*(1+1e-7))
+            else
            {
-                    lambda *= 0.1;
-                    break;
-                }
-
-                // error did not decrease: increase stabilizer by factor of 10 
-                // & try again
-                lambda *= 10;
-                if (lambda >= 1e6) // something is broken. Give up
-                    break;
+                i_out = in;
+                j_out = jn;
            }

-            scalar_type diff = err-olderr;
-            scalar_type scale = 0.5*(err+olderr+1.0);
-            if (diff > -1e-3*scale && diff < 1e-7*scale)
-                ++count;
+            if (j_out >= 0 && i_out >= 0)
+                return true;
            else
-                count = 0;
-
-            olderr = err;
-
-            if (count == 3)
-                break;
-        }
-
-        return probabilistic_decision_function<K>(
-                    A, B,
-                    svm_nu_train (x,y,kernel_function,nu,cache_size,eps) );
+                return false;
        }

-// ----------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------

        template <
-        typename T,
-        typename U
+            typename scalar_vector_type,
+            typename scalar_vector_type2,
+            typename scalar_type
            >
-    typename enable_if<is_matrix<T>,void>::type randomize_samples (
-        T& t,
-        U& u
-    )
-    {
-        rand::kernel_1a r;
-
-        long n = t.nr()-1;
-        while (n > 0)
+        void calculate_rho_and_b(
+            const scalar_vector_type2& y,
+            const scalar_vector_type& alpha,
+            const scalar_vector_type& df,
+            scalar_type& rho, 
+            scalar_type& b
+        ) const
        {
-            // put a random integer into idx
-            unsigned long idx = r.get_random_32bit_number();
-
-            // make idx be less than n
-            idx %= n;
+            using namespace std;
+            long num_p_free = 0;
+            long num_n_free = 0;
+            scalar_type sum_p_free = 0;
+            scalar_type sum_n_free = 0;

-            // swap our randomly selected index into the n position
-            exchange(t(idx), t(n));
-            exchange(u(idx), u(n));
+            scalar_type upper_bound_p = -numeric_limits<scalar_type>::infinity();
+            scalar_type upper_bound_n = -numeric_limits<scalar_type>::infinity();
+            scalar_type lower_bound_p = numeric_limits<scalar_type>::infinity();
+            scalar_type lower_bound_n = numeric_limits<scalar_type>::infinity();

-            --n;
+            for(long i = 0; i < alpha.nr(); ++i)
+            {
+                if(y(i) == 1)
+                {
+                    if(alpha(i) == 1)
+                    {
+                        if (df(i) > upper_bound_p)
+                            upper_bound_p = df(i);
                    }
+                    else if(alpha(i) == 0)
+                    {
+                        if (df(i) < lower_bound_p)
+                            lower_bound_p = df(i);
                    }
-
-// ----------------------------------------------------------------------------------------
-
-    template <
-        typename T,
-        typename U
-        >
-    typename disable_if<is_matrix<T>,void>::type randomize_samples (
-        T& t,
-        U& u
-    )
+                    else
                    {
-        rand::kernel_1a r;
-
-        long n = t.size()-1;
-        while (n > 0)
+                        ++num_p_free;
+                        sum_p_free += df(i);
+                    }
+                }
+                else
                {
-            // put a random integer into idx
-            unsigned long idx = r.get_random_32bit_number();
+                    if(alpha(i) == 1)
+                    {
+                        if (df(i) > upper_bound_n)
+                            upper_bound_n = df(i);
+                    }
+                    else if(alpha(i) == 0)
+                    {
+                        if (df(i) < lower_bound_n)
+                            lower_bound_n = df(i);
+                    }
+                    else
+                    {
+                        ++num_n_free;
+                        sum_n_free += df(i);
+                    }
+                }
+            }

-            // make idx be less than n
-            idx %= n;
+            scalar_type r1,r2;
+            if(num_p_free > 0)
+                r1 = sum_p_free/num_p_free;
+            else
+                r1 = (upper_bound_p+lower_bound_p)/2;

-            // swap our randomly selected index into the n position
-            exchange(t[idx], t[n]);
-            exchange(u[idx], u[n]);
+            if(num_n_free > 0)
+                r2 = sum_n_free/num_n_free;
+            else
+                r2 = (upper_bound_n+lower_bound_n)/2;

-            --n;
-        }
+            rho = (r1+r2)/2;
+            b = (r1-r2)/2/rho;
        }

-// ----------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------

        template <
-        typename T
+            typename sample_vector_type,
+            typename scalar_vector_type,
+            typename scalar_vector_type2,
+            typename scalar_type
            >
-    typename enable_if<is_matrix<T>,void>::type randomize_samples (
-        T& t
-    )
+        inline void optimize_working_pair (
+            const scalar_vector_type2& y,
+            scalar_vector_type& alpha,
+            const kernel_matrix_cache<K, sample_vector_type, scalar_vector_type2>& Q,
+            const scalar_vector_type& df,
+            const scalar_type tau,
+            const long i,
+            const long j
+        ) const
        {
-        rand::kernel_1a r;
+            scalar_type quad_coef = Q(i,i)+Q(j,j)-2*Q(j,i);
+            if (quad_coef <= 0)
+                quad_coef = tau;
+            scalar_type delta = (df(i)-df(j))/quad_coef;
+            scalar_type sum = alpha(i) + alpha(j);
+            alpha(i) -= delta;
+            alpha(j) += delta;

-        long n = t.nr()-1;
-        while (n > 0)
+            if(sum > 1)
            {
-            // put a random integer into idx
-            unsigned long idx = r.get_random_32bit_number();
-
-            // make idx be less than n
-            idx %= n;
-
-            // swap our randomly selected index into the n position
-            exchange(t(idx), t(n));
-
-            --n;
+                if(alpha(i) > 1)
+                {
+                    alpha(i) = 1;
+                    alpha(j) = sum - 1;
                }
+                else if(alpha(j) > 1)
+                {
+                    alpha(j) = 1;
+                    alpha(i) = sum - 1;
                }
-
-// ----------------------------------------------------------------------------------------
-
-    template <
-        typename T
-        >
-    typename disable_if<is_matrix<T>,void>::type randomize_samples (
-        T& t
-    )
+            }
+            else
            {
-        rand::kernel_1a r;
-
-        long n = t.size()-1;
-        while (n > 0)
+                if(alpha(j) < 0)
                {
-            // put a random integer into idx
-            unsigned long idx = r.get_random_32bit_number();
-
-            // make idx be less than n
-            idx %= n;
-
-            // swap our randomly selected index into the n position
-            exchange(t[idx], t[n]);
-
-            --n;
+                    alpha(j) = 0;
+                    alpha(i) = sum;
                }
+                else if(alpha(i) < 0)
+                {
+                    alpha(i) = 0;
+                    alpha(j) = sum;
                }
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        kernel_type kernel_function;
+        scalar_type nu;
+        long cache_size;
+        scalar_type eps;
+    }; // end of class svm_nu_trainer

 // ----------------------------------------------------------------------------------------


--- a/dlib/svm/svm_abstract.h
+++ b/dlib/svm/svm_abstract.h
@@ -17,8 +17,24 @@ namespace dlib

 // ----------------------------------------------------------------------------------------
 // ----------------------------------------------------------------------------------------
-//                              Functions that perform SVM training 
 // ----------------------------------------------------------------------------------------
+
+    class invalid_svm_nu_error : public dlib::error 
+    { 
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is an exception class used to indicate that a
+                value of nu used for svm training is incompatible with a
+                particular data set.
+
+                this->nu will be set to the invalid value of nu used.
+        !*/
+
+    public: 
+        invalid_svm_nu_error(const std::string& msg, double nu_) : dlib::error(msg), nu(nu_) {};
+        const double nu;
+    };
+
 // ----------------------------------------------------------------------------------------

    template <
@@ -29,102 +45,242 @@ namespace dlib
    );
    /*!
        requires
-            - T == a matrix object 
+            - T == a matrix object or an object convertible to a matrix via 
+              vector_to_matrix()
            - y.nc() == 1
            - y.nr() > 1
            - for all valid i:
                - y(i) == -1 or +1
        ensures
-            - returns the maximum valid nu that can be used with svm_nu_train().
+            - returns the maximum valid nu that can be used with the svm_nu_trainer and
+              the training set labels from the given y vector.
              (i.e. 2.0*min(number of +1 examples in y, number of -1 examples in y)/y.nr())
    !*/

-    template <
-        typename K
+// ----------------------------------------------------------------------------------------
+
+    bool template <
+        typename T,
+        typename U
        >
-    const decision_function<K> svm_nu_train (
-        const typename decision_function<K>::sample_vector_type& x,
-        const typename decision_function<K>::scalar_vector_type& y,
-        const K&                       kernel_function,
-        const typename K::scalar_type  nu,
-        const long                     cache_size = 200,
-        const typename K::scalar_type  eps = 0.001
+    bool is_binary_classification_problem (
+        const T& x,
+        const U& x_labels
    );
    /*!
        requires
-            - eps > 0
+            - T == a matrix or something convertible to a matrix via vector_to_matrix()
+            - U == a matrix or something convertible to a matrix via vector_to_matrix()
+        ensures
+            - returns true if all of the following are true and false otherwise:
                - x.nc()        == 1 (i.e. x is a column vector)
-            - y.nc() == 1 (i.e. y is a column vector)
-            - x.nr() == y.nr() 
+                - x_labels.nc() == 1 (i.e. x_labels is a column vector)
+                - x.nr() == x_labels.nr() 
                - x.nr() > 1
-            - cache_size > 0
                - for all valid i:
-                - y(i) == -1 or +1
-                - y(i) is the class that should be assigned to training example x(i)
-            - 0 < nu < maximum_nu(y) 
-            - kernel_function == a kernel function object type as defined at the
-              top of dlib/svm/kernel_abstract.h
+                    - x_labels(i) == -1 or +1
+    !*/
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename K 
+        >
+    class svm_nu_trainer
+    {
+        /*!
+            REQUIREMENTS ON K 
+                is a kernel function object as defined in dlib/svm/kernel_abstract.h 
+
+            WHAT THIS OBJECT REPRESENTS
+                This object implements a trainer for a nu support vector machine for 
+                solving binary classification problems.
+
+                The implementation of the nu-svm training algorithm used by this object is based
+                on the following excellent papers:
+                    - Chang and Lin, Training {nu}-Support Vector Classifiers: Theory and Algorithms
+                    - Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector 
+                      machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
+
+        !*/
+
+    public:
+        typedef K kernel_type;
+        typedef typename kernel_type::scalar_type scalar_type;
+        typedef typename kernel_type::sample_type sample_type;
+        typedef typename kernel_type::mem_manager_type mem_manager_type;
+        typedef decision_function<kernel_type> trained_function_type;
+
+        svm_nu_trainer (
+        );
+        /*!
+            ensures
+                - This object is properly initialized and ready to be used
+                  to train a support vector machine.
+                - #get_kernel() == kernel_type() 
+                - #get_nu() == 0.1 
+                - #get_cache_size() == 200
+                - #get_epsilon() == 0.001
+        !*/
+
+        svm_nu_trainer (
+            const kernel_type& kernel, 
+            const scalar_type& nu
+        );
+        /*!
+            requires
+                - 0 < nu <= 1
+            ensures
+                - This object is properly initialized and ready to be used
+                  to train a support vector machine.
+                - #get_kernel() == kernel
+                - #get_nu() == nu
+                - #get_cache_size() == 200
+                - #get_epsilon() == 0.001
+        !*/
+
+        void set_cache_size (
+            long cache_size
+        );
+        /*!
+            requires
+                - cache_size > 0
+            ensures
+                - #get_cache_size() == cache_size 
+        !*/
+
+        const long get_cache_size (
+        ) const;
+        /*!
+            ensures
+                - returns the number of megabytes of cache this object will use
+                  when it performs training via the this->train() function.
+                  (bigger values of this may make training go faster but doesn't affect 
+                  the result.  However, too big a value will cause you to run out of 
+                  memory obviously.)
+        !*/
+
+        void set_epsilon (
+            scalar_type eps
+        );
+        /*!
+            requires
+                - eps > 0
+            ensures
+                - #get_epsilon() == eps 
+        !*/
+
+        const scalar_type get_epsilon (
+        ) const;
+        /*!
+            ensures
+                - returns the error epsilon that determines when training should stop.
+                  Generally a good value for this is 0.001.  Smaller values may result
+                  in a more accurate solution but take longer to execute.
+        !*/
+
+        void set_kernel (
+            const kernel_type& k
+        );
+        /*!
+            ensures
+                - #get_kernel() == k 
+        !*/
+
+        const kernel_type& get_kernel (
+        ) const;
+        /*!
+            ensures
+                - returns a copy of the kernel function in use by this object
+        !*/
+
+        void set_nu (
+            scalar_type nu
+        );
+        /*!
+            requires
+                - 0 < nu <= 1
+            ensures
+                - #get_nu() == nu
+        !*/
+
+        const scalar_type get_nu (
+        ) const;
+        /*!
+            ensures
+                - returns the nu svm parameter.  This is a value between 0 and
+                  1.  It is the parameter that determines the trade off between
+                  trying to fit the training data exactly or allowing more errors 
+                  but hopefully improving the generalization ability of the 
+                  resulting classifier.  For more information you should consult
+                  the papers referenced above.
+        !*/
+
+        template <
+            typename in_sample_vector_type,
+            typename in_scalar_vector_type
+            >
+        const decision_function<kernel_type> train (
+            const in_sample_vector_type& x,
+            const in_scalar_vector_type& y
+        ) const;
+        /*!
+            requires
+                - is_binary_classification_problem(x,y) == true
            ensures
                - trains a nu support vector classifier given the training samples in x and 
-              labels in y.  Training is done when the error is less than eps.
-            - caches approximately at most cache_size megabytes of the kernel matrix. 
-              (bigger values of this may make training go faster but doesn't affect the 
-              result.  However, too big a value will cause you to run out of memory.)
+                  labels in y.  Training is done when the error is less than get_epsilon().
                - returns a decision function F with the following properties:
                    - if (new_x is a sample predicted have +1 label) then
                        - F(new_x) >= 0
                    - else
                        - F(new_x) < 0
+            throws
+                - invalid_svm_nu_error
+                  This exception is thrown if get_nu() > maximum_nu(y)
+                - std::bad_alloc
        !*/

-    /*
-        The implementation of the nu-svm training algorithm used by this library is based
-        on the following excellent papers:
-            - Chang and Lin, Training {nu}-Support Vector Classifiers: Theory and Algorithms
-            - Chih-Chung Chang and Chih-Jen Lin, LIBSVM : a library for support vector 
-              machines, 2001. Software available at http://www.csie.ntu.edu.tw/~cjlin/libsvm
-    */
+        void swap (
+            svm_nu_trainer& item
+        );
+        /*!
+            ensures
+                - swaps *this and item
+        !*/
+    }; 

 // ----------------------------------------------------------------------------------------

    template <
-        typename K
+        typename trainer_type,
+        typename in_sample_vector_type,
+        typename in_scalar_vector_type
        >
-    const probabilistic_decision_function<K> svm_nu_train_prob (
-        const typename decision_function<K>::sample_vector_type& x,
-        const typename decision_function<K>::scalar_vector_type& y,
-        const K&                       kernel_function,
-        const typename K::scalar_type  nu,
-        const long                     folds,
-        const long                     cache_size = 200,
-        const typename K::scalar_type  eps = 0.001
-    );
+    const probabilistic_decision_function<typename trainer_type::kernel_type> 
+    train_probabilistic_decision_function (
+        const trainer_type& trainer,
+        const in_sample_vector_type& x,
+        const in_scalar_vector_type& y,
+        const long folds
+    )
    /*!
        requires
-            - eps > 0
            - 1 < folds <= x.nr()
-            - x.nc() == 1 (i.e. x is a column vector)
-            - y.nc() == 1 (i.e. y is a column vector)
-            - x.nr() == y.nr() 
-            - x.nr() > 1
-            - cache_size > 0
-            - for all valid i:
-                - y(i) == -1 or +1
-                - y(i) is the class that should be assigned to training example x(i)
-            - 0 < nu < maximum_nu(y) 
-            - kernel_function == a kernel function object type as defined at the
-              top of dlib/svm/kernel_abstract.h
+            - is_binary_classification_problem(x,y) == true
+            - trainer_type == some kind of trainer object (e.g. svm_nu_trainer)
        ensures
            - trains a nu support vector classifier given the training samples in x and 
-              labels in y.  Training is done when the error is less than eps.
-            - caches approximately at most cache_size megabytes of the kernel matrix. 
-              (bigger values of this may make training go faster but doesn't affect the 
-              result.  However, too big a value will cause you to run out of memory.)
-            - returns a probabilistic_decision_function that represents the trained
-              svm.
+              labels in y.  
+            - returns a probabilistic_decision_function that represents the trained svm.
            - The parameters of the probability model are estimated by performing k-fold 
              cross validation. 
            - The number of folds used is given by the folds argument.
+        throws
+            - any exceptions thrown by trainer.train()
+            - std::bad_alloc
    !*/

 // ----------------------------------------------------------------------------------------
@@ -134,46 +290,36 @@ namespace dlib
 // ----------------------------------------------------------------------------------------

    template <
-        typename K
+        typename trainer_type,
+        typename in_sample_vector_type,
+        typename in_scalar_vector_type
        >
-    const matrix<typename K::scalar_type, 1, 2, typename K::mem_manager_type> svm_nu_cross_validate (
-        const typename decision_function<K>::sample_vector_type& x,
-        const typename decision_function<K>::scalar_vector_type& y,
-        const K&                       kernel_function,
-        const typename K::scalar_type  nu,
-        const long                     folds,
-        const long                     cache_size = 200,
-        const typename K::scalar_type  eps = 0.001
+    const matrix<typename trainer_type::scalar_type, 1, 2, typename trainer_type::mem_manager_type> 
+    cross_validate_trainer (
+        const trainer_type& trainer,
+        const in_sample_vector_type& x,
+        const in_scalar_vector_type& y,
+        const long folds
    );
    /*!
        requires
-            - eps > 0
+            - is_binary_classification_problem(x,y) == true
            - 1 < folds <= x.nr()
-            - x.nc() == 1 (i.e. x is a column vector)
-            - y.nc() == 1 (i.e. y is a column vector)
-            - x.nr() == y.nr() 
-            - x.nr() > 1
-            - cache_size > 0
-            - for all valid i:
-                - y(i) == -1 or +1
-                - y(i) is the class that should be assigned to training example x(i)
-            - 0 < nu < maximum_nu(y) 
-            - kernel_function == a kernel function object type as defined at the
-              top of dlib/svm/kernel_abstract.h
+            - trainer_type == some kind of trainer object (e.g. svm_nu_trainer)
        ensures
-            - performs k-fold cross validation by training a nu-svm using the svm_nu_train()
-              function.  Each fold is tested using the learned decision_function and the
-              average accuracy from all folds is returned.  The accuracy is returned in
-              a column vector, let us call it R.  Both quantities in R are numbers between
-              0 and 1 which represent the fraction of examples correctly classified.  R(0) 
-              is the fraction of +1 examples correctly classified and R(1) is the fraction
-              of -1 examples correctly classified.
+            - performs k-fold cross validation by using the given trainer to solve the
+              given binary classification problem for the given number of folds.
+              Each fold is tested using the output of the trainer and the average 
+              classification accuracy from all folds is returned.  
+            - The accuracy is returned in a column vector, let us call it R.  Both 
+              quantities in R are numbers between 0 and 1 which represent the fraction 
+              of examples correctly classified.  R(0) is the fraction of +1 examples 
+              correctly classified and R(1) is the fraction of -1 examples correctly 
+              classified.
            - The number of folds used is given by the folds argument.
-            - in each fold: trains a nu support vector classifier given the training samples 
-              in x and labels in y.  Training is done when the error is less than eps.
-            - caches approximately at most cache_size megabytes of the kernel matrix. 
-              (bigger values of this may make training go faster but doesn't affect the 
-              result.  However, too big a value will cause you to run out of memory.)
+        throws
+            - any exceptions thrown by trainer.train()
+            - std::bad_alloc
    !*/

 // ----------------------------------------------------------------------------------------