Added Sammon's algorithm.

46c00173 · Davis King · bf2edbec · 46c00173 · 46c00173 · 46c00173
Commit 46c00173 authored Aug 27, 2012 by Davis King
6 changed files
--- a/dlib/statistics.h
+++ b/dlib/statistics.h
@@ -7,6 +7,7 @@
 #include "statistics/dpca.h"
 #include "statistics/random_subset_selector.h"
 #include "statistics/image_feature_sampling.h"
+#include "statistics/sammon.h"

 #endif // DLIB_STATISTICs_H_ 


--- a/dlib/statistics/sammon.h
+++ b/dlib/statistics/sammon.h
+// Copyright (C) 2012  Emanuele Cesena (emanuele.cesena@gmail.com), Davis E. King
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_SAMMoN_H__
+#define DLIB_SAMMoN_H__
+
+#include "sammon_abstract.h"
+#include "../matrix.h"
+#include "../algs.h"
+#include "dpca.h"
+#include <vector>
+
+namespace dlib
+{
+
+    class sammon_projection
+    {
+
+    public:
+
+    // ------------------------------------------------------------------------------------
+
+        template <typename matrix_type>
+        std::vector<matrix<double,0,1> > operator() (
+            const std::vector<matrix_type>& data,       
+            long num_dims                      
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(num_dims > 0,
+                "\t std::vector<matrix<double,0,1> > sammon_projection::operator()"
+                << "\n\t Invalid inputs were given to this function."
+                << "\n\t num_dims:    " << num_dims
+                );
+            std::vector<matrix<double,0,1> > result;    // projections
+            if (data.size() == 0)
+            {
+                return result;
+            }
+
+#ifdef ENABLE_ASSERTS
+            DLIB_ASSERT(0 < num_dims && num_dims <= data[0].size(),
+                "\t std::vector<matrix<double,0,1> > sammon_projection::operator()"
+                << "\n\t Invalid inputs were given to this function."
+                << "\n\t data.size():    " << data.size()
+                << "\n\t num_dims:       " << num_dims
+                << "\n\t data[0].size(): " << data[0].size() 
+                );
+            for (unsigned long i = 0; i < data.size(); ++i)
+            {
+                DLIB_ASSERT(is_col_vector(data[i]) && data[i].size() == data[0].size(),
+                        "\t std::vector<matrix<double,0,1> > sammon_projection::operator()"
+                        << "\n\t Invalid inputs were given to this function."
+                        << "\n\t data["<<i<<"].size():    " << data[i].size()
+                        << "\n\t data[0].size(): " << data[0].size() 
+                        << "\n\t is_col_vector(data["<<i<<"]): " << is_col_vector(data[i])
+                );
+            }
+#endif
+
+            double err;                                 // error (discarded)
+            do_sammon_projection(data, num_dims, result, err);
+            return result;
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        template <typename matrix_type>
+        void operator() (
+            const std::vector<matrix_type>& data,       
+            long num_dims,                     
+            std::vector<matrix<double,0,1> >& result,   
+            double &err,                                
+            unsigned long num_iters = 1000,             
+            const double err_delta = 1.0e-9            
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(num_dims > 0 && num_iters > 0 && err_delta > 0.0,
+                "\t std::vector<matrix<double,0,1> > sammon_projection::operator()"
+                << "\n\t Invalid inputs were given to this function."
+                << "\n\t data.size(): " << data.size()
+                << "\n\t num_dims:    " << num_dims
+                << "\n\t num_iters:   " << num_iters
+                << "\n\t err_delta:   " << err_delta
+                );
+            if (data.size() == 0)
+            {
+                result.clear();
+                err = 0;
+                return;
+            }
+
+#ifdef ENABLE_ASSERTS
+            DLIB_ASSERT(0 < num_dims && num_dims <= data[0].size(),
+                "\t std::vector<matrix<double,0,1> > sammon_projection::operator()"
+                << "\n\t Invalid inputs were given to this function."
+                << "\n\t data.size():    " << data.size()
+                << "\n\t num_dims:       " << num_dims
+                << "\n\t data[0].size(): " << data[0].size() 
+                );
+            for (unsigned long i = 0; i < data.size(); ++i)
+            {
+                DLIB_ASSERT(is_col_vector(data[i]) && data[i].size() == data[0].size(),
+                        "\t std::vector<matrix<double,0,1> > sammon_projection::operator()"
+                        << "\n\t Invalid inputs were given to this function."
+                        << "\n\t data["<<i<<"].size():    " << data[i].size()
+                        << "\n\t data[0].size(): " << data[0].size() 
+                        << "\n\t is_col_vector(data["<<i<<"]): " << is_col_vector(data[i])
+                );
+            }
+#endif
+
+            do_sammon_projection(data, num_dims, result, err, num_iters, err_delta);
+        }
+
+        // ----------------------------------------------------------------------------------------
+        // ----------------------------------------------------------------------------------------
+
+    private:
+
+        void compute_relative_distances(
+            matrix<double,0,1>& dist,                   // relative distances (output)
+            matrix<double,0,0>& data,                   // input data (matrix whose columns are the input vectors)
+            double eps_ratio = 1.0e-7                   // to compute the minimum distance eps
+        )
+        /*!
+            requires
+                - dist.nc() == comb( data.nc(), 2 ), preallocated
+                - eps_ratio > 0
+            ensures
+                - dist[k] == lenght(data[i] - data[j]) for k = j(j-1)/2 + i
+        !*/
+        {
+            const long N = data.nc();                   // num of points
+            double eps;                                 // minimum distance, forced to avoid vectors collision
+                                                        // computed at runtime as eps_ration * mean(vectors distances)
+            for (int k = 0, i = 1; i < N; ++i)
+                for (int j = 0; j < i; ++j)
+                    dist(k++) = length(colm(data, i) - colm(data, j));
+
+            eps = eps_ratio * mean(dist);
+            dist = lowerbound(dist, eps);
+        }
+
+        // ----------------------------------------------------------------------------------------
+
+        template <typename matrix_type>
+        void do_sammon_projection(
+            const std::vector<matrix_type>& data,       // input data
+            unsigned long num_dims,                     // dimension of the reduced space
+            std::vector<matrix<double,0,1> >& result,   // projections (output)
+            double &err,                                // error (output)
+            unsigned long num_iters = 1000,             // max num of iterations: stop condition
+            const double err_delta = 1.0e-9             // delta error: stop condition
+        )
+        /*!
+            requires
+                - matrix_type should be a kind of dlib::matrix<double,N,1>
+                - num_dims > 0
+                - num_iters > 0
+                - err_delta > 0
+            ensures
+                - result == a set of matrix<double,num_dims,1> objects that represent
+                  the Sammon's projections of data vectors.
+                - err == the estimated error done in the projection, with the extra
+                  property that err(at previous iteration) - err < err_delta
+        !*/
+        {
+            // other params
+            const double mf = 0.3;                      // magic factor
+
+            matrix<double> mdata;                // input data as matrix
+            matrix<double> projs;                // projected vectors, i.e. output data as matrix
+
+            // std::vector<matrix> -> matrix
+            mdata.set_size(data[0].size(), data.size());
+            for (unsigned int i = 0; i < data.size(); i++)
+                set_colm(mdata, i) = data[i];
+
+            const long N = mdata.nc();           // num of points
+            const long d = num_dims;             // size of the reduced space
+            const long nd = N * (N - 1) / 2;     // num of pairs of points = size of the distances vectors
+
+            matrix<double, 0, 1> dsij, inv_dsij; // d*_ij: pair-wise distances in the input space (and inverses)
+            dsij.set_size(nd, 1);
+            inv_dsij.set_size(nd, 1);
+            double ic; // 1.0 / sum of dsij
+
+            matrix<double, 0, 1> dij;            // d_ij: pair-wise distances in the reduced space
+            dij.set_size(nd, 1);
+
+            matrix<double, 0, 0> dE, dE2, dtemp; // matrices representing error partial derivatives
+            dE.set_size(d, N);
+            dE2.set_size(d, N);
+            dtemp.set_size(d, N);
+
+            matrix<double, 0, 1> inv_dij, alpha; // utility vectors used to compute the partial derivatives
+            inv_dij.set_size(N, 1);              // inv_dij is 1.0/dij, but we only need it column-wise
+            alpha.set_size(N, 1);                // (slightly wasting a bit of computation)
+            // alpha = 1.0/dij - 1.0/dsij, again column-wise
+
+
+            // initialize projs with PCA
+            discriminant_pca<matrix<double> > dpca;
+            for (int i = 0; i < mdata.nc(); ++i)
+            {
+                dpca.add_to_total_variance(colm(mdata, i));
+            }
+            matrix<double> mat = dpca.dpca_matrix_of_size(num_dims);
+            projs = mat * mdata;
+
+            // compute dsij, inv_dsij and ic
+            compute_relative_distances(dsij, mdata);
+            inv_dsij = 1.0 / dsij;
+            ic = 1.0 / sum(dsij);
+
+            // compute dij and err
+            compute_relative_distances(dij, projs);
+            err = ic * sum(pointwise_multiply(squared(dij - dsij), inv_dsij));
+
+            // start iterating
+            while (num_iters--)
+            {
+                // compute dE, dE2 progressively column by column
+                for (int p = 0; p < N; ++p)
+                {
+                    // compute
+                    // - alpha_p, the column vector with 1/d_pj - 1/d*_pj
+                    // - dtemp, the matrix with the p-th column repeated all along
+                    //TODO: optimize constructions
+                    for (int i = 0; i < N; ++i)
+                    {
+                        int pos = (i < p) ? p * (p - 1) / 2 + i : i * (i - 1) / 2 + p;
+                        inv_dij(i) = (i == p) ? 0.0 : 1.0 / dij(pos);
+                        alpha(i) = (i == p) ? 0.0 : inv_dij(i) - inv_dsij(pos);
+                        set_colm(dtemp, i) = colm(projs, p);
+                    }
+
+                    dtemp -= projs;
+                    set_colm(dE, p) = dtemp * alpha;
+
+                    double sum_alpha = sum(alpha);
+                    set_colm(dE2, p) = abs( sum_alpha + squared(dtemp) * cubed(inv_dij) );
+                }
+
+
+                // compute the update projections
+                projs += pointwise_multiply(dE, mf * reciprocal(dE2));
+
+                // compute new dij and error
+                compute_relative_distances(dij, projs);
+                double err_new = ic * sum( pointwise_multiply(squared(dij - dsij), inv_dsij) );
+                if (err - err_new < err_delta)
+                    break;
+                err = err_new;
+            }
+
+            // matrix -> std::vector<matrix>
+            result.clear();
+            for (int i = 0; i < projs.nc(); ++i)
+                result.push_back(colm(projs, i));
+        }
+
+    };
+
+} // namespace dlib
+
+#endif // DLIB_SAMMoN_H__
+
--- a/dlib/statistics/sammon_abstract.h
+++ b/dlib/statistics/sammon_abstract.h
+// Copyright (C) 2012  Emanuele Cesena (emanuele.cesena@gmail.com), Davis E. King
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_SAMMoN_ABSTRACT_H__
+#ifdef DLIB_SAMMoN_ABSTRACT_H__
+
+#include "../matrix/matrix_abstract.h"
+#include <vector>
+
+namespace dlib
+{
+
+    class sammon_projection
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is a function object that computes the Sammon projection of a set
+                of N points in a L-dimensional vector space onto a d-dimensional space
+                (d < L), according to the paper:
+                    A Nonlinear Mapping for Data Structure Analysis (1969) by J.W. Sammon
+
+                The current implementation is a vectorized version of the original algorithm.
+        !*/
+
+    public:
+
+        sammon_projection(
+        );
+        /*!
+            ensures
+                - this object is properly initialized 
+        !*/
+
+        template <typename matrix_type>
+        std::vector<matrix<double,0,1> > operator() (
+            const std::vector<matrix_type>& data,       
+            long num_dims                      
+        );
+        /*!
+            requires
+                - num_dims > 0
+                - matrix_type should be a kind of dlib::matrix of doubles capable
+                  of representing column vectors.
+                - for all valid i:
+                    - is_col_vector(data[i]) == true
+                    - data[0].size() == data[i].size()
+                      (i.e. all the vectors in data must have the same dimensionality)
+                - if (data.size() != 0) then
+                    - 0 < num_dims <= data[0].size()
+                      (i.e. you can't project into a higher dimension than the input data,
+                      only to a lower dimension.)
+            ensures
+                - This routine computes Sammon's dimensionality reduction method based on the
+                  given input data.  It will attempt to project the contents of data into a
+                  num_dims dimensional space that preserves relative distances between the
+                  input data points.
+                - This function returns a std::vector, OUT, such that:
+                    - OUT == a set of column vectors that represent the Sammon's projection of 
+                      the input data vectors. 
+                    - OUT.size() == data.size()
+                    - for all valid i:
+                        - OUT[i].size() == num_dims
+                        - OUT[i] == the Sammon projection of the input vector data[i]
+        !*/
+
+        template <typename matrix_type>
+        void operator() (
+            const std::vector<matrix_type>& data,       
+            long num_dims,                     
+            std::vector<matrix<double,0,1> >& result,   
+            double &err,                                
+            unsigned long num_iters = 1000,             
+            const double err_delta = 1.0e-9            
+        );
+        /*!
+            requires
+                - num_iters > 0
+                - err_delta > 0
+                - num_dims > 0
+                - matrix_type should be a kind of dlib::matrix of doubles capable
+                  of representing column vectors.
+                - for all valid i:
+                    - is_col_vector(data[i]) == true
+                    - data[0].size() == data[i].size()
+                      (i.e. all the vectors in data must have the same dimensionality)
+                - if (data.size() != 0) then
+                    - 0 < num_dims <= data[0].size()
+                      (i.e. you can't project into a higher dimension than the input data,
+                      only to a lower dimension.)
+            ensures
+                - This routine computes Sammon's dimensionality reduction method based on the
+                  given input data.  It will attempt to project the contents of data into a
+                  num_dims dimensional space that preserves relative distances between the
+                  input data points.
+                - #err == the final error value at the end of the algorithm.  The goal of Sammon's
+                  algorithm is to find a lower dimensional projection of the input data that
+                  preserves the relative distances between points.  The value in #err is a measure
+                  of the total error at the end of the algorithm.  So smaller values indicate
+                  a better projection was found than if a large value is returned via #err.
+                - Sammon's algorithm will run until either num_iters iterations has executed
+                  or the change in error from one iteration to the next is less than err_delta.
+                - Upon completion, the output of Sammon's projection is stored into #result, in
+                  particular, we will have:
+                    - #result == a set of column vectors that represent the Sammon's projection of 
+                      the input data vectors. 
+                    - #result.size() == data.size()
+                    - for all valid i:
+                        - #result[i].size() == num_dims
+                        - #result[i] == the Sammon projection of the input vector data[i]
+        !*/
+
+    };
+
+} 
+
+#endif // DLIB_SAMMoN_ABSTRACT_H__
+
+
--- a/dlib/test/CMakeLists.txt
+++ b/dlib/test/CMakeLists.txt
@@ -91,6 +91,7 @@ set (tests
   read_write_mutex.cpp
   reference_counter.cpp
   rls.cpp
+   sammon.cpp
   scan_image.cpp
   sequence.cpp
   sequence_labeler.cpp

--- a/dlib/test/makefile
+++ b/dlib/test/makefile
@@ -106,6 +106,7 @@ SRC += rand.cpp
 SRC += read_write_mutex.cpp
 SRC += reference_counter.cpp
 SRC += rls.cpp
+SRC += sammon.cpp
 SRC += scan_image.cpp
 SRC += sequence.cpp
 SRC += sequence_labeler.cpp

--- a/dlib/test/sammon.cpp
+++ b/dlib/test/sammon.cpp
+// Copyright (C) 2012  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+
+
+#include <sstream>
+#include <string>
+#include <cstdlib>
+#include <ctime>
+#include <cmath>
+#include <dlib/statistics.h>
+
+#include "tester.h"
+
+namespace  
+{
+
+    using namespace test;
+    using namespace dlib;
+    using namespace std;
+
+    logger dlog("test.sammon");
+
+
+    std::vector<matrix<double,4,1> > make_test_data4(
+    )
+    {
+        std::vector<matrix<double,4,1> > data;
+
+        matrix<double,4,1> m;
+
+        m = 0,0,0, 0; data.push_back(m);
+        m = 1,0,0, 0; data.push_back(m);
+        m = 0,1,0, 0; data.push_back(m);
+        m = 0,0,1, 0; data.push_back(m);
+
+        return data;
+    }
+
+    std::vector<matrix<double,3,1> > make_test_data3(
+    )
+    {
+        std::vector<matrix<double,3,1> > data;
+
+        matrix<double,3,1> m;
+
+        m = 0,0,0; data.push_back(m);
+        m = 1,0,0; data.push_back(m);
+        m = 0,1,0; data.push_back(m);
+        m = 0,0,1; data.push_back(m);
+
+        return data;
+    }
+
+    std::vector<matrix<double> > make_test_data3d(
+    )
+    {
+        std::vector<matrix<double> > data;
+
+        matrix<double,3,1> m;
+
+        m = 0,0,0; data.push_back(m);
+        m = 1,0,0; data.push_back(m);
+        m = 0,1,0; data.push_back(m);
+        m = 0,0,1; data.push_back(m);
+
+        return data;
+    }
+
+
+    void runtest()
+    {
+        sammon_projection s;
+        std::vector<matrix<double, 0, 1> >  projs = s(make_test_data3(),2);
+        running_stats<double> rs1, rs2;
+
+        rs1.add(length(projs[0] - projs[1]));
+        rs1.add(length(projs[0] - projs[2]));
+        rs1.add(length(projs[0] - projs[3]));
+
+        rs2.add(length(projs[1] - projs[2]));
+        rs2.add(length(projs[2] - projs[3]));
+        rs2.add(length(projs[3] - projs[1]));
+
+        DLIB_TEST(rs1.stddev()/rs1.mean() < 1e-4);
+        DLIB_TEST(rs2.stddev()/rs2.mean() < 1e-4);
+
+
+
+        projs = s(make_test_data4(),2);
+        rs1.clear();
+        rs2.clear();
+
+        rs1.add(length(projs[0] - projs[1]));
+        rs1.add(length(projs[0] - projs[2]));
+        rs1.add(length(projs[0] - projs[3]));
+
+        rs2.add(length(projs[1] - projs[2]));
+        rs2.add(length(projs[2] - projs[3]));
+        rs2.add(length(projs[3] - projs[1]));
+
+        DLIB_TEST(rs1.stddev()/rs1.mean() < 1e-4);
+        DLIB_TEST(rs2.stddev()/rs2.mean() < 1e-4);
+
+        projs = s(make_test_data3d(),2);
+        rs1.clear();
+        rs2.clear();
+
+        rs1.add(length(projs[0] - projs[1]));
+        rs1.add(length(projs[0] - projs[2]));
+        rs1.add(length(projs[0] - projs[3]));
+
+        rs2.add(length(projs[1] - projs[2]));
+        rs2.add(length(projs[2] - projs[3]));
+        rs2.add(length(projs[3] - projs[1]));
+
+        DLIB_TEST(rs1.stddev()/rs1.mean() < 1e-4);
+        DLIB_TEST(rs2.stddev()/rs2.mean() < 1e-4);
+    }
+
+    void runtest2()
+    {
+        sammon_projection s;
+        std::vector<matrix<double, 0, 1> >  projs, temp;
+
+        DLIB_TEST(s(projs,3).size() == 0);
+
+        matrix<double,2,1> m;
+        m = 1,2;
+        projs.push_back(m);
+        temp = s(projs,2);
+        DLIB_TEST(temp.size() == 1);
+        DLIB_TEST(temp[0].size() == 2);
+
+        projs.push_back(m);
+        temp = s(projs,1);
+        DLIB_TEST(temp.size() == 2);
+        DLIB_TEST(temp[0].size() == 1);
+        DLIB_TEST(temp[1].size() == 1);
+    }
+
+    void runtest3(int num_dims)
+    {
+        sammon_projection s;
+        std::vector<matrix<double, 0, 1> >  projs;
+        matrix<double,3,1> m;
+        m = 1, 1, 1;
+        projs.push_back(m);
+
+        m = 1, 2, 1;
+        projs.push_back(m);
+
+        m = 1, 3, 1;
+        projs.push_back(m);
+
+        projs = s(projs,num_dims);
+
+        const double d1a = length(projs[0] - projs[1]);
+        const double d1b = length(projs[1] - projs[2]);
+        const double d2  = length(projs[0] - projs[2]);
+
+        DLIB_TEST(std::abs(d1a-d1b)/d1a < 1e-8);
+        DLIB_TEST(std::abs(d2/d1a-2) < 1e-8);
+    }
+
+    void runtest4(int num_dims)
+    {
+        sammon_projection s;
+        std::vector<matrix<double, 0, 1> >  projs;
+        matrix<double,3,1> m;
+        m = 1, 1, 1;
+        projs.push_back(m);
+
+        m = 1, 2, 1;
+        projs.push_back(m);
+
+
+        projs = s(projs,num_dims);
+
+        DLIB_TEST(length(projs[0] - projs[1]) > 1e-5); 
+    }
+
+    class sammon_tester : public tester
+    {
+    public:
+        sammon_tester (
+        ) :
+            tester ("test_sammon",
+                    "Runs tests on the sammon_projection component.")
+        {}
+
+        void perform_test (
+        )
+        {
+            print_spinner();
+            runtest();
+            print_spinner();
+            runtest2();
+            print_spinner();
+            runtest3(2);
+            print_spinner();
+            runtest4(2);
+            runtest3(1);
+            print_spinner();
+            runtest4(1);
+        }
+    } a;
+
+}
+
+
+