Added the find_approximate_k_nearest_neighbors() function.

--HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403653

Added the find_approximate_k_nearest_neighbors() function.
--HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403653
979f8bf5 · Davis King · 1cd2e5e9 · 979f8bf5 · 979f8bf5 · 979f8bf5
Commit 979f8bf5 authored May 29, 2010 by Davis King
3 changed files
--- a/dlib/manifold_regularization/graph_creation.h
+++ b/dlib/manifold_regularization/graph_creation.h
@@ -114,6 +114,155 @@ namespace dlib
        out.assign(edges.begin(), edges.begin() + edges.size()*percent);
    }

+// ----------------------------------------------------------------------------------------
+
+    namespace impl2
+    {
+        struct helper
+        {
+            /*
+                This is like the sample_pair but lets the edges be directional
+            */
+
+            helper(
+                unsigned long idx1,
+                unsigned long idx2,
+                float dist
+            ) : 
+                index1(idx1),
+                index2(idx2),
+                distance(dist) 
+            {}
+
+            unsigned long index1;
+            unsigned long index2;
+            float distance;
+        };
+
+        inline bool order_by_index (
+            const helper& a,
+            const helper& b
+        )
+        {
+            return a.index1 < b.index1 || (a.index1 == b.index1 && a.index2 < b.index2);
+        }
+
+        inline bool total_order_by_distance (
+            const helper& a,
+            const helper& b
+        )
+        {
+            return a.distance < b.distance || (a.distance == b.distance && order_by_index(a,b));
+        }
+
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename vector_type,
+        typename distance_function_type,
+        typename alloc,
+        typename T
+        >
+    void find_approximate_k_nearest_neighbors (
+        const vector_type& samples,
+        const distance_function_type& dist_funct,
+        const unsigned long k,
+        unsigned long num,
+        const T& random_seed,
+        std::vector<sample_pair, alloc>& out
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(samples.size() > 1 &&
+                    num > 0 && k > 0,
+            "\t void find_approximate_k_nearest_neighbors()"
+            << "\n\t Invalid inputs were given to this function."
+            << "\n\t samples.size(): " << samples.size()
+            << "\n\t k:              " << k  
+            << "\n\t num:            " << num 
+            );
+
+        // we add each edge twice in the following loop.  So multiply num by 2 to account for that.
+        num *= 2;
+
+        std::vector<impl2::helper> edges;
+        edges.reserve(num);
+        std::vector<sample_pair, alloc> temp;
+        temp.reserve(num);
+
+        dlib::rand::kernel_1a rnd;
+        rnd.set_seed(cast_to_string(random_seed));
+
+        // randomly sample a bunch of edges
+        while (edges.size() < num)
+        {
+            const unsigned long idx1 = rnd.get_random_32bit_number()%samples.size();
+            const unsigned long idx2 = rnd.get_random_32bit_number()%samples.size();
+            if (idx1 != idx2)
+            {
+                const float dist = dist_funct(samples[idx1], samples[idx2]);
+                edges.push_back(impl2::helper(idx1, idx2, dist));
+                edges.push_back(impl2::helper(idx2, idx1, dist));
+
+            }
+        }
+
+        std::sort(edges.begin(), edges.end(), &impl2::order_by_index);
+
+        std::vector<impl2::helper>::iterator beg, itr;
+        // now copy edges into temp when they aren't duplicates and also only move in the k shortest for
+        // each index.
+        itr = edges.begin();
+        while (itr != edges.end())
+        {
+            // first find the bounding range for all the edges connected to node itr->index1
+            beg = itr; 
+            while (itr != edges.end() && itr->index1 == beg->index1)
+                ++itr;
+
+            // If the node has more than k edges then sort them by distance so that
+            // we will end up with the k best.
+            if (static_cast<unsigned long>(itr - beg) > k)
+            {
+                std::sort(beg, itr, &impl2::total_order_by_distance);
+            }
+
+            // take the k best unique edges from the range [beg,itr)
+            temp.push_back(sample_pair(beg->index1, beg->index2, beg->distance));
+            unsigned long prev_index2 = beg->index2;
+            ++beg;
+            unsigned long count = 1;
+            for (; beg != itr && count < k; ++beg)
+            {
+                if (beg->index2 != prev_index2)
+                {
+                    temp.push_back(sample_pair(beg->index1, beg->index2, beg->distance));
+                    ++count;
+                }
+                prev_index2 = beg->index2;
+            }
+        }
+
+
+        // now sort temp so that we can avoid duplicates in the final loop below
+        std::sort(temp.begin(), temp.end(), &order_by_index);
+
+
+        // now put edges into out while avoiding duplicates
+        out.clear();
+        out.reserve(temp.size());
+        out.push_back(temp[0]);
+        for (unsigned long i = 1; i < temp.size(); ++i)
+        {
+            if (temp[i] != temp[i-1])
+            {
+                out.push_back(temp[i]);
+            }
+        }
+    }
+
 // ----------------------------------------------------------------------------------------

    template <

--- a/dlib/manifold_regularization/graph_creation_abstract.h
+++ b/dlib/manifold_regularization/graph_creation_abstract.h
@@ -50,6 +50,47 @@ namespace dlib
              function.
    !*/

+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename vector_type,
+        typename distance_function_type,
+        typename alloc,
+        typename T
+        >
+    void find_approximate_k_nearest_neighbors (
+        const vector_type& samples,
+        const distance_function_type& dist_funct,
+        const unsigned long k,
+        const unsigned long num,
+        const T& random_seed,
+        std::vector<sample_pair, alloc>& out
+    );
+    /*!
+        requires
+            - samples.size() > 1
+            - k > 0
+            - num > 0
+            - random_seed must be convertible to a string by dlib::cast_to_string()
+            - dist_funct(samples[i], samples[j]) must be a valid expression that evaluates
+              to a floating point number 
+        ensures
+            - This function computes an approximate form of k nearest neighbors. As num grows 
+              larger the output of this function converges to the output of the 
+              find_k_nearest_neighbors() function defined below.
+            - Specifically, this function randomly samples the space of pairs of integers between
+              0 and samples.size()-1 inclusive.  For each of these pairs, (i,j), a
+              sample_pair is created as follows:    
+                sample_pair(i, j, dist_funct(samples[i], samples[j]))
+              num such sample_pair objects are generated and then exact k-nearest-neighbors
+              is performed amongst these sample_pairs and the results are stored into #out.
+            - contains_duplicate_pairs(#out) == false
+            - for all valid i:
+                - #out[i].distance() == dist_funct(samples[#out[i].index1()], samples[#out[i].index2()])
+            - random_seed is used to seed the random number generator used by this 
+              function.
+    !*/
+
 // ----------------------------------------------------------------------------------------

    template <

--- a/dlib/test/linear_manifold_regularizer.cpp
+++ b/dlib/test/linear_manifold_regularizer.cpp
@@ -34,10 +34,13 @@ namespace
                0                     // the number of command line arguments for this test
            )
        {
+            seed = 1;
        }

        dlib::rand::float_1a rnd;

+        unsigned long seed;
+
        typedef matrix<double, 0, 1> sample_type;
        typedef radial_basis_kernel<sample_type> kernel_type;

@@ -213,6 +216,33 @@ namespace

        }

+        void test_knn1_approx()
+        {
+            std::vector<matrix<double,2,1> > samples;
+
+            matrix<double,2,1> test;
+            
+            test = 0,0;  samples.push_back(test);
+            test = 1,1;  samples.push_back(test);
+            test = 1,-1;  samples.push_back(test);
+            test = -1,1;  samples.push_back(test);
+            test = -1,-1;  samples.push_back(test);
+
+            std::vector<sample_pair> edges;
+            // For this simple graph and high number of samples we will do we should obtain the exact 
+            // knn solution.
+            find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 1, 10000, seed, edges);
+            DLIB_TEST(edges.size() == 4);
+
+            std::sort(edges.begin(), edges.end(), &order_by_index);
+
+            DLIB_TEST(edges[0] == sample_pair(0,1,0));
+            DLIB_TEST(edges[1] == sample_pair(0,2,0));
+            DLIB_TEST(edges[2] == sample_pair(0,3,0));
+            DLIB_TEST(edges[3] == sample_pair(0,4,0));
+
+        }
+
        void test_knn2()
        {
            std::vector<matrix<double,2,1> > samples;
@@ -237,6 +267,32 @@ namespace

        }

+        void test_knn2_approx()
+        {
+            std::vector<matrix<double,2,1> > samples;
+
+            matrix<double,2,1> test;
+            
+            test = 1,1;  samples.push_back(test);
+            test = 1,-1;  samples.push_back(test);
+            test = -1,1;  samples.push_back(test);
+            test = -1,-1;  samples.push_back(test);
+
+            std::vector<sample_pair> edges;
+            // For this simple graph and high number of samples we will do we should obtain the exact 
+            // knn solution.
+            find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 2, 10000, seed,  edges);
+            DLIB_TEST(edges.size() == 4);
+
+            std::sort(edges.begin(), edges.end(), &order_by_index);
+
+            DLIB_TEST(edges[0] == sample_pair(0,1,0));
+            DLIB_TEST(edges[1] == sample_pair(0,2,0));
+            DLIB_TEST(edges[2] == sample_pair(1,3,0));
+            DLIB_TEST(edges[3] == sample_pair(2,3,0));
+
+        }
+
        void perform_test (
        )
        {
@@ -244,6 +300,9 @@ namespace
            {
                do_the_test();

+                ++seed;
+                test_knn1_approx();
+                test_knn2_approx();
            }
            test_knn1();
            test_knn2();