Commit 979f8bf5 authored by Davis King's avatar Davis King

Added the find_approximate_k_nearest_neighbors() function.

--HG--
extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403653
parent 1cd2e5e9
......@@ -114,6 +114,155 @@ namespace dlib
out.assign(edges.begin(), edges.begin() + edges.size()*percent);
}
// ----------------------------------------------------------------------------------------
namespace impl2
{
struct helper
{
/*
This is like the sample_pair but lets the edges be directional
*/
helper(
unsigned long idx1,
unsigned long idx2,
float dist
) :
index1(idx1),
index2(idx2),
distance(dist)
{}
unsigned long index1;
unsigned long index2;
float distance;
};
inline bool order_by_index (
const helper& a,
const helper& b
)
{
return a.index1 < b.index1 || (a.index1 == b.index1 && a.index2 < b.index2);
}
inline bool total_order_by_distance (
const helper& a,
const helper& b
)
{
return a.distance < b.distance || (a.distance == b.distance && order_by_index(a,b));
}
}
// ----------------------------------------------------------------------------------------
template <
typename vector_type,
typename distance_function_type,
typename alloc,
typename T
>
void find_approximate_k_nearest_neighbors (
const vector_type& samples,
const distance_function_type& dist_funct,
const unsigned long k,
unsigned long num,
const T& random_seed,
std::vector<sample_pair, alloc>& out
)
{
// make sure requires clause is not broken
DLIB_ASSERT(samples.size() > 1 &&
num > 0 && k > 0,
"\t void find_approximate_k_nearest_neighbors()"
<< "\n\t Invalid inputs were given to this function."
<< "\n\t samples.size(): " << samples.size()
<< "\n\t k: " << k
<< "\n\t num: " << num
);
// we add each edge twice in the following loop. So multiply num by 2 to account for that.
num *= 2;
std::vector<impl2::helper> edges;
edges.reserve(num);
std::vector<sample_pair, alloc> temp;
temp.reserve(num);
dlib::rand::kernel_1a rnd;
rnd.set_seed(cast_to_string(random_seed));
// randomly sample a bunch of edges
while (edges.size() < num)
{
const unsigned long idx1 = rnd.get_random_32bit_number()%samples.size();
const unsigned long idx2 = rnd.get_random_32bit_number()%samples.size();
if (idx1 != idx2)
{
const float dist = dist_funct(samples[idx1], samples[idx2]);
edges.push_back(impl2::helper(idx1, idx2, dist));
edges.push_back(impl2::helper(idx2, idx1, dist));
}
}
std::sort(edges.begin(), edges.end(), &impl2::order_by_index);
std::vector<impl2::helper>::iterator beg, itr;
// now copy edges into temp when they aren't duplicates and also only move in the k shortest for
// each index.
itr = edges.begin();
while (itr != edges.end())
{
// first find the bounding range for all the edges connected to node itr->index1
beg = itr;
while (itr != edges.end() && itr->index1 == beg->index1)
++itr;
// If the node has more than k edges then sort them by distance so that
// we will end up with the k best.
if (static_cast<unsigned long>(itr - beg) > k)
{
std::sort(beg, itr, &impl2::total_order_by_distance);
}
// take the k best unique edges from the range [beg,itr)
temp.push_back(sample_pair(beg->index1, beg->index2, beg->distance));
unsigned long prev_index2 = beg->index2;
++beg;
unsigned long count = 1;
for (; beg != itr && count < k; ++beg)
{
if (beg->index2 != prev_index2)
{
temp.push_back(sample_pair(beg->index1, beg->index2, beg->distance));
++count;
}
prev_index2 = beg->index2;
}
}
// now sort temp so that we can avoid duplicates in the final loop below
std::sort(temp.begin(), temp.end(), &order_by_index);
// now put edges into out while avoiding duplicates
out.clear();
out.reserve(temp.size());
out.push_back(temp[0]);
for (unsigned long i = 1; i < temp.size(); ++i)
{
if (temp[i] != temp[i-1])
{
out.push_back(temp[i]);
}
}
}
// ----------------------------------------------------------------------------------------
template <
......
......@@ -50,6 +50,47 @@ namespace dlib
function.
!*/
// ----------------------------------------------------------------------------------------
template <
typename vector_type,
typename distance_function_type,
typename alloc,
typename T
>
void find_approximate_k_nearest_neighbors (
const vector_type& samples,
const distance_function_type& dist_funct,
const unsigned long k,
const unsigned long num,
const T& random_seed,
std::vector<sample_pair, alloc>& out
);
/*!
requires
- samples.size() > 1
- k > 0
- num > 0
- random_seed must be convertible to a string by dlib::cast_to_string()
- dist_funct(samples[i], samples[j]) must be a valid expression that evaluates
to a floating point number
ensures
- This function computes an approximate form of k nearest neighbors. As num grows
larger the output of this function converges to the output of the
find_k_nearest_neighbors() function defined below.
- Specifically, this function randomly samples the space of pairs of integers between
0 and samples.size()-1 inclusive. For each of these pairs, (i,j), a
sample_pair is created as follows:
sample_pair(i, j, dist_funct(samples[i], samples[j]))
num such sample_pair objects are generated and then exact k-nearest-neighbors
is performed amongst these sample_pairs and the results are stored into #out.
- contains_duplicate_pairs(#out) == false
- for all valid i:
- #out[i].distance() == dist_funct(samples[#out[i].index1()], samples[#out[i].index2()])
- random_seed is used to seed the random number generator used by this
function.
!*/
// ----------------------------------------------------------------------------------------
template <
......
......@@ -34,10 +34,13 @@ namespace
0 // the number of command line arguments for this test
)
{
seed = 1;
}
dlib::rand::float_1a rnd;
unsigned long seed;
typedef matrix<double, 0, 1> sample_type;
typedef radial_basis_kernel<sample_type> kernel_type;
......@@ -213,6 +216,33 @@ namespace
}
void test_knn1_approx()
{
std::vector<matrix<double,2,1> > samples;
matrix<double,2,1> test;
test = 0,0; samples.push_back(test);
test = 1,1; samples.push_back(test);
test = 1,-1; samples.push_back(test);
test = -1,1; samples.push_back(test);
test = -1,-1; samples.push_back(test);
std::vector<sample_pair> edges;
// For this simple graph and high number of samples we will do we should obtain the exact
// knn solution.
find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 1, 10000, seed, edges);
DLIB_TEST(edges.size() == 4);
std::sort(edges.begin(), edges.end(), &order_by_index);
DLIB_TEST(edges[0] == sample_pair(0,1,0));
DLIB_TEST(edges[1] == sample_pair(0,2,0));
DLIB_TEST(edges[2] == sample_pair(0,3,0));
DLIB_TEST(edges[3] == sample_pair(0,4,0));
}
void test_knn2()
{
std::vector<matrix<double,2,1> > samples;
......@@ -237,6 +267,32 @@ namespace
}
void test_knn2_approx()
{
std::vector<matrix<double,2,1> > samples;
matrix<double,2,1> test;
test = 1,1; samples.push_back(test);
test = 1,-1; samples.push_back(test);
test = -1,1; samples.push_back(test);
test = -1,-1; samples.push_back(test);
std::vector<sample_pair> edges;
// For this simple graph and high number of samples we will do we should obtain the exact
// knn solution.
find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 2, 10000, seed, edges);
DLIB_TEST(edges.size() == 4);
std::sort(edges.begin(), edges.end(), &order_by_index);
DLIB_TEST(edges[0] == sample_pair(0,1,0));
DLIB_TEST(edges[1] == sample_pair(0,2,0));
DLIB_TEST(edges[2] == sample_pair(1,3,0));
DLIB_TEST(edges[3] == sample_pair(2,3,0));
}
void perform_test (
)
{
......@@ -244,6 +300,9 @@ namespace
{
do_the_test();
++seed;
test_knn1_approx();
test_knn2_approx();
}
test_knn1();
test_knn2();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment