Commit 986273f2 authored by Davis King's avatar Davis King

Added find_clusters_using_angular_kmeans()

parent 2c8b159e
...@@ -477,6 +477,138 @@ namespace dlib ...@@ -477,6 +477,138 @@ namespace dlib
} }
} }
}
// ----------------------------------------------------------------------------------------
template <
typename array_type,
typename sample_type,
typename alloc
>
void find_clusters_using_angular_kmeans (
const array_type& samples,
std::vector<sample_type, alloc>& centers,
unsigned long max_iter = 1000
)
{
// make sure requires clause is not broken
DLIB_ASSERT(samples.size() > 0 && centers.size() > 0,
"\tvoid find_clusters_using_angular_kmeans()"
<< "\n\tYou passed invalid arguments to this function"
<< "\n\t samples.size(): " << samples.size()
<< "\n\t centers.size(): " << centers.size()
);
#ifdef ENABLE_ASSERTS
{
const long nr = samples[0].nr();
const long nc = samples[0].nc();
for (unsigned long i = 0; i < samples.size(); ++i)
{
DLIB_ASSERT(is_vector(samples[i]) && samples[i].nr() == nr && samples[i].nc() == nc,
"\tvoid find_clusters_using_angular_kmeans()"
<< "\n\t You passed invalid arguments to this function"
<< "\n\t is_vector(samples[i]): " << is_vector(samples[i])
<< "\n\t samples[i].nr(): " << samples[i].nr()
<< "\n\t nr: " << nr
<< "\n\t samples[i].nc(): " << samples[i].nc()
<< "\n\t nc: " << nc
<< "\n\t i: " << i
);
}
}
#endif
typedef typename sample_type::type scalar_type;
sample_type zero(centers[0]);
set_all_elements(zero, 0);
unsigned long seed = 0;
// tells which center a sample belongs to
std::vector<unsigned long> assignments(samples.size(), samples.size());
std::vector<double> lengths;
for (unsigned long i = 0; i < samples.size(); ++i)
{
lengths.push_back(length(samples[i]));
// If there are zero vectors in samples then just say their length is 1 so we
// can avoid a division by zero check later on. Also, this doesn't matter
// since zero vectors can be assigned to any cluster randomly as there is no
// basis for picking one based on angle.
if (lengths.back() == 0)
lengths.back() = 1;
}
// We will keep the centers as unit vectors at all times throughout the processing.
for (unsigned long i = 0; i < centers.size(); ++i)
{
double len = length(centers[i]);
// Avoid having length 0 centers. If that is the case then pick another center
// at random.
while(len == 0)
{
centers[i] = matrix_cast<scalar_type>(gaussian_randm(centers[i].nr(), centers[i].nc(), seed++));
len = length(centers[i]);
}
centers[i] /= len;
}
unsigned long iter = 0;
bool centers_changed = true;
while (centers_changed && iter < max_iter)
{
++iter;
centers_changed = false;
// loop over each sample and see which center it is closest to
for (unsigned long i = 0; i < samples.size(); ++i)
{
// find the best center for sample[i]
scalar_type best_angle = std::numeric_limits<scalar_type>::max();
unsigned long best_center = 0;
for (unsigned long j = 0; j < centers.size(); ++j)
{
scalar_type angle = -dot(centers[j],samples[i])/lengths[i];
if (angle < best_angle)
{
best_angle = angle;
best_center = j;
}
}
if (assignments[i] != best_center)
{
centers_changed = true;
assignments[i] = best_center;
}
}
// now update all the centers
centers.assign(centers.size(), zero);
for (unsigned long i = 0; i < samples.size(); ++i)
{
centers[assignments[i]] += samples[i];
}
// Now length normalize all the centers.
for (unsigned long i = 0; i < centers.size(); ++i)
{
double len = length(centers[i]);
// Avoid having length 0 centers. If that is the case then pick another center
// at random.
while(len == 0)
{
centers[i] = matrix_cast<scalar_type>(gaussian_randm(centers[i].nr(), centers[i].nc(), seed++));
len = length(centers[i]);
centers_changed = true;
}
centers[i] /= len;
}
}
} }
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
......
...@@ -283,7 +283,7 @@ namespace dlib ...@@ -283,7 +283,7 @@ namespace dlib
- centers.size() > 0 - centers.size() > 0
- array_type == something with an interface compatible with std::vector - array_type == something with an interface compatible with std::vector
and it must contain row or column vectors capable of being stored in and it must contain row or column vectors capable of being stored in
sample_type objects sample_type objects.
- sample_type == a dlib::matrix capable of representing vectors - sample_type == a dlib::matrix capable of representing vectors
ensures ensures
- performs regular old linear kmeans clustering on the samples. The clustering - performs regular old linear kmeans clustering on the samples. The clustering
...@@ -293,6 +293,46 @@ namespace dlib ...@@ -293,6 +293,46 @@ namespace dlib
terminates. terminates.
!*/ !*/
// ----------------------------------------------------------------------------------------
template <
typename array_type,
typename sample_type,
typename alloc
>
void find_clusters_using_angular_kmeans (
const array_type& samples,
std::vector<sample_type, alloc>& centers,
unsigned long max_iter = 1000
);
/*!
requires
- samples.size() > 0
- samples == a bunch of row or column vectors and they all must be of the
same length.
- centers.size() > 0
- array_type == something with an interface compatible with std::vector
and it must contain row or column vectors capable of being stored in
sample_type objects.
- sample_type == a dlib::matrix capable of representing vectors
ensures
- performs linear kmeans clustering on the samples, except instead of using
Euclidean distance to compare samples to the centers it uses the angle
between a sample and a center (with respect to the origin). So we try to
cluster samples together if they have small angles with respect to each
other. The clustering begins with the initial set of centers given as an
argument to this function. When it finishes #centers will contain the
resulting centers.
- for all valid i:
- length(#centers[i]) == 1
(i.e. the output centers are scaled to be unit vectors since their
magnitude is irrelevant. Moreover, this makes it so you can use
functions like nearest_center() with #centers to find the cluster
assignments.)
- No more than max_iter iterations will be performed before this function
terminates.
!*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
template < template <
......
...@@ -44,32 +44,63 @@ namespace ...@@ -44,32 +44,63 @@ namespace
randomize_samples(samples); randomize_samples(samples);
std::vector<sample_type> centers; {
pick_initial_centers(seed_centers.size(), centers, samples, linear_kernel<sample_type>()); std::vector<sample_type> centers;
pick_initial_centers(seed_centers.size(), centers, samples, linear_kernel<sample_type>());
find_clusters_using_kmeans(samples, centers); find_clusters_using_kmeans(samples, centers);
DLIB_TEST(centers.size() == seed_centers.size()); DLIB_TEST(centers.size() == seed_centers.size());
std::vector<int> hits(centers.size(),0); std::vector<int> hits(centers.size(),0);
for (unsigned long i = 0; i < samples.size(); ++i) for (unsigned long i = 0; i < samples.size(); ++i)
{
unsigned long best_idx = 0;
double best_dist = 1e100;
for (unsigned long j = 0; j < centers.size(); ++j)
{ {
if (length(samples[i] - centers[j]) < best_dist) unsigned long best_idx = 0;
double best_dist = 1e100;
for (unsigned long j = 0; j < centers.size(); ++j)
{ {
best_dist = length(samples[i] - centers[j]); if (length(samples[i] - centers[j]) < best_dist)
best_idx = j; {
best_dist = length(samples[i] - centers[j]);
best_idx = j;
}
} }
hits[best_idx]++;
} }
hits[best_idx]++;
}
for (unsigned long i = 0; i < hits.size(); ++i) for (unsigned long i = 0; i < hits.size(); ++i)
{
DLIB_TEST(hits[i] == 250);
}
}
{ {
DLIB_TEST(hits[i] == 250); std::vector<sample_type> centers;
pick_initial_centers(seed_centers.size(), centers, samples, linear_kernel<sample_type>());
find_clusters_using_angular_kmeans(samples, centers);
DLIB_TEST(centers.size() == seed_centers.size());
std::vector<int> hits(centers.size(),0);
for (unsigned long i = 0; i < samples.size(); ++i)
{
unsigned long best_idx = 0;
double best_dist = 1e100;
for (unsigned long j = 0; j < centers.size(); ++j)
{
if (length(samples[i] - centers[j]) < best_dist)
{
best_dist = length(samples[i] - centers[j]);
best_idx = j;
}
}
hits[best_idx]++;
}
for (unsigned long i = 0; i < hits.size(); ++i)
{
DLIB_TEST(hits[i] == 250);
}
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment