Commit f2371195 authored by Davis King's avatar Davis King

Added newman_cluster(), chinese_whispers(), and modularity() routines.

parent d598fcf2
// Copyright (C) 2012 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_CLuSTERING_
#define DLIB_CLuSTERING_
#include "clustering/modularity_clustering.h"
#include "clustering/chinese_whispers.h"
#include "svm/kkmeans.h"
#endif // DLIB_CLuSTERING_
// Copyright (C) 2012 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_CHINESE_WHISPErS_H__
#define DLIB_CHINESE_WHISPErS_H__
#include "chinese_whispers_abstract.h"
#include <vector>
#include "../rand.h"
#include "../manifold_regularization/graph_creation.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
unsigned long chinese_whispers (
const std::vector<ordered_sample_pair>& edges,
std::vector<unsigned long>& labels,
const unsigned long num_iterations,
dlib::rand& rnd
)
{
// make sure requires clause is not broken
DLIB_ASSERT(is_ordered_by_index(edges),
"\t unsigned long chinese_whispers()"
<< "\n\t Invalid inputs were given to this function"
);
std::vector<std::pair<unsigned long, unsigned long> > neighbors;
find_neighbor_ranges(edges, neighbors);
// Initialize the labels, each node gets a different label.
labels.resize(neighbors.size());
for (unsigned long i = 0; i < labels.size(); ++i)
labels[i] = i;
for (unsigned long iter = 0; iter < neighbors.size()*num_iterations; ++iter)
{
// Pick a random node.
const unsigned long idx = rnd.get_random_64bit_number()%neighbors.size();
// Count how many times each label happens amongst our neighbors.
std::map<unsigned long, double> labels_to_counts;
const unsigned long end = neighbors[idx].second;
for (unsigned long i = neighbors[idx].first; i != end; ++i)
{
labels_to_counts[labels[edges[i].index2()]] += edges[i].distance();
}
// find the most common label
std::map<unsigned long, double>::iterator i;
double best_score = -std::numeric_limits<double>::infinity();
unsigned long best_label = labels[idx];
for (i = labels_to_counts.begin(); i != labels_to_counts.end(); ++i)
{
if (i->second > best_score)
{
best_score = i->second;
best_label = i->first;
}
}
labels[idx] = best_label;
}
// Remap the labels into a contiguous range. First we find the
// mapping.
std::map<unsigned long,unsigned long> label_remap;
for (unsigned long i = 0; i < labels.size(); ++i)
{
const unsigned long next_id = label_remap.size();
if (label_remap.count(labels[i]) == 0)
label_remap[labels[i]] = next_id;
}
// now apply the mapping to all the labels.
for (unsigned long i = 0; i < labels.size(); ++i)
{
labels[i] = label_remap[labels[i]];
}
return label_remap.size();
}
// ----------------------------------------------------------------------------------------
unsigned long chinese_whispers (
const std::vector<sample_pair>& edges,
std::vector<unsigned long>& labels,
const unsigned long num_iterations,
dlib::rand& rnd
)
{
std::vector<ordered_sample_pair> oedges;
convert_unordered_to_ordered(edges, oedges);
std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>);
return chinese_whispers(oedges, labels, num_iterations, rnd);
}
// ----------------------------------------------------------------------------------------
unsigned long chinese_whispers (
const std::vector<sample_pair>& edges,
std::vector<unsigned long>& labels,
const unsigned long num_iterations = 100
)
{
dlib::rand rnd;
return chinese_whispers(edges, labels, num_iterations, rnd);
}
// ----------------------------------------------------------------------------------------
unsigned long chinese_whispers (
const std::vector<ordered_sample_pair>& edges,
std::vector<unsigned long>& labels,
const unsigned long num_iterations = 100
)
{
dlib::rand rnd;
return chinese_whispers(edges, labels, num_iterations, rnd);
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_CHINESE_WHISPErS_H__
// Copyright (C) 2012 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_CHINESE_WHISPErS_ABSTRACT_H__
#ifdef DLIB_CHINESE_WHISPErS_ABSTRACT_H__
#include <vector>
#include "../rand.h"
#include "../manifold_regularization/ordered_sample_pair_abstract.h"
#include "../manifold_regularization/sample_pair_abstract.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
unsigned long chinese_whispers (
const std::vector<ordered_sample_pair>& edges,
std::vector<unsigned long>& labels,
const unsigned long num_iterations,
dlib::rand& rnd
);
/*!
requires
- is_ordered_by_index(edges) == true
ensures
- This function implements the graph clustering algorithm described in the
paper: Chinese Whispers - an Efficient Graph Clustering Algorithm and its
Application to Natural Language Processing Problems by Chris Biemann.
- Interprets edges as a directed graph. That is, it contains the edges on the
said graph and the ordered_sample_pair::distance() values define the edge
weights (larger values indicating a stronger edge connection between the
nodes).
- returns the number of clusters found.
- #labels.size() == max_index_plus_one(edges)
- for all valid i:
- #labels[i] == the cluster ID of the node with index i in the graph.
- 0 <= #labels[i] < the number of clusters found
(i.e. cluster IDs are assigned contiguously and start at 0)
- Duplicate edges are interpreted as if there had been just one edge with a
distance value equal to the sum of all the duplicate edge's distance values.
- The algorithm performs exactly num_iterations passes over the graph before
terminating.
!*/
// ----------------------------------------------------------------------------------------
unsigned long chinese_whispers (
const std::vector<sample_pair>& edges,
std::vector<unsigned long>& labels,
const unsigned long num_iterations,
dlib::rand& rnd
);
/*!
ensures
- This function is identical to the above chinese_whispers() routine except
that it operates on a vector of sample_pair objects instead of
ordered_sample_pairs. Therefore, this is simply a convenience routine. In
particular, it is implemented by transforming the given edges into
ordered_sample_pairs and then calling the chinese_whispers() routine defined
above.
!*/
// ----------------------------------------------------------------------------------------
unsigned long chinese_whispers (
const std::vector<ordered_sample_pair>& edges,
std::vector<unsigned long>& labels,
const unsigned long num_iterations = 100
);
/*!
requires
- is_ordered_by_index(edges) == true
ensures
- performs: return chinese_whispers(edges, labels, num_iterations, rnd)
where rnd is a default initialized dlib::rand object.
!*/
// ----------------------------------------------------------------------------------------
unsigned long chinese_whispers (
const std::vector<sample_pair>& edges,
std::vector<unsigned long>& labels,
const unsigned long num_iterations = 100
);
/*!
ensures
- performs: return chinese_whispers(edges, labels, num_iterations, rnd)
where rnd is a default initialized dlib::rand object.
!*/
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_CHINESE_WHISPErS_ABSTRACT_H__
This diff is collapsed.
// Copyright (C) 2012 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_MODULARITY_ClUSTERING_ABSTRACT_H__
#ifdef DLIB_MODULARITY_ClUSTERING_ABSTRACT_H__
#include <vector>
#include "../manifold_regularization/ordered_sample_pair_abstract.h"
#include "../manifold_regularization/sample_pair_abstract.h"
namespace dlib
{
// -----------------------------------------------------------------------------------------
double modularity (
const std::vector<sample_pair>& edges,
const std::vector<unsigned long>& labels
);
/*!
requires
- labels.size() == max_index_plus_one(edges)
ensures
- Interprets edges as an undirected graph. That is, it contains the edges on
the said graph and the sample_pair::distance() values define the edge weights
(larger values indicating a stronger edge connection between the nodes).
- This function returns the modularity value obtained when the given input
graph is broken into subgraphs according to the contents of labels. In
particular, we say that two nodes with indices i and j are in the same
subgraph or community if and only if labels[i] == labels[j].
- Duplicate edges are interpreted as if there had been just one edge with a
distance value equal to the sum of all the duplicate edge's distance values.
- See the paper Modularity and community structure in networks by M. E. J. Newman
for a detailed definition.
!*/
// ----------------------------------------------------------------------------------------
double modularity (
const std::vector<ordered_sample_pair>& edges,
const std::vector<unsigned long>& labels
);
/*!
requires
- labels.size() == max_index_plus_one(edges)
ensures
- Interprets edges as a directed graph. That is, it contains the edges on the
said graph and the ordered_sample_pair::distance() values define the edge
weights (larger values indicating a stronger edge connection between the
nodes). Note that, generally, modularity is only really defined for
undirected graphs. Therefore, the "directed graph" given to this function
should have symmetric edges between all nodes. The reason this function is
provided at all is because sometimes a vector of ordered_sample_pair objects
is a useful representation of an undirected graph.
- This function returns the modularity value obtained when the given input
graph is broken into subgraphs according to the contents of labels. In
particular, we say that two nodes with indices i and j are in the same
subgraph or community if and only if labels[i] == labels[j].
- Duplicate edges are interpreted as if there had been just one edge with a
distance value equal to the sum of all the duplicate edge's distance values.
- See the paper Modularity and community structure in networks by M. E. J. Newman
for a detailed definition.
!*/
// ----------------------------------------------------------------------------------------
unsigned long newman_cluster (
const std::vector<ordered_sample_pair>& edges,
std::vector<unsigned long>& labels,
const double eps = 1e-4,
const unsigned long max_iterations = 2000
);
/*!
requires
- is_ordered_by_index(edges) == true
ensures
- This function performs the clustering algorithm described in the paper
Modularity and community structure in networks by M. E. J. Newman.
- This function interprets edges as a graph and attempts to find the labeling
that maximizes modularity(edges, #labels).
- returns the number of clusters found.
- #labels.size() == max_index_plus_one(edges)
- for all valid i:
- #labels[i] == the cluster ID of the node with index i in the graph.
- 0 <= #labels[i] < the number of clusters found
(i.e. cluster IDs are assigned contiguously and start at 0)
- The main computation of the algorithm is involved in finding an eigenvector
of a certain matrix. To do this, we use the power iteration. In particular,
each time we try to find an eigenvector we will let the power iteration loop
at most max_iterations times or until it reaches an accuracy of eps.
Whichever comes first.
!*/
// ----------------------------------------------------------------------------------------
unsigned long newman_cluster (
const std::vector<sample_pair>& edges,
std::vector<unsigned long>& labels,
const double eps = 1e-4,
const unsigned long max_iterations = 2000
);
/*!
ensures
- This function is identical to the above newman_cluster() routine except that
it operates on a vector of sample_pair objects instead of
ordered_sample_pairs. Therefore, this is simply a convenience routine. In
particular, it is implemented by transforming the given edges into
ordered_sample_pairs and then calling the newman_cluster() routine defined
above.
!*/
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_MODULARITY_ClUSTERING_ABSTRACT_H__
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment