Added newman_cluster(), chinese_whispers(), and modularity() routines.

f2371195 · Davis King · d598fcf2 · f2371195 · f2371195 · f2371195
Commit f2371195 authored Nov 05, 2012 by Davis King
5 changed files
--- a/dlib/clustering.h
+++ b/dlib/clustering.h
+// Copyright (C) 2012  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_CLuSTERING_
+#define DLIB_CLuSTERING_
+
+#include "clustering/modularity_clustering.h"
+#include "clustering/chinese_whispers.h"
+#include "svm/kkmeans.h"
+
+#endif // DLIB_CLuSTERING_
+
--- a/dlib/clustering/chinese_whispers.h
+++ b/dlib/clustering/chinese_whispers.h
+// Copyright (C) 2012  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_CHINESE_WHISPErS_H__
+#define DLIB_CHINESE_WHISPErS_H__
+
+#include "chinese_whispers_abstract.h"
+#include <vector>
+#include "../rand.h"
+#include "../manifold_regularization/graph_creation.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<ordered_sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations,
+        dlib::rand& rnd
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(is_ordered_by_index(edges),
+                    "\t unsigned long chinese_whispers()"
+                    << "\n\t Invalid inputs were given to this function"
+        );
+
+
+        std::vector<std::pair<unsigned long, unsigned long> > neighbors;
+        find_neighbor_ranges(edges, neighbors);
+
+        // Initialize the labels, each node gets a different label.
+        labels.resize(neighbors.size());
+        for (unsigned long i = 0; i < labels.size(); ++i)
+            labels[i] = i;
+
+
+        for (unsigned long iter = 0; iter < neighbors.size()*num_iterations; ++iter)
+        {
+            // Pick a random node.
+            const unsigned long idx = rnd.get_random_64bit_number()%neighbors.size();
+
+            // Count how many times each label happens amongst our neighbors.
+            std::map<unsigned long, double> labels_to_counts;
+            const unsigned long end = neighbors[idx].second;
+            for (unsigned long i = neighbors[idx].first; i != end; ++i)
+            {
+                labels_to_counts[labels[edges[i].index2()]] += edges[i].distance();
+            }
+
+            // find the most common label
+            std::map<unsigned long, double>::iterator i;
+            double best_score = -std::numeric_limits<double>::infinity();
+            unsigned long best_label = labels[idx];
+            for (i = labels_to_counts.begin(); i != labels_to_counts.end(); ++i)
+            {
+                if (i->second > best_score)
+                {
+                    best_score = i->second;
+                    best_label = i->first;
+                }
+            }
+
+            labels[idx] = best_label;
+        }
+
+
+        // Remap the labels into a contiguous range.  First we find the
+        // mapping.
+        std::map<unsigned long,unsigned long> label_remap;
+        for (unsigned long i = 0; i < labels.size(); ++i)
+        {
+            const unsigned long next_id = label_remap.size();
+            if (label_remap.count(labels[i]) == 0)
+                label_remap[labels[i]] = next_id;
+        }
+        // now apply the mapping to all the labels.
+        for (unsigned long i = 0; i < labels.size(); ++i)
+        {
+            labels[i] = label_remap[labels[i]];
+        }
+
+        return label_remap.size();
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations,
+        dlib::rand& rnd
+    )
+    {
+        std::vector<ordered_sample_pair> oedges;
+        convert_unordered_to_ordered(edges, oedges);
+        std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>);
+
+        return chinese_whispers(oedges, labels, num_iterations, rnd);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations = 100
+    )
+    {
+        dlib::rand rnd;
+        return chinese_whispers(edges, labels, num_iterations, rnd);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<ordered_sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations = 100
+    )
+    {
+        dlib::rand rnd;
+        return chinese_whispers(edges, labels, num_iterations, rnd);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_CHINESE_WHISPErS_H__
+
--- a/dlib/clustering/chinese_whispers_abstract.h
+++ b/dlib/clustering/chinese_whispers_abstract.h
+// Copyright (C) 2012  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_CHINESE_WHISPErS_ABSTRACT_H__
+#ifdef DLIB_CHINESE_WHISPErS_ABSTRACT_H__
+
+#include <vector>
+#include "../rand.h"
+#include "../manifold_regularization/ordered_sample_pair_abstract.h"
+#include "../manifold_regularization/sample_pair_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<ordered_sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations,
+        dlib::rand& rnd
+    );
+    /*!
+        requires
+            - is_ordered_by_index(edges) == true
+        ensures
+            - This function implements the graph clustering algorithm described in the
+              paper: Chinese Whispers - an Efficient Graph Clustering Algorithm and its
+              Application to Natural Language Processing Problems by Chris Biemann.
+            - Interprets edges as a directed graph.  That is, it contains the edges on the
+              said graph and the ordered_sample_pair::distance() values define the edge
+              weights (larger values indicating a stronger edge connection between the
+              nodes).
+            - returns the number of clusters found.
+            - #labels.size() == max_index_plus_one(edges)
+            - for all valid i:
+                - #labels[i] == the cluster ID of the node with index i in the graph.  
+                - 0 <= #labels[i] < the number of clusters found
+                  (i.e. cluster IDs are assigned contiguously and start at 0) 
+            - Duplicate edges are interpreted as if there had been just one edge with a
+              distance value equal to the sum of all the duplicate edge's distance values.
+            - The algorithm performs exactly num_iterations passes over the graph before
+              terminating.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations,
+        dlib::rand& rnd
+    );
+    /*!
+        ensures
+            - This function is identical to the above chinese_whispers() routine except
+              that it operates on a vector of sample_pair objects instead of
+              ordered_sample_pairs.  Therefore, this is simply a convenience routine.  In
+              particular, it is implemented by transforming the given edges into
+              ordered_sample_pairs and then calling the chinese_whispers() routine defined
+              above.  
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<ordered_sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations = 100
+    );
+    /*!
+        requires
+            - is_ordered_by_index(edges) == true
+        ensures
+            - performs: return chinese_whispers(edges, labels, num_iterations, rnd)
+              where rnd is a default initialized dlib::rand object.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations = 100
+    );
+    /*!
+        ensures
+            - performs: return chinese_whispers(edges, labels, num_iterations, rnd)
+              where rnd is a default initialized dlib::rand object.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_CHINESE_WHISPErS_ABSTRACT_H__
+
--- a/dlib/clustering/modularity_clustering.h
+++ b/dlib/clustering/modularity_clustering.h
--- a/dlib/clustering/modularity_clustering_abstract.h
+++ b/dlib/clustering/modularity_clustering_abstract.h
+// Copyright (C) 2012  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_MODULARITY_ClUSTERING_ABSTRACT_H__
+#ifdef DLIB_MODULARITY_ClUSTERING_ABSTRACT_H__
+
+#include <vector>
+#include "../manifold_regularization/ordered_sample_pair_abstract.h"
+#include "../manifold_regularization/sample_pair_abstract.h"
+
+namespace dlib
+{
+
+// -----------------------------------------------------------------------------------------
+
+    double modularity (
+        const std::vector<sample_pair>& edges,
+        const std::vector<unsigned long>& labels
+    );
+    /*!
+        requires
+            - labels.size() == max_index_plus_one(edges)
+        ensures
+            - Interprets edges as an undirected graph.  That is, it contains the edges on
+              the said graph and the sample_pair::distance() values define the edge weights
+              (larger values indicating a stronger edge connection between the nodes).
+            - This function returns the modularity value obtained when the given input
+              graph is broken into subgraphs according to the contents of labels.  In
+              particular, we say that two nodes with indices i and j are in the same
+              subgraph or community if and only if labels[i] == labels[j].
+            - Duplicate edges are interpreted as if there had been just one edge with a
+              distance value equal to the sum of all the duplicate edge's distance values.
+            - See the paper Modularity and community structure in networks by M. E. J. Newman
+              for a detailed definition.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    double modularity (
+        const std::vector<ordered_sample_pair>& edges,
+        const std::vector<unsigned long>& labels
+    );
+    /*!
+        requires
+            - labels.size() == max_index_plus_one(edges)
+        ensures
+            - Interprets edges as a directed graph.  That is, it contains the edges on the
+              said graph and the ordered_sample_pair::distance() values define the edge
+              weights (larger values indicating a stronger edge connection between the
+              nodes).  Note that, generally, modularity is only really defined for
+              undirected graphs.  Therefore, the "directed graph" given to this function
+              should have symmetric edges between all nodes.  The reason this function is
+              provided at all is because sometimes a vector of ordered_sample_pair objects
+              is a useful representation of an undirected graph.
+            - This function returns the modularity value obtained when the given input
+              graph is broken into subgraphs according to the contents of labels.  In
+              particular, we say that two nodes with indices i and j are in the same
+              subgraph or community if and only if labels[i] == labels[j].
+            - Duplicate edges are interpreted as if there had been just one edge with a
+              distance value equal to the sum of all the duplicate edge's distance values.
+            - See the paper Modularity and community structure in networks by M. E. J. Newman
+              for a detailed definition.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long newman_cluster (
+        const std::vector<ordered_sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const double eps = 1e-4,
+        const unsigned long max_iterations = 2000
+    );
+    /*!
+        requires
+            - is_ordered_by_index(edges) == true
+        ensures
+            - This function performs the clustering algorithm described in the paper
+              Modularity and community structure in networks by M. E. J. Newman.  
+            - This function interprets edges as a graph and attempts to find the labeling
+              that maximizes modularity(edges, #labels).   
+            - returns the number of clusters found.
+            - #labels.size() == max_index_plus_one(edges)
+            - for all valid i:
+                - #labels[i] == the cluster ID of the node with index i in the graph.  
+                - 0 <= #labels[i] < the number of clusters found
+                  (i.e. cluster IDs are assigned contiguously and start at 0) 
+            - The main computation of the algorithm is involved in finding an eigenvector
+              of a certain matrix.  To do this, we use the power iteration.  In particular,
+              each time we try to find an eigenvector we will let the power iteration loop
+              at most max_iterations times or until it reaches an accuracy of eps.
+              Whichever comes first.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long newman_cluster (
+        const std::vector<sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const double eps = 1e-4,
+        const unsigned long max_iterations = 2000
+    );
+    /*!
+        ensures
+            - This function is identical to the above newman_cluster() routine except that
+              it operates on a vector of sample_pair objects instead of
+              ordered_sample_pairs.  Therefore, this is simply a convenience routine.  In
+              particular, it is implemented by transforming the given edges into
+              ordered_sample_pairs and then calling the newman_cluster() routine defined
+              above.  
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_MODULARITY_ClUSTERING_ABSTRACT_H__
+