Added newman_cluster(), chinese_whispers(), and modularity() routines.

f2371195 · Davis King · d598fcf2 · f2371195 · f2371195 · f2371195
Commit f2371195 authored Nov 05, 2012 by Davis King
5 changed files
--- a/dlib/clustering.h
+++ b/dlib/clustering.h
+// Copyright (C) 2012  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_CLuSTERING_
+#define DLIB_CLuSTERING_
+
+#include "clustering/modularity_clustering.h"
+#include "clustering/chinese_whispers.h"
+#include "svm/kkmeans.h"
+
+#endif // DLIB_CLuSTERING_
+
--- a/dlib/clustering/chinese_whispers.h
+++ b/dlib/clustering/chinese_whispers.h
+// Copyright (C) 2012  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_CHINESE_WHISPErS_H__
+#define DLIB_CHINESE_WHISPErS_H__
+
+#include "chinese_whispers_abstract.h"
+#include <vector>
+#include "../rand.h"
+#include "../manifold_regularization/graph_creation.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<ordered_sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations,
+        dlib::rand& rnd
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(is_ordered_by_index(edges),
+                    "\t unsigned long chinese_whispers()"
+                    << "\n\t Invalid inputs were given to this function"
+        );
+
+
+        std::vector<std::pair<unsigned long, unsigned long> > neighbors;
+        find_neighbor_ranges(edges, neighbors);
+
+        // Initialize the labels, each node gets a different label.
+        labels.resize(neighbors.size());
+        for (unsigned long i = 0; i < labels.size(); ++i)
+            labels[i] = i;
+
+
+        for (unsigned long iter = 0; iter < neighbors.size()*num_iterations; ++iter)
+        {
+            // Pick a random node.
+            const unsigned long idx = rnd.get_random_64bit_number()%neighbors.size();
+
+            // Count how many times each label happens amongst our neighbors.
+            std::map<unsigned long, double> labels_to_counts;
+            const unsigned long end = neighbors[idx].second;
+            for (unsigned long i = neighbors[idx].first; i != end; ++i)
+            {
+                labels_to_counts[labels[edges[i].index2()]] += edges[i].distance();
+            }
+
+            // find the most common label
+            std::map<unsigned long, double>::iterator i;
+            double best_score = -std::numeric_limits<double>::infinity();
+            unsigned long best_label = labels[idx];
+            for (i = labels_to_counts.begin(); i != labels_to_counts.end(); ++i)
+            {
+                if (i->second > best_score)
+                {
+                    best_score = i->second;
+                    best_label = i->first;
+                }
+            }
+
+            labels[idx] = best_label;
+        }
+
+
+        // Remap the labels into a contiguous range.  First we find the
+        // mapping.
+        std::map<unsigned long,unsigned long> label_remap;
+        for (unsigned long i = 0; i < labels.size(); ++i)
+        {
+            const unsigned long next_id = label_remap.size();
+            if (label_remap.count(labels[i]) == 0)
+                label_remap[labels[i]] = next_id;
+        }
+        // now apply the mapping to all the labels.
+        for (unsigned long i = 0; i < labels.size(); ++i)
+        {
+            labels[i] = label_remap[labels[i]];
+        }
+
+        return label_remap.size();
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations,
+        dlib::rand& rnd
+    )
+    {
+        std::vector<ordered_sample_pair> oedges;
+        convert_unordered_to_ordered(edges, oedges);
+        std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>);
+
+        return chinese_whispers(oedges, labels, num_iterations, rnd);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations = 100
+    )
+    {
+        dlib::rand rnd;
+        return chinese_whispers(edges, labels, num_iterations, rnd);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<ordered_sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations = 100
+    )
+    {
+        dlib::rand rnd;
+        return chinese_whispers(edges, labels, num_iterations, rnd);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_CHINESE_WHISPErS_H__
+
--- a/dlib/clustering/chinese_whispers_abstract.h
+++ b/dlib/clustering/chinese_whispers_abstract.h
+// Copyright (C) 2012  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_CHINESE_WHISPErS_ABSTRACT_H__
+#ifdef DLIB_CHINESE_WHISPErS_ABSTRACT_H__
+
+#include <vector>
+#include "../rand.h"
+#include "../manifold_regularization/ordered_sample_pair_abstract.h"
+#include "../manifold_regularization/sample_pair_abstract.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<ordered_sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations,
+        dlib::rand& rnd
+    );
+    /*!
+        requires
+            - is_ordered_by_index(edges) == true
+        ensures
+            - This function implements the graph clustering algorithm described in the
+              paper: Chinese Whispers - an Efficient Graph Clustering Algorithm and its
+              Application to Natural Language Processing Problems by Chris Biemann.
+            - Interprets edges as a directed graph.  That is, it contains the edges on the
+              said graph and the ordered_sample_pair::distance() values define the edge
+              weights (larger values indicating a stronger edge connection between the
+              nodes).
+            - returns the number of clusters found.
+            - #labels.size() == max_index_plus_one(edges)
+            - for all valid i:
+                - #labels[i] == the cluster ID of the node with index i in the graph.  
+                - 0 <= #labels[i] < the number of clusters found
+                  (i.e. cluster IDs are assigned contiguously and start at 0) 
+            - Duplicate edges are interpreted as if there had been just one edge with a
+              distance value equal to the sum of all the duplicate edge's distance values.
+            - The algorithm performs exactly num_iterations passes over the graph before
+              terminating.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations,
+        dlib::rand& rnd
+    );
+    /*!
+        ensures
+            - This function is identical to the above chinese_whispers() routine except
+              that it operates on a vector of sample_pair objects instead of
+              ordered_sample_pairs.  Therefore, this is simply a convenience routine.  In
+              particular, it is implemented by transforming the given edges into
+              ordered_sample_pairs and then calling the chinese_whispers() routine defined
+              above.  
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<ordered_sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations = 100
+    );
+    /*!
+        requires
+            - is_ordered_by_index(edges) == true
+        ensures
+            - performs: return chinese_whispers(edges, labels, num_iterations, rnd)
+              where rnd is a default initialized dlib::rand object.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long chinese_whispers (
+        const std::vector<sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const unsigned long num_iterations = 100
+    );
+    /*!
+        ensures
+            - performs: return chinese_whispers(edges, labels, num_iterations, rnd)
+              where rnd is a default initialized dlib::rand object.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_CHINESE_WHISPErS_ABSTRACT_H__
+
--- a/dlib/clustering/modularity_clustering.h
+++ b/dlib/clustering/modularity_clustering.h
+// Copyright (C) 2012  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_MODULARITY_ClUSTERING__H__
+#define DLIB_MODULARITY_ClUSTERING__H__
+
+#include "modularity_clustering_abstract.h"
+#include "../sparse_vector.h"
+#include "../manifold_regularization/graph_creation.h"
+#include "../matrix.h"
+#include "../rand.h"
+
+namespace dlib
+{
+
+// -----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        double newman_cluster_split (
+            dlib::rand& rnd,
+            const std::vector<ordered_sample_pair>& edges,
+            const matrix<double,0,1>& node_degrees, // k from the Newman paper
+            const matrix<double,0,1>& Bdiag,        // diag(B) from the Newman paper
+            const double& edge_sum,                 // m from the Newman paper
+            matrix<double,0,1>& labels,
+            const double eps,
+            const unsigned long max_iterations
+        )
+        /*!
+            requires
+                - node_degrees.size() == max_index_plus_one(edges)
+                - Bdiag.size() == max_index_plus_one(edges)
+                - edges must be sorted according to order_by_index()
+            ensures
+                - This routine splits a graph into two subgraphs using the Newman 
+                  clustering method.  
+                - returns the modularity obtained when the graph is split according
+                  to the contents of #labels. 
+                - #labels.size() == node_degrees.size()
+                - for all valid i: #labels(i) == -1 or +1
+                - if (this function returns 0) then
+                    - all the labels are equal, i.e. the graph is not split.
+        !*/
+        {
+            // Scale epsilon so that it is relative to the expected value of an element of a
+            // unit vector of length node_degrees.size().
+            const double power_iter_eps = eps * std::sqrt(1.0/node_degrees.size());
+
+            // Make a random unit vector and put in labels.
+            labels.set_size(node_degrees.size());
+            for (long i = 0; i < labels.size(); ++i)
+                labels(i) = rnd.get_random_gaussian();
+            labels /= length(labels);
+
+            matrix<double,0,1> Bv, Bv_unit;
+
+            // Do the power iteration for a while.
+            double eig = -1;
+            double offset = 0;
+            while (eig < 0)
+            {
+
+                // any number larger than power_iter_eps
+                double iteration_change = power_iter_eps*2+1; 
+                for (unsigned long i = 0; i < max_iterations && iteration_change > power_iter_eps; ++i) 
+                {
+                    sparse_matrix_vector_multiply(edges, labels, Bv);
+                    Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees;
+
+                    if (offset != 0)
+                    {
+                        Bv -= offset*labels;
+                    }
+
+
+                    const double len = length(Bv);
+                    if (len != 0)
+                    {
+                        Bv_unit = Bv/len;
+                        iteration_change = max(abs(labels-Bv_unit));
+                        labels.swap(Bv_unit);
+                    }
+                    else
+                    {
+                        // Had a bad time, pick another random vector and try it with the
+                        // power iteration.
+                        for (long i = 0; i < labels.size(); ++i)
+                            labels(i) = rnd.get_random_gaussian();
+                    }
+                }
+
+                eig = dot(Bv,labels);
+                // we will repeat this loop if the largest eigenvalue is negative
+                offset = eig;
+            }
+
+
+            for (long i = 0; i < labels.size(); ++i)
+            {
+                if (labels(i) > 0)
+                    labels(i) = 1;
+                else
+                    labels(i) = -1;
+            }
+
+
+            // compute B*labels, store result in Bv.
+            sparse_matrix_vector_multiply(edges, labels, Bv);
+            Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees;
+
+            // Do some label refinement.  In this step we swap labels if it
+            // improves the modularity score.
+            bool flipped_label = true;
+            while(flipped_label)
+            {
+                flipped_label = false;
+                unsigned long idx = 0;
+                for (long i = 0; i < labels.size(); ++i)
+                {
+                    const double val = -2*labels(i);
+                    const double increase = 4*Bdiag(i) + 2*val*Bv(i);
+
+                    // if there is an increase in modularity for swapping this label
+                    if (increase > 0)
+                    {
+                        labels(i) *= -1;
+                        while (idx < edges.size() && edges[idx].index1() == (unsigned long)i)
+                        {
+                            const long j = edges[idx].index2();
+                            Bv(j) += val*edges[idx].distance();
+                            ++idx;
+                        }
+
+                        Bv -= (val*node_degrees(i)/(2*edge_sum))*node_degrees;
+
+                        flipped_label = true;
+                    }
+                    else
+                    {
+                        while (idx < edges.size() && edges[idx].index1() == (unsigned long)i)
+                        {
+                            ++idx;
+                        }
+                    }
+                }
+            }
+
+
+            const double modularity = dot(Bv, labels)/(4*edge_sum);
+
+            return modularity;
+        }
+
+    // -------------------------------------------------------------------------------------
+
+        unsigned long newman_cluster_helper (
+            dlib::rand& rnd,
+            const std::vector<ordered_sample_pair>& edges,
+            const matrix<double,0,1>& node_degrees, // k from the Newman paper
+            const matrix<double,0,1>& Bdiag,        // diag(B) from the Newman paper
+            const double& edge_sum,                 // m from the Newman paper
+            std::vector<unsigned long>& labels,
+            double modularity_threshold,
+            const double eps,
+            const unsigned long max_iterations
+        )
+        /*!
+            ensures
+                - returns the number of clusters the data was split into
+        !*/
+        {
+            matrix<double,0,1> l;
+            const double modularity = newman_cluster_split(rnd,edges,node_degrees,Bdiag,edge_sum,l,eps,max_iterations);
+
+
+            // We need to collapse the node index values down to contiguous values.  So
+            // we use the following two vectors to contain the mappings from input index
+            // values to their corresponding index values in each split.
+            std::vector<unsigned long> left_idx_map(node_degrees.size());
+            std::vector<unsigned long> right_idx_map(node_degrees.size());
+
+            // figure out how many nodes went into each side of the split.
+            unsigned long num_left_split = 0;
+            unsigned long num_right_split = 0;
+            for (long i = 0; i < l.size(); ++i)
+            {
+                if (l(i) > 0)
+                {
+                    left_idx_map[i] = num_left_split;
+                    ++num_left_split;
+                }
+                else
+                {
+                    right_idx_map[i] = num_right_split;
+                    ++num_right_split;
+                }
+            }
+
+            // do a recursive split if it will improve the modularity.
+            if (modularity > modularity_threshold && num_left_split > 0 && num_right_split > 0)
+            {
+
+                // split the node_degrees and Bdiag matrices into left and right split parts
+                matrix<double,0,1> left_node_degrees(num_left_split);
+                matrix<double,0,1> right_node_degrees(num_right_split);
+                matrix<double,0,1> left_Bdiag(num_left_split);
+                matrix<double,0,1> right_Bdiag(num_right_split);
+                for (long i = 0; i < l.size(); ++i)
+                {
+                    if (l(i) > 0)
+                    {
+                        left_node_degrees(left_idx_map[i]) = node_degrees(i);
+                        left_Bdiag(left_idx_map[i]) = Bdiag(i);
+                    }
+                    else
+                    {
+                        right_node_degrees(right_idx_map[i]) = node_degrees(i);
+                        right_Bdiag(right_idx_map[i]) = Bdiag(i);
+                    }
+                }
+
+
+                // put the edges from one side of the split into split_edges
+                std::vector<ordered_sample_pair> split_edges;
+                modularity_threshold = 0;
+                for (unsigned long k = 0; k < edges.size(); ++k)
+                {
+                    const unsigned long i = edges[k].index1();
+                    const unsigned long j = edges[k].index2();
+                    const double d = edges[k].distance();
+                    if (l(i) > 0 && l(j) > 0)
+                    {
+                        split_edges.push_back(ordered_sample_pair(left_idx_map[i], left_idx_map[j], d));
+                        modularity_threshold += d;
+                    }
+                }
+                modularity_threshold -= sum(left_node_degrees*sum(left_node_degrees))/(2*edge_sum);
+                modularity_threshold /= 4*edge_sum;
+
+                unsigned long num_left_clusters;
+                std::vector<unsigned long> left_labels;
+                num_left_clusters = newman_cluster_helper(rnd,split_edges,left_node_degrees,left_Bdiag,
+                                                          edge_sum,left_labels,modularity_threshold,
+                                                          eps, max_iterations);
+
+                // now load the other side into split_edges and cluster it as well
+                split_edges.clear();
+                modularity_threshold = 0;
+                for (unsigned long k = 0; k < edges.size(); ++k)
+                {
+                    const unsigned long i = edges[k].index1();
+                    const unsigned long j = edges[k].index2();
+                    const double d = edges[k].distance();
+                    if (l(i) < 0 && l(j) < 0)
+                    {
+                        split_edges.push_back(ordered_sample_pair(right_idx_map[i], right_idx_map[j], d));
+                        modularity_threshold += d;
+                    }
+                }
+                modularity_threshold -= sum(right_node_degrees*sum(right_node_degrees))/(2*edge_sum);
+                modularity_threshold /= 4*edge_sum;
+
+                unsigned long num_right_clusters;
+                std::vector<unsigned long> right_labels;
+                num_right_clusters = newman_cluster_helper(rnd,split_edges,right_node_degrees,right_Bdiag,
+                                                           edge_sum,right_labels,modularity_threshold,
+                                                           eps, max_iterations);
+
+                // Now merge the labels from the two splits.
+                labels.resize(node_degrees.size());
+                for (unsigned long i = 0; i < labels.size(); ++i)
+                {
+                    // if this node was in the left split
+                    if (l(i) > 0)
+                    {
+                        labels[i] = left_labels[left_idx_map[i]];
+                    }
+                    else // if this node was in the right split
+                    {
+                        labels[i] = right_labels[right_idx_map[i]] + num_left_clusters;
+                    }
+                }
+
+
+                return num_left_clusters + num_right_clusters;
+            }
+            else
+            {
+                labels.assign(node_degrees.size(),0);
+                return 1;
+            }
+
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long newman_cluster (
+        const std::vector<ordered_sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const double eps = 1e-4,
+        const unsigned long max_iterations = 2000
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(is_ordered_by_index(edges),
+                    "\t unsigned long newman_cluster()"
+                    << "\n\t Invalid inputs were given to this function"
+        );
+
+        const unsigned long num_nodes = max_index_plus_one(edges);
+
+        // compute the node_degrees vector, edge_sum value, and diag(B).
+        matrix<double,0,1> node_degrees(num_nodes);
+        matrix<double,0,1> Bdiag(num_nodes);
+        Bdiag = 0;
+        double edge_sum = 0;
+        node_degrees = 0;
+        for (unsigned long i = 0; i < edges.size(); ++i)
+        {
+            node_degrees(edges[i].index1()) += edges[i].distance();
+            edge_sum += edges[i].distance();
+            if (edges[i].index1() == edges[i].index2())
+                Bdiag(edges[i].index1()) += edges[i].distance();
+        }
+        edge_sum /= 2;
+        Bdiag -= squared(node_degrees)/(2*edge_sum);
+
+
+        dlib::rand rnd;
+        return impl::newman_cluster_helper(rnd,edges,node_degrees,Bdiag,edge_sum,labels,0,eps,max_iterations);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long newman_cluster (
+        const std::vector<sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const double eps = 1e-4,
+        const unsigned long max_iterations = 2000
+    )
+    {
+        std::vector<ordered_sample_pair> oedges;
+        convert_unordered_to_ordered(edges, oedges);
+        std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>);
+
+        return newman_cluster(oedges, labels, eps, max_iterations);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        std::vector<unsigned long> remap_labels (
+            const std::vector<unsigned long>& labels,
+            unsigned long& num_labels
+        )
+        /*!
+            ensures
+                - This function takes labels and produces a mapping which maps elements of
+                  labels into the most compact range in [0, max] as possible.  In particular,
+                  there won't be any unused integers in the mapped range.
+                - #num_labels == the number of distinct values in labels.
+                - returns a vector V such that:
+                    - V.size() == labels.size()
+                    - max(vector_to_matrix(V))+1 == num_labels.
+                    - for all valid i,j:
+                        - if (labels[i] == labels[j]) then
+                            - V[i] == V[j]
+                        - else
+                            - V[i] != V[j]
+        !*/
+        {
+            std::map<unsigned long, unsigned long> temp;
+            for (unsigned long i = 0; i < labels.size(); ++i)
+            {
+                if (temp.count(labels[i]) == 0)
+                {
+                    const unsigned long next = temp.size();
+                    temp[labels[i]] = next;
+                }
+            }
+
+            num_labels = temp.size();
+
+            std::vector<unsigned long> result(labels.size());
+            for (unsigned long i = 0; i < labels.size(); ++i)
+            {
+                result[i] = temp[labels[i]];
+            }
+            return result;
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    double modularity (
+        const std::vector<sample_pair>& edges,
+        const std::vector<unsigned long>& labels
+    )
+    {
+        const unsigned long num_nodes = max_index_plus_one(edges);
+        // make sure requires clause is not broken
+        DLIB_ASSERT(labels.size() == num_nodes,
+                    "\t double modularity()"
+                    << "\n\t Invalid inputs were given to this function"
+        );
+
+        unsigned long num_labels;
+        const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels);
+
+        std::vector<double> cluster_sums(num_labels,0);
+        std::vector<double> k(num_nodes,0);
+
+        double Q = 0;
+        double m = 0;
+        for (unsigned long i = 0; i < edges.size(); ++i)
+        {
+            const unsigned long n1 = edges[i].index1();
+            const unsigned long n2 = edges[i].index2();
+            k[n1] += edges[i].distance();
+            if (n1 != n2)
+                k[n2] += edges[i].distance();
+
+            if (n1 != n2)
+                m += edges[i].distance();
+            else
+                m += edges[i].distance()/2;
+
+            if (labels_[n1] == labels_[n2])
+            {
+                if (n1 != n2)
+                    Q += 2*edges[i].distance();
+                else
+                    Q += edges[i].distance();
+            }
+        }
+
+        if (m == 0)
+            return 0;
+
+        for (unsigned long i = 0; i < labels_.size(); ++i)
+        {
+            cluster_sums[labels_[i]] += k[i];
+        }
+
+        for (unsigned long i = 0; i < labels_.size(); ++i)
+        {
+            Q -= k[i]*cluster_sums[labels_[i]]/(2*m);
+        }
+
+        return 1.0/(2*m)*Q;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    double modularity (
+        const std::vector<ordered_sample_pair>& edges,
+        const std::vector<unsigned long>& labels
+    )
+    {
+        const unsigned long num_nodes = max_index_plus_one(edges);
+        // make sure requires clause is not broken
+        DLIB_ASSERT(labels.size() == num_nodes,
+                    "\t double modularity()"
+                    << "\n\t Invalid inputs were given to this function"
+        );
+
+
+        unsigned long num_labels;
+        const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels);
+
+        std::vector<double> cluster_sums(num_labels,0);
+        std::vector<double> k(num_nodes,0);
+
+        double Q = 0;
+        double m = 0;
+        for (unsigned long i = 0; i < edges.size(); ++i)
+        {
+            const unsigned long n1 = edges[i].index1();
+            const unsigned long n2 = edges[i].index2();
+            k[n1] += edges[i].distance();
+            m += edges[i].distance();
+            if (labels_[n1] == labels_[n2])
+            {
+                Q += edges[i].distance();
+            }
+        }
+
+        if (m == 0)
+            return 0;
+
+        for (unsigned long i = 0; i < labels_.size(); ++i)
+        {
+            cluster_sums[labels_[i]] += k[i];
+        }
+
+        for (unsigned long i = 0; i < labels_.size(); ++i)
+        {
+            Q -= k[i]*cluster_sums[labels_[i]]/m;
+        }
+
+        return 1.0/m*Q;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_MODULARITY_ClUSTERING__H__
+
--- a/dlib/clustering/modularity_clustering_abstract.h
+++ b/dlib/clustering/modularity_clustering_abstract.h
+// Copyright (C) 2012  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_MODULARITY_ClUSTERING_ABSTRACT_H__
+#ifdef DLIB_MODULARITY_ClUSTERING_ABSTRACT_H__
+
+#include <vector>
+#include "../manifold_regularization/ordered_sample_pair_abstract.h"
+#include "../manifold_regularization/sample_pair_abstract.h"
+
+namespace dlib
+{
+
+// -----------------------------------------------------------------------------------------
+
+    double modularity (
+        const std::vector<sample_pair>& edges,
+        const std::vector<unsigned long>& labels
+    );
+    /*!
+        requires
+            - labels.size() == max_index_plus_one(edges)
+        ensures
+            - Interprets edges as an undirected graph.  That is, it contains the edges on
+              the said graph and the sample_pair::distance() values define the edge weights
+              (larger values indicating a stronger edge connection between the nodes).
+            - This function returns the modularity value obtained when the given input
+              graph is broken into subgraphs according to the contents of labels.  In
+              particular, we say that two nodes with indices i and j are in the same
+              subgraph or community if and only if labels[i] == labels[j].
+            - Duplicate edges are interpreted as if there had been just one edge with a
+              distance value equal to the sum of all the duplicate edge's distance values.
+            - See the paper Modularity and community structure in networks by M. E. J. Newman
+              for a detailed definition.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    double modularity (
+        const std::vector<ordered_sample_pair>& edges,
+        const std::vector<unsigned long>& labels
+    );
+    /*!
+        requires
+            - labels.size() == max_index_plus_one(edges)
+        ensures
+            - Interprets edges as a directed graph.  That is, it contains the edges on the
+              said graph and the ordered_sample_pair::distance() values define the edge
+              weights (larger values indicating a stronger edge connection between the
+              nodes).  Note that, generally, modularity is only really defined for
+              undirected graphs.  Therefore, the "directed graph" given to this function
+              should have symmetric edges between all nodes.  The reason this function is
+              provided at all is because sometimes a vector of ordered_sample_pair objects
+              is a useful representation of an undirected graph.
+            - This function returns the modularity value obtained when the given input
+              graph is broken into subgraphs according to the contents of labels.  In
+              particular, we say that two nodes with indices i and j are in the same
+              subgraph or community if and only if labels[i] == labels[j].
+            - Duplicate edges are interpreted as if there had been just one edge with a
+              distance value equal to the sum of all the duplicate edge's distance values.
+            - See the paper Modularity and community structure in networks by M. E. J. Newman
+              for a detailed definition.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long newman_cluster (
+        const std::vector<ordered_sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const double eps = 1e-4,
+        const unsigned long max_iterations = 2000
+    );
+    /*!
+        requires
+            - is_ordered_by_index(edges) == true
+        ensures
+            - This function performs the clustering algorithm described in the paper
+              Modularity and community structure in networks by M. E. J. Newman.  
+            - This function interprets edges as a graph and attempts to find the labeling
+              that maximizes modularity(edges, #labels).   
+            - returns the number of clusters found.
+            - #labels.size() == max_index_plus_one(edges)
+            - for all valid i:
+                - #labels[i] == the cluster ID of the node with index i in the graph.  
+                - 0 <= #labels[i] < the number of clusters found
+                  (i.e. cluster IDs are assigned contiguously and start at 0) 
+            - The main computation of the algorithm is involved in finding an eigenvector
+              of a certain matrix.  To do this, we use the power iteration.  In particular,
+              each time we try to find an eigenvector we will let the power iteration loop
+              at most max_iterations times or until it reaches an accuracy of eps.
+              Whichever comes first.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    unsigned long newman_cluster (
+        const std::vector<sample_pair>& edges,
+        std::vector<unsigned long>& labels,
+        const double eps = 1e-4,
+        const unsigned long max_iterations = 2000
+    );
+    /*!
+        ensures
+            - This function is identical to the above newman_cluster() routine except that
+              it operates on a vector of sample_pair objects instead of
+              ordered_sample_pairs.  Therefore, this is simply a convenience routine.  In
+              particular, it is implemented by transforming the given edges into
+              ordered_sample_pairs and then calling the newman_cluster() routine defined
+              above.  
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_MODULARITY_ClUSTERING_ABSTRACT_H__
+