Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
D
dlib
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
钟尚武
dlib
Commits
f2371195
Commit
f2371195
authored
Nov 05, 2012
by
Davis King
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added newman_cluster(), chinese_whispers(), and modularity() routines.
parent
d598fcf2
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
866 additions
and
0 deletions
+866
-0
clustering.h
dlib/clustering.h
+11
-0
chinese_whispers.h
dlib/clustering/chinese_whispers.h
+132
-0
chinese_whispers_abstract.h
dlib/clustering/chinese_whispers_abstract.h
+96
-0
modularity_clustering.h
dlib/clustering/modularity_clustering.h
+511
-0
modularity_clustering_abstract.h
dlib/clustering/modularity_clustering_abstract.h
+116
-0
No files found.
dlib/clustering.h
0 → 100644
View file @
f2371195
// Copyright (C) 2012 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_CLuSTERING_
#define DLIB_CLuSTERING_
#include "clustering/modularity_clustering.h"
#include "clustering/chinese_whispers.h"
#include "svm/kkmeans.h"
#endif // DLIB_CLuSTERING_
dlib/clustering/chinese_whispers.h
0 → 100644
View file @
f2371195
// Copyright (C) 2012 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_CHINESE_WHISPErS_H__
#define DLIB_CHINESE_WHISPErS_H__
#include "chinese_whispers_abstract.h"
#include <vector>
#include "../rand.h"
#include "../manifold_regularization/graph_creation.h"
namespace
dlib
{
// ----------------------------------------------------------------------------------------
unsigned
long
chinese_whispers
(
const
std
::
vector
<
ordered_sample_pair
>&
edges
,
std
::
vector
<
unsigned
long
>&
labels
,
const
unsigned
long
num_iterations
,
dlib
::
rand
&
rnd
)
{
// make sure requires clause is not broken
DLIB_ASSERT
(
is_ordered_by_index
(
edges
),
"
\t
unsigned long chinese_whispers()"
<<
"
\n\t
Invalid inputs were given to this function"
);
std
::
vector
<
std
::
pair
<
unsigned
long
,
unsigned
long
>
>
neighbors
;
find_neighbor_ranges
(
edges
,
neighbors
);
// Initialize the labels, each node gets a different label.
labels
.
resize
(
neighbors
.
size
());
for
(
unsigned
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
labels
[
i
]
=
i
;
for
(
unsigned
long
iter
=
0
;
iter
<
neighbors
.
size
()
*
num_iterations
;
++
iter
)
{
// Pick a random node.
const
unsigned
long
idx
=
rnd
.
get_random_64bit_number
()
%
neighbors
.
size
();
// Count how many times each label happens amongst our neighbors.
std
::
map
<
unsigned
long
,
double
>
labels_to_counts
;
const
unsigned
long
end
=
neighbors
[
idx
].
second
;
for
(
unsigned
long
i
=
neighbors
[
idx
].
first
;
i
!=
end
;
++
i
)
{
labels_to_counts
[
labels
[
edges
[
i
].
index2
()]]
+=
edges
[
i
].
distance
();
}
// find the most common label
std
::
map
<
unsigned
long
,
double
>::
iterator
i
;
double
best_score
=
-
std
::
numeric_limits
<
double
>::
infinity
();
unsigned
long
best_label
=
labels
[
idx
];
for
(
i
=
labels_to_counts
.
begin
();
i
!=
labels_to_counts
.
end
();
++
i
)
{
if
(
i
->
second
>
best_score
)
{
best_score
=
i
->
second
;
best_label
=
i
->
first
;
}
}
labels
[
idx
]
=
best_label
;
}
// Remap the labels into a contiguous range. First we find the
// mapping.
std
::
map
<
unsigned
long
,
unsigned
long
>
label_remap
;
for
(
unsigned
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
{
const
unsigned
long
next_id
=
label_remap
.
size
();
if
(
label_remap
.
count
(
labels
[
i
])
==
0
)
label_remap
[
labels
[
i
]]
=
next_id
;
}
// now apply the mapping to all the labels.
for
(
unsigned
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
{
labels
[
i
]
=
label_remap
[
labels
[
i
]];
}
return
label_remap
.
size
();
}
// ----------------------------------------------------------------------------------------
unsigned
long
chinese_whispers
(
const
std
::
vector
<
sample_pair
>&
edges
,
std
::
vector
<
unsigned
long
>&
labels
,
const
unsigned
long
num_iterations
,
dlib
::
rand
&
rnd
)
{
std
::
vector
<
ordered_sample_pair
>
oedges
;
convert_unordered_to_ordered
(
edges
,
oedges
);
std
::
sort
(
oedges
.
begin
(),
oedges
.
end
(),
&
order_by_index
<
ordered_sample_pair
>
);
return
chinese_whispers
(
oedges
,
labels
,
num_iterations
,
rnd
);
}
// ----------------------------------------------------------------------------------------
unsigned
long
chinese_whispers
(
const
std
::
vector
<
sample_pair
>&
edges
,
std
::
vector
<
unsigned
long
>&
labels
,
const
unsigned
long
num_iterations
=
100
)
{
dlib
::
rand
rnd
;
return
chinese_whispers
(
edges
,
labels
,
num_iterations
,
rnd
);
}
// ----------------------------------------------------------------------------------------
unsigned
long
chinese_whispers
(
const
std
::
vector
<
ordered_sample_pair
>&
edges
,
std
::
vector
<
unsigned
long
>&
labels
,
const
unsigned
long
num_iterations
=
100
)
{
dlib
::
rand
rnd
;
return
chinese_whispers
(
edges
,
labels
,
num_iterations
,
rnd
);
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_CHINESE_WHISPErS_H__
dlib/clustering/chinese_whispers_abstract.h
0 → 100644
View file @
f2371195
// Copyright (C) 2012 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_CHINESE_WHISPErS_ABSTRACT_H__
#ifdef DLIB_CHINESE_WHISPErS_ABSTRACT_H__
#include <vector>
#include "../rand.h"
#include "../manifold_regularization/ordered_sample_pair_abstract.h"
#include "../manifold_regularization/sample_pair_abstract.h"
namespace
dlib
{
// ----------------------------------------------------------------------------------------
unsigned
long
chinese_whispers
(
const
std
::
vector
<
ordered_sample_pair
>&
edges
,
std
::
vector
<
unsigned
long
>&
labels
,
const
unsigned
long
num_iterations
,
dlib
::
rand
&
rnd
);
/*!
requires
- is_ordered_by_index(edges) == true
ensures
- This function implements the graph clustering algorithm described in the
paper: Chinese Whispers - an Efficient Graph Clustering Algorithm and its
Application to Natural Language Processing Problems by Chris Biemann.
- Interprets edges as a directed graph. That is, it contains the edges on the
said graph and the ordered_sample_pair::distance() values define the edge
weights (larger values indicating a stronger edge connection between the
nodes).
- returns the number of clusters found.
- #labels.size() == max_index_plus_one(edges)
- for all valid i:
- #labels[i] == the cluster ID of the node with index i in the graph.
- 0 <= #labels[i] < the number of clusters found
(i.e. cluster IDs are assigned contiguously and start at 0)
- Duplicate edges are interpreted as if there had been just one edge with a
distance value equal to the sum of all the duplicate edge's distance values.
- The algorithm performs exactly num_iterations passes over the graph before
terminating.
!*/
// ----------------------------------------------------------------------------------------
unsigned
long
chinese_whispers
(
const
std
::
vector
<
sample_pair
>&
edges
,
std
::
vector
<
unsigned
long
>&
labels
,
const
unsigned
long
num_iterations
,
dlib
::
rand
&
rnd
);
/*!
ensures
- This function is identical to the above chinese_whispers() routine except
that it operates on a vector of sample_pair objects instead of
ordered_sample_pairs. Therefore, this is simply a convenience routine. In
particular, it is implemented by transforming the given edges into
ordered_sample_pairs and then calling the chinese_whispers() routine defined
above.
!*/
// ----------------------------------------------------------------------------------------
unsigned
long
chinese_whispers
(
const
std
::
vector
<
ordered_sample_pair
>&
edges
,
std
::
vector
<
unsigned
long
>&
labels
,
const
unsigned
long
num_iterations
=
100
);
/*!
requires
- is_ordered_by_index(edges) == true
ensures
- performs: return chinese_whispers(edges, labels, num_iterations, rnd)
where rnd is a default initialized dlib::rand object.
!*/
// ----------------------------------------------------------------------------------------
unsigned
long
chinese_whispers
(
const
std
::
vector
<
sample_pair
>&
edges
,
std
::
vector
<
unsigned
long
>&
labels
,
const
unsigned
long
num_iterations
=
100
);
/*!
ensures
- performs: return chinese_whispers(edges, labels, num_iterations, rnd)
where rnd is a default initialized dlib::rand object.
!*/
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_CHINESE_WHISPErS_ABSTRACT_H__
dlib/clustering/modularity_clustering.h
0 → 100644
View file @
f2371195
// Copyright (C) 2012 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_MODULARITY_ClUSTERING__H__
#define DLIB_MODULARITY_ClUSTERING__H__
#include "modularity_clustering_abstract.h"
#include "../sparse_vector.h"
#include "../manifold_regularization/graph_creation.h"
#include "../matrix.h"
#include "../rand.h"
namespace
dlib
{
// -----------------------------------------------------------------------------------------
namespace
impl
{
double
newman_cluster_split
(
dlib
::
rand
&
rnd
,
const
std
::
vector
<
ordered_sample_pair
>&
edges
,
const
matrix
<
double
,
0
,
1
>&
node_degrees
,
// k from the Newman paper
const
matrix
<
double
,
0
,
1
>&
Bdiag
,
// diag(B) from the Newman paper
const
double
&
edge_sum
,
// m from the Newman paper
matrix
<
double
,
0
,
1
>&
labels
,
const
double
eps
,
const
unsigned
long
max_iterations
)
/*!
requires
- node_degrees.size() == max_index_plus_one(edges)
- Bdiag.size() == max_index_plus_one(edges)
- edges must be sorted according to order_by_index()
ensures
- This routine splits a graph into two subgraphs using the Newman
clustering method.
- returns the modularity obtained when the graph is split according
to the contents of #labels.
- #labels.size() == node_degrees.size()
- for all valid i: #labels(i) == -1 or +1
- if (this function returns 0) then
- all the labels are equal, i.e. the graph is not split.
!*/
{
// Scale epsilon so that it is relative to the expected value of an element of a
// unit vector of length node_degrees.size().
const
double
power_iter_eps
=
eps
*
std
::
sqrt
(
1
.
0
/
node_degrees
.
size
());
// Make a random unit vector and put in labels.
labels
.
set_size
(
node_degrees
.
size
());
for
(
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
labels
(
i
)
=
rnd
.
get_random_gaussian
();
labels
/=
length
(
labels
);
matrix
<
double
,
0
,
1
>
Bv
,
Bv_unit
;
// Do the power iteration for a while.
double
eig
=
-
1
;
double
offset
=
0
;
while
(
eig
<
0
)
{
// any number larger than power_iter_eps
double
iteration_change
=
power_iter_eps
*
2
+
1
;
for
(
unsigned
long
i
=
0
;
i
<
max_iterations
&&
iteration_change
>
power_iter_eps
;
++
i
)
{
sparse_matrix_vector_multiply
(
edges
,
labels
,
Bv
);
Bv
-=
dot
(
node_degrees
,
labels
)
/
(
2
*
edge_sum
)
*
node_degrees
;
if
(
offset
!=
0
)
{
Bv
-=
offset
*
labels
;
}
const
double
len
=
length
(
Bv
);
if
(
len
!=
0
)
{
Bv_unit
=
Bv
/
len
;
iteration_change
=
max
(
abs
(
labels
-
Bv_unit
));
labels
.
swap
(
Bv_unit
);
}
else
{
// Had a bad time, pick another random vector and try it with the
// power iteration.
for
(
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
labels
(
i
)
=
rnd
.
get_random_gaussian
();
}
}
eig
=
dot
(
Bv
,
labels
);
// we will repeat this loop if the largest eigenvalue is negative
offset
=
eig
;
}
for
(
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
{
if
(
labels
(
i
)
>
0
)
labels
(
i
)
=
1
;
else
labels
(
i
)
=
-
1
;
}
// compute B*labels, store result in Bv.
sparse_matrix_vector_multiply
(
edges
,
labels
,
Bv
);
Bv
-=
dot
(
node_degrees
,
labels
)
/
(
2
*
edge_sum
)
*
node_degrees
;
// Do some label refinement. In this step we swap labels if it
// improves the modularity score.
bool
flipped_label
=
true
;
while
(
flipped_label
)
{
flipped_label
=
false
;
unsigned
long
idx
=
0
;
for
(
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
{
const
double
val
=
-
2
*
labels
(
i
);
const
double
increase
=
4
*
Bdiag
(
i
)
+
2
*
val
*
Bv
(
i
);
// if there is an increase in modularity for swapping this label
if
(
increase
>
0
)
{
labels
(
i
)
*=
-
1
;
while
(
idx
<
edges
.
size
()
&&
edges
[
idx
].
index1
()
==
(
unsigned
long
)
i
)
{
const
long
j
=
edges
[
idx
].
index2
();
Bv
(
j
)
+=
val
*
edges
[
idx
].
distance
();
++
idx
;
}
Bv
-=
(
val
*
node_degrees
(
i
)
/
(
2
*
edge_sum
))
*
node_degrees
;
flipped_label
=
true
;
}
else
{
while
(
idx
<
edges
.
size
()
&&
edges
[
idx
].
index1
()
==
(
unsigned
long
)
i
)
{
++
idx
;
}
}
}
}
const
double
modularity
=
dot
(
Bv
,
labels
)
/
(
4
*
edge_sum
);
return
modularity
;
}
// -------------------------------------------------------------------------------------
unsigned
long
newman_cluster_helper
(
dlib
::
rand
&
rnd
,
const
std
::
vector
<
ordered_sample_pair
>&
edges
,
const
matrix
<
double
,
0
,
1
>&
node_degrees
,
// k from the Newman paper
const
matrix
<
double
,
0
,
1
>&
Bdiag
,
// diag(B) from the Newman paper
const
double
&
edge_sum
,
// m from the Newman paper
std
::
vector
<
unsigned
long
>&
labels
,
double
modularity_threshold
,
const
double
eps
,
const
unsigned
long
max_iterations
)
/*!
ensures
- returns the number of clusters the data was split into
!*/
{
matrix
<
double
,
0
,
1
>
l
;
const
double
modularity
=
newman_cluster_split
(
rnd
,
edges
,
node_degrees
,
Bdiag
,
edge_sum
,
l
,
eps
,
max_iterations
);
// We need to collapse the node index values down to contiguous values. So
// we use the following two vectors to contain the mappings from input index
// values to their corresponding index values in each split.
std
::
vector
<
unsigned
long
>
left_idx_map
(
node_degrees
.
size
());
std
::
vector
<
unsigned
long
>
right_idx_map
(
node_degrees
.
size
());
// figure out how many nodes went into each side of the split.
unsigned
long
num_left_split
=
0
;
unsigned
long
num_right_split
=
0
;
for
(
long
i
=
0
;
i
<
l
.
size
();
++
i
)
{
if
(
l
(
i
)
>
0
)
{
left_idx_map
[
i
]
=
num_left_split
;
++
num_left_split
;
}
else
{
right_idx_map
[
i
]
=
num_right_split
;
++
num_right_split
;
}
}
// do a recursive split if it will improve the modularity.
if
(
modularity
>
modularity_threshold
&&
num_left_split
>
0
&&
num_right_split
>
0
)
{
// split the node_degrees and Bdiag matrices into left and right split parts
matrix
<
double
,
0
,
1
>
left_node_degrees
(
num_left_split
);
matrix
<
double
,
0
,
1
>
right_node_degrees
(
num_right_split
);
matrix
<
double
,
0
,
1
>
left_Bdiag
(
num_left_split
);
matrix
<
double
,
0
,
1
>
right_Bdiag
(
num_right_split
);
for
(
long
i
=
0
;
i
<
l
.
size
();
++
i
)
{
if
(
l
(
i
)
>
0
)
{
left_node_degrees
(
left_idx_map
[
i
])
=
node_degrees
(
i
);
left_Bdiag
(
left_idx_map
[
i
])
=
Bdiag
(
i
);
}
else
{
right_node_degrees
(
right_idx_map
[
i
])
=
node_degrees
(
i
);
right_Bdiag
(
right_idx_map
[
i
])
=
Bdiag
(
i
);
}
}
// put the edges from one side of the split into split_edges
std
::
vector
<
ordered_sample_pair
>
split_edges
;
modularity_threshold
=
0
;
for
(
unsigned
long
k
=
0
;
k
<
edges
.
size
();
++
k
)
{
const
unsigned
long
i
=
edges
[
k
].
index1
();
const
unsigned
long
j
=
edges
[
k
].
index2
();
const
double
d
=
edges
[
k
].
distance
();
if
(
l
(
i
)
>
0
&&
l
(
j
)
>
0
)
{
split_edges
.
push_back
(
ordered_sample_pair
(
left_idx_map
[
i
],
left_idx_map
[
j
],
d
));
modularity_threshold
+=
d
;
}
}
modularity_threshold
-=
sum
(
left_node_degrees
*
sum
(
left_node_degrees
))
/
(
2
*
edge_sum
);
modularity_threshold
/=
4
*
edge_sum
;
unsigned
long
num_left_clusters
;
std
::
vector
<
unsigned
long
>
left_labels
;
num_left_clusters
=
newman_cluster_helper
(
rnd
,
split_edges
,
left_node_degrees
,
left_Bdiag
,
edge_sum
,
left_labels
,
modularity_threshold
,
eps
,
max_iterations
);
// now load the other side into split_edges and cluster it as well
split_edges
.
clear
();
modularity_threshold
=
0
;
for
(
unsigned
long
k
=
0
;
k
<
edges
.
size
();
++
k
)
{
const
unsigned
long
i
=
edges
[
k
].
index1
();
const
unsigned
long
j
=
edges
[
k
].
index2
();
const
double
d
=
edges
[
k
].
distance
();
if
(
l
(
i
)
<
0
&&
l
(
j
)
<
0
)
{
split_edges
.
push_back
(
ordered_sample_pair
(
right_idx_map
[
i
],
right_idx_map
[
j
],
d
));
modularity_threshold
+=
d
;
}
}
modularity_threshold
-=
sum
(
right_node_degrees
*
sum
(
right_node_degrees
))
/
(
2
*
edge_sum
);
modularity_threshold
/=
4
*
edge_sum
;
unsigned
long
num_right_clusters
;
std
::
vector
<
unsigned
long
>
right_labels
;
num_right_clusters
=
newman_cluster_helper
(
rnd
,
split_edges
,
right_node_degrees
,
right_Bdiag
,
edge_sum
,
right_labels
,
modularity_threshold
,
eps
,
max_iterations
);
// Now merge the labels from the two splits.
labels
.
resize
(
node_degrees
.
size
());
for
(
unsigned
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
{
// if this node was in the left split
if
(
l
(
i
)
>
0
)
{
labels
[
i
]
=
left_labels
[
left_idx_map
[
i
]];
}
else
// if this node was in the right split
{
labels
[
i
]
=
right_labels
[
right_idx_map
[
i
]]
+
num_left_clusters
;
}
}
return
num_left_clusters
+
num_right_clusters
;
}
else
{
labels
.
assign
(
node_degrees
.
size
(),
0
);
return
1
;
}
}
}
// ----------------------------------------------------------------------------------------
unsigned
long
newman_cluster
(
const
std
::
vector
<
ordered_sample_pair
>&
edges
,
std
::
vector
<
unsigned
long
>&
labels
,
const
double
eps
=
1e-4
,
const
unsigned
long
max_iterations
=
2000
)
{
// make sure requires clause is not broken
DLIB_ASSERT
(
is_ordered_by_index
(
edges
),
"
\t
unsigned long newman_cluster()"
<<
"
\n\t
Invalid inputs were given to this function"
);
const
unsigned
long
num_nodes
=
max_index_plus_one
(
edges
);
// compute the node_degrees vector, edge_sum value, and diag(B).
matrix
<
double
,
0
,
1
>
node_degrees
(
num_nodes
);
matrix
<
double
,
0
,
1
>
Bdiag
(
num_nodes
);
Bdiag
=
0
;
double
edge_sum
=
0
;
node_degrees
=
0
;
for
(
unsigned
long
i
=
0
;
i
<
edges
.
size
();
++
i
)
{
node_degrees
(
edges
[
i
].
index1
())
+=
edges
[
i
].
distance
();
edge_sum
+=
edges
[
i
].
distance
();
if
(
edges
[
i
].
index1
()
==
edges
[
i
].
index2
())
Bdiag
(
edges
[
i
].
index1
())
+=
edges
[
i
].
distance
();
}
edge_sum
/=
2
;
Bdiag
-=
squared
(
node_degrees
)
/
(
2
*
edge_sum
);
dlib
::
rand
rnd
;
return
impl
::
newman_cluster_helper
(
rnd
,
edges
,
node_degrees
,
Bdiag
,
edge_sum
,
labels
,
0
,
eps
,
max_iterations
);
}
// ----------------------------------------------------------------------------------------
unsigned
long
newman_cluster
(
const
std
::
vector
<
sample_pair
>&
edges
,
std
::
vector
<
unsigned
long
>&
labels
,
const
double
eps
=
1e-4
,
const
unsigned
long
max_iterations
=
2000
)
{
std
::
vector
<
ordered_sample_pair
>
oedges
;
convert_unordered_to_ordered
(
edges
,
oedges
);
std
::
sort
(
oedges
.
begin
(),
oedges
.
end
(),
&
order_by_index
<
ordered_sample_pair
>
);
return
newman_cluster
(
oedges
,
labels
,
eps
,
max_iterations
);
}
// ----------------------------------------------------------------------------------------
namespace
impl
{
std
::
vector
<
unsigned
long
>
remap_labels
(
const
std
::
vector
<
unsigned
long
>&
labels
,
unsigned
long
&
num_labels
)
/*!
ensures
- This function takes labels and produces a mapping which maps elements of
labels into the most compact range in [0, max] as possible. In particular,
there won't be any unused integers in the mapped range.
- #num_labels == the number of distinct values in labels.
- returns a vector V such that:
- V.size() == labels.size()
- max(vector_to_matrix(V))+1 == num_labels.
- for all valid i,j:
- if (labels[i] == labels[j]) then
- V[i] == V[j]
- else
- V[i] != V[j]
!*/
{
std
::
map
<
unsigned
long
,
unsigned
long
>
temp
;
for
(
unsigned
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
{
if
(
temp
.
count
(
labels
[
i
])
==
0
)
{
const
unsigned
long
next
=
temp
.
size
();
temp
[
labels
[
i
]]
=
next
;
}
}
num_labels
=
temp
.
size
();
std
::
vector
<
unsigned
long
>
result
(
labels
.
size
());
for
(
unsigned
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
{
result
[
i
]
=
temp
[
labels
[
i
]];
}
return
result
;
}
}
// ----------------------------------------------------------------------------------------
double
modularity
(
const
std
::
vector
<
sample_pair
>&
edges
,
const
std
::
vector
<
unsigned
long
>&
labels
)
{
const
unsigned
long
num_nodes
=
max_index_plus_one
(
edges
);
// make sure requires clause is not broken
DLIB_ASSERT
(
labels
.
size
()
==
num_nodes
,
"
\t
double modularity()"
<<
"
\n\t
Invalid inputs were given to this function"
);
unsigned
long
num_labels
;
const
std
::
vector
<
unsigned
long
>&
labels_
=
dlib
::
impl
::
remap_labels
(
labels
,
num_labels
);
std
::
vector
<
double
>
cluster_sums
(
num_labels
,
0
);
std
::
vector
<
double
>
k
(
num_nodes
,
0
);
double
Q
=
0
;
double
m
=
0
;
for
(
unsigned
long
i
=
0
;
i
<
edges
.
size
();
++
i
)
{
const
unsigned
long
n1
=
edges
[
i
].
index1
();
const
unsigned
long
n2
=
edges
[
i
].
index2
();
k
[
n1
]
+=
edges
[
i
].
distance
();
if
(
n1
!=
n2
)
k
[
n2
]
+=
edges
[
i
].
distance
();
if
(
n1
!=
n2
)
m
+=
edges
[
i
].
distance
();
else
m
+=
edges
[
i
].
distance
()
/
2
;
if
(
labels_
[
n1
]
==
labels_
[
n2
])
{
if
(
n1
!=
n2
)
Q
+=
2
*
edges
[
i
].
distance
();
else
Q
+=
edges
[
i
].
distance
();
}
}
if
(
m
==
0
)
return
0
;
for
(
unsigned
long
i
=
0
;
i
<
labels_
.
size
();
++
i
)
{
cluster_sums
[
labels_
[
i
]]
+=
k
[
i
];
}
for
(
unsigned
long
i
=
0
;
i
<
labels_
.
size
();
++
i
)
{
Q
-=
k
[
i
]
*
cluster_sums
[
labels_
[
i
]]
/
(
2
*
m
);
}
return
1
.
0
/
(
2
*
m
)
*
Q
;
}
// ----------------------------------------------------------------------------------------
double
modularity
(
const
std
::
vector
<
ordered_sample_pair
>&
edges
,
const
std
::
vector
<
unsigned
long
>&
labels
)
{
const
unsigned
long
num_nodes
=
max_index_plus_one
(
edges
);
// make sure requires clause is not broken
DLIB_ASSERT
(
labels
.
size
()
==
num_nodes
,
"
\t
double modularity()"
<<
"
\n\t
Invalid inputs were given to this function"
);
unsigned
long
num_labels
;
const
std
::
vector
<
unsigned
long
>&
labels_
=
dlib
::
impl
::
remap_labels
(
labels
,
num_labels
);
std
::
vector
<
double
>
cluster_sums
(
num_labels
,
0
);
std
::
vector
<
double
>
k
(
num_nodes
,
0
);
double
Q
=
0
;
double
m
=
0
;
for
(
unsigned
long
i
=
0
;
i
<
edges
.
size
();
++
i
)
{
const
unsigned
long
n1
=
edges
[
i
].
index1
();
const
unsigned
long
n2
=
edges
[
i
].
index2
();
k
[
n1
]
+=
edges
[
i
].
distance
();
m
+=
edges
[
i
].
distance
();
if
(
labels_
[
n1
]
==
labels_
[
n2
])
{
Q
+=
edges
[
i
].
distance
();
}
}
if
(
m
==
0
)
return
0
;
for
(
unsigned
long
i
=
0
;
i
<
labels_
.
size
();
++
i
)
{
cluster_sums
[
labels_
[
i
]]
+=
k
[
i
];
}
for
(
unsigned
long
i
=
0
;
i
<
labels_
.
size
();
++
i
)
{
Q
-=
k
[
i
]
*
cluster_sums
[
labels_
[
i
]]
/
m
;
}
return
1
.
0
/
m
*
Q
;
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_MODULARITY_ClUSTERING__H__
dlib/clustering/modularity_clustering_abstract.h
0 → 100644
View file @
f2371195
// Copyright (C) 2012 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_MODULARITY_ClUSTERING_ABSTRACT_H__
#ifdef DLIB_MODULARITY_ClUSTERING_ABSTRACT_H__
#include <vector>
#include "../manifold_regularization/ordered_sample_pair_abstract.h"
#include "../manifold_regularization/sample_pair_abstract.h"
namespace
dlib
{
// -----------------------------------------------------------------------------------------
double
modularity
(
const
std
::
vector
<
sample_pair
>&
edges
,
const
std
::
vector
<
unsigned
long
>&
labels
);
/*!
requires
- labels.size() == max_index_plus_one(edges)
ensures
- Interprets edges as an undirected graph. That is, it contains the edges on
the said graph and the sample_pair::distance() values define the edge weights
(larger values indicating a stronger edge connection between the nodes).
- This function returns the modularity value obtained when the given input
graph is broken into subgraphs according to the contents of labels. In
particular, we say that two nodes with indices i and j are in the same
subgraph or community if and only if labels[i] == labels[j].
- Duplicate edges are interpreted as if there had been just one edge with a
distance value equal to the sum of all the duplicate edge's distance values.
- See the paper Modularity and community structure in networks by M. E. J. Newman
for a detailed definition.
!*/
// ----------------------------------------------------------------------------------------
double
modularity
(
const
std
::
vector
<
ordered_sample_pair
>&
edges
,
const
std
::
vector
<
unsigned
long
>&
labels
);
/*!
requires
- labels.size() == max_index_plus_one(edges)
ensures
- Interprets edges as a directed graph. That is, it contains the edges on the
said graph and the ordered_sample_pair::distance() values define the edge
weights (larger values indicating a stronger edge connection between the
nodes). Note that, generally, modularity is only really defined for
undirected graphs. Therefore, the "directed graph" given to this function
should have symmetric edges between all nodes. The reason this function is
provided at all is because sometimes a vector of ordered_sample_pair objects
is a useful representation of an undirected graph.
- This function returns the modularity value obtained when the given input
graph is broken into subgraphs according to the contents of labels. In
particular, we say that two nodes with indices i and j are in the same
subgraph or community if and only if labels[i] == labels[j].
- Duplicate edges are interpreted as if there had been just one edge with a
distance value equal to the sum of all the duplicate edge's distance values.
- See the paper Modularity and community structure in networks by M. E. J. Newman
for a detailed definition.
!*/
// ----------------------------------------------------------------------------------------
unsigned
long
newman_cluster
(
const
std
::
vector
<
ordered_sample_pair
>&
edges
,
std
::
vector
<
unsigned
long
>&
labels
,
const
double
eps
=
1e-4
,
const
unsigned
long
max_iterations
=
2000
);
/*!
requires
- is_ordered_by_index(edges) == true
ensures
- This function performs the clustering algorithm described in the paper
Modularity and community structure in networks by M. E. J. Newman.
- This function interprets edges as a graph and attempts to find the labeling
that maximizes modularity(edges, #labels).
- returns the number of clusters found.
- #labels.size() == max_index_plus_one(edges)
- for all valid i:
- #labels[i] == the cluster ID of the node with index i in the graph.
- 0 <= #labels[i] < the number of clusters found
(i.e. cluster IDs are assigned contiguously and start at 0)
- The main computation of the algorithm is involved in finding an eigenvector
of a certain matrix. To do this, we use the power iteration. In particular,
each time we try to find an eigenvector we will let the power iteration loop
at most max_iterations times or until it reaches an accuracy of eps.
Whichever comes first.
!*/
// ----------------------------------------------------------------------------------------
unsigned
long
newman_cluster
(
const
std
::
vector
<
sample_pair
>&
edges
,
std
::
vector
<
unsigned
long
>&
labels
,
const
double
eps
=
1e-4
,
const
unsigned
long
max_iterations
=
2000
);
/*!
ensures
- This function is identical to the above newman_cluster() routine except that
it operates on a vector of sample_pair objects instead of
ordered_sample_pairs. Therefore, this is simply a convenience routine. In
particular, it is implemented by transforming the given edges into
ordered_sample_pairs and then calling the newman_cluster() routine defined
above.
!*/
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_MODULARITY_ClUSTERING_ABSTRACT_H__
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment