Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
D
dlib
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
钟尚武
dlib
Commits
1efcfb3d
Commit
1efcfb3d
authored
May 14, 2013
by
Davis King
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Made the sequence_segmenter work with both BIO and BILOU tagging models.
parent
6d623eeb
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
219 additions
and
70 deletions
+219
-70
sequence_segmenter.h
dlib/svm/sequence_segmenter.h
+107
-23
sequence_segmenter_abstract.h
dlib/svm/sequence_segmenter_abstract.h
+49
-28
structural_sequence_segmentation_trainer.h
dlib/svm/structural_sequence_segmentation_trainer.h
+40
-10
sequence_segmenter.cpp
dlib/test/sequence_segmenter.cpp
+23
-9
No files found.
dlib/svm/sequence_segmenter.h
View file @
1efcfb3d
...
...
@@ -16,9 +16,13 @@ namespace dlib
// ------------------------------------------------------------------------------------
// BIO/BILOU labels
const
unsigned
int
BEGIN
=
0
;
const
unsigned
int
INSIDE
=
1
;
const
unsigned
int
OUTSIDE
=
2
;
const
unsigned
int
LAST
=
3
;
const
unsigned
int
UNIT
=
4
;
// ------------------------------------------------------------------------------------
...
...
@@ -52,13 +56,11 @@ namespace dlib
unsigned
long
num_features
()
const
{
const
int
base_dims
=
fe
.
num_features
();
return
num_labels
()
*
(
num_labels
()
+
// previous and current label
base_dims
*
fe
.
window_size
()
+
// window around current element
num_labels
()
*
base_dims
*
fe
.
window_size
()
// window around current element in conjunction with previous label
);
const
unsigned
long
NL
=
ss_feature_extractor
::
use_BIO_model
?
3
:
5
;
if
(
ss_feature_extractor
::
use_high_order_features
)
return
NL
*
NL
+
(
NL
*
NL
+
NL
)
*
fe
.
num_features
()
*
fe
.
window_size
();
else
return
NL
*
NL
+
NL
*
fe
.
num_features
()
*
fe
.
window_size
();
}
unsigned
long
order
()
const
...
...
@@ -68,7 +70,10 @@ namespace dlib
unsigned
long
num_labels
()
const
{
return
3
;
if
(
ss_feature_extractor
::
use_BIO_model
)
return
3
;
else
return
5
;
}
private
:
...
...
@@ -113,10 +118,58 @@ namespace dlib
unsigned
long
)
const
{
// Don't allow BIO label patterns that don't correspond to a sensical
// segmentation.
if
(
y
.
size
()
>
1
&&
y
(
0
)
==
INSIDE
&&
y
(
1
)
==
OUTSIDE
)
return
true
;
if
(
ss_feature_extractor
::
use_BIO_model
)
{
// Don't allow BIO label patterns that don't correspond to a sensical
// segmentation.
if
(
y
.
size
()
>
1
&&
y
(
0
)
==
INSIDE
&&
y
(
1
)
==
OUTSIDE
)
return
true
;
if
(
y
.
size
()
==
1
&&
y
(
0
)
==
INSIDE
)
return
true
;
}
else
{
// Don't allow BILOU label patterns that don't correspond to a sensical
// segmentation.
if
(
y
.
size
()
>
1
)
{
if
(
y
(
1
)
==
BEGIN
&&
y
(
0
)
==
OUTSIDE
)
return
true
;
if
(
y
(
1
)
==
BEGIN
&&
y
(
0
)
==
UNIT
)
return
true
;
if
(
y
(
1
)
==
BEGIN
&&
y
(
0
)
==
BEGIN
)
return
true
;
if
(
y
(
1
)
==
INSIDE
&&
y
(
0
)
==
BEGIN
)
return
true
;
if
(
y
(
1
)
==
INSIDE
&&
y
(
0
)
==
OUTSIDE
)
return
true
;
if
(
y
(
1
)
==
INSIDE
&&
y
(
0
)
==
UNIT
)
return
true
;
if
(
y
(
1
)
==
OUTSIDE
&&
y
(
0
)
==
INSIDE
)
return
true
;
if
(
y
(
1
)
==
OUTSIDE
&&
y
(
0
)
==
LAST
)
return
true
;
if
(
y
(
1
)
==
LAST
&&
y
(
0
)
==
INSIDE
)
return
true
;
if
(
y
(
1
)
==
LAST
&&
y
(
0
)
==
LAST
)
return
true
;
if
(
y
(
1
)
==
UNIT
&&
y
(
0
)
==
INSIDE
)
return
true
;
if
(
y
(
1
)
==
UNIT
&&
y
(
0
)
==
LAST
)
return
true
;
}
else
{
if
(
y
(
0
)
==
INSIDE
)
return
true
;
if
(
y
(
0
)
==
LAST
)
return
true
;
}
}
return
false
;
}
...
...
@@ -146,7 +199,8 @@ namespace dlib
const
unsigned
long
off1
=
y
(
0
)
*
base_dims
;
dot_functor
<
feature_setter
>
fs1
(
set_feature
,
offset
+
off1
);
fe
.
get_features
(
fs1
,
x
,
pos
);
if
(
y
.
size
()
>
1
)
if
(
ss_feature_extractor
::
use_high_order_features
&&
y
.
size
()
>
1
)
{
const
unsigned
long
off2
=
num_labels
()
*
base_dims
+
(
y
(
0
)
*
num_labels
()
+
y
(
1
))
*
base_dims
;
dot_functor
<
feature_setter
>
fs2
(
set_feature
,
offset
+
off2
);
...
...
@@ -154,7 +208,10 @@ namespace dlib
}
}
offset
+=
num_labels
()
*
(
base_dims
+
num_labels
()
*
base_dims
);
if
(
ss_feature_extractor
::
use_high_order_features
)
offset
+=
num_labels
()
*
base_dims
+
num_labels
()
*
num_labels
()
*
base_dims
;
else
offset
+=
num_labels
()
*
base_dims
;
}
}
...
...
@@ -171,7 +228,11 @@ namespace dlib
const
feature_extractor
&
fe
)
{
return
3
*
3
+
12
*
fe
.
num_features
()
*
fe
.
window_size
();
const
unsigned
long
NL
=
feature_extractor
::
use_BIO_model
?
3
:
5
;
if
(
feature_extractor
::
use_high_order_features
)
return
NL
*
NL
+
(
NL
*
NL
+
NL
)
*
fe
.
num_features
()
*
fe
.
window_size
();
else
return
NL
*
NL
+
NL
*
fe
.
num_features
()
*
fe
.
window_size
();
}
// ----------------------------------------------------------------------------------------
...
...
@@ -272,18 +333,41 @@ namespace dlib
std
::
vector
<
unsigned
long
>
labels
;
labeler
.
label_sequence
(
x
,
labels
);
// Convert from BIO tagging to the explicit segments representation.
for
(
unsigned
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
if
(
feature_extractor
::
use_BIO_model
)
{
if
(
labels
[
i
]
==
impl_ss
::
BEGIN
)
// Convert from BIO tagging to the explicit segments representation.
for
(
unsigned
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
{
const
unsigned
long
begin
=
i
;
++
i
;
while
(
i
<
labels
.
size
()
&&
labels
[
i
]
==
impl_ss
::
INSIDE
)
if
(
labels
[
i
]
==
impl_ss
::
BEGIN
)
{
const
unsigned
long
begin
=
i
;
++
i
;
while
(
i
<
labels
.
size
()
&&
labels
[
i
]
==
impl_ss
::
INSIDE
)
++
i
;
y
.
push_back
(
std
::
make_pair
(
begin
,
i
));
--
i
;
}
}
}
else
{
// Convert from BILOU tagging to the explicit segments representation.
for
(
unsigned
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
{
if
(
labels
[
i
]
==
impl_ss
::
BEGIN
)
{
const
unsigned
long
begin
=
i
;
++
i
;
while
(
i
<
labels
.
size
()
&&
labels
[
i
]
==
impl_ss
::
INSIDE
)
++
i
;
y
.
push_back
(
std
::
make_pair
(
begin
,
i
));
--
i
;
y
.
push_back
(
std
::
make_pair
(
begin
,
i
+
1
));
}
else
if
(
labels
[
i
]
==
impl_ss
::
UNIT
)
{
y
.
push_back
(
std
::
make_pair
(
i
,
i
+
1
));
}
}
}
}
...
...
dlib/svm/sequence_segmenter_abstract.h
View file @
1efcfb3d
...
...
@@ -26,25 +26,30 @@ namespace dlib
Where w is a parameter vector and the label sequence defines a segmentation
of x.
Recall that a sequence_segmenter uses the BIO tagging model and is also an
instantiation of the dlib::sequence_labeler. This means that each element
of the label sequence y takes on one of three possible values (B, I, or O)
and together these labels define a segmentation of the sequence. For example,
to represent a segmentation of the sequence of words "The dog ran to Bob Jones"
where only "Bob Jones" was segmented out we would use the label sequence OOOOBI.
Recall that a sequence_segmenter uses the BIO or BILOU tagging model and is
also an instantiation of the dlib::sequence_labeler. Selecting to use the
BIO model means that each element of the label sequence y takes on one of
three possible values (B, I, or O) and together these labels define a
segmentation of the sequence. For example, to represent a segmentation of
the sequence of words "The dog ran to Bob Jones" where only "Bob Jones" was
segmented out we would use the label sequence OOOOBI. The BILOU model is
similar except that it uses five different labels and each segment is
labeled as U, BL, BIL, BIIL, BIIIL, and so on depending on its length.
Therefore, the BILOU model is able to more explicitly model the ends of the
segments than the BIO model, but has more parameters to estimate.
Keeping
this in mind, the purpose of a sequence_segmenter is to take car
e
of the bookkeeping associated with creating BIO tagging models for
segmentation tasks. In particular, it presents the user with a simplified
version of the interface used by the dlib::sequence_labeler. It does this
by completely hiding the BIO tags from the user and instead exposes an
exp
licit sub-segment based labeling representation. It also simplifies the
construction of the PSI() feature vector.
Keeping
all this in mind, the purpose of a sequence_segmenter is to tak
e
care of the bookkeeping associated with creating BIO/BILOU tagging models
for segmentation tasks. In particular, it presents the user with a
simplified version of the interface used by the dlib::sequence_labeler. It
does this by completely hiding the BIO/BILOU tags from the user and instead
exp
oses an explicit sub-segment based labeling representation. It also
simplifies the
construction of the PSI() feature vector.
Like in the dlib::sequence_labeler, PSI() is a sum of feature vectors, each
derived from the entire input sequence x but only part of the label
sequence y. In the case of the sequence_segmenter, we use an order one
model Markov. This means that
that
model Markov. This means that
PSI(x,y) == sum_i XI(x, y_{i-1}, y_{i}, i)
where the sum is taken over all the elements in the sequence. At each
element we extract a feature vector, XI(), that is expected to encode
...
...
@@ -61,12 +66,12 @@ namespace dlib
independent of any labeling. We denote this feature vector by ZI(x,i), where
x is the sequence and i is the position in question.
For example, suppose we use a window size of 3
, then we can put all this
together and define XI() in terms of ZI(). To do this, we can think of
XI() as containing 12*3 slots which contain either a zero vector or a ZI()
vector. Each combination of window position and labeling has a different
slot. To explain further, consider the following examples where we hav
e
annotated which parts of XI() correspond to each slot.
For example, suppose we use a window size of 3
and BIO tags, then we can
put all this together and define XI() in terms of ZI(). To do this, we can
think of XI() as containing 12*3 slots which contain either a zero vector
or a ZI() vector. Each combination of window position and labeling has a
different slot. To explain further, consider the following examples wher
e
we have
annotated which parts of XI() correspond to each slot.
If the previous and current label are both B and we use a window size of 3
then XI() would be instantiated as:
...
...
@@ -152,7 +157,10 @@ namespace dlib
0 \
0 > If previous label is O and current label is O
0] /
If we had instead used the BILOU tagging model the XI() vector would
have been similarly defined except that there would be 5*5+5 slots for
the various label combination instead of 3*3+3.
Finally, while not shown here, we also include nine indicator features
in XI() to model label transitions.
...
...
@@ -168,6 +176,19 @@ namespace dlib
// anything so long as it has a .size() which returns the length of the sequence.
typedef
the_type_used_to_represent_a_sequence
sequence_type
;
// If you want to use the BIO tagging model then set this bool to true. Set it to
// false to use the BILOU tagging model.
const
static
bool
use_BIO_model
=
true
;
// In the WHAT THIS OBJECT REPRESENTS section above we discussed how we model the
// conjunction of the previous label and the window around each position. Doing
// this greatly expands the size of the parameter vector w. You can optionally
// disable these higher order features by setting the use_high_order_features bool
// to false. This will cause XI() to include only slots which are independent of
// the previous label.
const
static
bool
use_high_order_features
=
true
;
example_feature_extractor
(
);
/*!
...
...
@@ -257,9 +278,8 @@ namespace dlib
- fe must be an object that implements an interface compatible with the
example_feature_extractor discussed above.
ensures
- returns 3*3 + 12*fe.num_features()*fe.window_size()
(i.e. returns the dimensionality of the PSI() vector defined by the given
feature extractor.
- returns the dimensionality of the PSI() vector defined by the given feature
extractor.
!*/
// ----------------------------------------------------------------------------------------
...
...
@@ -283,10 +303,11 @@ namespace dlib
contiguous words which refer to proper names.
The sequence_segmenter is implemented using the BIO (Begin, Inside,
Outside) sequence tagging model. Moreover, the sequence tagging is done
internally using a dlib::sequence_labeler object and therefore
sequence_segmenter objects are examples of chain structured conditional
random field style sequence taggers.
Outside) or BILOU (Begin, Inside, Last, Outside, Unit) sequence tagging
model. Moreover, the sequence tagging is done internally using a
dlib::sequence_labeler object and therefore sequence_segmenter objects are
examples of chain structured conditional random field style sequence
taggers.
THREAD SAFETY
It is always safe to use distinct instances of this object in different
...
...
dlib/svm/structural_sequence_segmentation_trainer.h
View file @
1efcfb3d
...
...
@@ -142,20 +142,50 @@ namespace dlib
<<
"
\n\t
this: "
<<
this
);
// convert y into tagged BIO labels
std
::
vector
<
std
::
vector
<
unsigned
long
>
>
labels
(
y
.
size
());
for
(
unsigned
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
if
(
feature_extractor
::
use_BIO_model
)
{
labels
[
i
].
resize
(
x
[
i
].
size
(),
impl_ss
::
OUTSIDE
);
for
(
unsigned
long
j
=
0
;
j
<
y
[
i
].
size
();
++
j
)
// convert y into tagged BIO labels
for
(
unsigned
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
{
const
unsigned
long
begin
=
y
[
i
][
j
].
first
;
const
unsigned
long
end
=
y
[
i
][
j
].
second
;
if
(
begin
!=
end
)
labels
[
i
].
resize
(
x
[
i
].
size
(),
impl_ss
::
OUTSIDE
);
for
(
unsigned
long
j
=
0
;
j
<
y
[
i
].
size
();
++
j
)
{
labels
[
i
][
begin
]
=
impl_ss
::
BEGIN
;
for
(
unsigned
long
k
=
begin
+
1
;
k
<
end
;
++
k
)
labels
[
i
][
k
]
=
impl_ss
::
INSIDE
;
const
unsigned
long
begin
=
y
[
i
][
j
].
first
;
const
unsigned
long
end
=
y
[
i
][
j
].
second
;
if
(
begin
!=
end
)
{
labels
[
i
][
begin
]
=
impl_ss
::
BEGIN
;
for
(
unsigned
long
k
=
begin
+
1
;
k
<
end
;
++
k
)
labels
[
i
][
k
]
=
impl_ss
::
INSIDE
;
}
}
}
}
else
{
// convert y into tagged BILOU labels
for
(
unsigned
long
i
=
0
;
i
<
labels
.
size
();
++
i
)
{
labels
[
i
].
resize
(
x
[
i
].
size
(),
impl_ss
::
OUTSIDE
);
for
(
unsigned
long
j
=
0
;
j
<
y
[
i
].
size
();
++
j
)
{
const
unsigned
long
begin
=
y
[
i
][
j
].
first
;
const
unsigned
long
end
=
y
[
i
][
j
].
second
;
if
(
begin
!=
end
)
{
if
(
begin
+
1
==
end
)
{
labels
[
i
][
begin
]
=
impl_ss
::
UNIT
;
}
else
{
labels
[
i
][
begin
]
=
impl_ss
::
BEGIN
;
for
(
unsigned
long
k
=
begin
+
1
;
k
+
1
<
end
;
++
k
)
labels
[
i
][
k
]
=
impl_ss
::
INSIDE
;
labels
[
i
][
end
-
1
]
=
impl_ss
::
LAST
;
}
}
}
}
}
...
...
dlib/test/sequence_segmenter.cpp
View file @
1efcfb3d
...
...
@@ -20,10 +20,14 @@ namespace
dlib
::
rand
rnd
;
template
<
bool
use_BIO_model_
,
bool
use_high_order_features_
>
class
unigram_extractor
{
public
:
const
static
bool
use_BIO_model
=
use_BIO_model_
;
const
static
bool
use_high_order_features
=
use_high_order_features_
;
typedef
std
::
vector
<
unsigned
long
>
sequence_type
;
std
::
map
<
unsigned
long
,
matrix
<
double
,
0
,
1
>
>
feats
;
...
...
@@ -64,12 +68,14 @@ namespace
};
void
serialize
(
const
unigram_extractor
&
item
,
std
::
ostream
&
out
)
template
<
bool
use_BIO_model_
,
bool
use_high_order_features_
>
void
serialize
(
const
unigram_extractor
<
use_BIO_model_
,
use_high_order_features_
>&
item
,
std
::
ostream
&
out
)
{
serialize
(
item
.
feats
,
out
);
}
void
deserialize
(
unigram_extractor
&
item
,
std
::
istream
&
in
)
template
<
bool
use_BIO_model_
,
bool
use_high_order_features_
>
void
deserialize
(
unigram_extractor
<
use_BIO_model_
,
use_high_order_features_
>&
item
,
std
::
istream
&
in
)
{
deserialize
(
item
.
feats
,
in
);
}
...
...
@@ -89,7 +95,7 @@ namespace
labels
.
resize
(
dataset_size
);
unigram_extractor
fe
;
unigram_extractor
<
true
,
true
>
fe
;
dlib
::
rand
rnd
;
for
(
unsigned
long
iter
=
0
;
iter
<
dataset_size
;
++
iter
)
...
...
@@ -161,22 +167,27 @@ namespace
// ----------------------------------------------------------------------------------------
template
<
bool
use_BIO_model
,
bool
use_high_order_features
>
void
do_test
()
{
dlog
<<
LINFO
<<
"use_BIO_model: "
<<
use_BIO_model
;
dlog
<<
LINFO
<<
"use_high_order_features: "
<<
use_high_order_features
;
std
::
vector
<
std
::
vector
<
unsigned
long
>
>
samples
;
std
::
vector
<
std
::
vector
<
std
::
pair
<
unsigned
long
,
unsigned
long
>
>
>
segments
;
make_dataset2
(
samples
,
segments
,
200
);
print_spinner
();
typedef
unigram_extractor
<
use_BIO_model
,
use_high_order_features
>
fe_type
;
unigram_extractor
fe_temp
;
unigram_extractor
fe_temp2
;
structural_sequence_segmentation_trainer
<
unigram_extractor
>
trainer
(
fe_temp2
);
fe_type
fe_temp
;
fe_type
fe_temp2
;
structural_sequence_segmentation_trainer
<
fe_type
>
trainer
(
fe_temp2
);
trainer
.
set_c
(
4
);
trainer
.
set_num_threads
(
1
);
sequence_segmenter
<
unigram_extractor
>
labeler
=
trainer
.
train
(
samples
,
segments
);
sequence_segmenter
<
fe_type
>
labeler
=
trainer
.
train
(
samples
,
segments
);
print_spinner
();
...
...
@@ -215,7 +226,7 @@ namespace
ostringstream
sout
;
serialize
(
labeler
,
sout
);
istringstream
sin
(
sout
.
str
());
sequence_segmenter
<
unigram_extractor
>
labeler2
;
sequence_segmenter
<
fe_type
>
labeler2
;
deserialize
(
labeler2
,
sin
);
res
=
test_sequence_segmenter
(
labeler2
,
samples
,
segments
);
...
...
@@ -238,7 +249,10 @@ namespace
void
perform_test
(
)
{
do_test
();
do_test
<
true
,
true
>
();
do_test
<
true
,
false
>
();
do_test
<
false
,
true
>
();
do_test
<
false
,
false
>
();
}
}
a
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment