Added the sequence_segmenter object.

4a527aa8 · Davis King · f5f3d07c · 4a527aa8 · 4a527aa8 · 4a527aa8
Commit 4a527aa8 authored May 12, 2013 by Davis King
Hide whitespace changes
Inline Side-by-side

Showing with 703 additions and 0 deletions

svm.h dlib/svm.h +1 -0

sequence_segmenter.h dlib/svm/sequence_segmenter.h +285 -0

sequence_segmenter_abstract.h dlib/svm/sequence_segmenter_abstract.h +417 -0

No files found.
--- a/dlib/svm.h
+++ b/dlib/svm.h
@@ -49,6 +49,7 @@
 #include "svm/assignment_function.h"
 #include "svm/active_learning.h"
 #include "svm/svr_linear_trainer.h"
+#include "svm/sequence_segmenter.h"
 #endif // DLIB_SVm_HEADER

--- a/dlib/svm/sequence_segmenter.h
+++ b/dlib/svm/sequence_segmenter.h
+// Copyright (C) 2013  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_SEQUENCE_SeGMENTER_H___
+#define DLIB_SEQUENCE_SeGMENTER_H___
+#include "sequence_segmenter_abstract.h"
+#include "../matrix.h"
+#include "sequence_labeler.h"
+#include <vector>
+namespace dlib
+{
+    // This namespace contains implementation details for the sequence_segmenter.
+    namespace impl_ss
+    {
+    // ------------------------------------------------------------------------------------
+        const unsigned int BEGIN   = 0;
+        const unsigned int INSIDE  = 1;
+        const unsigned int OUTSIDE = 2;
+    // ------------------------------------------------------------------------------------
+        template <typename ss_feature_extractor>
+        class feature_extractor
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a feature extractor for a sequence_labeler.  It serves to map
+                    the interface defined by a sequence_labeler into the kind of interface
+                    defined for a sequence_segmenter.
+            !*/
+        public:
+            typedef typename ss_feature_extractor::sequence_type sequence_type;
+            ss_feature_extractor fe;
+            feature_extractor() {}
+            feature_extractor(const ss_feature_extractor& ss_fe_) : fe(ss_fe_) {}
+            friend void serialize(const feature_extractor& item, std::ostream& out) 
+            {
+                serialize(item.fe, out);
+            }
+            friend void deserialize(feature_extractor& item, std::istream& in) 
+            {
+                deserialize(item.fe, in);
+            }
+            unsigned long num_features() const
+            {
+                const int base_dims = fe.num_features();
+                return num_labels()*(
+                    num_labels() +  // previous and current label
+                    base_dims*fe.window_size() + // window around current element
+                    num_labels()*base_dims*fe.window_size() // window around current element in conjunction with previous label
+                );
+            }
+            unsigned long order() const 
+            { 
+                return 1; 
+            }
+            unsigned long num_labels() const 
+            { 
+                return 3; 
+            }
+        private:
+            template <typename feature_setter>
+            struct dot_functor
+            {
+                /*!
+                    WHAT THIS OBJECT REPRESENTS
+                        This class wraps the feature_setter used by a sequence_labeler
+                        and turns it into the kind needed by a sequence_segmenter.
+                !*/
+                dot_functor(feature_setter& set_feature_, unsigned long offset_) : 
+                    set_feature(set_feature_), offset(offset_) {}
+                feature_setter& set_feature;
+                unsigned long offset;
+                inline void operator() (
+                    unsigned long feat_index
+                )
+                {
+                    set_feature(offset+feat_index);
+                }
+                inline void operator() (
+                    unsigned long feat_index,
+                    double feat_value
+                )
+                {
+                    set_feature(offset+feat_index, feat_value);
+                }
+            };
+        public:
+            template <typename EXP>
+            bool reject_labeling (
+                const sequence_type& ,
+                const matrix_exp<EXP>& y,
+                unsigned long 
+            ) const
+            {
+                // Don't allow BIO label patterns that don't correspond to a sensical
+                // segmentation. 
+                if (y.size() > 1 && y(0) == INSIDE && y(1) == OUTSIDE)
+                    return true;
+                return false;
+            }
+            template <typename feature_setter, typename EXP>
+            void get_features (
+                feature_setter& set_feature,
+                const sequence_type& x,
+                const matrix_exp<EXP>& y,
+                unsigned long position
+            ) const
+            {
+                // Pull out an indicator feature for the type of transition between the
+                // previous label and the current label.
+                if (y.size() > 1)
+                    set_feature(y(1)*num_labels() + y(0));
+                unsigned long offset = num_labels()*num_labels();
+                const int window_size = fe.window_size();
+                const int base_dims = fe.num_features();
+                for (int i = 0; i < window_size; ++i)
+                {
+                    const long pos = i-window_size/2 + static_cast<long>(position);
+                    if (0 <= pos && pos < (long)x.size())
+                    {
+                        const unsigned long off1 = y(0)*base_dims;
+                        dot_functor<feature_setter> fs1(set_feature, offset+off1);
+                        fe.get_features(fs1, x, pos);
+                        if (y.size() > 1)
+                        {
+                            const unsigned long off2 = num_labels()*base_dims + (y(0)*num_labels()+y(1))*base_dims;
+                            dot_functor<feature_setter> fs2(set_feature, offset+off2);
+                            fe.get_features(fs2, x, pos);
+                        }
+                    }
+                    offset += num_labels()*(base_dims + num_labels()*base_dims);
+                }
+            }
+        };
+    } // end namespace impl_ss
+// ----------------------------------------------------------------------------------------
+    template <
+        typename feature_extractor
+        >
+    unsigned long total_feature_vector_size (
+        const feature_extractor& fe
+    )
+    {
+        return 3*3 + 12*fe.num_features()*fe.window_size();
+    }
+// ----------------------------------------------------------------------------------------
+    template <
+        typename feature_extractor
+        >
+    class sequence_segmenter
+    {
+    public:
+        typedef typename feature_extractor::sequence_type sample_sequence_type;
+        typedef std::vector<std::pair<unsigned long, unsigned long> > segmented_sequence_type;
+        sequence_segmenter()
+        {
+        }
+        explicit sequence_segmenter(
+            const matrix<double,0,1>& weights
+        ) : 
+            labeler(weights)
+        {
+            const feature_extractor& fe = labeler.get_feature_extractor().fe;
+            // make sure requires clause is not broken
+            DLIB_ASSERT(total_feature_vector_size(fe) == (unsigned long)weights.size(),
+                "\t sequence_segmenter::sequence_segmenter(weights)"
+                << "\n\t These sizes should match"
+                << "\n\t total_feature_vector_size(fe):  " << total_feature_vector_size(fe) 
+                << "\n\t weights.size(): " << weights.size() 
+                << "\n\t this: " << this
+                );
+        }
+        sequence_segmenter(
+            const matrix<double,0,1>& weights,
+            const feature_extractor& fe
+        ) :
+            labeler(weights, impl_ss::feature_extractor<feature_extractor>(fe))
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(total_feature_vector_size(fe) == (unsigned long)weights.size(),
+                "\t sequence_segmenter::sequence_segmenter(weights,fe)"
+                << "\n\t These sizes should match"
+                << "\n\t total_feature_vector_size(fe):  " << total_feature_vector_size(fe) 
+                << "\n\t weights.size(): " << weights.size() 
+                << "\n\t this: " << this
+                );
+        }
+        const feature_extractor& get_feature_extractor (
+        ) const { return labeler.get_feature_extractor().fe; }
+        const matrix<double,0,1>& get_weights (
+        ) const { return labeler.get_weights(); }
+        segmented_sequence_type operator() (
+            const sample_sequence_type& x
+        ) const
+        {
+            segmented_sequence_type y;
+            segment_sequence(x,y);
+            return y;
+        }
+        void segment_sequence (
+            const sample_sequence_type& x,
+            segmented_sequence_type& y
+        ) const
+        {
+            y.clear();
+            std::vector<unsigned long> labels;
+            labeler.label_sequence(x, labels);
+            // Convert from BIO tagging to the explicit segments representation.
+            for (unsigned long i = 0; i < labels.size(); ++i)
+            {
+                if (labels[i] == impl_ss::BEGIN)
+                {
+                    const unsigned long begin = i;
+                    ++i;
+                    while (i < labels.size() && labels[i] == impl_ss::INSIDE)
+                        ++i;
+                    y.push_back(std::make_pair(begin, i));
+                    --i;
+                }
+            }
+        }
+        friend void serialize(const sequence_segmenter& item, std::ostream& out)
+        {
+            serialize(item.labeler, out);
+        }
+        friend void deserialize(sequence_segmenter& item, std::istream& in)
+        {
+            deserialize(item.labeler, in);
+        }
+    private:
+        sequence_labeler<impl_ss::feature_extractor<feature_extractor> > labeler;
+    };
+// ----------------------------------------------------------------------------------------
+}
+#endif // DLIB_SEQUENCE_SeGMENTER_H___
--- a/dlib/svm/sequence_segmenter_abstract.h
+++ b/dlib/svm/sequence_segmenter_abstract.h
+// Copyright (C) 2013  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_SEQUENCE_SeGMENTER_ABSTRACT_H___
+#ifdef DLIB_SEQUENCE_SeGMENTER_ABSTRACT_H___
+#include "../matrix.h"
+#include <vector>
+#include "sequence_labeler_abstract.h"
+namespace dlib
+{
+// ----------------------------------------------------------------------------------------
+    class example_feature_extractor
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object defines the interface a feature extractor must implement if it
+                is to be used with the sequence_segmenter defined at the bottom of this
+                file.  
+                The model used by sequence_segmenter objects is the following.  Given an
+                input sequence x, predict an output label sequence y such that:
+                    y == argmax_Y dot(w, PSI(x,Y))
+                Where w is a parameter vector and the label sequence defines a segmentation
+                of x.
+                Recall that a sequence_segmenter uses the BIO tagging model and is also an
+                instantiation of the dlib::sequence_labeler.  This means that each element
+                of the label sequence y takes on one of three possible values (B, I, or O)
+                and together these labels define a segmentation of the sequence.  For example, 
+                to represent a segmentation of the sequence of words "The dog ran to Bob Jones" 
+                where only "Bob Jones" was segmented out we would use the label sequence OOOOBI.
+                Keeping this in mind, the purpose of a sequence_segmenter is to take care
+                of the bookkeeping associated with creating BIO tagging models for
+                segmentation tasks.  In particular, it presents the user with a simplified
+                version of the interface used by the dlib::sequence_labeler.  It does this
+                by completely hiding the BIO tags from the user and instead exposes an
+                explicit sub-segment based labeling representation.  It also simplifies the
+                construction of the PSI() feature vector. 
+                Like in the dlib::sequence_labeler, PSI() is a sum of feature vectors, each
+                derived from the entire input sequence x but only part of the label
+                sequence y.  In the case of the sequence_segmenter, we use an order one
+                model Markov.  This means that that 
+                    PSI(x,y) == sum_i XI(x, y_{i-1}, y_{i}, i)
+                where the sum is taken over all the elements in the sequence.  At each
+                element we extract a feature vector, XI(), that is expected to encode
+                important details of what the i-th position of the sequence looks like in
+                the context of the current and previous labels.  To do this, XI() is
+                allowed to look at any part of the input sequence x, the current and
+                previous labels, and of course it must also know the position in question, i.  
+                The sequence_segmenter simplifies this further by decomposing XI() into
+                components which model the current window around each position as well as
+                the conjunction of the current window around each position and the previous
+                label.  In particular, the sequence_segmenter only asks a user to provide a
+                single feature vector which characterizes a position of the sequence
+                independent of any labeling.  We denote this feature vector by ZI(x,i), where
+                x is the sequence and i is the position in question.  
+                For example, suppose we use a window size of 3, then we can put this all
+                together and define XI() in terms of ZI().  To do this, we can think of
+                XI() as containing 12*3 slots which contain either a zero vector or a ZI()
+                vector.  Each combination of window position and labeling has a different
+                slot.  To explain further, consider the following examples where we have
+                annotated on the which parts of XI() correspond to each slot.  
+                If the previous and current label are both B and we use a window size of 3
+                then XI() would be instantiated as:
+                    XI(x, B, B, i) = [ZI(x,i-1)  \ 
+                                      ZI(x,i)     > If current label is B
+                                      ZI(x,i+1)  /  
+                                      0          \                        
+                                      0           > If current label is I 
+                                      0          /                        
+                                      0          \                        
+                                      0           > If current label is O 
+                                      0          /  
+                                      ZI(x,i-1)  \ 
+                                      ZI(x,i)     > If previous label is B and current label is B
+                                      ZI(x,i+1)  /  
+                                      0          \                        
+                                      0           > If previous label is B and current label is I 
+                                      0          /                        
+                                      0          \                        
+                                      0           > If previous label is B and current label is O 
+                                      0          /  
+                                      0          \ 
+                                      0           > If previous label is I and current label is B
+                                      0          /  
+                                      0          \                        
+                                      0           > If previous label is I and current label is I 
+                                      0          /                        
+                                      0          \                        
+                                      0           > If previous label is I and current label is O 
+                                      0          /  
+                                      0          \ 
+                                      0           > If previous label is O and current label is B
+                                      0          /  
+                                      0          \                        
+                                      0           > If previous label is O and current label is I 
+                                      0          /                        
+                                      0          \                        
+                                      0           > If previous label is O and current label is O 
+                                      0]         /  
+                If the previous label is O and the current label is I and we use a window
+                size of 3 then XI() would be instantiated as:
+                    XI(x, O, I, i) = [0          \ 
+                                      0           > If current label is B
+                                      0          /  
+                                      ZI(x,i-1)  \                        
+                                      ZI(x,i)     > If current label is I 
+                                      ZI(x,i+1)  /                        
+                                      0          \                        
+                                      0           > If current label is O 
+                                      0          /  
+                                      0          \ 
+                                      0           > If previous label is B and current label is B
+                                      0          /  
+                                      0          \                        
+                                      0           > If previous label is B and current label is I 
+                                      0          /                        
+                                      0          \                        
+                                      0           > If previous label is B and current label is O 
+                                      0          /  
+                                      0          \ 
+                                      0           > If previous label is I and current label is B
+                                      0          /  
+                                      0          \                        
+                                      0           > If previous label is I and current label is I 
+                                      0          /                        
+                                      0          \                        
+                                      0           > If previous label is I and current label is O 
+                                      0          /  
+                                      0          \ 
+                                      0           > If previous label is O and current label is B
+                                      0          /  
+                                      ZI(x,i-1)  \                        
+                                      ZI(x,i)     > If previous label is O and current label is I 
+                                      ZI(x,i+1)  /                        
+                                      0          \                        
+                                      0           > If previous label is O and current label is O 
+                                      0]         /  
+                    Finally, while not shown here, we also include nine indicator features
+                    in XI() to model label transitions.  
+            THREAD SAFETY
+                Instances of this object are required to be threadsafe, that is, it should
+                be safe for multiple threads to make concurrent calls to the member
+                functions of this object.
+        !*/
+    public:
+        // This should be the type used to represent an input sequence.  It can be
+        // anything so long as it has a .size() which returns the length of the sequence.
+        typedef the_type_used_to_represent_a_sequence sequence_type;
+        example_feature_extractor (
+        ); 
+        /*!
+            ensures
+                - this object is properly initialized
+        !*/
+        unsigned long num_features(
+        ) const; 
+        /*!
+            ensures
+                - returns the dimensionality of the ZI() feature vector.  This number is
+                  always >= 1
+        !*/
+        unsigned long window_size(
+        ) const;
+        /*!
+            ensures
+                - returns the size of the window ZI() vectors are extracted from.  This
+                  number is always >= 1.
+        !*/
+        template <typename feature_setter>
+        void get_features (
+            feature_setter& set_feature,
+            const sequence_type& x,
+            unsigned long position
+        ) const;
+        /*!
+            requires
+                - position < x.size()
+                - set_feature is a function object which allows expressions of the form:
+                    - set_features((unsigned long)feature_index, (double)feature_value);
+                    - set_features((unsigned long)feature_index);
+            ensures
+                - This function computes the ZI(x,position) feature vector.  This is a
+                  feature vector which should capture the properties of x[position] that
+                  are informative relative to the sequence segmentation task you are trying
+                  to perform.
+                - ZI(x,position) is returned as a sparse vector by invoking set_feature().
+                  For example, to set the feature with an index of 55 to the value of 1
+                  this method would call:
+                    set_feature(55);
+                  Or equivalently:
+                    set_feature(55,1);
+                  Therefore, the first argument to set_feature is the index of the feature
+                  to be set while the second argument is the value the feature should take.
+                  Additionally, note that calling set_feature() multiple times with the
+                  same feature index does NOT overwrite the old value, it adds to the
+                  previous value.  For example, if you call set_feature(55) 3 times then it
+                  will result in feature 55 having a value of 3.
+                - This function only calls set_feature() with feature_index values < num_features()
+        !*/
+    };
+// ----------------------------------------------------------------------------------------
+    void serialize(
+        const example_feature_extractor& item,
+        std::ostream& out
+    );
+    /*!
+        provides serialization support 
+    !*/
+    void deserialize(
+        example_feature_extractor& item, 
+        std::istream& in
+    );
+    /*!
+        provides deserialization support 
+    !*/
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+    template <
+        typename feature_extractor
+        >
+    unsigned long total_feature_vector_size (
+        const feature_extractor& fe
+    );
+    /*!
+        requires
+            - fe must be an object that implements an interface compatible with the
+              example_feature_extractor discussed above.
+        ensures
+            - returns 3*3 + 12*fe.num_features()*fe.window_size()
+              (i.e. returns the dimensionality of the PSI() vector defined by the given
+              feature extractor.
+    !*/
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+    template <
+        typename feature_extractor
+        >
+    class sequence_segmenter
+    {
+        /*!
+            REQUIREMENTS ON feature_extractor
+                It must be an object that implements an interface compatible with 
+                the example_feature_extractor discussed above.
+            WHAT THIS OBJECT REPRESENTS
+                This object is a tool for segmenting a sequence of objects into a set of
+                non-overlapping chunks.  An example sequence segmentation task is to take
+                English sentences and identify all the named entities.  In this example,
+                you would be using a sequence_segmenter to find all the chunks of
+                contiguous words which refer to proper names.
+                The sequence_segmenter is implemented using the BIO (Begin, Inside,
+                Outside) sequence tagging model.  Moreover, the sequence tagging is done
+                internally using a dlib::sequence_labeler object and therefore
+                sequence_segmenter objects are examples of chain structured conditional
+                random field style sequence taggers. 
+            THREAD SAFETY
+                It is always safe to use distinct instances of this object in different
+                threads.  However, when a single instance is shared between threads then
+                the following rules apply:
+                    It is safe to call the const members of this object from multiple
+                    threads so long as the feature_extractor is also threadsafe.  This is
+                    because the const members are purely read-only operations.  However,
+                    any operation that modifies a sequence_segmenter is not threadsafe.
+        !*/
+    public:
+        typedef typename feature_extractor::sequence_type sample_sequence_type;
+        typedef std::vector<std::pair<unsigned long, unsigned long> > segmented_sequence_type;
+        sequence_segmenter(
+        );
+        /*!
+            ensures
+                - #get_feature_extractor() == feature_extractor() 
+                  (i.e. it will have its default value)
+                - #get_weights().size() == total_feature_vector_size(#get_feature_extractor())
+                - #get_weights() == 0
+        !*/
+        explicit sequence_segmenter(
+            const matrix<double,0,1>& weights
+        ); 
+        /*!
+            requires
+                - total_feature_vector_size(feature_extractor()) == weights.size()
+            ensures
+                - #get_feature_extractor() == feature_extractor() 
+                  (i.e. it will have its default value)
+                - #get_weights() == weights
+        !*/
+        sequence_segmenter(
+            const matrix<double,0,1>& weights,
+            const feature_extractor& fe
+        ); 
+        /*!
+            requires
+                - total_feature_vector_size(fe) == weights.size()
+            ensures
+                - #get_feature_extractor() == fe
+                - #get_weights() == weights
+        !*/
+        const feature_extractor& get_feature_extractor (
+        ) const; 
+        /*!
+            ensures
+                - returns the feature extractor used by this object.
+        !*/
+        const matrix<double,0,1>& get_weights (
+        ) const;
+        /*!
+            ensures
+                - returns the parameter vector associated with this sequence segmenter. 
+                  The length of the vector is total_feature_vector_size(get_feature_extractor()).  
+        !*/
+        segmented_sequence_type operator() (
+            const sample_sequence_type& x
+        ) const;
+        /*!
+            ensures
+                - Takes an input sequence and returns a list of detected segments within
+                  that sequence.
+                - returns a std::vector Y of segments such that:
+                    - Y.size() == the number of segments detected in the input sequence x.
+                    - for all valid i:
+                        - Y[i].first  == the start of the i-th segment.
+                        - Y[i].second == one past the end of the i-th segment.
+                        - Therefore, the i-th detected segment in x is composed of the elements
+                          x[Y[i].first], x[Y[i].first+1], ..., x[Y[i].second-1]
+                        - Y[i].first < x.size()
+                        - Y[i].second <= x.size()
+        !*/
+        void segment_sequence (
+            const sample_sequence_type& x,
+            segmented_sequence_type& y
+        ) const;
+        /*!
+            ensures
+                - #y == (*this)(x)
+                  (i.e. This is just another interface to the operator() routine
+                  above.  This one avoids returning the results by value and therefore
+                  might be a little faster in some cases)
+        !*/
+    };
+// ----------------------------------------------------------------------------------------
+    template <
+        typename feature_extractor
+        >
+    void serialize (
+        const sequence_segmenter<feature_extractor>& item,
+        std::ostream& out
+    );
+    /*!
+        provides serialization support 
+    !*/
+// ----------------------------------------------------------------------------------------
+    template <
+        typename feature_extractor
+        >
+    void deserialize (
+        sequence_segmenter<feature_extractor>& item,
+        std::istream& in 
+    );
+    /*!
+        provides deserialization support 
+    !*/
+// ----------------------------------------------------------------------------------------
+}
+#endif // DLIB_SEQUENCE_SeGMENTER_ABSTRACT_H___