Commit 07b5b2e3 authored by Davis King's avatar Davis King

filled out spec

parent 1493769d
...@@ -12,11 +12,27 @@ namespace dlib ...@@ -12,11 +12,27 @@ namespace dlib
// ----------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------
template <typename T> template <
typename T
>
struct constituent struct constituent
{ {
/*! /*!
WHAT THIS OBJECT REPRESENTS WHAT THIS OBJECT REPRESENTS
This object represents the linguistic idea of a constituent, that is, a
group of words that functions as a single unit. In particular, it
represents a combination of two constituents into a new constituent.
Additionally, a constituent object represents a range of words relative to
some std::vector of words. The range is from [begin, end) (i.e. including
begin but not including end, so using the normal C++ iterator notation).
Moreover, a constituent is always composed of two parts, each having a tag.
Therefore, the left part is composed of the words in the range [begin,k)
and has tag left_tag while the right part of the constituent contains the
words in the range [k,end) and has the tag right_tag.
The tags are user defined objects of type T. In general, they are used to
represent syntactic categories such as noun phrase, verb phrase, etc.
!*/ !*/
unsigned long begin, end, k; unsigned long begin, end, k;
...@@ -24,7 +40,9 @@ namespace dlib ...@@ -24,7 +40,9 @@ namespace dlib
T right_tag; T right_tag;
}; };
template <typename T> template <
typename T
>
void serialize( void serialize(
const constituent<T>& item, const constituent<T>& item,
std::ostream& out std::ostream& out
...@@ -33,7 +51,9 @@ namespace dlib ...@@ -33,7 +51,9 @@ namespace dlib
provides serialization support provides serialization support
!*/ !*/
template <typename T> template <
typename T
>
void deserialize( void deserialize(
constituent<T>& item, constituent<T>& item,
std::istream& in std::istream& in
...@@ -51,24 +71,53 @@ namespace dlib ...@@ -51,24 +71,53 @@ namespace dlib
// ----------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------
template <typename T> template <
typename T
>
struct parse_tree_element struct parse_tree_element
{ {
/*! /*!
WHAT THIS OBJECT REPRESENTS WHAT THIS OBJECT REPRESENTS
This object is used to represent a node in a binary parse tree. An entire
parse tree is represented by a std::vector of parse_tree_element objects.
We follow the convention that the first element of this vector is always
the root of the entire tree.
The fields of this object have the following interpretations:
- c == the constituent spanned by this node in the parse tree.
Therefore, the node spans the words in the range [c.begin, c.end).
- tag == the syntactic category of this node in the parse tree.
- score == the score or log likelihood for this parse tree. In
general, this is the sum of scores of all the production rules used
to build the tree rooted at the current node.
- let PT denote the vector of parse_tree_elements that defines an
entire parse tree. Then we have:
- if (left != END_OF_TREE) then
- PT[left] == the left sub-tree of the current node.
- PT[left] spans the words [c.begin, c.k)
- PT[left].tag == c.left_tag
- else
- there is no left sub-tree
- if (right != END_OF_TREE) then
- PT[right] == the right sub-tree of the current node.
- PT[right] spans the words [c.k, c.end)
- PT[right].tag == c.right_tag
- else
- there is no right sub-tree
!*/ !*/
constituent<T> c; constituent<T> c;
T tag; // id for the constituent corresponding to this level of the tree T tag;
double score;
// subtrees. These are the index values into the std::vector that contains all the parse_tree_elements.
unsigned long left; unsigned long left;
unsigned long right; unsigned long right;
double score; // score for this tree
}; };
template <typename T> template <
typename T
>
void serialize ( void serialize (
const parse_tree_element<T>& item, const parse_tree_element<T>& item,
std::ostream& out std::ostream& out
...@@ -77,7 +126,9 @@ namespace dlib ...@@ -77,7 +126,9 @@ namespace dlib
provides serialization support provides serialization support
!*/ !*/
template <typename T> template <
typename T
>
void deserialize ( void deserialize (
parse_tree_element<T>& item, parse_tree_element<T>& item,
std::istream& in std::istream& in
...@@ -90,20 +141,30 @@ namespace dlib ...@@ -90,20 +141,30 @@ namespace dlib
// ----------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------
void example_production_rule_function ( void example_production_rule_function (
const std::vector<T>& sequence, const std::vector<T>& words,
const constituent<T>& c, const constituent<T>& c,
std::vector<std::pair<T,double> >& possible_tags std::vector<std::pair<T,double> >& possible_tags
) )
/*! /*!
requires requires
- 0 <= c.begin < c.k < c.end <= sequence.size() - 0 <= c.begin < c.k < c.end <= words.size()
- possible_tags.size() == 0 - possible_tags.size() == 0
ensures ensures
- finds all the production rules that can turn c into a single non-terminal. - Finds all the syntactic categories that can be used to label c and puts those
Puts the IDs of these rules and their scores into possible_tags. categories, along with their scores, into possible_tags. Or in other words,
this function determines which production rules can be used to turn the left
and right sub-constituents in c into a single constituent. The contents of c
have the following interpretations:
- The left sub-constituent has syntactic category c.left_tag
- for all i such that c.begin <= i < c.k:
- words[i] is part of the left sub-constituent.
- The right sub-constituent has syntactic category c.right_tag
- for all i such that c.k <= i < c.end:
- words[i] is part of the right sub-constituent.
- Note that example_production_rule_function() is not a real function. It is - Note that example_production_rule_function() is not a real function. It is
here just to show you how to define production rule producing functions here just to show you how to define production rule producing functions for
for use with the find_max_parse_cky() routine defined below. use with the find_max_parse_cky() routine defined below.
!*/ !*/
template < template <
...@@ -111,7 +172,7 @@ namespace dlib ...@@ -111,7 +172,7 @@ namespace dlib
typename production_rule_function typename production_rule_function
> >
void find_max_parse_cky ( void find_max_parse_cky (
const std::vector<T>& sequence, const std::vector<T>& words,
const production_rule_function& production_rules, const production_rule_function& production_rules,
std::vector<std::vector<parse_tree_element<T> > >& parse_trees std::vector<std::vector<parse_tree_element<T> > >& parse_trees
); );
...@@ -119,6 +180,25 @@ namespace dlib ...@@ -119,6 +180,25 @@ namespace dlib
requires requires
- production_rule_function == a function or function object with the same - production_rule_function == a function or function object with the same
interface as example_production_rule_function defined above. interface as example_production_rule_function defined above.
ensures
- Uses the CKY algorithm to find the most probable/highest scoring parse tree
of the given vector of words. The output is stored in #parse_trees.
- This function outputs a set of non-overlapping parse trees. Each parse tree
always spans the largest number of words possible, regardless of any other
considerations (except that the parse trees cannot have overlapping word
spans). For example, this function will never select a smaller parse tree,
even if it would have a better score, if it can possibly build a larger tree.
Therefore, this function will only output multiple parse trees if it is
impossible to form words into a single parse tree.
- This function uses production_rules() to find out what the allowed production
rules are. That is, production_rules() defines all properties of the grammar
used by find_max_parse_cky().
- for all valid i:
- #parse_trees[i].size() != 0
- #parse_trees[i] == the root of the i'th parse tree.
- #parse_trees[i].score == the score of the i'th parse tree.
- The i'th parse tree spans all the elements of words in the range
[#parse_trees[i].c.begin, #parse_trees[i].c.end).
!*/ !*/
// ----------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------
...@@ -128,31 +208,79 @@ namespace dlib ...@@ -128,31 +208,79 @@ namespace dlib
{ {
/*! /*!
WHAT THIS OBJECT REPRESENTS WHAT THIS OBJECT REPRESENTS
This is the exception thrown by parse_tree_to_string() and
parse_tree_to_string_tagged() if the inputs are discovered to be invalid.
!*/ !*/
}; };
// ----------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------
template <typename T, typename U> template <
typename T,
typename U
>
std::string parse_tree_to_string ( std::string parse_tree_to_string (
const std::vector<parse_tree_element<T> >& tree, const std::vector<parse_tree_element<T> >& tree,
const std::vector<U>& words const std::vector<U>& words
); );
/*! /*!
requires
- It must be possible to print U objects to an ostream using operator<<
(typically, U would be something like std::string)
ensures ensures
- - Interprets tree as a parse tree defined over the given sequence of words.
- returns a bracketed string that represents the parse tree over the words.
For example, suppose the following parse tree is input:
/\
/ \
/\ \
/ \ \
the dog ran
Then the output would be the string "[[the dog] ran]"
throws
- parse_tree_to_string_error
This exception is thrown if an invalid tree is detected. This might happen
if the tree refers to elements of words that don't exist because words is
shorted than it is supposed to be.
!*/ !*/
// ----------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------
template <typename T, typename U> template <
typename T,
typename U
>
std::string parse_tree_to_string_tagged ( std::string parse_tree_to_string_tagged (
const std::vector<parse_tree_element<T> >& tree, const std::vector<parse_tree_element<T> >& tree,
const std::vector<U>& words const std::vector<U>& words
); );
/*! /*!
requires
- It must be possible to print T objects to an ostream using operator<<
- It must be possible to print U objects to an ostream using operator<<
(typically, U would be something like std::string)
ensures ensures
- - This function does the same thing as parse_tree_to_string() except that it
also includes the parse_tree_element::tag object in the output. Therefore,
the tag of each bracket will be included as the first token inside the
bracket. For example, suppose the following parse tree is input (where tags
are shown at the vertices):
S
/\
NP \
/\ \
/ \ \
the dog ran
Then the output would be the string "[S [NP the dog] ran]"
throws
- parse_tree_to_string_error
This exception is thrown if an invalid tree is detected. This might happen
if the tree refers to elements of words that don't exist because words is
shorted than it is supposed to be.
!*/ !*/
// ----------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment