refs #4, #6 Support for lexical rules and NonTerminal/Terminal str/repr

Introduced grammar rules decorator. Combine decorator is used to mark part of the grammar as lexical rule. This rule will always return Terminal, whitespaces will be preserved, and comments will not get matched.

refs #4, #6 Support for lexical rules and NonTerminal/Terminal str/repr
Introduced grammar rules decorator. Combine decorator is used to mark part of the grammar as lexical rule. This rule will always return Terminal, whitespaces will be preserved, and comments will not get matched.
2951f542 · Igor Dejanovic · 713b5f2f · 2951f542 · 2951f542 · 2951f542
Commit 2951f542 authored Feb 12, 2014 by Igor Dejanovic
5 changed files
--- a/arpeggio/__init__.py
+++ b/arpeggio/__init__.py
@@ -151,9 +151,12 @@ class ParsingExpression(object):
    def _parse_intro(self, parser):
        if parser.debug:
            print "Parsing %s" % self.name
+        # Skip whitespaces if we are not in the lexical rule
+        if not parser._in_lex_rule:
            parser._skip_ws()
-        # Set the begining position in input stream of
+        # Set the begining position in the input stream of
        # this parsing expression
        self.c_pos = parser.position
@@ -165,7 +168,7 @@ class ParsingExpression(object):
        c_pos = self.c_pos
        # Memoization.
-        # If this position is already parsed by this parser expression than use
+        # If this position is already parsed by this parser expression use
        # the result
        if c_pos in self.result_cache:
            if parser.debug:
@@ -181,7 +184,9 @@ class ParsingExpression(object):
        result = self._parse(parser)
-        if result:
+        # Create terminal or non-terminal if result is not
+        # already a Terminal.
+        if result and not isinstance(result, Terminal):
            if parser.reduce_tree:
                if isinstance(result, list):
                    if self.root:
@@ -360,6 +365,44 @@ class Empty(SyntaxPredicate):
        pass
+class Decorator(ParsingExpression):
+    """
+    Decorator are special kind of parsing expression used to mark
+    a containing pexpression and give it some special semantics.
+    For example, decorators are used to mark pexpression as lexical
+    rules (see :class:Lex).
+    """
+class Combine(Decorator):
+    """
+    This decorator defines pexpression that represents a lexeme rule.
+    This rules will always return a Terminal parse tree node.
+    Whitespaces will be preserved. Comments will not be matched.
+    """
+    def _parse(self, parser):
+        results = []
+        parser._in_lex_rule = True
+        self.c_pos = parser.position
+        try:
+            for parser_model_node in self.nodes:
+                results.append(parser_model_node.parse(parser))
+            results = flatten(results)
+            # Create terminal from result
+            return Terminal(self.rule if self.root else '', self.c_pos, \
+                              "".join([str(result) for result in results]))
+        except NoMatch:
+            parser.position = self.c_pos  # Backtracking
+            raise
+        finally:
+            parser._in_lex_rule = False
+        return results
 class Match(ParsingExpression):
    """
    Base class for all classes that will try to match something from the input.
@@ -375,14 +418,15 @@ class Match(ParsingExpression):
        self._parse_intro(parser)
        if parser._in_parse_comment:
            return self._parse(parser)
        comments = []
        try:
            match = self._parse(parser)
        except NoMatch, nm:
-            # If not matched try to match comment
+            # If not matched and not in lexical rule try to match comment
            #TODO: Comment handling refactoring. Should think of better way to
            # handle comments.
-            if parser.comments_model:
+            if not parser._in_lex_rule and parser.comments_model:
                try:
                    parser._in_parse_comment = True
                    while True:
@@ -492,7 +536,7 @@ class EndOfFile(Match):
    def _parse(self, parser):
        if len(parser.input) == parser.position:
-            return Terminal(self.rule if self.root else '', self.c_pos, 'EOF')
+            return Terminal('EOF', self.c_pos, '')
        else:
            if parser.debug:
                print "EOF not matched."
@@ -535,6 +579,8 @@ class Terminal(ParseTreeNode):
    Leaf node of the Parse Tree. Represents matched string.
    Attributes:
+        type (str): The name of the rule that created this terminal.
+        position (int): A position in the input stream where match occurred.
        value (str): Matched string at the given position or missing token
            name in the case of an error node.
    """
@@ -544,11 +590,17 @@ class Terminal(ParseTreeNode):
    @property
    def desc(self):
+        if self.value:
            return "%s '%s' [%s]" % (self.type, self.value, self.position)
+        else:
+            return "%s [%s]" % (self.type, self.position)
    def __str__(self):
        return self.value
+    def __repr__(self):
+        return self.desc
    def __eq__(self, other):
        return str(self) == str(other)
@@ -573,7 +625,10 @@ class NonTerminal(ParseTreeNode):
        return iter(self.nodes)
    def __str__(self):
-        return "[ %s ]" % ", ".join([str(x) for x in self.nodes])
+        return "".join([str(x) for x in self.nodes])
+    def __repr__(self):
+        return "[ %s ]" % ", ".join([repr(x) for x in self.nodes])
 # ----------------------------------------------------
@@ -625,6 +680,7 @@ class Parser(object):
        self.parse_tree = None
        self._in_parse_comment = False
+        self._in_lex_rule = False
    def parse(self, _input):
        self.position = 0  # Input position
@@ -823,7 +879,8 @@ class ParserPython(Parser):
                retval = expression
            elif isinstance(expression, Repetition) or \
-                    isinstance(expression, SyntaxPredicate):
+                    isinstance(expression, SyntaxPredicate) or \
+                    isinstance(expression, Decorator):
                retval = expression
                retval.nodes.append(inner_from_python(retval.elements))
                if any((isinstance(x, CrossRef) for x in retval.nodes)):

--- a/tests/test_decorator_combine.py
+++ b/tests/test_decorator_combine.py
+# -*- coding: utf-8 -*-
+#######################################################################
+# Name: test_decorator_combine
+# Purpose: Test for Combine decorator. Combine decorator
+#           results in Terminal parse tree node. Whitespaces are
+#           preserved  (they are not skipped) and comments are not matched.
+# Author: Igor R. Dejanović <igor DOT dejanovic AT gmail DOT com>
+# Copyright: (c) 2014 Igor R. Dejanović <igor DOT dejanovic AT gmail DOT com>
+# License: MIT License
+#######################################################################
+from unittest import TestCase
+from arpeggio import ParserPython, ZeroOrMore, OneOrMore, NonTerminal, Terminal, NoMatch, Combine
+from arpeggio.peg import ParserPEG
+class TestDecoratorCombine(TestCase):
+    def test_combine_python(self):
+        # This will result in NonTerminal node
+        def root():     return my_rule(), "."
+        # This will result in Terminal node
+        def my_rule():  return Combine(ZeroOrMore("a"), OneOrMore("b"))
+        parser = ParserPython(root, debug=True)
+        input1 = "abbb."
+        # Whitespaces are preserved in lexical rules so the following input
+        # should not be recognized.
+        input2 = "a b bb."
+        ptree1 = parser.parse(input1)
+        def fail_nm():
+            ptree2 = parser.parse(input2)
+        self.assertRaises(NoMatch, fail_nm)
+        self.assertIsInstance(ptree1, NonTerminal)
+        self.assertIsInstance(ptree1.nodes[0], Terminal)
+        self.assertEqual(ptree1.nodes[0].value, "abbb")
--- a/tests/test_parsing_expressions.py
+++ b/tests/test_parsing_expressions.py
@@ -19,8 +19,10 @@ class TestParsingExpression(TestCase):
        parser = ParserPython(grammar)
-        parsed = str(parser.parse("a b c"))
+        parsed = parser.parse("a b c")
-        self.assertEqual(parsed, "[ a, b, c ]")
+        self.assertEqual(str(parsed), "abc")
+        self.assertEqual(repr(parsed), "[  'a' [0],  'b' [2],  'c' [4] ]")
    def test_ordered_choice(self):
@@ -28,11 +30,14 @@ class TestParsingExpression(TestCase):
        parser = ParserPython(grammar)
-        parsed = str(parser.parse("b"))
+        parsed = parser.parse("b")
-        self.assertEqual(parsed, "[ b, EOF ]")
+        self.assertEqual(str(parsed), "b")
+        self.assertEqual(repr(parsed), "[  'b' [0], EOF [1] ]")
-        parsed = str(parser.parse("c"))
+        parsed = parser.parse("c")
-        self.assertEqual(parsed, "[ c, EOF ]")
+        self.assertEqual(str(parsed), "c")
+        self.assertEqual(repr(parsed), "[  'c' [0], EOF [1] ]")
        self.assertRaises(NoMatch, lambda: parser.parse("ab"))
        self.assertRaises(NoMatch, lambda: parser.parse("bb"))
@@ -43,11 +48,15 @@ class TestParsingExpression(TestCase):
        parser = ParserPython(grammar)
-        parsed = str(parser.parse("aaaaaaa"))
+        parsed = parser.parse("aaaaaaa")
-        self.assertEqual(parsed, "[ a, a, a, a, a, a, a, EOF ]")
+        self.assertEqual(str(parsed), "aaaaaaa")
+        self.assertEqual(repr(parsed), "[  'a' [0],  'a' [1],  'a' [2],  'a' [3],  'a' [4],  'a' [5],  'a' [6], EOF [7] ]")
+        parsed = parser.parse("")
-        parsed = str(parser.parse(""))
+        self.assertEqual(str(parsed), "")
-        self.assertEqual(parsed, "[ EOF ]")
+        self.assertEqual(repr(parsed), "[ EOF [0] ]")
        self.assertRaises(NoMatch, lambda: parser.parse("bbb"))
@@ -57,8 +66,10 @@ class TestParsingExpression(TestCase):
        parser = ParserPython(grammar)
-        parsed = str(parser.parse("aaaaaaa"))
+        parsed = parser.parse("aaaaaaa")
-        self.assertEqual(parsed, "[ a, a, a, a, a, a, a ]")
+        self.assertEqual(str(parsed), "aaaaaaa")
+        self.assertEqual(repr(parsed), "[  'a' [0],  'a' [1],  'a' [2],  'a' [3],  'a' [4],  'a' [5],  'a' [6] ]")
        self.assertRaises(NoMatch, lambda: parser.parse(""))
        self.assertRaises(NoMatch, lambda: parser.parse("bbb"))
@@ -69,11 +80,15 @@ class TestParsingExpression(TestCase):
        parser = ParserPython(grammar)
-        parsed = str(parser.parse("ab"))
+        parsed = parser.parse("ab")
-        self.assertEqual(parsed, "[ a, b, EOF ]")
+        self.assertEqual(str(parsed), "ab")
+        self.assertEqual(repr(parsed), "[  'a' [0],  'b' [1], EOF [2] ]")
-        parsed = str(parser.parse("b"))
+        parsed = parser.parse("b")
-        self.assertEqual(parsed, "[ b, EOF ]")
+        self.assertEqual(str(parsed), "b")
+        self.assertEqual(repr(parsed),  "[  'b' [0], EOF [1] ]")
        self.assertRaises(NoMatch, lambda: parser.parse("aab"))
        self.assertRaises(NoMatch, lambda: parser.parse(""))
@@ -87,8 +102,9 @@ class TestParsingExpression(TestCase):
        parser = ParserPython(grammar)
-        parsed = str(parser.parse("ab"))
+        parsed = parser.parse("ab")
-        self.assertEqual(parsed, "[ a, b, EOF ]")
+        self.assertEqual(str(parsed), "ab")
+        self.assertEqual(repr(parsed), "[  'a' [0],  'b' [1], EOF [2] ]")
        # 'And' will try to match 'b' and fail so 'c' will never get matched
        self.assertRaises(NoMatch, lambda: parser.parse("ac"))
@@ -101,8 +117,10 @@ class TestParsingExpression(TestCase):
        parser = ParserPython(grammar)
-        parsed = str(parser.parse("ac"))
+        parsed = parser.parse("ac")
-        self.assertEqual(parsed, "[ a, c, EOF ]")
+        self.assertEqual(str(parsed), "ac")
+        self.assertEqual(repr(parsed), "[  'a' [0],  'c' [1], EOF [2] ]")
        # Not will will fail on 'b'
        self.assertRaises(NoMatch, lambda: parser.parse("ab"))

--- a/tests/test_peg_parser.py
+++ b/tests/test_peg_parser.py
@@ -37,7 +37,8 @@ class TestPEGParser(TestCase):
        result = parser.parse(input)
        self.assertTrue(isinstance(result, NonTerminal))
-        self.assertEqual(str(result), "[ [ [ [ 4 ] ], +, [ [ 5 ], *, [ 7 ], /, [ 3.45 ], *, [ -, 45 ], *, [ (, [ [ [ 2.56 ] ], +, [ [ 32 ] ] ], ) ], /, [ -, 56 ], *, [ (, [ [ [ 2 ] ], -, [ [ 1.34 ] ] ], ) ] ] ], EOF ]")
+        self.assertEqual(str(result), "4+5*7/3.45*-45*(2.56+32)/-56*(2-1.34)")
+        self.assertEqual(repr(result), "[ [ [ [ number '4' [0] ] ],  '+' [1], [ [ number '5' [2] ],  '*' [3], [ number '7' [4] ],  '/' [5], [ number '3.45' [6] ],  '*' [10], [  '-' [11], number '45' [12] ],  '*' [14], [  '(' [15], [ [ [ number '2.56' [16] ] ],  '+' [20], [ [ number '32' [21] ] ] ],  ')' [23] ],  '/' [24], [  '-' [25], number '56' [26] ],  '*' [28], [  '(' [29], [ [ [ number '2' [30] ] ],  '-' [31], [ [ number '1.34' [32] ] ] ],  ')' [36] ] ] ], EOF [37] ]")
    def test_reduce_tree(self):
@@ -47,4 +48,5 @@ class TestPEGParser(TestCase):
        self.assertTrue(isinstance(result, NonTerminal))
-        self.assertEqual(str(result), "[ [ 4, +, [ 5, *, 7, /, 3.45, *, [ -, 45 ], *, [ (, [ 2.56, +, 32 ], ) ], /, [ -, 56 ], *, [ (, [ 2, -, 1.34 ], ) ] ] ], EOF ]" )
+        self.assertEqual(str(result),"4+5*7/3.45*-45*(2.56+32)/-56*(2-1.34)")
+        self.assertEqual(repr(result), "[ [ number '4' [0],  '+' [1], [ number '5' [2],  '*' [3], number '7' [4],  '/' [5], number '3.45' [6],  '*' [10], [  '-' [11], number '45' [12] ],  '*' [14], [  '(' [15], [ number '2.56' [16],  '+' [20], number '32' [21] ],  ')' [23] ],  '/' [24], [  '-' [25], number '56' [26] ],  '*' [28], [  '(' [29], [ number '2' [30],  '-' [31], number '1.34' [32] ],  ')' [36] ] ] ], EOF [37] ]")
--- a/tests/test_python_parser.py
+++ b/tests/test_python_parser.py
@@ -41,7 +41,8 @@ class TestPythonParser(TestCase):
        result = parser.parse(input)
        self.assertTrue(isinstance(result, NonTerminal))
-        self.assertEqual(str(result), "[ [ [ [ 4 ] ], +, [ [ 5 ], *, [ 7 ], /, [ 3.45 ], *, [ -, 45 ], *, [ (, [ [ [ 2.56 ] ], +, [ [ 32 ] ] ], ) ], /, [ -, 56 ], *, [ (, [ [ [ 2 ] ], -, [ [ 1.34 ] ] ], ) ] ] ], EOF ]")
+        self.assertEqual(str(result), "4+5*7/3.45*-45*(2.56+32)/-56*(2-1.34)")
+        self.assertEqual(repr(result), "[ [ [ [ number '4' [0] ] ],  '+' [1], [ [ number '5' [2] ],  '*' [3], [ number '7' [4] ],  '/' [5], [ number '3.45' [6] ],  '*' [10], [  '-' [11], number '45' [12] ],  '*' [14], [  '(' [15], [ [ [ number '2.56' [16] ] ],  '+' [20], [ [ number '32' [21] ] ] ],  ')' [23] ],  '/' [24], [  '-' [25], number '56' [26] ],  '*' [28], [  '(' [29], [ [ [ number '2' [30] ] ],  '-' [31], [ [ number '1.34' [32] ] ] ],  ')' [36] ] ] ], EOF [37] ]")
    def test_reduce_tree(self):
@@ -51,5 +52,6 @@ class TestPythonParser(TestCase):
        self.assertTrue(isinstance(result, NonTerminal))
-        self.assertEqual(str(result), "[ [ 4, +, [ 5, *, 7, /, 3.45, *, [ -, 45 ], *, [ (, [ 2.56, +, 32 ], ) ], /, [ -, 56 ], *, [ (, [ 2, -, 1.34 ], ) ] ] ], EOF ]" )
+        self.assertEqual(str(result),"4+5*7/3.45*-45*(2.56+32)/-56*(2-1.34)")
+        self.assertEqual(repr(result), "[ [ number '4' [0],  '+' [1], [ number '5' [2],  '*' [3], number '7' [4],  '/' [5], number '3.45' [6],  '*' [10], [  '-' [11], number '45' [12] ],  '*' [14], [  '(' [15], [ number '2.56' [16],  '+' [20], number '32' [21] ],  ')' [23] ],  '/' [24], [  '-' [25], number '56' [26] ],  '*' [28], [  '(' [29], [ number '2' [30],  '-' [31], number '1.34' [32] ],  ')' [36] ] ] ], EOF [37] ]")