1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#-*- coding: utf-8 -*-
#######################################################################
# Name: bibtex.py
# Purpose: Parser for bibtex files
# Author: Igor R. Dejanovic <igor DOT dejanovic AT gmail DOT com>
# Copyright: (c) 2013-2014 Igor R. Dejanovic <igor DOT dejanovic AT gmail DOT com>
# License: MIT License
#
# This example demonstrates grammar and parser for bibtex files.
#######################################################################
from __future__ import print_function, unicode_literals
import pprint
import os
import sys
from arpeggio import *
from arpeggio import RegExMatch as _
# Grammar
def bibfile(): return ZeroOrMore([comment_entry, bibentry, comment]), EOF
def comment_entry(): return "@comment", "{", _(r'[^}]*'), "}"
def bibentry(): return bibtype, "{", bibkey, ",", field, ZeroOrMore(",", field), "}"
def field(): return fieldname, "=", fieldvalue
def fieldvalue(): return [fieldvalue_braces, fieldvalue_quotes]
def fieldvalue_braces(): return "{", fieldvalue_braced_content, "}"
def fieldvalue_quotes(): return '"', fieldvalue_quoted_content, '"'
# Lexical rules
def fieldname(): return _(r'[-\w]+')
def comment(): return _(r'[^@]+')
def bibtype(): return _(r'@\w+')
def bibkey(): return _(r'[^\s,]+')
def fieldvalue_quoted_content(): return _(r'((\\")|[^"])*')
def fieldvalue_braced_content(): return Combine(ZeroOrMore(Optional(And("{"), fieldvalue_inner),\
fieldvalue_part))
def fieldvalue_part(): return _(r'((\\")|[^{}])+')
def fieldvalue_inner(): return "{", fieldvalue_braced_content, "}"
# Semantic actions visitor
class BibtexVisitor(PTNodeVisitor):
def visit_bibfile(self, node, children):
"""
Just returns list of child nodes (bibentries).
"""
if self.debug:
print("Processing Bibfile")
# Return only dict nodes
return [x for x in children if type(x) is dict]
def visit_bibentry(self, node, children):
"""
Constructs a map where key is bibentry field name.
Key is returned under 'bibkey' key. Type is returned under 'bibtype'.
"""
if self.debug:
print(" Processing bibentry %s" % children[1])
bib_entry_map = {
'bibtype': children[0],
'bibkey': children[1]
}
for field in children[2:]:
bib_entry_map[field[0]] = field[1]
return bib_entry_map
def visit_field(self, node, children):
"""
Constructs a tuple (fieldname, fieldvalue).
"""
if self.debug:
print(" Processing field %s" % children[0])
field = (children[0], children[1])
return field
def visit_fieldvalue(self, node, children):
"""
Serbian Serbian letters form latex encoding to Unicode.
Remove braces. Remove newlines.
"""
value = children[0]
value = value.replace(r"\'{c}", u"ć")\
.replace(r"\'{C}", u"Ć")\
.replace(r"\v{c}", u"č")\
.replace(r"\v{C}", u"Č")\
.replace(r"\v{z}", u"ž")\
.replace(r"\v{Z}", u"Ž")\
.replace(r"\v{s}", u"š")\
.replace(r"\v{S}", u"Š")
value = re.sub("[\n{}]", '', value)
return value
def main(debug=False, file_name=None):
# First we will make a parser - an instance of the bib parser model.
# Parser model is given in the form of python constructs therefore we
# are using ParserPython class.
parser = ParserPython(bibfile, reduce_tree=True, debug=debug)
if not file_name:
file_name = os.path.join(os.path.dirname(__file__),
'bibtex_example.bib')
with codecs.open(file_name, "r", encoding="utf-8") as bibtexfile:
bibtexfile_content = bibtexfile.read()
# We create a parse tree or abstract syntax tree out of
# textual input
parse_tree = parser.parse(bibtexfile_content)
# visit_parse_tree will start semantic analysis.
# In this case semantic analysis will return list of bibentry maps.
ast = visit_parse_tree(parse_tree, BibtexVisitor(debug=debug))
return ast
if __name__ == "__main__":
# First parameter is bibtex file
if len(sys.argv) > 1:
# In debug mode dot (graphviz) files for parser model
# and parse tree will be created for visualization.
# Checkout current folder for .dot files.
entries = main(debug=True, file_name=sys.argv[1])
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(entries)
else:
print("Usage: python bibtex.py file_to_parse")