This problem was discussed on comp.lang.python, where a pyparsing solution was posted. The following is the equivalent Lepl solution:
>>> comma = Drop(',')
>>> none = Literal('None') >> (lambda x: None)
>>> bool = (Literal('True') | Literal('False')) >> (lambda x: x == 'True')
>>> ident = Word(Letter() | '_',
>>> Letter() | '_' | Digit())
>>> float_ = Float() >> float
>>> int_ = Integer() >> int
>>> str_ = String() | String("'")
>>> item = str_ | int_ | float_ | none | bool | ident
>>> with Separator(~Regexp(r'\s*')):
>>> value = Delayed()
>>> list_ = Drop('[') & value[:, comma] & Drop(']') > list
>>> tuple_ = Drop('(') & value[:, comma] & Drop(')') > tuple
>>> value += list_ | tuple_ | item
>>> arg = value >> 'arg'
>>> karg = (ident & Drop('=') & value > tuple) >> 'karg'
>>> expr = (karg | arg)[:, comma] & Drop(Eos()) > Node
>>> parser = expr.get_parse_string()
>>> ast = parser('True, type=rect, sizes=[3, 4], coords = ([1,2],[3,4])')
>>> print(ast[0])
Node
+- arg True
+- karg ('type', 'rect')
+- karg ('sizes', [3, 4])
`- karg ('coords', ([1, 2], [3, 4]))
>>> ast[0].arg
[True]
>>> ast[0].karg
[('type', 'rect'), ('sizes', [3, 4]), ('coords', ([1, 2], [3, 4]))]
>>> ast = parser('None, str="a string"')
>>> print(ast[0])
Node
+- arg None
`- karg ('str', 'a string')
>>> ast[0].arg
[None]
>>> ast[0].karg
[('str', 'a string')]
This example shows how different choices of Configuration can change the compilation and parsing times:
from gc import collect
from random import random
from timeit import timeit
from lepl import *
from lepl._example.support import Example
from lepl.support.lib import format
NUMBER = 10
REPEAT = 3
def default():
'''A simple parser we'll use as an example.'''
class Term(List): pass
class Factor(List): pass
class Expression(List): pass
expr = Delayed()
number = Float() >> float
with DroppedSpace():
term = number | '(' & expr & ')' > Term
muldiv = Any('*/')
factor = term & (muldiv & term)[:] > Factor
addsub = Any('+-')
expr += factor & (addsub & factor)[:] > Expression
line = expr & Eos()
return line
# These create a matcher for the parser above with different configurations
def clear():
matcher = default()
matcher.config.clear()
return matcher
def no_memo():
matcher = default()
matcher.config.no_memoize()
return matcher
def full_memo():
matcher = default()
matcher.config.auto_memoize(full=True)
return matcher
def slow():
matcher = default()
matcher.config.clear().trace().manage().auto_memoize(full=True)
return matcher
def nfa_regexp():
matcher = default()
matcher.config.clear().compile_to_nfa(force=True)
return matcher
def dfa_regexp():
matcher = default()
matcher.config.clear().compile_to_dfa(force=True)
return matcher
# Next, build all the tests, making sure that we pre-compile parsers where
# necessary and (important!) we avoid reusing a parser with a cache
data = [format('{0:4.2f} + {1:4.2f} * ({2:4.2f} + {3:4.2f} - {4:4.2f})',
random(), random(), random(), random(), random())
for i in range(NUMBER)]
matchers = [default, clear, no_memo, full_memo, slow, nfa_regexp, dfa_regexp]
def build_cached(factory):
matcher = factory()
matcher.config.clear_cache()
parser = matcher.get_parse()
def test():
for line in data:
parser(line)[0]
return test
def build_uncached(factory):
matcher = factory()
def test():
for line in data:
matcher.config.clear_cache()
matcher.parse(line)[0]
return test
tests = {}
for matcher in matchers:
tests[matcher] = {True: [], False: []}
for i in range(REPEAT):
tests[matcher][True].append(build_cached(matcher))
tests[matcher][False].append(build_uncached(matcher))
def run(matcher, cached, repeat):
'''Time the given test.'''
stmt = 'tests[{0}][{1}][{2}]()'.format(matcher.__name__, cached, repeat)
setup = 'from __main__ import tests, {0}'.format(matcher.__name__)
return timeit(stmt, setup, number=1)
def analyse(matcher, t_uncached_base=None, t_cached_base=None):
'''We do our own repeating so we can GC between attempts.'''
(t_uncached, t_cached) = ([], [])
for repeat in range(REPEAT):
collect()
t_uncached.append(run(matcher, False, repeat))
collect()
t_cached.append(run(matcher, True, repeat))
(t_uncached, t_cached) = (min(t_uncached), min(t_cached))
t_uncached = 1000.0 * t_uncached / NUMBER
t_cached = 1000.0 * t_cached / NUMBER
print(format('{0:>20s} {1:5.1f} {2:8s} {3:5.1f} {4:8s}',
matcher.__name__,
t_uncached, normalize(t_uncached, t_uncached_base),
t_cached, normalize(t_cached, t_cached_base)))
return (t_uncached, t_cached)
def normalize(time, base):
if base:
return '(x{0:5.2f})'.format(time / base)
else:
return ''
def main():
print('{0:d} iterations; time per iteration in ms (best of {1:d})\n'.format(
NUMBER, REPEAT))
print(format('{0:>35s} {1:s}', 're-compiled', 'cached'))
(t_uncached, t_cached) = analyse(default)
for matcher in matchers:
if matcher is not default:
analyse(matcher, t_uncached, t_cached)
if __name__ == '__main__':
main()
Running main() gives:
10 iterations; time per iteration in ms (best of 3)
re-compiled cached
default 78.4 4.2
clear 6.0 (x 0.08) 5.9 (x 1.39)
no_memo 68.6 (x 0.88) 4.1 (x 0.97)
full_memo 89.1 (x 1.14) 13.5 (x 3.19)
slow 157.1 (x 2.00) 133.3 (x31.55)
nfa_regexp 35.3 (x 0.45) 4.3 (x 1.01)
dfa_regexp 38.4 (x 0.49) 5.5 (x 1.30)
The first column describes the configuration — you can check the code to see exactly what was used in the function of the same name.
The second two columns are the time (and the ratio of that time relative to the default) for using a parser that is re–compiled for each parse. The time includes the work needed to compile the parser and is appropriate when you’re only using a matcher once.
The final two columns are the time (and the ratio of that time relative to the default) for re–using a cached parser. This doesn’t include the time needed to compile the parser and is appropriate for when you’re using the same matcher many times (in which case the compilation time is relatively unimportant).
Note that you don’t need to worry about caching parsers yourself — a matcher will automatically cache the parser when it is used. The test code is much more complex because it is trying to disable caching in various places.
What can we learn from these results?
For anyone interested in absolute speed, the values above are milliseconds required per iteration on a Dual Core laptop (a Lenovo X60, a couple of years old), with sufficient memory to avoid paging.
This is a simple example that shows how to parse data in a fixed, tabular format using the Columns() matcher:
def columns_example():
# http://www.swivel.com/data_sets/spreadsheet/1002196
table = '''
US Foreign Aid, top recipients, constant dollars
Year Iraq Israel Egypt
2005 6,981,200,000 2,684,100,000 1,541,900,000
2004 8,333,400,000 2,782,400,000 2,010,600,000
2003 4,150,000,000 3,878,300,000 1,849,600,000
2002 41,600,000 2,991,200,000 2,362,800,000
'''
spaces = ~Space()[:]
integer = (spaces & Digit()[1:, ~Optional(','), ...] & spaces) >> int
cols = Columns((4, integer),
# if we give widths, they follow on from each other
(16, integer),
# we can also specify column indices
((23, 36), integer),
# and then start with widths again
(16, integer))
# by default, Columns consumes a whole line (see skip argument), so
# for the whole table we only need to (1) drop the text and (2) put
# each row in a separate list.
parser = ~SkipTo(Digit(), include=False) & (cols > list)[:]
parser.parse(table)
columns_example()
Which prints:
[[2005, 6981200000, 2684100000, 1541900000],
[2004, 8333400000, 2782400000, 2010600000],
[2003, 4150000000, 3878300000, 1849600000],
[2002, 41600000, 2991200000, 2362800000]]
Here’s a simpler example of how to use offside parsing, as described in Line–Aware Parsing and the Offside Rule. The idea is that we have a configuration file format with named sections and subsections; in the subsections are name/value pairs:
from string import ascii_letters
from lepl import *
def config_parser():
word = Token(Any(ascii_letters)[1:, ...])
key_value = (word & ~Token(':') & word) > tuple
subsection = BLine(word) & (Block(BLine(key_value)[1:] > dict)) > list
section = BLine(word) & Block(subsection[1:]) > list
config_file = (section | ~Line(Empty()))[:] > list
config_file.config.default_line_aware(block_policy=rightmost)
return config_file.get_parse()
parser = config_parser()
parser('''
one
a
foo: bar
baz: poop
b
snozzle: berry
two
c
apple: orange
''')[0]
Which prints:
[['one', ['a', {'foo': 'bar', 'baz': 'poop'}], ['b', {'snozzle': 'berry'}]], ['two', ['c', {'apple': 'orange'}]]]
Note that the name/value pairs are in dictionaries; this is because we passed a list of tuples to dict().
Here are a set of progressively more complex parsers that split each line into a list of words.
We start with a simple parser that explicitly manages spaces:
>>> with DroppedSpace():
>>> line = (Word()[:] & Drop('\n')) > list
>>> lines = line[:]
>>> lines.parse('abc de f\n pqr\n')
[['abc', 'de', 'f'], ['pqr']]
Next, we use tokens (and spaces are handled automatically):
>>> word = Token(Word())
>>> newline = ~Token('\n')
>>> line = (word[:] & newline) > list
>>> lines = line[:]
>>> lines.parse('abc de f\n pqr\n')
[['abc', 'de', 'f'], ['pqr']]
Warning
The next matchers use line-aware parsing. This is not necessary to solve this problem (this feature is intended for “offside rule” or “significant whitespace” parsing) and I recommend using the matchers above. I am including these examples only because they help clarify how the line-aware parsing works.
The final matcher (with SOL etc) requires version 4.3.1; all the other line-aware matchers require version 4.3 (or greater).
We can also use line-aware parsing with tokens to handle the newline:
>>> word = Token(Word())
>>> line = (~LineAwareSol() & word[:] & ~LineAwareEol()) > list
>>> lines = line[:]
>>> lines.config.default_line_aware()
>>> lines.parse('abc de f\n pqr\n')
[['abc', 'de', 'f'], ['pqr']]
or, (almost) equivalently:
>>> word = Token(Word())
>>> line = Line(word[:]) > list
>>> lines = line[:]
>>> lines.config.default_line_aware()
>>> lines.parse('abc de f\n pqr\n')
[['abc', 'de', 'f'], ['pqr']]
And we can also match the line-aware symbols directly (without using tokens):
>>> with DroppedSpace():
>>> words = Word()[:]
>>> newline = ~Any('\n')
>>> line = (~SOL() & words & newline & ~EOL()) > list
>>> lines = line[:]
>>> lines.config.default_line_aware()
>>> lines.parse('abc de f\n pqr\n')
[['abc', 'de', 'f'], ['pqr']]