1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 '''
31 Rewrite a matcher graph to include lexing.
32 '''
33
34 from collections import deque
35
36 from lepl.core.rewriters import Rewriter
37 from lepl.lexer.lexer import Lexer
38 from lepl.lexer.support import LexerError
39 from lepl.lexer.matchers import BaseToken, NonToken
40 from lepl.matchers.matcher import Matcher, is_child
41 from lepl.regexp.unicode import UnicodeAlphabet
42 from lepl.support.lib import fmt
43
44
46 '''
47 Returns a set of Tokens. Also asserts that children of tokens are
48 not themselves Tokens.
49
50 Should we also check that a Token occurs somewhere on every path to a
51 leaf node?
52 '''
53 (tokens, visited, non_tokens) = (set(), set(), set())
54 stack = deque([matcher])
55 while stack:
56 matcher = stack.popleft()
57 if matcher not in visited:
58 if is_child(matcher, NonToken):
59 non_tokens.add(matcher)
60 if isinstance(matcher, BaseToken):
61 tokens.add(matcher)
62 if matcher.content:
63 assert_not_token(matcher.content, visited)
64 else:
65 for child in matcher:
66 if isinstance(child, Matcher):
67 stack.append(child)
68 visited.add(matcher)
69 if tokens and non_tokens:
70 raise LexerError(
71 fmt('The grammar contains a mix of Tokens and non-Token '
72 'matchers at the top level. If Tokens are used then '
73 'non-token matchers that consume input must only '
74 'appear "inside" Tokens. The non-Token matchers '
75 'include: {0}.',
76 '; '.join(str(n) for n in non_tokens)))
77 return tokens
78
79
91
92
94 '''
95 This is required when using Tokens. It does the following:
96 - Find all tokens in the matcher graph
97 - Construct a lexer from the tokens
98 - Connect the lexer to the matcher
99 - Check that all children have a token parent
100 (and optionally add a default token)
101 Although possibly not in that order.
102
103 alphabet is the alphabet for which the regular expressions are defined.
104
105 discard is a regular expression that is used to match space (typically)
106 if no token can be matched (and which is then discarded)
107 '''
108
109 - def __init__(self, alphabet=None, discard=None, lexer=None):
120
122 tokens = find_tokens(graph)
123 if tokens:
124 self._debug(fmt('Found {0}', [token.id_ for token in tokens]))
125 return self.lexer(graph, tokens, self.alphabet, self.discard)
126 else:
127 self._info('Lexer rewriter used, but no tokens found.')
128 return graph
129