1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 from lepl.support.lib import fmt
31 from lepl.support.context import NamespaceMixin
32 from lepl.matchers.support import BaseMatcher
33 from lepl.lexer.operators import TOKENS, TokenNamespace
34 from lepl.core.parser import tagged
35 from lepl.stream.core import s_empty, s_debug, s_stream, s_fmt, s_factory, \
36 s_max, s_new_max, s_id, s_global_kargs, s_delta, s_len, \
37 s_cache_level
38 from lepl.lexer.support import RuntimeLexerError
39 from lepl.regexp.core import Compiler
40
41
42
43
44 -class Lexer(NamespaceMixin, BaseMatcher):
45 '''
46 This takes a set of regular expressions and provides a matcher that
47 converts a stream into a stream of tokens, passing the new stream to
48 the embedded matcher.
49
50 It is added to the matcher graph by the lexer_rewriter; it is not
51 specified explicitly by the user.
52 '''
53
54 - def __init__(self, matcher, tokens, alphabet, discard,
55 t_regexp=None, s_regexp=None):
56 '''
57 matcher is the head of the original matcher graph, which will be called
58 with a tokenised stream.
59
60 tokens is the set of `Token` instances that define the lexer.
61
62 alphabet is the alphabet for which the regexps are defined.
63
64 discard is the regular expression for spaces (which are silently
65 dropped if not token can be matcher).
66
67 t_regexp and s_regexp are internally compiled state, use in cloning,
68 and should not be provided by non-cloning callers.
69 '''
70 super(Lexer, self).__init__(TOKENS, TokenNamespace)
71 if t_regexp is None:
72 unique = {}
73 for token in tokens:
74 token.compile(alphabet)
75 self._debug(fmt('Token: {0}', token))
76
77 unique[token.id_] = token
78 t_regexp = Compiler.multiple(alphabet,
79 [(t.id_, t.regexp)
80 for t in unique.values() if t.regexp is not None]).dfa()
81 if s_regexp is None and discard is not None:
82 s_regexp = Compiler.single(alphabet, discard).dfa()
83 self._arg(matcher=matcher)
84 self._arg(tokens=tokens)
85 self._arg(alphabet=alphabet)
86 self._arg(discard=discard)
87 self._karg(t_regexp=t_regexp)
88 self._karg(s_regexp=s_regexp)
89
91 '''
92 A utility that checks the known tokens for a given ID. The ID is used
93 internally, but is (by default) an unfriendly integer value. Note that
94 a lexed stream associates a chunk of input with a list of IDs - more
95 than one regexp may be a maximal match (and this is a feature, not a
96 bug).
97 '''
98 for token in self.tokens:
99 if token.id_ == id_:
100 return token
101
103 '''
104 Generate tokens, on demand.
105 '''
106 try:
107 id_ = s_id(stream)
108 while not s_empty(stream):
109
110 id_ += 1
111 try:
112 (terminals, match, next_stream) = \
113 self.t_regexp.match(stream)
114 self._debug(fmt('Token: {0!r} {1!r} {2!s}',
115 terminals, match, s_debug(stream)))
116 yield (terminals, s_stream(stream, match, max=max, id_=id_))
117 except TypeError:
118 (terminals, _size, next_stream) = \
119 self.s_regexp.size_match(stream)
120 self._debug(fmt('Space: {0!r} {1!s}',
121 terminals, s_debug(stream)))
122 stream = next_stream
123 except TypeError:
124 raise RuntimeLexerError(
125 s_fmt(stream,
126 'No token for {rest} at {location} of {text}.'))
127
128 @tagged
150