1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 '''
31 Generate and match a stream of tokens that are identified by regular
32 expressions.
33 '''
34
35
36
37 from abc import ABCMeta
38
39 from lepl.stream.core import s_empty, s_line, s_next, s_len
40 from lepl.lexer.support import LexerError
41 from lepl.lexer.operators import TOKENS, TokenNamespace
42 from lepl.lexer.stream import FilteredTokenHelper
43 from lepl.matchers.core import OperatorMatcher, Any, Literal, Lookahead, Regexp
44 from lepl.matchers.matcher import Matcher, add_children
45 from lepl.matchers.memo import NoMemo
46 from lepl.matchers.support import coerce_, trampoline_matcher_factory
47 from lepl.core.parser import tagged
48 from lepl.regexp.matchers import BaseRegexp
49 from lepl.regexp.rewriters import CompileRegexp
50 from lepl.regexp.unicode import UnicodeAlphabet
51 from lepl.support.lib import fmt, str
52
53
54
55
56
57
58
59 NonToken = ABCMeta('NonToken', (object, ), {})
60 '''
61 ABC used to identify matchers that actually consume from the stream. These
62 are the "leaf" matchers that "do the real work" and they cannot be used at
63 the same level as Tokens, but must be embedded inside them.
64
65 This is a purely infmtive interface used, for example, to generate warnings
66 for the user. Not implementing this interface will not block any
67 functionality.
68 '''
69
70 add_children(NonToken, Lookahead, Any, Literal, Regexp)
71
72
73
74
75
76 -class BaseToken(OperatorMatcher, NoMemo):
77 '''
78 Introduce a token that will be recognised by the lexer. A Token instance
79 can be specialised to match particular contents by calling as a function.
80
81 This is a base class that provides all the functionality, but doesn't
82 set the regexp attribute. This allows subclasses to provide a fixed
83 value, while `Token` uses the constructor.
84 '''
85
86 __count = 0
87
88 - def __init__(self, content=None, id_=None, alphabet=None,
89 complete=True, compiled=False):
90 '''
91 Define a token that will be generated by the lexer.
92
93 content is the optional matcher that will be invoked on the value
94 of the token. It is usually set via (), which clones this instance
95 so that the same token can be used more than once.
96
97 id_ is an optional unique identifier that will be given an integer
98 value if left empty.
99
100 alphabet is the alphabet associated with the regexp. It should be
101 set by the lexer rewiter, so that all instances share the same
102 value (it appears in the constructor so that Tokens can be cloned).
103
104 complete indicates whether any sub-matcher must completely exhaust
105 the contents when matching. It can be over-ridden for a particular
106 sub-matcher via __call__().
107
108 compiled should only be used internally. It is a flag indicating
109 that the Token has been processed by the rewriter (see below).
110
111 A Token must be "compiled" --- this completes the configuration
112 using a given alphabet and is done by the lexer_rewriter. Care is
113 taken to allow a Token to be cloned before or after compilation.
114 '''
115 super(BaseToken, self).__init__(name=TOKENS, namespace=TokenNamespace)
116 self._karg(content=content)
117 if id_ is None:
118 id_ = 'Tk' + str(BaseToken.__count)
119 BaseToken.__count += 1
120 self._karg(id_=id_)
121 self._karg(alphabet=alphabet)
122 self._karg(complete=complete)
123 self._karg(compiled=compiled)
124
137
138 @staticmethod
140 '''
141 The regexp may be a matcher; if so we try to convert it to a regular
142 expression and extract the equivalent text.
143 '''
144 if isinstance(regexp, Matcher):
145 rewriter = CompileRegexp(alphabet)
146 rewrite = rewriter(regexp)
147
148 if isinstance(rewrite, BaseRegexp) and \
149 len(rewrite.wrapper.functions) <= 1:
150 regexp = str(rewrite.regexp)
151 else:
152 raise LexerError(
153 fmt('A Token was specified with a matcher, '
154 'but the matcher could not be converted to '
155 'a regular expression: {0}', rewrite))
156 return regexp
157
158 - def __call__(self, content, complete=None):
159 '''
160 If complete is specified as True of False it overrides the value
161 set in the constructor. If True the content matcher must complete
162 match the Token contents.
163 '''
164 args, kargs = self._constructor_args()
165 kargs['complete'] = self.complete if complete is None else complete
166 kargs['content'] = coerce_(content)
167 return type(self)(*args, **kargs)
168
169 @tagged
171 '''
172 On matching we first assert that the token type is correct and then
173 delegate to the content.
174 '''
175 if not self.compiled:
176 raise LexerError(
177 fmt('A {0} token has not been compiled. '
178 'You must use the lexer rewriter with Tokens. '
179 'This can be done by using matcher.config.lexer().',
180 self.__class__.__name__))
181 ((tokens, line_stream), next_stream) = s_next(stream)
182 if self.id_ in tokens:
183 if self.content is None:
184
185 (line, _) = s_line(line_stream, True)
186 (line, _) = s_next(line_stream, count=len(line))
187 yield ([line], next_stream)
188 else:
189 generator = self.content._match(line_stream)
190 while True:
191 (result, next_line_stream) = yield generator
192 if s_empty(next_line_stream) or not self.complete:
193 yield (result, next_stream)
194
196 return fmt('{0}: {1!s}', self.id_, self.regexp)
197
199 return fmt('<Token({0!s})>', self)
200
201 @classmethod
203 '''
204 Reset the ID counter. This should not be needed in normal use.
205 '''
206 cls.__count = 0
207
208
209 -class Token(BaseToken):
210 '''
211 A token with a user-specified regexp.
212 '''
213
214 - def __init__(self, regexp, content=None, id_=None, alphabet=None,
215 complete=True, compiled=False):
216 '''
217 Define a token that will be generated by the lexer.
218
219 regexp is the regular expression that the lexer will use to generate
220 appropriate tokens.
221
222 content is the optional matcher that will be invoked on the value
223 of the token. It is usually set via (), which clones this instance
224 so that the same token can be used more than once.
225
226 id_ is an optional unique identifier that will be given an integer
227 value if left empty.
228
229 alphabet is the alphabet associated with the regexp. It should be
230 set by the lexer rewiter, so that all instances share the same
231 value (it appears in the constructor so that Tokens can be cloned).
232
233 complete indicates whether any sub-matcher must completely exhaust
234 the contents when matching. It can be over-ridden for a particular
235 sub-matcher via __call__().
236
237 compiled should only be used internally. It is a flag indicating
238 that the Token has been processed by the rewriter (see below).
239
240 A Token must be "compiled" --- this completes the configuration
241 using a given alphabet and is done by the lexer_rewriter. Care is taken
242 to allow a Token to be cloned before or after compilation.
243 '''
244 super(Token, self).__init__(content=content, id_=id_, alphabet=alphabet,
245 complete=complete, compiled=compiled)
246 self._karg(regexp=regexp)
247
250 '''
251 A token that cannot be specialised, and that returns nothing.
252 '''
253
255 raise TypeError('Empty token')
256
257 @tagged
259 '''
260 On matching we first assert that the token type is correct and then
261 delegate to the content.
262 '''
263 if not self.compiled:
264 raise LexerError(
265 fmt('A {0} token has not been compiled. '
266 'You must use the lexer rewriter with Tokens. '
267 'This can be done by using matcher.config.lexer().',
268 self.__class__.__name__))
269 ((tokens, _), next_stream) = s_next(stream)
270 if self.id_ in tokens:
271 yield ([], next_stream)
272
276 '''
277 A matcher factory that generates a new matcher that will transform the
278 stream passed to its arguments so that they do not see the given tokens.
279
280 So, for example:
281 MyFactory = RestrictTokensBy(A(), B()):
282 RestrictedC = MyFactory(C())
283 will create a matcher, RestrictedC, that is like C, but which will not
284 see the tokens matced by A and B.
285
286 In other words, this filters tokens from the input.
287 '''
288
289 @trampoline_matcher_factory()
290 def factory(matcher, *tokens):
291 '''
292 The factory that will be returned, with the tokens supplied above.
293 '''
294 def match(support, in_stream):
295 '''
296 The final matcher - delegates to `matcher` with a restricted
297 stream of tokens.
298 '''
299 ids = [token.id_ for token in tokens]
300 (state, helper) = in_stream
301 filtered = (state, FilteredTokenHelper(helper, *ids))
302 generator = matcher._match(filtered)
303 while True:
304 (result, (state, _)) = yield generator
305 support._debug(fmt('Result {0}', result))
306 yield (result, (state, helper))
307 return match
308
309 def pass_args(matcher):
310 '''
311 Dirty trick to pass tokens in to factory.
312 '''
313 return factory(matcher, *tokens)
314
315 return pass_args
316