Package lepl :: Package lexer :: Module matchers
[hide private]
[frames] | no frames]

Source Code for Module lepl.lexer.matchers

  1   
  2  # The contents of this file are subject to the Mozilla Public License 
  3  # (MPL) Version 1.1 (the "License"); you may not use this file except 
  4  # in compliance with the License. You may obtain a copy of the License 
  5  # at http://www.mozilla.org/MPL/ 
  6  # 
  7  # Software distributed under the License is distributed on an "AS IS" 
  8  # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 
  9  # the License for the specific language governing rights and 
 10  # limitations under the License. 
 11  # 
 12  # The Original Code is LEPL (http://www.acooke.org/lepl) 
 13  # The Initial Developer of the Original Code is Andrew Cooke. 
 14  # Portions created by the Initial Developer are Copyright (C) 2009-2010 
 15  # Andrew Cooke (andrew@acooke.org). All Rights Reserved. 
 16  # 
 17  # Alternatively, the contents of this file may be used under the terms 
 18  # of the LGPL license (the GNU Lesser General Public License, 
 19  # http://www.gnu.org/licenses/lgpl.html), in which case the provisions 
 20  # of the LGPL License are applicable instead of those above. 
 21  # 
 22  # If you wish to allow use of your version of this file only under the 
 23  # terms of the LGPL License and not to allow others to use your version 
 24  # of this file under the MPL, indicate your decision by deleting the 
 25  # provisions above and replace them with the notice and other provisions 
 26  # required by the LGPL License.  If you do not delete the provisions 
 27  # above, a recipient may use your version of this file under either the 
 28  # MPL or the LGPL License. 
 29   
 30  ''' 
 31  Generate and match a stream of tokens that are identified by regular  
 32  expressions. 
 33  ''' 
 34   
 35  # pylint currently cannot parse this file 
 36   
 37  from abc import ABCMeta 
 38   
 39  from lepl.stream.core import s_empty, s_line, s_next, s_len 
 40  from lepl.lexer.support import LexerError 
 41  from lepl.lexer.operators import TOKENS, TokenNamespace 
 42  from lepl.lexer.stream import FilteredTokenHelper 
 43  from lepl.matchers.core import OperatorMatcher, Any, Literal, Lookahead, Regexp 
 44  from lepl.matchers.matcher import Matcher, add_children 
 45  from lepl.matchers.memo import NoMemo 
 46  from lepl.matchers.support import coerce_, trampoline_matcher_factory 
 47  from lepl.core.parser import tagged 
 48  from lepl.regexp.matchers import BaseRegexp 
 49  from lepl.regexp.rewriters import CompileRegexp 
 50  from lepl.regexp.unicode import UnicodeAlphabet 
 51  from lepl.support.lib import fmt, str 
 52   
 53   
 54  # pylint: disable-msg=W0105 
 55  # epydoc convention 
 56   
 57  # pylint: disable-msg=C0103 
 58  # it's a class 
 59  NonToken = ABCMeta('NonToken', (object, ), {}) 
 60  ''' 
 61  ABC used to identify matchers that actually consume from the stream.  These 
 62  are the "leaf" matchers that "do the real work" and they cannot be used at 
 63  the same level as Tokens, but must be embedded inside them. 
 64   
 65  This is a purely infmtive interface used, for example, to generate warnings  
 66  for the user.  Not implementing this interface will not block any  
 67  functionality. 
 68  ''' 
 69   
 70  add_children(NonToken, Lookahead, Any, Literal, Regexp) 
71 # don't register Empty() here because it's useful as a token(!) 72 73 74 # pylint: disable-msg=R0901, R0904, R0913, W0201, W0142, E1101 75 # lepl standards 76 -class BaseToken(OperatorMatcher, NoMemo):
77 ''' 78 Introduce a token that will be recognised by the lexer. A Token instance 79 can be specialised to match particular contents by calling as a function. 80 81 This is a base class that provides all the functionality, but doesn't 82 set the regexp attribute. This allows subclasses to provide a fixed 83 value, while `Token` uses the constructor. 84 ''' 85 86 __count = 0 87
88 - def __init__(self, content=None, id_=None, alphabet=None, 89 complete=True, compiled=False):
90 ''' 91 Define a token that will be generated by the lexer. 92 93 content is the optional matcher that will be invoked on the value 94 of the token. It is usually set via (), which clones this instance 95 so that the same token can be used more than once. 96 97 id_ is an optional unique identifier that will be given an integer 98 value if left empty. 99 100 alphabet is the alphabet associated with the regexp. It should be 101 set by the lexer rewiter, so that all instances share the same 102 value (it appears in the constructor so that Tokens can be cloned). 103 104 complete indicates whether any sub-matcher must completely exhaust 105 the contents when matching. It can be over-ridden for a particular 106 sub-matcher via __call__(). 107 108 compiled should only be used internally. It is a flag indicating 109 that the Token has been processed by the rewriter (see below). 110 111 A Token must be "compiled" --- this completes the configuration 112 using a given alphabet and is done by the lexer_rewriter. Care is 113 taken to allow a Token to be cloned before or after compilation. 114 ''' 115 super(BaseToken, self).__init__(name=TOKENS, namespace=TokenNamespace) 116 self._karg(content=content) 117 if id_ is None: 118 id_ = 'Tk' + str(BaseToken.__count) 119 BaseToken.__count += 1 120 self._karg(id_=id_) 121 self._karg(alphabet=alphabet) 122 self._karg(complete=complete) 123 self._karg(compiled=compiled)
124
125 - def compile(self, alphabet=None):
126 ''' 127 Convert the regexp if necessary. 128 ''' 129 if alphabet is None: 130 alphabet = UnicodeAlphabet.instance() 131 # pylint: disable-msg=E0203 132 # set in constructor via _kargs 133 if self.alphabet is None: 134 self.alphabet = alphabet 135 self.regexp = self.__to_regexp(self.regexp, self.alphabet) 136 self.compiled = True
137 138 @staticmethod
139 - def __to_regexp(regexp, alphabet):
140 ''' 141 The regexp may be a matcher; if so we try to convert it to a regular 142 expression and extract the equivalent text. 143 ''' 144 if isinstance(regexp, Matcher): 145 rewriter = CompileRegexp(alphabet) 146 rewrite = rewriter(regexp) 147 # one transformation is empty_adapter 148 if isinstance(rewrite, BaseRegexp) and \ 149 len(rewrite.wrapper.functions) <= 1: 150 regexp = str(rewrite.regexp) 151 else: 152 raise LexerError( 153 fmt('A Token was specified with a matcher, ' 154 'but the matcher could not be converted to ' 155 'a regular expression: {0}', rewrite)) 156 return regexp
157
158 - def __call__(self, content, complete=None):
159 ''' 160 If complete is specified as True of False it overrides the value 161 set in the constructor. If True the content matcher must complete 162 match the Token contents. 163 ''' 164 args, kargs = self._constructor_args() 165 kargs['complete'] = self.complete if complete is None else complete 166 kargs['content'] = coerce_(content) 167 return type(self)(*args, **kargs)
168 169 @tagged
170 - def _match(self, stream):
171 ''' 172 On matching we first assert that the token type is correct and then 173 delegate to the content. 174 ''' 175 if not self.compiled: 176 raise LexerError( 177 fmt('A {0} token has not been compiled. ' 178 'You must use the lexer rewriter with Tokens. ' 179 'This can be done by using matcher.config.lexer().', 180 self.__class__.__name__)) 181 ((tokens, line_stream), next_stream) = s_next(stream) 182 if self.id_ in tokens: 183 if self.content is None: 184 # result contains all data (use s_next not s_line to set max) 185 (line, _) = s_line(line_stream, True) 186 (line, _) = s_next(line_stream, count=len(line)) 187 yield ([line], next_stream) 188 else: 189 generator = self.content._match(line_stream) 190 while True: 191 (result, next_line_stream) = yield generator 192 if s_empty(next_line_stream) or not self.complete: 193 yield (result, next_stream)
194
195 - def __str__(self):
196 return fmt('{0}: {1!s}', self.id_, self.regexp)
197
198 - def __repr__(self):
199 return fmt('<Token({0!s})>', self)
200 201 @classmethod
202 - def reset_ids(cls):
203 ''' 204 Reset the ID counter. This should not be needed in normal use. 205 ''' 206 cls.__count = 0
207
208 209 -class Token(BaseToken):
210 ''' 211 A token with a user-specified regexp. 212 ''' 213
214 - def __init__(self, regexp, content=None, id_=None, alphabet=None, 215 complete=True, compiled=False):
216 ''' 217 Define a token that will be generated by the lexer. 218 219 regexp is the regular expression that the lexer will use to generate 220 appropriate tokens. 221 222 content is the optional matcher that will be invoked on the value 223 of the token. It is usually set via (), which clones this instance 224 so that the same token can be used more than once. 225 226 id_ is an optional unique identifier that will be given an integer 227 value if left empty. 228 229 alphabet is the alphabet associated with the regexp. It should be 230 set by the lexer rewiter, so that all instances share the same 231 value (it appears in the constructor so that Tokens can be cloned). 232 233 complete indicates whether any sub-matcher must completely exhaust 234 the contents when matching. It can be over-ridden for a particular 235 sub-matcher via __call__(). 236 237 compiled should only be used internally. It is a flag indicating 238 that the Token has been processed by the rewriter (see below). 239 240 A Token must be "compiled" --- this completes the configuration 241 using a given alphabet and is done by the lexer_rewriter. Care is taken 242 to allow a Token to be cloned before or after compilation. 243 ''' 244 super(Token, self).__init__(content=content, id_=id_, alphabet=alphabet, 245 complete=complete, compiled=compiled) 246 self._karg(regexp=regexp)
247
248 249 -class EmptyToken(Token):
250 ''' 251 A token that cannot be specialised, and that returns nothing. 252 ''' 253
254 - def __call__(self, *args, **kargs):
255 raise TypeError('Empty token')
256 257 @tagged
258 - def _match(self, stream):
259 ''' 260 On matching we first assert that the token type is correct and then 261 delegate to the content. 262 ''' 263 if not self.compiled: 264 raise LexerError( 265 fmt('A {0} token has not been compiled. ' 266 'You must use the lexer rewriter with Tokens. ' 267 'This can be done by using matcher.config.lexer().', 268 self.__class__.__name__)) 269 ((tokens, _), next_stream) = s_next(stream) 270 if self.id_ in tokens: 271 yield ([], next_stream)
272
273 274 275 -def RestrictTokensBy(*tokens):
276 ''' 277 A matcher factory that generates a new matcher that will transform the 278 stream passed to its arguments so that they do not see the given tokens. 279 280 So, for example: 281 MyFactory = RestrictTokensBy(A(), B()): 282 RestrictedC = MyFactory(C()) 283 will create a matcher, RestrictedC, that is like C, but which will not 284 see the tokens matced by A and B. 285 286 In other words, this filters tokens from the input. 287 ''' 288 289 @trampoline_matcher_factory() 290 def factory(matcher, *tokens): 291 ''' 292 The factory that will be returned, with the tokens supplied above. 293 ''' 294 def match(support, in_stream): 295 ''' 296 The final matcher - delegates to `matcher` with a restricted 297 stream of tokens. 298 ''' 299 ids = [token.id_ for token in tokens] 300 (state, helper) = in_stream 301 filtered = (state, FilteredTokenHelper(helper, *ids)) 302 generator = matcher._match(filtered) 303 while True: 304 (result, (state, _)) = yield generator 305 support._debug(fmt('Result {0}', result)) 306 yield (result, (state, helper))
307 return match 308 309 def pass_args(matcher): 310 ''' 311 Dirty trick to pass tokens in to factory. 312 ''' 313 return factory(matcher, *tokens) 314 315 return pass_args 316