Package lepl :: Package lexer :: Module lexer
[hide private]
[frames] | no frames]

Source Code for Module lepl.lexer.lexer

  1   
  2  # The contents of this file are subject to the Mozilla Public License 
  3  # (MPL) Version 1.1 (the "License"); you may not use this file except 
  4  # in compliance with the License. You may obtain a copy of the License 
  5  # at http://www.mozilla.org/MPL/ 
  6  # 
  7  # Software distributed under the License is distributed on an "AS IS" 
  8  # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 
  9  # the License for the specific language governing rights and 
 10  # limitations under the License. 
 11  # 
 12  # The Original Code is LEPL (http://www.acooke.org/lepl) 
 13  # The Initial Developer of the Original Code is Andrew Cooke. 
 14  # Portions created by the Initial Developer are Copyright (C) 2009-2010 
 15  # Andrew Cooke (andrew@acooke.org). All Rights Reserved. 
 16  # 
 17  # Alternatively, the contents of this file may be used under the terms 
 18  # of the LGPL license (the GNU Lesser General Public License, 
 19  # http://www.gnu.org/licenses/lgpl.html), in which case the provisions 
 20  # of the LGPL License are applicable instead of those above. 
 21  # 
 22  # If you wish to allow use of your version of this file only under the 
 23  # terms of the LGPL License and not to allow others to use your version 
 24  # of this file under the MPL, indicate your decision by deleting the 
 25  # provisions above and replace them with the notice and other provisions 
 26  # required by the LGPL License.  If you do not delete the provisions 
 27  # above, a recipient may use your version of this file under either the 
 28  # MPL or the LGPL License. 
 29   
 30  from lepl.support.lib import fmt 
 31  from lepl.support.context import NamespaceMixin 
 32  from lepl.matchers.support import BaseMatcher 
 33  from lepl.lexer.operators import TOKENS, TokenNamespace 
 34  from lepl.core.parser import tagged 
 35  from lepl.stream.core import s_empty, s_debug, s_stream, s_fmt, s_factory, \ 
 36      s_max, s_new_max, s_id, s_global_kargs, s_delta, s_len, \ 
 37      s_cache_level 
 38  from lepl.lexer.support import RuntimeLexerError 
 39  from lepl.regexp.core import Compiler 
40 41 # pylint can't detect _kargs etc 42 # pylint: disable-msg=E1101 43 44 -class Lexer(NamespaceMixin, BaseMatcher):
45 ''' 46 This takes a set of regular expressions and provides a matcher that 47 converts a stream into a stream of tokens, passing the new stream to 48 the embedded matcher. 49 50 It is added to the matcher graph by the lexer_rewriter; it is not 51 specified explicitly by the user. 52 ''' 53
54 - def __init__(self, matcher, tokens, alphabet, discard, 55 t_regexp=None, s_regexp=None):
56 ''' 57 matcher is the head of the original matcher graph, which will be called 58 with a tokenised stream. 59 60 tokens is the set of `Token` instances that define the lexer. 61 62 alphabet is the alphabet for which the regexps are defined. 63 64 discard is the regular expression for spaces (which are silently 65 dropped if not token can be matcher). 66 67 t_regexp and s_regexp are internally compiled state, use in cloning, 68 and should not be provided by non-cloning callers. 69 ''' 70 super(Lexer, self).__init__(TOKENS, TokenNamespace) 71 if t_regexp is None: 72 unique = {} 73 for token in tokens: 74 token.compile(alphabet) 75 self._debug(fmt('Token: {0}', token)) 76 # this just reduces the work for the regexp compiler 77 unique[token.id_] = token 78 t_regexp = Compiler.multiple(alphabet, 79 [(t.id_, t.regexp) 80 for t in unique.values() if t.regexp is not None]).dfa() 81 if s_regexp is None and discard is not None: 82 s_regexp = Compiler.single(alphabet, discard).dfa() 83 self._arg(matcher=matcher) 84 self._arg(tokens=tokens) 85 self._arg(alphabet=alphabet) 86 self._arg(discard=discard) 87 self._karg(t_regexp=t_regexp) 88 self._karg(s_regexp=s_regexp)
89
90 - def token_for_id(self, id_):
91 ''' 92 A utility that checks the known tokens for a given ID. The ID is used 93 internally, but is (by default) an unfriendly integer value. Note that 94 a lexed stream associates a chunk of input with a list of IDs - more 95 than one regexp may be a maximal match (and this is a feature, not a 96 bug). 97 ''' 98 for token in self.tokens: 99 if token.id_ == id_: 100 return token
101
102 - def _tokens(self, stream, max):
103 ''' 104 Generate tokens, on demand. 105 ''' 106 try: 107 id_ = s_id(stream) 108 while not s_empty(stream): 109 # avoid conflicts between tokens 110 id_ += 1 111 try: 112 (terminals, match, next_stream) = \ 113 self.t_regexp.match(stream) 114 self._debug(fmt('Token: {0!r} {1!r} {2!s}', 115 terminals, match, s_debug(stream))) 116 yield (terminals, s_stream(stream, match, max=max, id_=id_)) 117 except TypeError: 118 (terminals, _size, next_stream) = \ 119 self.s_regexp.size_match(stream) 120 self._debug(fmt('Space: {0!r} {1!s}', 121 terminals, s_debug(stream))) 122 stream = next_stream 123 except TypeError: 124 raise RuntimeLexerError( 125 s_fmt(stream, 126 'No token for {rest} at {location} of {text}.'))
127 128 @tagged
129 - def _match(self, in_stream):
130 ''' 131 Implement matching - pass token stream to tokens. 132 ''' 133 (max, clean_stream) = s_new_max(in_stream) 134 try: 135 length = s_len(in_stream) 136 except TypeError: 137 length = None 138 factory = s_factory(in_stream) 139 token_stream = factory.to_token( 140 self._tokens(clean_stream, max), 141 id=s_id(in_stream), factory=factory, 142 max=s_max(in_stream), 143 global_kargs=s_global_kargs(in_stream), 144 delta=s_delta(in_stream), len=length, 145 cache_level=s_cache_level(in_stream)+1) 146 in_stream = None 147 generator = self.matcher._match(token_stream) 148 while True: 149 yield (yield generator)
150