Package lepl :: Package lexer :: Module rewriters
[hide private]
[frames] | no frames]

Source Code for Module lepl.lexer.rewriters

  1   
  2  # The contents of this file are subject to the Mozilla Public License 
  3  # (MPL) Version 1.1 (the "License"); you may not use this file except 
  4  # in compliance with the License. You may obtain a copy of the License 
  5  # at http://www.mozilla.org/MPL/ 
  6  # 
  7  # Software distributed under the License is distributed on an "AS IS" 
  8  # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 
  9  # the License for the specific language governing rights and 
 10  # limitations under the License. 
 11  # 
 12  # The Original Code is LEPL (http://www.acooke.org/lepl) 
 13  # The Initial Developer of the Original Code is Andrew Cooke. 
 14  # Portions created by the Initial Developer are Copyright (C) 2009-2010 
 15  # Andrew Cooke (andrew@acooke.org). All Rights Reserved. 
 16  # 
 17  # Alternatively, the contents of this file may be used under the terms 
 18  # of the LGPL license (the GNU Lesser General Public License, 
 19  # http://www.gnu.org/licenses/lgpl.html), in which case the provisions 
 20  # of the LGPL License are applicable instead of those above. 
 21  # 
 22  # If you wish to allow use of your version of this file only under the 
 23  # terms of the LGPL License and not to allow others to use your version 
 24  # of this file under the MPL, indicate your decision by deleting the 
 25  # provisions above and replace them with the notice and other provisions 
 26  # required by the LGPL License.  If you do not delete the provisions 
 27  # above, a recipient may use your version of this file under either the 
 28  # MPL or the LGPL License. 
 29   
 30  ''' 
 31  Rewrite a matcher graph to include lexing. 
 32  ''' 
 33   
 34  from collections import deque 
 35   
 36  from lepl.core.rewriters import Rewriter 
 37  from lepl.lexer.lexer import Lexer 
 38  from lepl.lexer.support import LexerError 
 39  from lepl.lexer.matchers import BaseToken, NonToken 
 40  from lepl.matchers.matcher import Matcher, is_child 
 41  from lepl.regexp.unicode import UnicodeAlphabet 
 42  from lepl.support.lib import fmt 
 43   
 44   
45 -def find_tokens(matcher):
46 ''' 47 Returns a set of Tokens. Also asserts that children of tokens are 48 not themselves Tokens. 49 50 Should we also check that a Token occurs somewhere on every path to a 51 leaf node? 52 ''' 53 (tokens, visited, non_tokens) = (set(), set(), set()) 54 stack = deque([matcher]) 55 while stack: 56 matcher = stack.popleft() 57 if matcher not in visited: 58 if is_child(matcher, NonToken): 59 non_tokens.add(matcher) 60 if isinstance(matcher, BaseToken): 61 tokens.add(matcher) 62 if matcher.content: 63 assert_not_token(matcher.content, visited) 64 else: 65 for child in matcher: 66 if isinstance(child, Matcher): 67 stack.append(child) 68 visited.add(matcher) 69 if tokens and non_tokens: 70 raise LexerError( 71 fmt('The grammar contains a mix of Tokens and non-Token ' 72 'matchers at the top level. If Tokens are used then ' 73 'non-token matchers that consume input must only ' 74 'appear "inside" Tokens. The non-Token matchers ' 75 'include: {0}.', 76 '; '.join(str(n) for n in non_tokens))) 77 return tokens
78 79
80 -def assert_not_token(node, visited):
81 ''' 82 Assert that neither this nor any child node is a Token. 83 ''' 84 if isinstance(node, Matcher) and node not in visited: 85 visited.add(node) 86 if isinstance(node, BaseToken): 87 raise LexerError(fmt('Nested token: {0}', node)) 88 else: 89 for child in node: 90 assert_not_token(child, visited)
91 92
93 -class AddLexer(Rewriter):
94 ''' 95 This is required when using Tokens. It does the following: 96 - Find all tokens in the matcher graph 97 - Construct a lexer from the tokens 98 - Connect the lexer to the matcher 99 - Check that all children have a token parent 100 (and optionally add a default token) 101 Although possibly not in that order. 102 103 alphabet is the alphabet for which the regular expressions are defined. 104 105 discard is a regular expression that is used to match space (typically) 106 if no token can be matched (and which is then discarded) 107 ''' 108
109 - def __init__(self, alphabet=None, discard=None, lexer=None):
110 if alphabet is None: 111 alphabet = UnicodeAlphabet.instance() 112 # use '' to have no discard at all 113 if discard is None: 114 discard = '[ \t\r\n]+' 115 super(AddLexer, self).__init__(Rewriter.LEXER, 116 name=fmt('Lexer({0}, {1}, {2})', alphabet, discard, lexer)) 117 self.alphabet = alphabet 118 self.discard = discard 119 self.lexer = lexer if lexer else Lexer
120
121 - def __call__(self, graph):
122 tokens = find_tokens(graph) 123 if tokens: 124 self._debug(fmt('Found {0}', [token.id_ for token in tokens])) 125 return self.lexer(graph, tokens, self.alphabet, self.discard) 126 else: 127 self._info('Lexer rewriter used, but no tokens found.') 128 return graph
129