| Home | Trees | Indices | Help |
|---|
|
|
1 2 # The contents of this file are subject to the Mozilla Public License 3 # (MPL) Version 1.1 (the "License"); you may not use this file except 4 # in compliance with the License. You may obtain a copy of the License 5 # at http://www.mozilla.org/MPL/ 6 # 7 # Software distributed under the License is distributed on an "AS IS" 8 # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 9 # the License for the specific language governing rights and 10 # limitations under the License. 11 # 12 # The Original Code is LEPL (http://www.acooke.org/lepl) 13 # The Initial Developer of the Original Code is Andrew Cooke. 14 # Portions created by the Initial Developer are Copyright (C) 2009-2010 15 # Andrew Cooke (andrew@acooke.org). All Rights Reserved. 16 # 17 # Alternatively, the contents of this file may be used under the terms 18 # of the LGPL license (the GNU Lesser General Public License, 19 # http://www.gnu.org/licenses/lgpl.html), in which case the provisions 20 # of the LGPL License are applicable instead of those above. 21 # 22 # If you wish to allow use of your version of this file only under the 23 # terms of the LGPL License and not to allow others to use your version 24 # of this file under the MPL, indicate your decision by deleting the 25 # provisions above and replace them with the notice and other provisions 26 # required by the LGPL License. If you do not delete the provisions 27 # above, a recipient may use your version of this file under either the 28 # MPL or the LGPL License. 29 30 ''' 31 A regexp implementation for unicode strings. 32 ''' 33 34 from sys import maxunicode 35 36 from lepl.regexp.str import StrAlphabet, ILLEGAL 37 from lepl.support.lib import chr, lmap, fmt 38 39 _WHITESPACE = '\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680' \ 40 '\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007' \ 41 '\u2008\u2009\u200a\u2028\u2029\u202F\u205F\u3000' 42 '''http://en.wikipedia.org/wiki/Whitespace_character'''46 ''' 47 An alphabet for unicode strings. 48 ''' 49 50 __cached_instance = None 51 52 # pylint: disable-msg=E1002 53 # (pylint bug? this chains back to a new style abc)73 range = Or(mkchr('s', _WHITESPACE), 74 mkchr('S', _WHITESPACE, invert=True)) 75 escaped = Any(ILLEGAL) | mkhex('x', 2) | mkhex('u', 4) | mkhex('U', 8) 76 super(UnicodeAlphabet, self).__init__(chr(0), max_, escaped=escaped, 77 range=range) 7855 from lepl.matchers.core import Any 56 from lepl.matchers.combine import Or 57 max_ = chr(maxunicode) 58 def mkhex(char, n): 59 from lepl.matchers.derived import Drop 60 return Drop(Any(char)) + Any('0123456789abcdefABCDEF')[n,...] >> \ 61 (lambda x: chr(int(x, 16)))62 def mkchr(char, range, invert=False): 63 from lepl.matchers.core import Literal 64 from lepl.matchers.derived import Map 65 from lepl.regexp.core import Character 66 intervals = lmap(lambda x: (x, x), range) 67 if invert: 68 # this delays call to invert until after creation of self 69 func = lambda _: Character(self.invert(intervals), self) 70 else: 71 func = lambda _: Character(intervals, self) 72 return Map(Literal(char), func)80 ''' 81 Must return the character before char in the alphabet. Never called 82 with min (assuming input data are in range). 83 ''' 84 return chr(ord(char)-1)85 87 ''' 88 Must return the character after c in the alphabet. Never called with 89 max (assuming input data are in range). 90 ''' 91 return chr(ord(char)+1) 92 93 @classmethod95 ''' 96 Get an instance of this alphabet (avoids creating new objects). 97 ''' 98 if cls.__cached_instance is None: 99 cls.__cached_instance = UnicodeAlphabet() 100 return cls.__cached_instance101 104
| Home | Trees | Indices | Help |
|---|
| Generated by Epydoc 3.0.1 on Sat Jun 9 21:51:00 2012 | http://epydoc.sourceforge.net |