Package lepl :: Package regexp :: Module unicode
[hide private]
[frames] | no frames]

Source Code for Module lepl.regexp.unicode

  1   
  2  # The contents of this file are subject to the Mozilla Public License 
  3  # (MPL) Version 1.1 (the "License"); you may not use this file except 
  4  # in compliance with the License. You may obtain a copy of the License 
  5  # at http://www.mozilla.org/MPL/ 
  6  # 
  7  # Software distributed under the License is distributed on an "AS IS" 
  8  # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 
  9  # the License for the specific language governing rights and 
 10  # limitations under the License. 
 11  # 
 12  # The Original Code is LEPL (http://www.acooke.org/lepl) 
 13  # The Initial Developer of the Original Code is Andrew Cooke. 
 14  # Portions created by the Initial Developer are Copyright (C) 2009-2010 
 15  # Andrew Cooke (andrew@acooke.org). All Rights Reserved. 
 16  # 
 17  # Alternatively, the contents of this file may be used under the terms 
 18  # of the LGPL license (the GNU Lesser General Public License, 
 19  # http://www.gnu.org/licenses/lgpl.html), in which case the provisions 
 20  # of the LGPL License are applicable instead of those above. 
 21  # 
 22  # If you wish to allow use of your version of this file only under the 
 23  # terms of the LGPL License and not to allow others to use your version 
 24  # of this file under the MPL, indicate your decision by deleting the 
 25  # provisions above and replace them with the notice and other provisions 
 26  # required by the LGPL License.  If you do not delete the provisions 
 27  # above, a recipient may use your version of this file under either the 
 28  # MPL or the LGPL License. 
 29   
 30  ''' 
 31  A regexp implementation for unicode strings. 
 32  ''' 
 33   
 34  from sys import maxunicode 
 35   
 36  from lepl.regexp.str import StrAlphabet, ILLEGAL 
 37  from lepl.support.lib import chr, lmap, fmt 
 38   
 39  _WHITESPACE = '\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680' \ 
 40                  '\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007' \ 
 41                  '\u2008\u2009\u200a\u2028\u2029\u202F\u205F\u3000' 
 42  '''http://en.wikipedia.org/wiki/Whitespace_character''' 
43 44 45 -class UnicodeAlphabet(StrAlphabet):
46 ''' 47 An alphabet for unicode strings. 48 ''' 49 50 __cached_instance = None 51 52 # pylint: disable-msg=E1002 53 # (pylint bug? this chains back to a new style abc)
54 - def __init__(self):
55 from lepl.matchers.core import Any 56 from lepl.matchers.combine import Or 57 max_ = chr(maxunicode) 58 def mkhex(char, n): 59 from lepl.matchers.derived import Drop 60 return Drop(Any(char)) + Any('0123456789abcdefABCDEF')[n,...] >> \ 61 (lambda x: chr(int(x, 16)))
62 def mkchr(char, range, invert=False): 63 from lepl.matchers.core import Literal 64 from lepl.matchers.derived import Map 65 from lepl.regexp.core import Character 66 intervals = lmap(lambda x: (x, x), range) 67 if invert: 68 # this delays call to invert until after creation of self 69 func = lambda _: Character(self.invert(intervals), self) 70 else: 71 func = lambda _: Character(intervals, self) 72 return Map(Literal(char), func)
73 range = Or(mkchr('s', _WHITESPACE), 74 mkchr('S', _WHITESPACE, invert=True)) 75 escaped = Any(ILLEGAL) | mkhex('x', 2) | mkhex('u', 4) | mkhex('U', 8) 76 super(UnicodeAlphabet, self).__init__(chr(0), max_, escaped=escaped, 77 range=range) 78
79 - def before(self, char):
80 ''' 81 Must return the character before char in the alphabet. Never called 82 with min (assuming input data are in range). 83 ''' 84 return chr(ord(char)-1)
85
86 - def after(self, char):
87 ''' 88 Must return the character after c in the alphabet. Never called with 89 max (assuming input data are in range). 90 ''' 91 return chr(ord(char)+1) 92 93 @classmethod
94 - def instance(cls):
95 ''' 96 Get an instance of this alphabet (avoids creating new objects). 97 ''' 98 if cls.__cached_instance is None: 99 cls.__cached_instance = UnicodeAlphabet() 100 return cls.__cached_instance
101
102 - def __repr__(self):
103 return '<Unicode>'
104