Package lepl :: Package regexp :: Module str
[hide private]
[frames] | no frames]

Source Code for Module lepl.regexp.str

  1   
  2  # The contents of this file are subject to the Mozilla Public License 
  3  # (MPL) Version 1.1 (the "License"); you may not use this file except 
  4  # in compliance with the License. You may obtain a copy of the License 
  5  # at http://www.mozilla.org/MPL/ 
  6  # 
  7  # Software distributed under the License is distributed on an "AS IS" 
  8  # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 
  9  # the License for the specific language governing rights and 
 10  # limitations under the License. 
 11  # 
 12  # The Original Code is LEPL (http://www.acooke.org/lepl) 
 13  # The Initial Developer of the Original Code is Andrew Cooke. 
 14  # Portions created by the Initial Developer are Copyright (C) 2009-2010 
 15  # Andrew Cooke (andrew@acooke.org). All Rights Reserved. 
 16  # 
 17  # Alternatively, the contents of this file may be used under the terms 
 18  # of the LGPL license (the GNU Lesser General Public License, 
 19  # http://www.gnu.org/licenses/lgpl.html), in which case the provisions 
 20  # of the LGPL License are applicable instead of those above. 
 21  # 
 22  # If you wish to allow use of your version of this file only under the 
 23  # terms of the LGPL License and not to allow others to use your version 
 24  # of this file under the MPL, indicate your decision by deleting the 
 25  # provisions above and replace them with the notice and other provisions 
 26  # required by the LGPL License.  If you do not delete the provisions 
 27  # above, a recipient may use your version of this file under either the 
 28  # MPL or the LGPL License. 
 29   
 30  ''' 
 31  Some intermediate classes that support parsers for objects that can be 
 32  converted to strings using str(). 
 33  ''' 
 34   
 35  from lepl.matchers.support import coerce_ 
 36  from lepl.regexp.core import Alphabet, Character, Sequence, Choice, Repeat, \ 
 37      Option, _Choice, _Character 
 38  from lepl.support.lib import fmt, str, LogMixin 
 39   
 40   
 41  ILLEGAL = '{}[]*()-?.+\\^$|' 
 42  ''' 
 43  Characters that must be escaped. 
 44  ''' 
45 46 47 -class StrParser(LogMixin):
48 ''' 49 Construct a parser for string based expressions. 50 51 We need a clear policy on backslashes. To be as backwards compatible as 52 possible I am going with: 53 54 0. "Escaping" means prefixing with \. 55 56 1. These characters are special: {, }, [, ], -, \, (, ), *, ?, ., +, 57 ^, $, |. 58 59 2. Special characters (ie literal, or unescaped special characters) may 60 not have a meaning currently, or may only have a meaning in certain 61 contexts. 62 63 3. To use a special character literally, it must be escaped. 64 65 4. If a special character is used without an escape, in a context 66 where it doesn't have a meaning, then it is an error. 67 68 5. If a non-special character is escaped, that is also an error. 69 70 This is not the same as the Python convention, but I believe it makes 71 automatic escaping of given text easier. 72 ''' 73
74 - def __init__(self, alphabet):
75 super(StrParser, self).__init__() 76 self.alphabet = alphabet
77
78 - def dup(self, x):
79 ''' 80 Create an interval from a single character. 81 ''' 82 return (self.alphabet.from_char(x), self.alphabet.from_char(x))
83
84 - def tup(self, x):
85 ''' 86 Create an interval from a tuple. 87 ''' 88 return (self.alphabet.from_char(x[0]), self.alphabet.from_char(x[1]))
89
90 - def dot(self, _):
91 ''' 92 Create a "complete" interval. 93 ''' 94 return (self.alphabet.min, self.alphabet.max)
95
96 - def invert(self, x):
97 ''' 98 Invert an interval. 99 ''' 100 # Character needed here to ensure intervals passed to invert are ordered 101 return self.alphabet.invert(Character(x, self.alphabet))
102
103 - def sequence(self, x):
104 ''' 105 Create a sequence. 106 ''' 107 return Sequence(self.alphabet, *x)
108
109 - def star(self, x):
110 ''' 111 Repeat a sub-expression. 112 ''' 113 return Repeat(self.alphabet, *x)
114
115 - def plus(self, x):
116 ''' 117 Repeat a sub-expression. 118 ''' 119 return self.sequence([self.sequence(x), self.star(x)])
120
121 - def option(self, x):
122 ''' 123 Make a sub-expression optional. 124 ''' 125 return Option(self.alphabet, *x)
126
127 - def choice(self, x):
128 ''' 129 Construct a choice from a list of sub-expressions. 130 ''' 131 return Choice(self.alphabet, *x)
132
133 - def char(self, x):
134 ''' 135 Construct a character from an interval (pair). 136 ''' 137 return Character(x, self.alphabet)
138
139 - def extend(self, x):
140 ''' 141 Delegate a character extension to the alphabet. 142 ''' 143 return self.alphabet.extension(x)
144
145 - def build(self):
146 ''' 147 Construct the parser. 148 ''' 149 150 # Avoid dependency loops 151 from lepl.matchers.derived import Drop, Eos, AnyBut, Upper 152 from lepl.matchers.core import Any, Lookahead, Literal, Delayed 153 from lepl.matchers.error import make_error 154 from lepl.matchers.variables import TraceVariables 155 from lepl.support.node import node_throw 156 157 with TraceVariables(False): 158 159 # these two definitions enforce the conditions above, providing only 160 # special characters appear as literals in the grammar 161 escaped = Drop(self.alphabet.escape) & self.alphabet.escaped 162 raw = ~Lookahead(self.alphabet.escape) & \ 163 AnyBut(self.alphabet.illegal) 164 close = Drop(')') 165 extend = (Drop('(*') & Upper()[1:,...] & close) >> self.extend 166 167 single = escaped | raw | extend 168 169 any_ = Literal('.') >> self.dot 170 letter = single >> self.dup 171 pair = single & Drop('-') & single > self.tup 172 173 interval = pair | letter 174 brackets = Drop('[') & interval[1:] & Drop(']') 175 inverted = Drop('[^') & interval[1:] & Drop(']') >= self.invert 176 char = inverted | brackets | letter | any_ | extend > self.char 177 178 item = Delayed() 179 180 open = Drop('(?:') 181 range = Drop(self.alphabet.escape) & self.alphabet.range 182 seq = (char | item | range)[0:] > self.sequence 183 group = open & seq & close 184 alts = open & seq[2:, Drop('|')] & close > self.choice 185 star = (alts | group | char) & Drop('*') > self.star 186 plus = (alts | group | char) & Drop('+') > self.plus 187 opt = (alts | group | char) & Drop('?') > self.option 188 bad_grp = (Drop('(') & ~Lookahead('?:') & seq & close) \ 189 ** make_error( 190 "Lepl's own regular expressions do not currently " 191 "support matched groups.\n" 192 "Use '(?:...)' to group expressions without " 193 "matching.") 194 195 item += alts | group | star | plus | opt | bad_grp 196 197 expr = ((char | item)[:] & Drop(Eos())) >> node_throw 198 199 # Empty config here avoids loops if the default config includes 200 # references to alphabets 201 expr.config.clear() 202 return expr.parse_string
203
204 205 -def make_str_parser(alphabet):
206 ''' 207 Create a parser. 208 ''' 209 return StrParser(alphabet).build()
210
211 212 -class StrAlphabet(Alphabet):
213 ''' 214 Support for alphabets. 215 ''' 216 217 # pylint: disable-msg=E1002 218 # (pylint bug? this chains back to a new style abc)
219 - def __init__(self, min_, max_, escape='\\', escaped=ILLEGAL, 220 illegal=ILLEGAL, range=None, 221 parser_factory=make_str_parser):
222 from lepl.matchers.core import Any, Never 223 super(StrAlphabet, self).__init__(min_, max_) 224 self.__escape = escape 225 self.__escaped = coerce_(escaped, Any) 226 self.__illegal = illegal 227 self.__range = range if range else Never() 228 self._parser = parser_factory(self)
229 230 @property
231 - def escape(self):
232 return self.__escape
233 234 @property
235 - def escaped(self):
236 return self.__escaped
237 238 @property
239 - def illegal(self):
240 return self.__illegal
241 242 @property
243 - def range(self):
244 return self.__range
245
246 - def _escape_char(self, char):
247 ''' 248 Escape a character if necessary. 249 ''' 250 if self.escape is not None and str(char) in self.illegal: 251 return self.escape + str(char) 252 else: 253 return str(char)
254
255 - def _no_parens(self, children):
256 ''' 257 Returns True of no parens are needed around this when fmtting. 258 ''' 259 return len(children) == 1 and \ 260 (isinstance(children[0], _Character) or 261 len(children[0]) == 1 and isinstance(children[0][0], _Choice))
262
263 - def fmt_intervals(self, intervals):
264 ''' 265 Hide unicode chars because of some strange error that occurs with 266 Python2.6 on the command line. 267 268 This is in StrAlphabet, but for ASCII makes no difference. Having it 269 here helps LineAwareAlphabet work (the whole idea of subclassing 270 alphabets etc is not so great). 271 ''' 272 def pretty(c): 273 x = self._escape_char(c) 274 if len(x) > 1 or 32 <= ord(x) <= 127: 275 return str(x) 276 elif ord(c) < 0x100: 277 return fmt('\\x{0:02x}', ord(c)) 278 elif ord(c) < 0x10000: 279 return fmt('\\u{0:04x}', ord(c)) 280 else: 281 return fmt('\\U{0:08x}', ord(c))
282 ranges = [] 283 if len(intervals) == 1: 284 if intervals[0][0] == intervals[0][1]: 285 return self._escape_char(intervals[0][0]) 286 elif intervals[0][0] == self.min and intervals[0][1] == self.max: 287 return '.' 288 # pylint: disable-msg=C0103 289 # (sorry. but i use this (a, b) convention throughout the regexp lib) 290 for (a, b) in intervals: 291 if a == b: 292 ranges.append(pretty(a)) 293 else: 294 ranges.append(fmt('{0!s}-{1!s}', pretty(a), pretty(b))) 295 return fmt('[{0}]', self.join(ranges))
296
297 - def fmt_sequence(self, children):
298 ''' 299 Generate a string representation of a sequence. 300 301 This must fully describe the data in the children (it is used to 302 hash the data). 303 ''' 304 return self.join(str(c) for c in children)
305
306 - def fmt_repeat(self, children):
307 ''' 308 Generate a string representation of a repetition. 309 310 This must fully describe the data in the children (it is used to 311 hash the data). 312 ''' 313 string = self.fmt_sequence(children) 314 if self._no_parens(children): 315 return string + '*' 316 else: 317 return fmt('(?:{0})*', string)
318
319 - def fmt_choice(self, children):
320 ''' 321 Generate a string representation of a choice. 322 323 This must fully describe the data in the children (it is used to 324 hash the data). 325 ''' 326 return fmt('(?:{0})', '|'.join(str(child) for child in children))
327
328 - def fmt_option(self, children):
329 ''' 330 Generate a string representation of an option. 331 332 This must fully describe the data in the children (it is used to 333 hash the data). 334 ''' 335 string = self.fmt_sequence(children) 336 if self._no_parens(children): 337 return string + '?' 338 else: 339 return fmt('(?:{0})?', string)
340
341 - def fmt_label(self, label, child):
342 ''' 343 Generate a string representation of labelled options. 344 345 This must fully describe the data in the children (it is used to 346 hash the data). 347 ''' 348 return fmt('(?P<{0}>{1})', label, child)
349
350 - def join(self, chars):
351 ''' 352 Join characters together. 353 ''' 354 return ''.join(chars)
355
356 - def from_char(self, char):
357 ''' 358 This must convert a single character. 359 ''' 360 return char
361
362 - def parse(self, regexp):
363 ''' 364 Generate a Sequence from the given text. 365 ''' 366 return self._parser(regexp)
367