| Home | Trees | Indices | Help |
|---|
|
|
1
2 # The contents of this file are subject to the Mozilla Public License
3 # (MPL) Version 1.1 (the "License"); you may not use this file except
4 # in compliance with the License. You may obtain a copy of the License
5 # at http://www.mozilla.org/MPL/
6 #
7 # Software distributed under the License is distributed on an "AS IS"
8 # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
9 # the License for the specific language governing rights and
10 # limitations under the License.
11 #
12 # The Original Code is LEPL (http://www.acooke.org/lepl)
13 # The Initial Developer of the Original Code is Andrew Cooke.
14 # Portions created by the Initial Developer are Copyright (C) 2009-2010
15 # Andrew Cooke (andrew@acooke.org). All Rights Reserved.
16 #
17 # Alternatively, the contents of this file may be used under the terms
18 # of the LGPL license (the GNU Lesser General Public License,
19 # http://www.gnu.org/licenses/lgpl.html), in which case the provisions
20 # of the LGPL License are applicable instead of those above.
21 #
22 # If you wish to allow use of your version of this file only under the
23 # terms of the LGPL License and not to allow others to use your version
24 # of this file under the MPL, indicate your decision by deleting the
25 # provisions above and replace them with the notice and other provisions
26 # required by the LGPL License. If you do not delete the provisions
27 # above, a recipient may use your version of this file under either the
28 # MPL or the LGPL License.
29
30 '''
31 Some intermediate classes that support parsers for objects that can be
32 converted to strings using str().
33 '''
34
35 from lepl.matchers.support import coerce_
36 from lepl.regexp.core import Alphabet, Character, Sequence, Choice, Repeat, \
37 Option, _Choice, _Character
38 from lepl.support.lib import fmt, str, LogMixin
39
40
41 ILLEGAL = '{}[]*()-?.+\\^$|'
42 '''
43 Characters that must be escaped.
44 '''
48 '''
49 Construct a parser for string based expressions.
50
51 We need a clear policy on backslashes. To be as backwards compatible as
52 possible I am going with:
53
54 0. "Escaping" means prefixing with \.
55
56 1. These characters are special: {, }, [, ], -, \, (, ), *, ?, ., +,
57 ^, $, |.
58
59 2. Special characters (ie literal, or unescaped special characters) may
60 not have a meaning currently, or may only have a meaning in certain
61 contexts.
62
63 3. To use a special character literally, it must be escaped.
64
65 4. If a special character is used without an escape, in a context
66 where it doesn't have a meaning, then it is an error.
67
68 5. If a non-special character is escaped, that is also an error.
69
70 This is not the same as the Python convention, but I believe it makes
71 automatic escaping of given text easier.
72 '''
73
77
79 '''
80 Create an interval from a single character.
81 '''
82 return (self.alphabet.from_char(x), self.alphabet.from_char(x))
83
85 '''
86 Create an interval from a tuple.
87 '''
88 return (self.alphabet.from_char(x[0]), self.alphabet.from_char(x[1]))
89
95
97 '''
98 Invert an interval.
99 '''
100 # Character needed here to ensure intervals passed to invert are ordered
101 return self.alphabet.invert(Character(x, self.alphabet))
102
108
114
116 '''
117 Repeat a sub-expression.
118 '''
119 return self.sequence([self.sequence(x), self.star(x)])
120
126
128 '''
129 Construct a choice from a list of sub-expressions.
130 '''
131 return Choice(self.alphabet, *x)
132
134 '''
135 Construct a character from an interval (pair).
136 '''
137 return Character(x, self.alphabet)
138
140 '''
141 Delegate a character extension to the alphabet.
142 '''
143 return self.alphabet.extension(x)
144
146 '''
147 Construct the parser.
148 '''
149
150 # Avoid dependency loops
151 from lepl.matchers.derived import Drop, Eos, AnyBut, Upper
152 from lepl.matchers.core import Any, Lookahead, Literal, Delayed
153 from lepl.matchers.error import make_error
154 from lepl.matchers.variables import TraceVariables
155 from lepl.support.node import node_throw
156
157 with TraceVariables(False):
158
159 # these two definitions enforce the conditions above, providing only
160 # special characters appear as literals in the grammar
161 escaped = Drop(self.alphabet.escape) & self.alphabet.escaped
162 raw = ~Lookahead(self.alphabet.escape) & \
163 AnyBut(self.alphabet.illegal)
164 close = Drop(')')
165 extend = (Drop('(*') & Upper()[1:,...] & close) >> self.extend
166
167 single = escaped | raw | extend
168
169 any_ = Literal('.') >> self.dot
170 letter = single >> self.dup
171 pair = single & Drop('-') & single > self.tup
172
173 interval = pair | letter
174 brackets = Drop('[') & interval[1:] & Drop(']')
175 inverted = Drop('[^') & interval[1:] & Drop(']') >= self.invert
176 char = inverted | brackets | letter | any_ | extend > self.char
177
178 item = Delayed()
179
180 open = Drop('(?:')
181 range = Drop(self.alphabet.escape) & self.alphabet.range
182 seq = (char | item | range)[0:] > self.sequence
183 group = open & seq & close
184 alts = open & seq[2:, Drop('|')] & close > self.choice
185 star = (alts | group | char) & Drop('*') > self.star
186 plus = (alts | group | char) & Drop('+') > self.plus
187 opt = (alts | group | char) & Drop('?') > self.option
188 bad_grp = (Drop('(') & ~Lookahead('?:') & seq & close) \
189 ** make_error(
190 "Lepl's own regular expressions do not currently "
191 "support matched groups.\n"
192 "Use '(?:...)' to group expressions without "
193 "matching.")
194
195 item += alts | group | star | plus | opt | bad_grp
196
197 expr = ((char | item)[:] & Drop(Eos())) >> node_throw
198
199 # Empty config here avoids loops if the default config includes
200 # references to alphabets
201 expr.config.clear()
202 return expr.parse_string
203
210
213 '''
214 Support for alphabets.
215 '''
216
217 # pylint: disable-msg=E1002
218 # (pylint bug? this chains back to a new style abc)
219 - def __init__(self, min_, max_, escape='\\', escaped=ILLEGAL,
220 illegal=ILLEGAL, range=None,
221 parser_factory=make_str_parser):
222 from lepl.matchers.core import Any, Never
223 super(StrAlphabet, self).__init__(min_, max_)
224 self.__escape = escape
225 self.__escaped = coerce_(escaped, Any)
226 self.__illegal = illegal
227 self.__range = range if range else Never()
228 self._parser = parser_factory(self)
229
230 @property
233
234 @property
237
238 @property
241
242 @property
245
247 '''
248 Escape a character if necessary.
249 '''
250 if self.escape is not None and str(char) in self.illegal:
251 return self.escape + str(char)
252 else:
253 return str(char)
254
256 '''
257 Returns True of no parens are needed around this when fmtting.
258 '''
259 return len(children) == 1 and \
260 (isinstance(children[0], _Character) or
261 len(children[0]) == 1 and isinstance(children[0][0], _Choice))
262
264 '''
265 Hide unicode chars because of some strange error that occurs with
266 Python2.6 on the command line.
267
268 This is in StrAlphabet, but for ASCII makes no difference. Having it
269 here helps LineAwareAlphabet work (the whole idea of subclassing
270 alphabets etc is not so great).
271 '''
272 def pretty(c):
273 x = self._escape_char(c)
274 if len(x) > 1 or 32 <= ord(x) <= 127:
275 return str(x)
276 elif ord(c) < 0x100:
277 return fmt('\\x{0:02x}', ord(c))
278 elif ord(c) < 0x10000:
279 return fmt('\\u{0:04x}', ord(c))
280 else:
281 return fmt('\\U{0:08x}', ord(c))
282 ranges = []
283 if len(intervals) == 1:
284 if intervals[0][0] == intervals[0][1]:
285 return self._escape_char(intervals[0][0])
286 elif intervals[0][0] == self.min and intervals[0][1] == self.max:
287 return '.'
288 # pylint: disable-msg=C0103
289 # (sorry. but i use this (a, b) convention throughout the regexp lib)
290 for (a, b) in intervals:
291 if a == b:
292 ranges.append(pretty(a))
293 else:
294 ranges.append(fmt('{0!s}-{1!s}', pretty(a), pretty(b)))
295 return fmt('[{0}]', self.join(ranges))
296
298 '''
299 Generate a string representation of a sequence.
300
301 This must fully describe the data in the children (it is used to
302 hash the data).
303 '''
304 return self.join(str(c) for c in children)
305
307 '''
308 Generate a string representation of a repetition.
309
310 This must fully describe the data in the children (it is used to
311 hash the data).
312 '''
313 string = self.fmt_sequence(children)
314 if self._no_parens(children):
315 return string + '*'
316 else:
317 return fmt('(?:{0})*', string)
318
320 '''
321 Generate a string representation of a choice.
322
323 This must fully describe the data in the children (it is used to
324 hash the data).
325 '''
326 return fmt('(?:{0})', '|'.join(str(child) for child in children))
327
329 '''
330 Generate a string representation of an option.
331
332 This must fully describe the data in the children (it is used to
333 hash the data).
334 '''
335 string = self.fmt_sequence(children)
336 if self._no_parens(children):
337 return string + '?'
338 else:
339 return fmt('(?:{0})?', string)
340
349
355
361
367
| Home | Trees | Indices | Help |
|---|
| Generated by Epydoc 3.0.1 on Sat Jun 9 21:51:00 2012 | http://epydoc.sourceforge.net |