Package lepl :: Package regexp :: Module rewriters
[hide private]
[frames] | no frames]

Source Code for Module lepl.regexp.rewriters

  1   
  2  # The contents of this file are subject to the Mozilla Public License 
  3  # (MPL) Version 1.1 (the "License"); you may not use this file except 
  4  # in compliance with the License. You may obtain a copy of the License 
  5  # at http://www.mozilla.org/MPL/ 
  6  # 
  7  # Software distributed under the License is distributed on an "AS IS" 
  8  # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 
  9  # the License for the specific language governing rights and 
 10  # limitations under the License. 
 11  # 
 12  # The Original Code is LEPL (http://www.acooke.org/lepl) 
 13  # The Initial Developer of the Original Code is Andrew Cooke. 
 14  # Portions created by the Initial Developer are Copyright (C) 2009-2010 
 15  # Andrew Cooke (andrew@acooke.org). All Rights Reserved. 
 16  # 
 17  # Alternatively, the contents of this file may be used under the terms 
 18  # of the LGPL license (the GNU Lesser General Public License, 
 19  # http://www.gnu.org/licenses/lgpl.html), in which case the provisions 
 20  # of the LGPL License are applicable instead of those above. 
 21  # 
 22  # If you wish to allow use of your version of this file only under the 
 23  # terms of the LGPL License and not to allow others to use your version 
 24  # of this file under the MPL, indicate your decision by deleting the 
 25  # provisions above and replace them with the notice and other provisions 
 26  # required by the LGPL License.  If you do not delete the provisions 
 27  # above, a recipient may use your version of this file under either the 
 28  # MPL or the LGPL License. 
 29   
 30  ''' 
 31  Rewrite the tree of matchers from the bottom up (as far as possible) 
 32  using regular expressions.  This is complicated by a number of things. 
 33   
 34  First, intermediate parts of regular expressions are not matchers, so we need  
 35  to keep them inside a special container type that we can detect and convert to 
 36  a regular expression when needed (since at some point we cannot continue with 
 37  regular expressions). 
 38   
 39  Second, we sometimes do not know if our regular expression can be used until we  
 40  have moved higher up the matcher tree.  For example, And() might convert all 
 41  sub-expressions to a sequence if it's parent is an Apply(add).  So we may 
 42  need to store several alternatives, along with a way of selecting the correct 
 43  alternative. 
 44   
 45  So cloning a node may give either a matcher or a container.  The container 
 46  will provide both a matcher and an intermediate regular expression.  The logic 
 47  for handling odd dependencies is more difficult to implement in a general 
 48  way, because it is not clear what all cases may be.  For now, therefore, 
 49  we use a simple state machine approach using a tag (which is almost always 
 50  None).   
 51  ''' 
 52   
 53  from logging import getLogger 
 54  from operator import __add__ 
 55   
 56  from lepl.matchers.core import Regexp 
 57  from lepl.matchers.matcher import Matcher, matcher_map 
 58  from lepl.matchers.support import FunctionWrapper, SequenceWrapper, \ 
 59      TrampolineWrapper, TransformableTrampolineWrapper 
 60  from lepl.regexp.core import Choice, Sequence, Repeat, Empty, Option 
 61  from lepl.regexp.matchers import NfaRegexp, DfaRegexp 
 62  from lepl.regexp.interval import Character 
 63  from lepl.regexp.unicode import UnicodeAlphabet 
 64  from lepl.core.rewriters import clone, Rewriter, clone_matcher 
 65  from lepl.support.lib import fmt, str, basestring 
 66  from lepl.matchers.combine import DepthNoTrampoline, AndNoTrampoline 
 67  from lepl.matchers.error import Error 
68 69 70 -class RegexpContainer(object):
71 ''' 72 The container referred to above, which carries a regular expression and 73 an alternative matcher "up the tree" during rewriting. 74 ''' 75 76 log = getLogger('lepl.regexp.rewriters.RegexpContainer') 77
78 - def __init__(self, matcher, regexp, use, add_reqd=False):
79 self.matcher = matcher # current best matcher (regexp or not) 80 self.regexp = regexp # the current regexp 81 self.use = use # is the regexp a win? 82 self.add_reqd = add_reqd # we need "add" to combine values (from And)?
83
84 - def __str__(self):
85 return ','.join([str(self.matcher.__class__), str(self.regexp), 86 str(self.use), str(self.add_reqd)])
87 88 @classmethod
89 - def to_regexps(cls, use, possibles, have_add=False):
90 ''' 91 Convert to regular expressions. 92 93 `have_add` indicaes whether the caller can supply an "add". 94 None - caller doesn't care what lower code needed. 95 True - caller has add, and caller should need that. 96 False - caller doesn't have add, and caller should not need it. 97 ''' 98 regexps = [] 99 for possible in possibles: 100 if isinstance(possible, RegexpContainer): 101 cls.log.debug(fmt('unpacking: {0!s}', possible)) 102 if have_add is None or possible.add_reqd == have_add: 103 regexps.append(possible.regexp) 104 # this flag indicates that it's "worth" using the regexp 105 # so we "inherit" 106 use = use or possible.use 107 else: 108 raise Unsuitable('Add inconsistent.') 109 else: 110 cls.log.debug(fmt('cannot unpack: {0!s}', 111 possible.__class__)) 112 raise Unsuitable('Not a container.') 113 return (use, regexps)
114 115 @staticmethod
116 - def to_matcher(possible):
117 ''' 118 Convert to a matcher. 119 ''' 120 if isinstance(possible, RegexpContainer): 121 return possible.matcher 122 else: 123 return possible
124 125 @classmethod
126 - def build(cls, node, regexp, alphabet, regexp_type, use, 127 add_reqd=False, wrapper=None):
128 ''' 129 Construct a container or matcher. 130 ''' 131 if use and not add_reqd: 132 matcher = single(alphabet, node, regexp, regexp_type, wrapper) 133 # if matcher is a Transformable with a Transformation other than 134 # the standard empty_adapter then we must stop 135 if len(matcher.wrapper.functions) > 1: 136 cls.log.debug(fmt('Force matcher: {0}', matcher.wrapper)) 137 return matcher 138 else: 139 # not good enough to have a regexp as default, so either force 140 # the original matcher if it has transforms, or keep going in the 141 # hope we can get more complex later 142 matcher = node 143 if hasattr(matcher, 'wrapper') and matcher.wrapper: 144 return matcher 145 return RegexpContainer(matcher, regexp, use, add_reqd)
146
147 148 -def single(alphabet, node, regexp, regexp_type, wrapper=None):
149 ''' 150 Create a matcher for the given regular expression. 151 ''' 152 # avoid dependency loops 153 from lepl.matchers.transform import TransformationWrapper 154 matcher = regexp_type(regexp, alphabet) 155 matcher = matcher.compose(TransformationWrapper(empty_adapter)) 156 if wrapper is None and hasattr(node, 'wrapper'): 157 wrapper = node.wrapper 158 elif wrapper and not isinstance(wrapper, TransformationWrapper): 159 wrapper = TransformationWrapper(wrapper) 160 if wrapper: 161 wrapper.functions = \ 162 list(filter(lambda x: x != empty_adapter, wrapper.functions)) 163 matcher = matcher.compose(wrapper) 164 return matcher
165
166 -def empty_adapter(_stream, matcher):
167 ''' 168 There is a fundamental mismatch between regular expressions and the 169 recursive descent parser on how empty matchers are handled. The main 170 parser uses empty lists; regexp uses an empty string. This is a hack 171 that converts from one to the other. I do not see a better solution. 172 ''' 173 (results, stream_out) = matcher() 174 if results == ['']: 175 results = [] 176 return (results, stream_out)
177
178 179 -class Unsuitable(Exception):
180 ''' 181 Exception thrown when a sub-node does not contain a suitable matcher. 182 ''' 183 pass
184
185 186 -def make_clone(alphabet_, old_clone, regexp_type, use_from_start):
187 ''' 188 Factory that generates a clone suitable for rewriting recursive descent 189 to regular expressions. 190 ''' 191 192 # clone functions below take the "standard" clone and the node, and then 193 # reproduce the normal argument list of the matcher being cloned. 194 # they should return either a container or a matcher. 195 196 # Avoid dependency loops 197 from lepl.matchers.derived import add 198 from lepl.matchers.combine import And, Or, DepthFirst 199 from lepl.matchers.core import Any, Literal 200 from lepl.matchers.transform import Transform 201 202 log = getLogger('lepl.regexp.rewriters.make_clone') 203 204 def clone_any(use, original, restrict=None): 205 ''' 206 We can always convert Any() to a regular expression; the only question 207 is whether we have an open range or not. 208 ''' 209 if restrict is None: 210 char = Character([(alphabet_.min, alphabet_.max)], alphabet_) 211 else: 212 char = Character(((char, char) for char in restrict), alphabet_) 213 log.debug(fmt('Any: cloned {0}', char)) 214 regexp = Sequence(alphabet_, char) 215 return RegexpContainer.build(original, regexp, alphabet_, 216 regexp_type, use)
217 218 def clone_or(use, original, *matchers): 219 ''' 220 We can convert an Or only if all the sub-matchers have possible 221 regular expressions. 222 ''' 223 (use, regexps) = \ 224 RegexpContainer.to_regexps(use, matchers, have_add=False) 225 regexp = Choice(alphabet_, *regexps) 226 log.debug(fmt('Or: cloned {0}', regexp)) 227 return RegexpContainer.build(original, regexp, alphabet_, 228 regexp_type, use) 229 230 def clone_and(use, original, *matchers): 231 ''' 232 We can convert an And only if all the sub-matchers have possible 233 regular expressions, and even then we must tag the result unless 234 an add transform is present. 235 ''' 236 if hasattr(original, 'wrapper'): 237 wrapper = original.wrapper.functions 238 else: 239 wrapper = None 240 add_reqd = True 241 if wrapper: 242 if wrapper[0] is add: 243 wrapper = wrapper[1:] 244 add_reqd = False 245 else: 246 raise Unsuitable 247 try: 248 # combine all 249 (use, regexps) = \ 250 RegexpContainer.to_regexps(use, matchers, have_add=None) 251 # if we have regexp sub-expressions, join them 252 regexp = Sequence(alphabet_, *regexps) 253 log.debug(fmt('And: cloning {0}', regexp)) 254 return RegexpContainer.build(original, regexp, alphabet_, 255 regexp_type, use, add_reqd=add_reqd, 256 wrapper=wrapper) 257 except Unsuitable: 258 # combine contiguous matchers where possible 259 if add_reqd: 260 raise 261 def unpack(matcher): 262 original = RegexpContainer.to_matcher(matcher) 263 try: 264 return (original, 265 RegexpContainer.to_regexps(use, [matcher], 266 have_add=None)[1][0]) 267 except Unsuitable: 268 return (original, None) 269 output = [] 270 (regexps, originals) = ([], []) 271 for (matcher, regexp) in [unpack(matcher) for matcher in matchers]: 272 if regexp: 273 regexps.append(regexp) 274 originals.append(matcher) 275 else: 276 if len(regexps) > 1: 277 # combine regexps 278 output.append( 279 regexp_type(Sequence(alphabet_, *regexps), 280 alphabet_)) 281 else: 282 output.extend(originals) 283 output.append(matcher) 284 (regexps, originals) = ([], []) 285 if len(regexps) > 1: 286 output.append( 287 regexp_type(Sequence(alphabet_, *regexps), alphabet_)) 288 else: 289 output.extend(originals) 290 merged = And(*output) 291 return merged.compose(original.wrapper) 292 293 def clone_transform(use, original, matcher, wrapper): 294 ''' 295 We can assume that wrapper is a transformation. Add joins into 296 a sequence. 297 ''' 298 if original.wrapper: 299 if original.wrapper.functions[0] is add: 300 have_add = True 301 wrapper = original.wrapper.functions[1:] 302 else: 303 have_add = False 304 wrapper = original.wrapper.functions 305 else: 306 # punt to next level 307 return matcher 308 (use, [regexp]) = \ 309 RegexpContainer.to_regexps(use, [matcher], have_add=have_add) 310 log.debug(fmt('Transform: cloning {0}', regexp)) 311 return RegexpContainer.build(original, regexp, alphabet_, 312 regexp_type, use, 313 add_reqd=False, wrapper=wrapper) 314 315 def clone_literal(use, original, text): 316 ''' 317 Literal values are easy to transform. 318 ''' 319 chars = [Character([(c, c)], alphabet_) for c in text] 320 regexp = Sequence(alphabet_, *chars) 321 log.debug(fmt('Literal: cloned {0}', regexp)) 322 return RegexpContainer.build(original, regexp, alphabet_, 323 regexp_type, use) 324 325 def clone_regexp(use, original, pattern, alphabet=None): 326 ''' 327 Regexps values are also easy. 328 ''' 329 try: 330 if isinstance(pattern, basestring): 331 pattern = Sequence(alphabet_, *alphabet_.parse(pattern)) 332 except TypeError: 333 raise Unsuitable 334 except Error: # cannot parse regexp 335 raise Unsuitable 336 return RegexpContainer.build(original, pattern, alphabet_, 337 regexp_type, use) 338 339 def clone_dfs(use, original, first, start, stop, rest=None, reduce=None, 340 generator_manager_queue_len=None): 341 ''' 342 This forces use=True as it is likely that a regexp is a gain. 343 ''' 344 if stop is not None and start > stop: 345 raise Unsuitable 346 if reduce and not (isinstance(reduce, tuple) 347 and len(reduce) == 2 348 and reduce[0] == [] 349 and reduce[1] == __add__): 350 raise Unsuitable 351 if generator_manager_queue_len: 352 # this should only be set when running 353 raise Unsuitable 354 add_reqd = stop is None or stop > 1 355 wrapper = False 356 if hasattr(original, 'wrapper') and original.wrapper: 357 if original.wrapper.functions[0] is add: 358 add_reqd = False 359 wrapper = original.wrapper.functions[1:] 360 else: 361 raise Unsuitable 362 rest = first if rest is None else rest 363 (use, [first, rest]) = \ 364 RegexpContainer.to_regexps(True, [first, rest], have_add=None) 365 seq = [] 366 if first != rest: 367 seq.append(first.clone()) 368 while len(seq) < start: 369 seq.append(rest.clone()) 370 addzero = len(seq) > start # first was exceptional and start=0 371 if stop: 372 if stop > start: 373 # use nested form to avoid inefficient nfa 374 extras = Option(alphabet_, rest.clone()) 375 for _i in range(stop - start - 1): 376 extras = Option(alphabet_, 377 Sequence(alphabet_, rest.clone(), extras)) 378 seq.append(extras) 379 else: 380 seq.append(Repeat(alphabet_, rest.clone())) 381 regexp = Sequence(alphabet_, *seq) 382 if addzero: 383 regexp = Choice(alphabet_, regexp, Empty(alphabet_)) 384 log.debug(fmt('DFS: cloned {0}', regexp)) 385 return RegexpContainer.build(original, regexp, alphabet_, 386 regexp_type, use, add_reqd=add_reqd, 387 wrapper=wrapper) 388 389 def clone_wrapper(use, original, *args, **kargs): 390 factory = original.factory 391 if factory in map_: 392 log.debug(fmt('Found {0}', factory)) 393 return map_[factory](use, original, *args, **kargs) 394 else: 395 log.debug(fmt('No clone for {0}, {1}', factory, map_.keys())) 396 return original 397 398 map_ = matcher_map({Any: clone_any, 399 Or: clone_or, 400 And: clone_and, 401 AndNoTrampoline: clone_and, 402 Transform: clone_transform, 403 Literal: clone_literal, 404 Regexp: clone_regexp, 405 NfaRegexp: clone_regexp, 406 DfaRegexp: clone_regexp, 407 DepthFirst: clone_dfs, 408 DepthNoTrampoline: clone_dfs, 409 FunctionWrapper: clone_wrapper, 410 SequenceWrapper: clone_wrapper, 411 TrampolineWrapper: clone_wrapper, 412 TransformableTrampolineWrapper: clone_wrapper}) 413 414 def clone_(i, j, node, args, kargs): 415 ''' 416 Do the cloning, dispatching by type to the methods above. 417 ''' 418 original_args = [RegexpContainer.to_matcher(arg) for arg in args] 419 original_kargs = dict((name, RegexpContainer.to_matcher(kargs[name])) 420 for name in kargs) 421 original = old_clone(i, j, node, original_args, original_kargs) 422 type_ = type(node) 423 if type_ in map_: 424 # pylint: disable-msg=W0142 425 try: 426 return map_[type_](use_from_start, original, *args, **kargs) 427 except Unsuitable: 428 pass 429 return original 430 431 return clone_ 432
433 434 -class CompileRegexp(Rewriter):
435 ''' 436 A rewriter that uses the given alphabet and matcher to compile simple 437 matchers. 438 439 The "use" parameter controls when regular expressions are substituted. 440 If true, they are always used. If false, they are used only if they 441 are part of a tree that includes repetition. The latter case generally 442 gives more efficient parsers because it avoids converting already 443 efficient literal matchers to regular expressions. 444 ''' 445
446 - def __init__(self, alphabet=None, use=True, matcher=NfaRegexp):
447 if alphabet is None: 448 alphabet = UnicodeAlphabet.instance() 449 super(CompileRegexp, self).__init__(Rewriter.COMPILE_REGEXP, 450 fmt('CompileRegexp({0}, {1}, {2})', alphabet, use, matcher)) 451 self.alphabet = alphabet 452 self.use = use 453 self.matcher = matcher
454
455 - def __call__(self, graph):
456 new_clone = make_clone(self.alphabet, clone, self.matcher, self.use) 457 graph = clone_matcher(graph, new_clone) 458 graph = RegexpContainer.to_matcher(graph) 459 return graph
460