Package lepl :: Package matchers :: Module derived
[hide private]
[frames] | no frames]

Source Code for Module lepl.matchers.derived

  1   
  2  # The contents of this file are subject to the Mozilla Public License 
  3  # (MPL) Version 1.1 (the "License"); you may not use this file except 
  4  # in compliance with the License. You may obtain a copy of the License 
  5  # at http://www.mozilla.org/MPL/ 
  6  # 
  7  # Software distributed under the License is distributed on an "AS IS" 
  8  # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 
  9  # the License for the specific language governing rights and 
 10  # limitations under the License. 
 11  # 
 12  # The Original Code is LEPL (http://www.acooke.org/lepl) 
 13  # The Initial Developer of the Original Code is Andrew Cooke. 
 14  # Portions created by the Initial Developer are Copyright (C) 2009-2010 
 15  # Andrew Cooke (andrew@acooke.org). All Rights Reserved. 
 16  # 
 17  # Alternatively, the contents of this file may be used under the terms 
 18  # of the LGPL license (the GNU Lesser General Public License, 
 19  # http://www.gnu.org/licenses/lgpl.html), in which case the provisions 
 20  # of the LGPL License are applicable instead of those above. 
 21  # 
 22  # If you wish to allow use of your version of this file only under the 
 23  # terms of the LGPL License and not to allow others to use your version 
 24  # of this file under the MPL, indicate your decision by deleting the 
 25  # provisions above and replace them with the notice and other provisions 
 26  # required by the LGPL License.  If you do not delete the provisions 
 27  # above, a recipient may use your version of this file under either the 
 28  # MPL or the LGPL License. 
 29   
 30  ''' 
 31  Matchers that are defined in terms of other matchers (ie the majority). 
 32  ''' 
 33   
 34  from string import whitespace, digits, ascii_letters, \ 
 35      ascii_uppercase, ascii_lowercase, printable, punctuation 
 36  from operator import __add__ 
 37   
 38  from lepl.matchers.combine import And, DepthFirst, BreadthFirst, \ 
 39      OrderByResultCount, Or, Limit 
 40  from lepl.matchers.core import Lookahead, Any, Eof, Literal, Regexp 
 41  from lepl.matchers.operators import BREADTH_FIRST, DEPTH_FIRST, GREEDY, \ 
 42      NON_GREEDY 
 43  from lepl.matchers.support import coerce_ 
 44  from lepl.matchers.transform import TransformationWrapper, Transform, \ 
 45      ApplyArgs, ApplyRaw 
 46  from lepl.regexp.matchers import NfaRegexp, DfaRegexp 
 47  from lepl.support.lib import assert_type, lmap, fmt, basestring, reduce 
 48  from lepl.support.warn import warn_on_use 
49 50 51 # pylint: disable-msg=C0103 52 # (consistent interface) 53 # pylint: disable-msg=W0142 54 # (*args) 55 # pylint: disable-msg=W0105 56 # (string comments) 57 # pylint: disable-msg=W0141 58 # (map) 59 60 -def Repeat(matcher, start=0, stop=None, limit=None, algorithm=DEPTH_FIRST, 61 separator=None, add_=False, reduce=None):
62 ''' 63 This is called by the [] operator. It repeats the given matcher between 64 `start` and `stop` number of times (inclusive). 65 66 If `limit` is given it is an upper limit on the number of different 67 results returned on backtracking. 68 69 `algorithm` selects the repeat algorithm to use. 70 71 If `separator` is given then each repetition is separated by that matcher. 72 73 If `add_` is true then the results are joined with `Add` (once all 74 results are obtained). 75 76 If `reduce` is given it should be a pair (zero, join) where 77 `join(results, next)` is used to accumulate results and `zero` is the 78 initial value of `results`. This is implemented via `Reduce`. 79 80 `reduce` and `add_` cannot be given together. 81 ''' 82 first = coerce_(matcher) 83 if separator is None: 84 rest = first 85 else: 86 rest = And(coerce_(separator, Regexp), first) 87 if start is None: 88 start = 0 89 # allow duck typing (mutable values - IntVar etc) 90 # assert_type('The start index for Repeat or [...]', start, int) 91 # assert_type('The stop index for Repeat or [...]', stop, int, none_ok=True) 92 # assert_type('The limit value (step index) for Repeat or [...]', limit, int, none_ok=True) 93 # assert_type('The algorithm (step index) for Repeat or [...]', algorithm, str) 94 # if start < 0: 95 # raise ValueError('Repeat or [...] cannot have a negative start.') 96 # if stop is not None and stop < start: 97 # raise ValueError('Repeat or [...] must have a stop ' 98 # 'value greater than or equal to the start.') 99 # if 'dbgn'.find(algorithm) == -1: 100 # raise ValueError('Repeat or [...] must have a step (algorithm) ' 101 # 'of d, b, g or n.') 102 if add_ and reduce: 103 raise ValueError('Repeat cannot apply both add_ and reduce') 104 elif add_: 105 process = Add 106 elif reduce: 107 process = lambda r: Reduce(r, reduce[0], reduce[1]) 108 else: 109 process = Identity 110 matcher = {DEPTH_FIRST: 111 process(DepthFirst(first=first, start=start, stop=stop, rest=rest)), 112 BREADTH_FIRST: 113 process(BreadthFirst(first=first, start=start, stop=stop, rest=rest)), 114 GREEDY: 115 process(OrderByResultCount( 116 BreadthFirst(first=first, start=start, stop=stop, rest=rest))), 117 NON_GREEDY: 118 process(OrderByResultCount( 119 BreadthFirst(first=first, start=start, stop=stop, rest=rest), 120 False)) 121 }[algorithm] 122 if limit is not None: 123 matcher = Limit(matcher, count=limit) 124 return matcher
125
126 127 -def Apply(matcher, function, raw=False, args=False):
128 ''' 129 Apply an arbitrary function to the results of the matcher 130 (**>**, **>=**). 131 132 Apply can be used via the standard operators by placing ``>`` 133 (or ``>=`` to set ``raw=True``, or ``*`` to set ``args=True``) 134 to the right of a matcher. 135 136 If the function is a `TransformationWrapper` it is used directly. 137 Otherwise a `TransformationWrapper` is constructed via the `raw` and 138 `args` parameters, as described below. 139 140 **Note:** The creation of named pairs (when a string argument is 141 used) behaves more like a mapping than a single function invocation. 142 If several values are present, several pairs will be created. 143 144 **Note:** There is an asymmetry in the default values of *raw* 145 and *args*. If the identity function is used with the default settings 146 then a list of results is passed as a single argument (``args=False``). 147 That is then returned (by the identity) as a list, which is wrapped 148 in an additional list (``raw=False``), giving an extra level of 149 grouping. This is necessary because Python's ``list()`` is an 150 identity for lists, but we want it to add an extra level of grouping 151 so that nested S-expressions are easy to generate. 152 153 See also `Map`. 154 155 :Parameters: 156 157 matcher 158 The matcher whose results will be modified. 159 160 function 161 The modification to apply. 162 163 If a `Transformation`, this is used directly. 164 165 If a string is given, named pairs will be created (and raw and args 166 ignored). 167 168 Otherwise the function should expect a list of results (unless 169 ``args=True`` in which case the list is supplied as ``*args``) 170 and can return any value (unless ``raw=True`` in which case it should 171 return a list). 172 173 raw 174 If True the results are used directly. Otherwise they are wrapped in 175 a list. The default is False --- a list is added. This is set to 176 true if the target function is an `ApplyRaw` instance. 177 178 args 179 If True, the results are passed to the function as separate 180 arguments (Python's '*args' behaviour). The default is False --- 181 the results are passed inside a list). This is set to true if the 182 target function is an `ApplyArgs` instance. 183 ''' 184 raw = raw or (type(function) is type and issubclass(function, ApplyRaw)) 185 args = args or (type(function) is type 186 and issubclass(function, ApplyArgs)) 187 if isinstance(function, TransformationWrapper): 188 apply = function 189 else: 190 if isinstance(function, basestring): 191 function = lambda results, f=function: \ 192 lmap(lambda x: (f, x), results) 193 raw = True 194 args = False 195 if args: 196 if raw: 197 function = lambda results, f=function: f(*results) 198 else: 199 function = lambda results, f=function: [f(*results)] 200 else: 201 if not raw: 202 function = lambda results, f=function: [f(results)] 203 def apply(stream_in, matcher): 204 (results, stream_out) = matcher() 205 return (function(results), stream_out)
206 return Transform(matcher, apply) 207
208 209 -def args(function):
210 ''' 211 A decorator that has the same effect as ApplyArgs for functions/methods. 212 ''' 213 def wrapper(args_): 214 ''' 215 Apply args as *args. 216 ''' 217 return function(*args_)
218 return wrapper 219
220 221 -def KApply(matcher, function, raw=False):
222 ''' 223 Apply an arbitrary function to named arguments (**\****). 224 The function should typically expect and return a list. 225 It can be used indirectly by placing ``**=`` to the right of the matcher. 226 227 The function will be applied with the following keyword arguments: 228 229 stream_out 230 The stream returned from the matcher. 231 232 results 233 A list of the results returned. 234 235 :Parameters: 236 237 matcher 238 The matcher whose results will be modified. 239 240 function 241 The modification to apply. 242 243 raw 244 If false (the default), the final return value from the function 245 will be placed in a list and returned in a pair together with the 246 new stream returned from the matcher (ie the function returns a 247 single new result). 248 249 If true, the final return value from the function is used directly 250 and so should match the ``([results], stream)`` type expected by 251 other matchers. 252 ''' 253 def fun(stream_in, matcher): 254 ''' 255 Apply args as **kargs. 256 ''' 257 (results, stream_out) = matcher() 258 kargs = {'results': results, 259 'stream_in': stream_in, 260 'stream_out': stream_out} 261 if raw: 262 return function(**kargs) 263 else: 264 return ([function(**kargs)], stream_out)
265 return Transform(matcher, fun) 266
267 268 -def AnyBut(exclude=None):
269 ''' 270 Match any character except those specified (or, if a matcher is used as 271 the exclude, if the matcher fails). 272 273 The argument should be a list of tokens (or a string of suitable 274 characters) to exclude, or a matcher. If omitted all tokens are accepted. 275 ''' 276 return And(~Lookahead(coerce_(exclude, Any)), Any())
277
278 279 -def Optional(matcher):
280 ''' 281 Match zero or one instances of a matcher (**[0:1]**). 282 ''' 283 return Repeat(coerce_(matcher), stop=1)
284 #return Or(coerce_(matcher), Empty())
285 286 287 -def Star(matcher):
288 ''' 289 Match zero or more instances of a matcher (**[0:]**) 290 ''' 291 return Repeat(coerce_(matcher))
292 293 294 ZeroOrMore = Star 295 ''' 296 Match zero or more instances of a matcher (**[0:]**) 297 '''
298 299 300 -def Plus(matcher):
301 ''' 302 Match one or more instances of a matcher (**[1:]**) 303 ''' 304 return Repeat(coerce_(matcher), start=1)
305 306 307 OneOrMore = Plus 308 ''' 309 Match one or more instances of a matcher (**[1:]**) 310 '''
311 312 313 -def Map(matcher, function):
314 ''' 315 Apply an arbitrary function to each of the tokens in the result of the 316 matcher (**>>**). If the function is a name, named pairs are created 317 instead. It can be used indirectly by placing ``>>`` to the right of the 318 matcher. 319 320 See also `Apply`. 321 ''' 322 # list() necessary so we can use '+' on result 323 if isinstance(function, basestring): 324 return Apply(matcher, lambda l: list(map(lambda x: (function, x), l)), 325 raw=True) 326 else: 327 return Apply(matcher, lambda l: list(map(function, l)), raw=True)
328
329 330 -def Reduce(matcher, zero, join=__add__):
331 ''' 332 Combine the results from the matcher using `reduce(join, results, zero)`. 333 Unlike `Add` this will return a value (`zero`) when there are no matches. 334 ''' 335 def reduce_(_stream, matcher): 336 (results, stream_out) = matcher() 337 return ([reduce(join, results, zero)], stream_out)
338 return Apply(matcher, TransformationWrapper(reduce_)) 339
340 341 -def add(_stream, matcher):
342 ''' 343 The transformation used in `Add` - we carefully use "+" in as generic 344 a manner as possible. 345 ''' 346 (results, stream_out) = matcher() 347 if results: 348 result = results[0] 349 for extra in results[1:]: 350 try: 351 result = result + extra 352 except TypeError: 353 raise TypeError( 354 fmt('An attempt was made to add two results ' 355 'that do not have consistent types: {0!r} + {1!r}', 356 result, extra)) 357 result = [result] 358 else: 359 result = [] 360 return (result, stream_out)
361
362 363 -def Add(matcher):
364 ''' 365 Join tokens in the result using the "+" operator (**+**). 366 This joins strings and merges lists. 367 Unlike `Reduce` this will have no effect of there are no matches. 368 ''' 369 return Apply(matcher, TransformationWrapper(add))
370
371 372 -def Join(*matchers):
373 ''' 374 Combine many matchers together with Add(And(...)). 375 It can be used indirectly by placing ``+`` between matchers. 376 ''' 377 return Add(And(*matchers))
378
379 380 -def Drop(matcher):
381 ''' 382 Do the match, but return nothing (**~**). The ~ prefix is equivalent. 383 ''' 384 return Apply(matcher, lambda l: [], raw=True)
385
386 387 -def Substitute(matcher, value):
388 ''' 389 Replace each return value with that given. 390 ''' 391 return Map(matcher, lambda x: value)
392
393 394 -def Name(matcher, name):
395 ''' 396 Name the result of matching (**> name**) 397 398 This replaces each value in the match with a tuple whose first value is 399 the given name and whose second value is the matched token. The Node 400 class recognises this form and associates such values with named attributes. 401 ''' 402 return Map(matcher, name)
403 404 405 Eos = Eof 406 '''Match the end of a stream. Returns nothing.'''
407 408 409 -def Identity(matcher):
410 '''Functions identically to the matcher given as an argument.''' 411 return coerce_(matcher)
412
413 414 -def Newline():
415 '''Match newline (Unix) or carriage return newline (Windows)''' 416 return Or(Literal('\n'), Literal('\r\n'))
417
418 419 -def Space(space=' \t'):
420 '''Match a single space (by default space or tab).''' 421 return Any(space)
422
423 424 -def Whitespace(space=whitespace):
425 ''' 426 Match a single space (by default from string.whitespace, 427 which includes newlines). 428 ''' 429 return Any(space)
430
431 432 -def Digit():
433 '''Match any single digit.''' 434 return Any(digits)
435
436 437 -def Letter():
438 '''Match any ASCII letter (A-Z, a-z).''' 439 return Any(ascii_letters)
440
441 442 -def Upper():
443 '''Match any ASCII uppercase letter (A-Z).''' 444 return Any(ascii_uppercase)
445
446 447 -def Lower():
448 '''Match any ASCII lowercase letter (A-Z).''' 449 return Any(ascii_lowercase)
450
451 452 -def Printable():
453 '''Match any printable character (string.printable).''' 454 return Any(printable)
455
456 457 -def Punctuation():
458 '''Match any punctuation character (string.punctuation).''' 459 return Any(punctuation)
460
461 462 -def UnsignedInteger():
463 '''Match a simple sequence of digits.''' 464 return Repeat(Digit(), start=1, add_=True)
465
466 467 -def SignedInteger():
468 '''Match a sequence of digits with an optional initial sign.''' 469 return Add(And(Optional(Any('+-')), UnsignedInteger()))
470 471 472 Integer = SignedInteger 473 ''' 474 The default integer is a signed one. 475 '''
476 477 478 -def UnsignedReal(decimal='.'):
479 ''' 480 Match a sequence of digits that may include a decimal point. This 481 will match both integer and float values. 482 ''' 483 return Or(Join(Optional(UnsignedInteger()), 484 Any(decimal), UnsignedInteger()), 485 Join(UnsignedInteger(), Optional(Any(decimal))))
486
487 488 -def SignedReal(decimal='.'):
489 ''' 490 Match a signed sequence of digits that may include a decimal point. 491 This will match both integer and float values. 492 ''' 493 return Join(Optional(Any('+-')), UnsignedReal(decimal))
494
495 496 -def UnsignedEReal(decimal='.', exponent='eE'):
497 ''' 498 Match an `UnsignedReal` followed by an optional exponent 499 (e+02 etc). This will match both integer and float values. 500 ''' 501 return Join(UnsignedReal(decimal), 502 Optional(And(Any(exponent), SignedInteger())))
503
504 505 -def SignedEReal(decimal='.', exponent='eE'):
506 ''' 507 Match a `SignedReal` followed by an optional exponent 508 (e+02 etc). This will match both integer and float values. 509 ''' 510 if decimal == '.' and exponent == 'eE': 511 # hack to faster direct implementation for now 512 return NfaRegexp(r'[\+\-]?(?:[0-9]*\.[0-9]+|[0-9]+\.|[0-9]+)(?:[eE][\+\-]?[0-9]+)?') 513 else: 514 return Join(SignedReal(decimal), 515 Optional(Join(Any(exponent), SignedInteger())))
516 517 518 Real = SignedEReal 519 ''' 520 The default float is signed with exponents. 521 ''' 522 523 _FLOAT_WARN = '''WARNING: The definition of the Float matchers changed in Lepl 4.4.0 524 (you may want to use Real instead).'''
525 526 @warn_on_use(_FLOAT_WARN) 527 -def UnsignedFloat(decimal='.'):
528 ''' 529 Match a sequence of digits that must include a decimal point. This 530 will match real values that are not integers. 531 ''' 532 return Or(Join(Optional(UnsignedInteger()), 533 Any(decimal), UnsignedInteger()), 534 Join(UnsignedInteger(), Any(decimal)))
535
536 537 @warn_on_use(_FLOAT_WARN) 538 -def SignedFloat(decimal='.'):
539 ''' 540 Match a signed sequence of digits that must include a decimal point. This 541 will match real values that are not integers. 542 ''' 543 return Join(Optional(Any('+-')), UnsignedFloat(decimal))
544
545 546 @warn_on_use(_FLOAT_WARN) 547 -def UnsignedEFloat(decimal='.', exponent='eE'):
548 ''' 549 As `UnsignedEReal`, but must contain a decimal or exponent. This 550 will match real values that are not integers. 551 ''' 552 return Or(Join(UnsignedReal(decimal), Any(exponent), SignedInteger()), 553 UnsignedFloat(decimal))
554
555 556 @warn_on_use(_FLOAT_WARN) 557 -def SignedEFloat(decimal='.', exponent='eE'):
558 ''' 559 As `SignedEReal`, but must contain a decimal or exponent. This 560 will match real values that are not integers. 561 ''' 562 if decimal == '.' and exponent == 'eE': 563 # hack to faster direct implementation for now 564 return NfaRegexp(r'[\+\-]?(?:[0-9]*\.[0-9]+(?:[eE][\+\-]?[0-9]+)?|[0-9]+\.(?:[eE][\+\-]?[0-9]+)?|[0-9]+[eE][\+\-]?[0-9]+)') 565 else: 566 return Or(Join(SignedReal(decimal), Any(exponent), SignedInteger()), 567 SignedFloat(decimal))
568 569 570 Float = SignedEFloat 571 ''' 572 The default float matcher accepts signed values with optional exponents. 573 This will not match integers - see `Real` for that. 574 '''
575 576 577 -def Word(chars=NfaRegexp('[^%s]' % whitespace), body=None):
578 ''' 579 Match a sequence of non-space characters, joining them together. 580 581 chars and body, if given as strings, define possible characters to use 582 for the first and rest of the characters in the word, respectively. 583 If body is not given, then chars is used for the entire word. 584 They can also specify matchers, which typically should match only a 585 single character. 586 587 So ``Word(Upper(), Lower())`` would match names that being with an upper 588 case letter, for example, while ``Word(AnyBut(Space()))`` (the default) 589 matches any sequence of non-space characters. 590 ''' 591 chars = coerce_(chars, Any) 592 body = chars if body is None else coerce_(body, Any) 593 return Add(And(chars, Star(body)))
594
595 596 -def DropEmpty(matcher):
597 ''' 598 Drop results if they are empty (ie if they are ``False`` in Python). 599 600 This will drop empty strings and lists. It will also drop 601 `Node` instances if they are empty (since the length is then 602 zero). 603 ''' 604 def drop(results): 605 ''' 606 Only drop "False" values. 607 ''' 608 return [result for result in results if result]
609 return Apply(matcher, drop, raw=True) 610
611 612 -def Literals(*matchers):
613 ''' 614 A series of literals, joined with `Or`. 615 ''' 616 # I considered implementing this by extending Literal() itself, but 617 # that would have meant putting "Or-like" functionality in Literal, 618 # and I felt it better to keep the base matchers reasonably orthogonal. 619 return Or(*lmap(Literal, matchers))
620
621 622 -def String(quote='"', escape='\\', empty='', join=__add__):
623 ''' 624 Match a string with quotes that can be escaped. This will match across 625 newlines (see `SingleLineString` for an alternative). 626 627 More generally, a string is a grouping of results. Setting `empty` and 628 `join` correctly will allow this matcher to work with a variety of types. 629 ''' 630 q = Literal(quote) 631 content = AnyBut(q) 632 if escape: 633 content = Or(And(Drop(escape), q), content) 634 content = Repeat(content, reduce=(empty, join)) 635 return And(Drop(q), content, Drop(q))
636
637 638 -def SingleLineString(quote='"', escape='\\', exclude='\n', empty='', join=__add__):
639 ''' 640 Like `String`, but will not match across multiple lines. 641 ''' 642 q = Literal(quote) 643 content = AnyBut(Or(q, Any(exclude))) 644 if escape: 645 content = Or(content, And(Drop(escape), q)) 646 content = Repeat(content, reduce=(empty, join)) 647 return And(Drop(q), content, Drop(q))
648
649 650 -def SkipString(quote='"', escape='\\', ignore='\n', empty='', join=__add__):
651 ''' 652 Like `String`, matching across multiple lines, but will silently 653 drop newlines. 654 ''' 655 q = Literal(quote) 656 content = AnyBut(Or(q, Any(ignore))) 657 if escape: 658 content = Or(content, And(Drop(escape), q)) 659 content = Or(content, Drop(Any(ignore))) 660 content = Repeat(content, reduce=(empty, join)) 661 return And(Drop(q), content, Drop(q))
662
663 664 -def SkipTo(matcher, include=True):
665 ''' 666 Consume everything up to (and including, if include is True, as it is by 667 default) the matcher. Returns all the skipped data, joined. 668 ''' 669 if include: 670 return Add(And(Star(AnyBut(matcher)), matcher)) 671 else: 672 return And(Add(Star(AnyBut(matcher))), Lookahead(matcher))
673