1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 '''
31 Rewriters modify the graph of matchers before it is used to generate a
32 parser.
33 '''
34
35 from lepl.matchers.memo import LMemo, RMemo
36 from lepl.support.graph import Visitor, preorder, loops, order, NONTREE, \
37 dfs_edges, LEAF
38 from lepl.matchers.combine import DepthFirst, DepthNoTrampoline, \
39 BreadthFirst, BreadthNoTrampoline, And, AndNoTrampoline, \
40 Or, OrNoTrampoline
41 from lepl.matchers.core import Delayed, Lookahead
42 from lepl.matchers.derived import add
43 from lepl.matchers.matcher import Matcher, is_child, FactoryMatcher, \
44 matcher_type, MatcherTypeException, canonical_matcher_type
45 from lepl.matchers.support import NoTrampoline, Transformable
46 from lepl.support.lib import lmap, fmt, LogMixin, empty, count
50 '''
51 base class for rewriters, supporting a fixed ordering.
52 '''
53
54
55 (SET_ARGUMENTS,
56 COMPOSE_TRANSFORMS,
57 FLATTEN,
58 COMPILE_REGEXP,
59 OPTIMIZE_OR,
60 LEXER,
61 DIRECT_EVALUATION,
62
63
64
65
66
67 MEMOIZE,
68 TRACE_VARIABLES,
69 FULL_FIRST_MATCH) = range(10, 110, 10)
70
71 - def __init__(self, order_, name=None, exclusive=True):
76
78 if not isinstance(other, self.__class__):
79 return False
80 return self.exclusive or self is other
81
83 return not self.__eq__(other)
84
86 if self.exclusive:
87 return hash(self.__class__)
88 else:
89 return super(Rewriter, self).__hash__()
90
92 if not isinstance(other, Rewriter):
93 return True
94 elif self.exclusive or self.order != other.order:
95 return self.order < other.order
96 else:
97 return hash(self) < hash(other)
98
100 return not self.__lt__(other)
101
103 if not isinstance(other, Rewriter):
104 return True
105 elif self.exclusive or self.order != other.order:
106 return self.order > other.order
107 else:
108 return hash(self) > hash(other)
109
111 return not self.__gt__(other)
112
115
118
119
120 -def clone(i, j, node, args, kargs):
128
140
143 '''
144 Return `[(head, reversed), ...]` where each tuple describes a tree of
145 matchers without loops. The first head is the root node. The reversed
146 list contains nodes ordered children-first (except for `Delayed()`
147 instances, whose children are other `head` elements).
148
149 This allows us to clone a DAG of matchers in the same way as it was first
150 created - by creating linear trees and then connecting the `Delayed()`
151 instances.
152
153 The postorder ordering is used to match the ordering in the more general
154 iteration over matchers based on the graph support classes and helps
155 keep things consistent (there was a strange issue where the `.tree()`
156 display of a cloned graph differed from the original that, I think, was
157 due to a different ordering).
158 '''
159 linear = []
160 pending = [node]
161 heads = set()
162 while pending:
163 node = pending.pop()
164 if node not in heads:
165 stack = []
166 def append(child):
167 if isinstance(child, Matcher):
168 if isinstance(child, Delayed):
169 child.assert_matcher()
170 pending.append(child.matcher)
171 stack.append((child, empty()))
172 else:
173 stack.append((child, iter(child)))
174 heads.add(node)
175 append(node)
176 def postorder():
177 while stack:
178 (node, children) = stack[-1]
179 try:
180 append(next(children))
181 except StopIteration:
182 yield stack.pop()[0]
183 linear.append((node, list(postorder())))
184 return linear
185
186
187 -def clone_tree(i, head, reversed, mapping, delayed, clone, duplicate=False):
188 '''
189 Clone a tree of matchers. This clones all the matchers in a linearised
190 set, except for the `Delayed()` instances, which are re-created without
191 their contents (these are set later, to connect the trees into the
192 final matcher DAG).
193
194 `i` is the index of the tree (0 for the first tree, which cannot be part
195 of a loop itself). It is passed to the clone function.
196
197 `head` is the root of the tree.
198
199 `reversed` are the tree nodes in postorder
200
201 `mapping` is a map from old to new node of all the nodes created. For
202 `Delayed()` instances, if `duplicate=True`, then the new node is just
203 one of possibly many copies.
204
205 `clone` is the function used to create a new node instance.
206
207 `duplicate` controls how `Delayed()` instances are handled. If true then
208 a new instance is created for each one. This does not preserve the
209 graph, but is used by memoisation.
210 '''
211 def rewrite(value):
212 try:
213 if value in mapping:
214 return mapping[value]
215 except TypeError:
216 pass
217 return value
218 n = len(reversed)
219 for (j, node) in zip(count(n, -1), reversed):
220 if isinstance(node, Delayed):
221 if duplicate or node not in mapping:
222 mapping[node] = clone(i, -j, node, (), {})
223 delayed.append((node, mapping[node]))
224 else:
225 if node not in mapping:
226 (args, kargs) = node._constructor_args()
227 args = lmap(rewrite, args)
228 kargs = dict((name, rewrite(value)) for (name, value) in kargs.items())
229 copy = clone(i, j, node, args, kargs)
230 mapping[node] = copy
231
234 '''
235 This used to be implemented using the graph support classes
236 (`ConstructorWalker()` etc). But the left-recursive handling was
237 unreliable and that was too opaque to debug easily. It's possible this
238 code could now be moved back to that approach, as not everything
239 here is used (the `j` index turned out not to be useful, for example).
240 But this approach is easier to understand and I am not 100% sure that
241 the code is correct, so I may need to continue working on this.
242
243 `node` is the root of the matcher graph.
244
245 `clone` is a function used to create new instances.
246
247 `duplicate` controls how `Delayed()` instances are handled. If true then
248 a new instance is created for each one. This does not preserve the
249 graph, but is used by memoisation.
250 '''
251 from lepl.regexp.rewriters import RegexpContainer
252 trees = linearise_matcher(node)
253 all_nodes = {}
254 all_delayed = []
255 for (i, (head, reversed)) in enumerate(trees):
256 clone_tree(i, head, reversed, all_nodes, all_delayed, clone,
257 duplicate=duplicate)
258 for (delayed, clone) in all_delayed:
259
260
261 clone.__iadd__(RegexpContainer.to_matcher(all_nodes[delayed.matcher]))
262 return RegexpContainer.to_matcher(all_nodes[node])
263
264
265 -def post_clone(function):
266 '''
267 Generate a clone function that applies the given function to the newly
268 constructed node, except for Delayed instances (which are effectively
269 proxies and so have no functionality of their own) (so, when used with
270 `DelayedClone`, effectively performs a map on the graph).
271 '''
272 def new_clone(i, j, node, args, kargs):
273 '''
274 Apply function as well as clone.
275 '''
276 copy = clone(i, j, node, args, kargs)
277
278
279 if not isinstance(node, Delayed):
280 copy = function(copy)
281 return copy
282 return new_clone
283
286 '''
287 A rewriter that flattens `And` and `Or` lists.
288 '''
289
292
294 def new_clone(i, j, node, old_args, kargs):
295 '''
296 The flattening cloner.
297 '''
298 new_args = []
299 type_ = matcher_type(node, fail=False)
300 if type_ in map(canonical_matcher_type, [And, Or]):
301 for arg in old_args:
302 if matcher_type(arg, fail=False) is type_ and \
303 (not hasattr(arg, 'wrapper') or
304 ((not arg.wrapper and not node.wrapper) or \
305 (arg.wrapper.functions == node.wrapper.functions
306 and node.wrapper.functions == [add]))):
307 new_args.extend(arg.matchers)
308 else:
309 new_args.append(arg)
310 if not new_args:
311 new_args = old_args
312 return clone(i, j, node, new_args, kargs)
313 return clone_matcher(graph, new_clone)
314
340
343 '''
344 A rewriter needed for TraceVariables which adds the trace_variables
345 attribute to untransformable matchers that need a transform.
346 '''
347
350
352 from lepl.matchers.transform import Transform
353 def new_clone(i, j, node, args, kargs):
354 '''
355 The joining cloner.
356 '''
357
358
359 copy = clone(i, j, node, args, kargs)
360 if hasattr(node, 'trace_variables') and node.trace_variables:
361 return Transform(copy, node.trace_variables)
362 else:
363 return copy
364 return clone_matcher(graph, new_clone)
365
368 '''
369 A rewriter that adds RMemo to all nodes in the matcher graph.
370 '''
371
374
377
380 '''
381 A rewriter that adds LMemo to all nodes in the matcher graph.
382 '''
383
387
389 def new_clone(i, j, node, args, kargs):
390 copy = clone(i, j, node, args, kargs)
391 return self.memoize(i, j, self.d, copy, LMemo)
392 return clone_matcher(graph, new_clone, duplicate=True)
393
394 @staticmethod
396 if j > 0:
397 def open(depth, length):
398 return False
399 curtail = open
400 elif d:
401 def fixed(depth, length):
402 return depth > i * d
403 curtail = fixed
404 else:
405 def slen(depth, length):
406 return depth > i * length
407 curtail = slen
408 return memo(copy, curtail)
409
412 '''
413 Apply two different memoizers, one to left recursive loops and the
414 other elsewhere (either can be omitted).
415
416 `conservative` refers to the algorithm used to detect loops:
417 `None` will use the left memoizer on all nodes except the initial tree
418 `True` will detect all possible loops (should be very similar to `None`)
419 `False` detects only left-most loops and may miss some loops.
420
421 `d` is a parameter that controls the depth to which repeated left-recursion
422 may occur. If `None` then the length of the remaining input is used.
423 If set, parsers are more efficient, but less likely to match input
424 correctly.
425 '''
426
427 - def __init__(self, conservative=None, left=None, right=None, d=0):
434
436 dangerous = set()
437 for head in order(graph, NONTREE, Matcher):
438 for loop in either_loops(head, self.conservative):
439 for node in loop:
440 dangerous.add(node)
441 def new_clone(i, j, node, args, kargs):
442 '''
443 Clone with the appropriate memoizer
444 (cannot use post_clone as need to test original)
445 '''
446 copy = clone(i, j, node, args, kargs)
447 if (self.conservative is None and i) or node in dangerous:
448 if self.left:
449 return LeftMemoize.memoize(i, j, self.d, copy, self.left)
450 else:
451 return copy
452 else:
453 if self.right:
454 return self.right(copy)
455 else:
456 return copy
457 return clone_matcher(graph, new_clone, duplicate=True)
458
461 '''
462 Return (an estimate of) all left-recursive loops from the given node.
463
464 We cannot know for certain whether a loop is left recursive because we
465 don't know exactly which parsers will consume data. But we can estimate
466 by assuming that all matchers eventually (ie via their children) consume
467 something. We can also improve that slightly by ignoring `Lookahead`.
468
469 So we estimate left-recursive loops as paths that start and end at
470 the given node, and which are first children of intermediate nodes
471 unless the node is `Or`, or the preceding matcher is a
472 `Lookahead`.
473
474 Each loop is a list that starts and ends with the given node.
475 '''
476 stack = [[node]]
477 known = set([node])
478 while stack:
479 ancestors = stack.pop()
480 parent = ancestors[-1]
481 if isinstance(parent, Matcher):
482 for child in parent:
483 family = list(ancestors) + [child]
484 if child is node:
485 yield family
486 else:
487 try:
488 if child not in known:
489 stack.append(family)
490 known.add(child)
491
492 except TypeError:
493 pass
494 if not is_child(parent, Or, fail=False) and \
495 not is_child(child, Lookahead, fail=False):
496 break
497
500 '''
501 Select between the conservative and liberal loop detection algorithms.
502 '''
503 if conservative:
504 return loops(node, Matcher)
505 else:
506 return left_loops(node)
507
510 '''
511 A rewriter that re-arranges `Or` matcher contents for left--recursive
512 loops.
513
514 When a left-recursive rule is used, it is much more efficient if it
515 appears last in an `Or` statement, since that forces the alternates
516 (which correspond to the terminating case in a recursive function)
517 to be tested before the LMemo limit is reached.
518
519 This rewriting may change the order in which different results for
520 an ambiguous grammar are returned.
521
522 `conservative` refers to the algorithm used to detect loops; False
523 may classify some left--recursive loops as right--recursive.
524 '''
525
529
550
553 '''
554 Add/replace named arguments while cloning.
555
556 This rewriter is not exclusive - several different instances canb be
557 defined in parallel.
558 '''
559
560 - def __init__(self, type_, **extra_kargs):
561 super(SetArguments, self).__init__(Rewriter.SET_ARGUMENTS,
562 fmt('SetArguments({0}, {1})', type_, extra_kargs), False)
563 self.type = type_
564 self.extra_kargs = extra_kargs
565
567 def new_clone(i, j, node, args, kargs):
568 '''
569 As clone, but add in any extra kargs if the node is an instance
570 of the given type.
571 '''
572 if isinstance(node, self.type):
573 for key in self.extra_kargs:
574 kargs[key] = self.extra_kargs[key]
575 return clone(i, j, node, args, kargs)
576 return clone_matcher(graph, new_clone)
577
580 '''
581 Replace given matchers if all Matcher arguments are subclasses of
582 `NoTrampolineWrapper`
583
584 `spec` is a map from original matcher type to the replacement.
585 '''
586
596
598 def new_clone(i, j, node, args, kargs):
599 type_, ok = None, False
600 for parent in self.spec:
601 if is_child(node, parent):
602 type_ = self.spec[parent]
603 if type_:
604 ok = True
605 for arg in args:
606 if isinstance(arg, Matcher) and not \
607 isinstance(arg, NoTrampoline):
608 ok = False
609 for name in kargs:
610 arg = kargs[name]
611 if isinstance(arg, Matcher) and not \
612 isinstance(arg, NoTrampoline):
613 ok = False
614 if not ok:
615 type_ = type(node)
616 try:
617 copy = type_(*args, **kargs)
618 copy_standard_attributes(node, copy)
619 return copy
620 except TypeError as err:
621 raise TypeError(fmt('Error cloning {0} with ({1}, {2}): {3}',
622 type_, args, kargs, err))
623 return clone_matcher(graph, new_clone)
624
627 '''
628 If the parser fails, raise an error at the maxiumum depth.
629
630 `eos` controls whether or not the entire input must be consumed for the
631 parse to be considered a success.
632 '''
633
638
642
645 '''
646 Provide statistics and access by type to nodes.
647 '''
648
650 self.loops = 0
651 self.leaves = 0
652 self.total = 0
653 self.others = 0
654 self.duplicates = 0
655 self.unhashable = 0
656 self.types = {}
657 self.__known = set()
658 if matcher is not None:
659 self.add_all(matcher)
660
661 - def add(self, type_, node):
662 '''
663 Add a node of a given type.
664 '''
665 try:
666 node_type = matcher_type(node)
667 except MatcherTypeException:
668 node_type = type(node)
669 if type_ & LEAF:
670 self.leaves += 1
671 if type_ & NONTREE and is_child(node_type, Matcher, fail=False):
672 self.loops += 1
673 try:
674 if node not in self.__known:
675 self.__known.add(node)
676 if node_type not in self.types:
677 self.types[node_type] = set()
678 self.types[node_type].add(node)
679 if is_child(node_type, Matcher):
680 self.total += 1
681 else:
682 self.others += 1
683 else:
684 self.duplicates += 1
685 except:
686 self.unhashable += 1
687
694
696 counts = fmt('total: {total:3d}\n'
697 'leaves: {leaves:3d}\n'
698 'loops: {loops:3d}\n'
699 'duplicates: {duplicates:3d}\n'
700 'others: {others:3d}\n'
701 'unhashable: {unhashable:3d}\n', **self.__dict__)
702 keys = list(self.types.keys())
703 keys.sort(key=repr)
704 types = '\n'.join([fmt('{0:40s}: {1:3d}', key, len(self.types[key]))
705 for key in keys])
706 return counts + types
707
709 '''
710 Quick and dirty equality
711 '''
712 return str(self) == str(other)
713
716 '''
717 Avoid using graph code (so we can check that...)
718 '''
719
721 self.total = 0
722 self.leaves = 0
723 self.duplicates = 0
724 self.types = {}
725
726 known = set()
727 stack = [node]
728 while stack:
729 node = stack.pop()
730 if node in known:
731 self.duplicates += 1
732 else:
733 known.add(node)
734 self.total += 1
735 type_ = type(node)
736 if type_ not in self.types:
737 self.types[type_] = 0
738 self.types[type_] += 1
739 children = [child for child in node if isinstance(child, Matcher)]
740 if not children:
741 self.leaves += 1
742 else:
743 stack.extend(children)
744
746 counts = fmt('total: {total:3d}\n'
747 'leaves: {leaves:3d}\n'
748 'duplicates: {duplicates:3d}\n', **self.__dict__)
749 keys = list(self.types.keys())
750 keys.sort(key=repr)
751 types = '\n'.join([fmt('{0:40s}: {1:3d}', key, self.types[key])
752 for key in keys])
753 return counts + types
754
756 '''
757 Quick and dirty equality
758 '''
759 return str(self) == str(other)
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837