1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 '''
31 A simple regular expression engine in pure Python.
32
33 This takes a set of regular expressions (only the most basic functionality is
34 supported) and generates a finite state machines that can match them against
35 a stream of values.
36
37 Although simple (and slow compared to a C version), it has some advantages
38 from being implemented in Python.
39
40 * It can use a variety of alphabets - it is not restricted to strings.
41 It could, for example, match lists of integers, or sequences of tokens.
42
43 * The NFA implementation can yield intermediate matches.
44
45 * It is extensible.
46
47 The classes here form a layered series of representations for regular
48 expressions.
49
50 The first layer contains the classes `Sequence`, `Option`, `Repeat`, `Choice`,
51 `Labelled` and `Regexp`. These are a high level representation that will
52 typically be constructed by an alphabet-specific parser.
53
54 The second layer encodes the regular expression as a non-deterministic
55 finite automaton (NFA) with empty transitions. This is a "natural"
56 representation of the transitions that can be generated by the first layer.
57
58 The third layer encodes the regular expression as a deterministic finite
59 automaton (DFA). This is a more "machine friendly" representation that allows
60 for more efficient matching. It is generated by transforming a representation
61 from the second layer.
62 '''
63
64 from abc import ABCMeta, abstractmethod
65 from collections import deque
66 from itertools import chain
67
68 from lepl.stream.core import s_next, s_empty, s_line
69 from lepl.support.graph import ConstructorWalker, clone, Clone
70 from lepl.support.node import Node
71 from lepl.regexp.interval import Character, TaggedFragments, IntervalMap,\
72 _Character
73 from lepl.support.lib import fmt, basestring, str, LogMixin
74
75
76
77
78
79 _Alphabet = ABCMeta('_Alphabet', (object, ), {})
80
81
82
83
84 -class Alphabet(LogMixin, _Alphabet):
85
86 '''
87 Regular expressions are generalised over alphabets, which describe the set
88 of acceptable characters.
89
90 The characters in an alphabet must have an order, which is defined by
91 `__lt__` on the character instances themselves (equality and inequality
92 are also assumed to be defined). In addition, `before(c)` and `after(c)`
93 below should give the previous and subsequent characters in the ordering
94 and `min` and `max` should give the two most extreme characters.
95
96 Internally, within the routines here, ranges of characters are used.
97 These are encoded as pairs of values `(a, b)` which are inclusive.
98 Each pair is called an "interval".
99
100 Alphabets include additional methods used for display and may also have
101 methods specific to a given instance (typically named with an initial
102 underscore).
103 '''
104
106 self.__min = min_
107 self.__max = max_
108 super(Alphabet, self).__init__()
109
110 @property
112 '''
113 The "smallest" character.
114 '''
115 return self.__min
116
117 @property
119 '''
120 The "largest" character.
121 '''
122 return self.__max
123
124 @abstractmethod
126 '''
127 Must return the character before c in the alphabet. Never called with
128 min (assuming input data are in range).
129 '''
130
131 @abstractmethod
133 '''
134 Must return the character after c in the alphabet. Never called with
135 max (assuming input data are in range).
136 '''
137
138 @abstractmethod
140 '''
141 This must fully describe the data in the intervals (it is used to
142 hash the data).
143 '''
144
146 '''
147 Return a list of intervals that describes the complement of the given
148 interval. Note - the input interval must be ordered (and the result
149 will be ordered too).
150 '''
151 if not intervals:
152 return [(self.min, self.max)]
153 inverted = []
154 (a, last) = intervals[0]
155 if a != self.min:
156 inverted.append((self.min, self.before(a)))
157 for (a, b) in intervals[1:]:
158 inverted.append((self.after(last), self.before(a)))
159 last = b
160 if last != self.max:
161 inverted.append((self.after(last), self.max))
162 self._debug(fmt('invert {0} -> {1}', intervals, inverted))
163 return inverted
164
166 '''
167 This is called for extensions for the form (*NAME) where NAME is any
168 sequence of capitals. It should return a character range. Further
169 uses of (*...) are still to be decided.
170 '''
171 raise RegexpError(fmt('Extension {0!r} not supported by {1!s}',
172 text, self.__class__))
173
174 @abstractmethod
176 '''
177 This must fully describe the data in the children (it is used to
178 hash the data).
179 '''
180
181 @abstractmethod
183 '''
184 This must fully describe the data in the children (it is used to
185 hash the data).
186 '''
187
188 @abstractmethod
190 '''
191 This must fully describe the data in the children (it is used to
192 hash the data).
193 '''
194
195 @abstractmethod
197 '''
198 This must fully describe the data in the children (it is used to
199 hash the data).
200 '''
201
202 @abstractmethod
204 '''
205 This must fully describe the data in the child (it is used to
206 hash the data).
207 '''
208
209 @abstractmethod
210 - def join(self, chars):
211 '''
212 Join a list of characters into a string (or the equivalent).
213 '''
214
215 @abstractmethod
216 - def parse(self, regexp):
217 '''
218 Generate a Sequence from the given regexp text.
219 '''
220
223 '''
224 A sequence of Characters, etc.
225 '''
226
227 - def __init__(self, alphabet, *children):
237
239 '''
240 Construct a string representation of self.
241 '''
242 return self.alphabet.fmt_sequence(self)
243
246
248 return hash(self.__str)
249
250 - def build(self, graph, before, after):
263
264 @staticmethod
272
275
276
277
278 RegexpGraphNode = ABCMeta('RegexpGraphNode', (object, ), {})
279 RegexpGraphNode.register(_Character)
280 RegexpGraphNode.register(Sequence)
281
282
283 -class Option(Sequence):
284 '''
285 An optional sequence of Characters (or sequences).
286 '''
287
289 '''
290 Construct a string representation of self.
291 '''
292 return self.alphabet.fmt_option(self)
293
294 - def build(self, graph, before, after):
300
301
302 -class Empty(Sequence):
303 '''
304 Matches an empty sequence.
305 '''
306
308 '''
309 Construct a string representation of self.
310 '''
311 return self.alphabet.fmt_sequence([])
312
313 - def build(self, graph, before, after):
318
321 '''
322 A sequence of Characters (or sequences) that can repeat 0 or more times.
323 '''
324
326 '''
327 Construct a string representation of self.
328 '''
329 return self.alphabet.fmt_repeat(self)
330
331 - def build(self, graph, before, after):
340
341
342 -def Choice(alphabet, *children):
343 '''
344 Encase a choice in a sequence so that it can be treated in the same
345 way as other components (if we don't do this, then passing it to another
346 sequence will split it into constituents and then sequence them).
347 '''
348 return Sequence(alphabet, _Choice(alphabet, *children))
349
352 '''
353 A set of alternative Characters (or sequences).
354 '''
355
357 '''
358 Construct a string representation of self.
359 '''
360 return self.alphabet.fmt_choice(self)
361
362 - def build(self, graph, before, after):
363 '''
364 Connect in parallel from start to end, but add extra nodes so that
365 the sequence is tried in order (because evaluation tries empty
366 transitions last) and that loops don't return to start.
367 '''
368 if self:
369 last = self[-1]
370 for child in self:
371
372 if child is not last:
373 node = graph.new_node()
374 child.build(graph, before, after)
375 if child is not last:
376 graph.connect(before, node)
377 before = node
378
379
380
381
382
383
384
385
386
387
388
389
390
391 -class Labelled(Sequence):
392 '''
393 A labelled sequence. Within our limited implementation these are
394 restricted to (1) being children of Regexp and (2) not being followed
395 by any other sequence. Their termination defines terminal nodes.
396 '''
397
398 - def __init__(self, alphabet, label, *children):
401
408
409 - def build(self, graph, before, after):
418
421 '''
422 An error associated with (problems in) the regexp implementation.
423 '''
424 pass
425
428 '''
429 Compile an expression.
430 '''
431
432 - def __init__(self, expression, alphabet):
436
446
448 '''
449 Generate a DFA-based matcher (faster than NFA, but returns only a
450 single, greedy match).
451 '''
452 self._debug(fmt('compiling to dfa: {0}', self))
453 ngraph = NfaGraph(self.alphabet)
454 self.expression.build(ngraph, ngraph.new_node(), ngraph.new_node())
455 self._debug(fmt('nfa graph: {0}', ngraph))
456 dgraph = NfaToDfa(ngraph, self.alphabet).dfa
457 self._debug(fmt('dfa graph: {0}', dgraph))
458 return DfaPattern(dgraph, self.alphabet)
459
461 '''
462 Generate a matcher that wraps the standard "re" package.
463 '''
464 return RePattern(str(self), self.alphabet)
465
467 '''
468 Show the expression itself.
469 '''
470 return str(self.expression)
471
472 @staticmethod
485
486 @staticmethod
487 - def single(alphabet, regexp, label='label'):
493
494 @staticmethod
504
508 '''
509 Describes a collection of connected nodes.
510 '''
511
513 super(BaseGraph, self).__init__()
514 self._alphabet = alphabet
515 self._next_node = 0
516 self._transitions = {}
517 self._terminals = {}
518
520 '''
521 Indicate that the node terminates with the given labels.
522 '''
523 assert node < self._next_node
524 if node not in self._terminals:
525 self._terminals[node] = set()
526 self._terminals[node].update(labels)
527
529 '''
530 Get a new node.
531 '''
532 node = self._next_node
533 self._next_node += 1
534 return node
535
536 - def connect(self, src, dest, edge):
537 '''
538 Define a connection between src and dest, with an edge
539 value (a character).
540 '''
541 assert src < self._next_node
542 assert dest < self._next_node
543 if src not in self._transitions:
544 self._transitions[src] = []
545 self._transitions[src].append((dest, edge))
546
548 '''
549 An iterator over all nodes.
550 '''
551 return iter(range(self._next_node))
552
554 '''
555 An iterator over all non-empty transitions from src - returns
556 (dest, edge) pairs.
557 '''
558 return iter(self._transitions.get(src, []))
559
561 '''
562 An iterator over the terminals for the give node.
563 '''
564 return iter(self._terminals.get(node, []))
565
567 return self._next_node
568
571 '''
572 Describes a NFA with epsilon (empty) transitions.
573 '''
574
578
586
587 - def connect(self, src, dest, edge=None):
588 '''
589 Define a connection between src and dest, with an optional edge
590 value (a character).
591 '''
592 if edge:
593 super(NfaGraph, self).connect(src, dest, edge)
594 else:
595 assert src < self._next_node
596 assert dest < self._next_node
597 assert src not in self._terminals, 'Source is terminal'
598 self._empty_transitions[src].add(dest)
599
601 '''
602 An iterator over all empty transitions from src.
603 '''
604 return iter(self._empty_transitions.get(src, []))
605
621
632
634 '''
635 Example:
636 0: 3, 4; 1: 2; 2(Tk1); 3: [{u'\x00'}-`b-{u'\U0010ffff'}]->3, 1;
637 4: {$}->5, 7; 5: 6; 6($); 7: {^}->10; 8: 9; 9(^); 10: 11;
638 11: [ ]->11, 8
639
640 Node 0 leads to 3 and 4 (both empty)
641 Node 1 leads to 2 (empty)
642 Node 2 is terminal, labelled with "Tk1"
643 Node 3 loops back to 3 for a character in the given range, or to 1
644 etc.
645 '''
646 lines = []
647 for node in self:
648 edges = []
649 for (dest, edge) in self.transitions(node):
650 edges.append(fmt('{0}->{1}', edge, dest))
651 for dest in self.empty_transitions(node):
652 edges.append(str(dest))
653 label = '' if self.terminal(node) is None \
654 else fmt('({0})', self.terminal(node))
655 if edges:
656 lines.append(
657 fmt('{0}{1}: {2}', node, label, ', '.join(edges)))
658 else:
659 lines.append(fmt('{0}{1}', node, label))
660 return '; '.join(lines)
661
664 '''
665 Given a graph this constructs a transition table and an associated
666 matcher. The matcher attempts to find longest matches but does not
667 guarantee termination (if a possible empty match is repeated).
668
669 Note that the matcher returns a triple, including label. This is not
670 the same interface as the matchers used in recursive descent parsing.
671
672 Evaluation order for transition:
673 - Transition with character, if defined
674 - Empty transition to largest numbered node
675 these ensure we do deepest match first.
676 '''
677
684
686 '''
687 Rewrite the graph as a transition table, with appropriate ordering.
688 '''
689 for src in self.__graph:
690
691
692 fragments = TaggedFragments(self.__alphabet)
693 for (dest, char) in self.__graph.transitions(src):
694 fragments.append(char, (dest, self.__graph.terminal(dest)))
695 map_ = IntervalMap()
696 for (interval, dts) in fragments:
697 map_[interval] = dts
698
699 empties = [(dest, self.__graph.terminal(dest))
700
701
702 for dest in sorted(self.__graph.empty_transitions(src))]
703 self.__table[src] = (map_, empties)
704
705 - def match(self, stream):
706 '''
707 Use the table to match a stream.
708
709 The stack holds the current state, which is consumed from left to
710 right. An entry on the stack contains:
711
712 - map_ - a map from character to [(dest state, terminals)]
713
714 - matched - the [(dest state, terminals)] generated by the map for
715 a given character
716
717 - empties - empty transitions for this state
718
719 - match - the current match, as a list of tokens consumed from the
720 stream
721
722 - stream - the current stream
723 '''
724
725 stack = deque()
726 (map_, empties) = self.__table[0]
727 stack.append((map_, None, empties, [], stream))
728 while stack:
729
730 (map_, matched, empties, match, stream) = stack.pop()
731 if not map_ and not matched and not empties:
732
733 pass
734 elif map_:
735
736 stack.append((None, None, empties, match, stream))
737
738 if not s_empty(stream):
739 (value, next_stream) = s_next(stream)
740 try:
741 matched = map_[value]
742 if matched:
743 stack.append((None, matched, None,
744 match + [value], next_stream))
745 except IndexError:
746 pass
747 elif matched:
748 (dest, terminal) = matched[-1]
749
750 if len(matched) > 1:
751 stack.append((map_, matched[:-1], empties, match, stream))
752
753 (map_, empties) = self.__table[dest]
754 stack.append((map_, None, empties, match, stream))
755 if terminal:
756 yield (terminal, self.__alphabet.join(match), stream)
757 else:
758
759 (dest, terminal) = empties[-1]
760
761 if len(empties) > 1:
762 stack.append((map_, matched, empties[:-1], match, stream))
763
764 (map_, empties) = self.__table[dest]
765 stack.append((map_, None, empties, match, stream))
766 if terminal:
767 yield (terminal, self.__alphabet.join(match), stream)
768
771
774 '''
775 Describes a DFA where each node is a collection of NFA nodes.
776 '''
777
782
783 - def node(self, nfa_nodes):
784 '''
785 Add a node, defined as a set of nfa nodes. If the set already exists,
786 (False, old node) is returned, with the existing DFA node.
787 Otherwise (True, new node) is returned.
788 '''
789 new = nfa_nodes not in self._nfa_to_dfa
790 if new:
791 dfa_node = self.new_node()
792 self._nfa_to_dfa[nfa_nodes] = dfa_node
793 self._dfa_to_nfa[dfa_node] = nfa_nodes
794 return (new, self._nfa_to_dfa[nfa_nodes])
795
797 '''
798 An iterator over NFA nodes associated with the given DFA node.
799 '''
800 return iter(self._dfa_to_nfa[node])
801
803 lines = []
804 for node in self:
805 edges = []
806 for (dest, edge) in self.transitions(node):
807 edges.append(fmt('{0}->{1}', edge, dest))
808 nodes = [n for n in self.nfa_nodes(node)]
809 edges = ' ' + ','.join(edges) if edges else ''
810 labels = list(self.terminals(node))
811 labels = fmt('({0})', ','.join(str(label) for label in labels)) \
812 if labels else ''
813 lines.append(fmt('{0}{1}: {2}{3}', node, labels, nodes, edges))
814 return '; '.join(lines)
815
820 '''
821 Convert a NFA graph to a DFA graph (uses the usual superset approach but
822 does combination of states in a way that seems to fit better with the idea
823 of character ranges).
824 '''
825
832
834 '''
835 This is the driver for the "usual" superset algorithm - we find
836 all states that correspond to a DFA state, then look at transitions
837 to other states. This repeats until we have covered all the
838 different combinations of NFA states that could occur.
839 '''
840 stack = deque()
841
842 (nfa_nodes, terminals) = self.__nfa.connected([0])
843 (_, src) = self.dfa.node(nfa_nodes)
844 stack.append((src, nfa_nodes, terminals))
845
846 while stack:
847 (src, nfa_nodes, terminals) = stack.pop()
848 self.dfa.terminate(src, terminals)
849 fragments = self.__fragment_transitions(nfa_nodes)
850 groups = self.__group_fragments(fragments)
851 self.__add_groups(src, groups, stack)
852
854 '''
855 From the given nodes we can accumulate the destination nodes and
856 terminals associated with each transition (edge/character).
857 At the same time we separate the character matches into non-overlapping
858 fragments.
859 '''
860 fragments = TaggedFragments(self.__alphabet)
861 for nfa_node in nfa_nodes:
862 for (dest, edge) in self.__nfa.transitions(nfa_node):
863 (nodes, terminals) = self.__nfa.connected([dest])
864 fragments.append(edge, (nodes, list(terminals)))
865 return fragments
866
867 @staticmethod
869 '''
870 For each fragment, we for the complete set of possible destinations
871 and associated terminals. Since it is possible that more than one
872 fragment will lead to the same set of target nodes we group all
873 related fragments together.
874 '''
875
876 groups = {}
877 for (interval, nts) in fragments:
878 if len(nts) > 1:
879
880 (nodes, terminals) = (set(), set())
881 for (n, t) in nts:
882 nodes.update(n)
883 terminals.update(t)
884 nodes = frozenset(nodes)
885 else:
886 (nodes, terminals) = nts[0]
887 if nodes not in groups:
888 groups[nodes] = (set(), set())
889 groups[nodes][0].add(interval)
890 groups[nodes][1].update(terminals)
891 return groups
892
905
908 '''
909 Create a lookup table for a DFA and a matcher to evaluate it.
910 '''
911
919
921 '''
922 Construct a transition table.
923 '''
924 for src in self.__graph:
925 row = IntervalMap()
926 for (dest, char) in self.__graph.transitions(src):
927
928 labels = tuple(self.__graph.terminals(dest))
929 for interval in char:
930 row[interval] = (dest, labels)
931 self.__table[src] = row
932
933 - def match(self, stream_in):
934 '''
935 Match against the stream.
936 '''
937 try:
938 (terminals, size, _) = self.size_match(stream_in)
939 (value, stream_out) = s_next(stream_in, count=size)
940 return (terminals, value, stream_out)
941 except TypeError:
942
943 return None
944
946 '''
947 Match against the stream, but return the length of the match.
948 '''
949 state = 0
950 size = 0
951 longest = (self.__empty_labels, 0, stream) \
952 if self.__empty_labels else None
953 (line, _) = s_line(stream, True)
954 while size < len(line):
955 future = self.__table[state][line[size]]
956 if future is None:
957 break
958
959 (state, terminals) = future
960 size += 1
961
962
963 if terminals:
964 try:
965 (_, next_stream) = s_next(stream, count=size)
966 longest = (terminals, size, next_stream)
967 except StopIteration:
968 pass
969 return longest
970
973