Package lepl :: Package regexp :: Module core
[hide private]
[frames] | no frames]

Source Code for Module lepl.regexp.core

  1   
  2  # The contents of this file are subject to the Mozilla Public License 
  3  # (MPL) Version 1.1 (the "License"); you may not use this file except 
  4  # in compliance with the License. You may obtain a copy of the License 
  5  # at http://www.mozilla.org/MPL/ 
  6  # 
  7  # Software distributed under the License is distributed on an "AS IS" 
  8  # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 
  9  # the License for the specific language governing rights and 
 10  # limitations under the License. 
 11  # 
 12  # The Original Code is LEPL (http://www.acooke.org/lepl) 
 13  # The Initial Developer of the Original Code is Andrew Cooke. 
 14  # Portions created by the Initial Developer are Copyright (C) 2009-2010 
 15  # Andrew Cooke (andrew@acooke.org). All Rights Reserved. 
 16  # 
 17  # Alternatively, the contents of this file may be used under the terms 
 18  # of the LGPL license (the GNU Lesser General Public License, 
 19  # http://www.gnu.org/licenses/lgpl.html), in which case the provisions 
 20  # of the LGPL License are applicable instead of those above. 
 21  # 
 22  # If you wish to allow use of your version of this file only under the 
 23  # terms of the LGPL License and not to allow others to use your version 
 24  # of this file under the MPL, indicate your decision by deleting the 
 25  # provisions above and replace them with the notice and other provisions 
 26  # required by the LGPL License.  If you do not delete the provisions 
 27  # above, a recipient may use your version of this file under either the 
 28  # MPL or the LGPL License. 
 29   
 30  ''' 
 31  A simple regular expression engine in pure Python. 
 32   
 33  This takes a set of regular expressions (only the most basic functionality is 
 34  supported) and generates a finite state machines that can match them against 
 35  a stream of values. 
 36   
 37  Although simple (and slow compared to a C version), it has some advantages  
 38  from being implemented in Python. 
 39   
 40   * It can use a variety of alphabets - it is not restricted to strings.   
 41     It could, for example, match lists of integers, or sequences of tokens. 
 42   
 43   * The NFA implementation can yield intermediate matches. 
 44   
 45   * It is extensible. 
 46   
 47  The classes here form a layered series of representations for regular  
 48  expressions. 
 49   
 50  The first layer contains the classes `Sequence`, `Option`, `Repeat`, `Choice`, 
 51  `Labelled` and `Regexp`.  These are a high level representation that will 
 52  typically be constructed by an alphabet-specific parser. 
 53   
 54  The second layer encodes the regular expression as a non-deterministic  
 55  finite automaton (NFA) with empty transitions.  This is a "natural"  
 56  representation of the transitions that can be generated by the first layer. 
 57   
 58  The third layer encodes the regular expression as a deterministic finite 
 59  automaton (DFA).  This is a more "machine friendly" representation that allows  
 60  for more efficient matching.  It is generated by transforming a representation 
 61  from the second layer. 
 62  ''' 
 63   
 64  from abc import ABCMeta, abstractmethod 
 65  from collections import deque 
 66  from itertools import chain 
 67   
 68  from lepl.stream.core import s_next, s_empty, s_line 
 69  from lepl.support.graph import ConstructorWalker, clone, Clone 
 70  from lepl.support.node import Node 
 71  from lepl.regexp.interval import Character, TaggedFragments, IntervalMap,\ 
 72      _Character 
 73  from lepl.support.lib import fmt, basestring, str, LogMixin 
 74   
 75   
 76  # pylint: disable-msg=C0103 
 77  # Python 2.6 
 78  #class Alphabet(metaclass=ABCMeta): 
 79  _Alphabet = ABCMeta('_Alphabet', (object, ), {}) 
80 81 82 # pylint: disable-msg=E1002 83 # pylint can't find ABCs 84 -class Alphabet(LogMixin, _Alphabet):
85 86 ''' 87 Regular expressions are generalised over alphabets, which describe the set 88 of acceptable characters. 89 90 The characters in an alphabet must have an order, which is defined by 91 `__lt__` on the character instances themselves (equality and inequality 92 are also assumed to be defined). In addition, `before(c)` and `after(c)` 93 below should give the previous and subsequent characters in the ordering 94 and `min` and `max` should give the two most extreme characters. 95 96 Internally, within the routines here, ranges of characters are used. 97 These are encoded as pairs of values `(a, b)` which are inclusive. 98 Each pair is called an "interval". 99 100 Alphabets include additional methods used for display and may also have 101 methods specific to a given instance (typically named with an initial 102 underscore). 103 ''' 104
105 - def __init__(self, min_, max_):
106 self.__min = min_ 107 self.__max = max_ 108 super(Alphabet, self).__init__()
109 110 @property
111 - def min(self):
112 ''' 113 The "smallest" character. 114 ''' 115 return self.__min
116 117 @property
118 - def max(self):
119 ''' 120 The "largest" character. 121 ''' 122 return self.__max
123 124 @abstractmethod
125 - def before(self, c):
126 ''' 127 Must return the character before c in the alphabet. Never called with 128 min (assuming input data are in range). 129 '''
130 131 @abstractmethod
132 - def after(self, c):
133 ''' 134 Must return the character after c in the alphabet. Never called with 135 max (assuming input data are in range). 136 ''' 137 138 @abstractmethod
139 - def fmt_intervals(self, intervals):
140 ''' 141 This must fully describe the data in the intervals (it is used to 142 hash the data). 143 '''
144
145 - def invert(self, intervals):
146 ''' 147 Return a list of intervals that describes the complement of the given 148 interval. Note - the input interval must be ordered (and the result 149 will be ordered too). 150 ''' 151 if not intervals: 152 return [(self.min, self.max)] 153 inverted = [] 154 (a, last) = intervals[0] 155 if a != self.min: 156 inverted.append((self.min, self.before(a))) 157 for (a, b) in intervals[1:]: 158 inverted.append((self.after(last), self.before(a))) 159 last = b 160 if last != self.max: 161 inverted.append((self.after(last), self.max)) 162 self._debug(fmt('invert {0} -> {1}', intervals, inverted)) 163 return inverted
164
165 - def extension(self, text):
166 ''' 167 This is called for extensions for the form (*NAME) where NAME is any 168 sequence of capitals. It should return a character range. Further 169 uses of (*...) are still to be decided. 170 ''' 171 raise RegexpError(fmt('Extension {0!r} not supported by {1!s}', 172 text, self.__class__))
173 174 @abstractmethod
175 - def fmt_sequence(self, children):
176 ''' 177 This must fully describe the data in the children (it is used to 178 hash the data). 179 '''
180 181 @abstractmethod
182 - def fmt_repeat(self, children):
183 ''' 184 This must fully describe the data in the children (it is used to 185 hash the data). 186 '''
187 188 @abstractmethod
189 - def fmt_choice(self, children):
190 ''' 191 This must fully describe the data in the children (it is used to 192 hash the data). 193 '''
194 195 @abstractmethod
196 - def fmt_option(self, children):
197 ''' 198 This must fully describe the data in the children (it is used to 199 hash the data). 200 '''
201 202 @abstractmethod
203 - def fmt_label(self, label, child):
204 ''' 205 This must fully describe the data in the child (it is used to 206 hash the data). 207 '''
208 209 @abstractmethod
210 - def join(self, chars):
211 ''' 212 Join a list of characters into a string (or the equivalent). 213 '''
214 215 @abstractmethod
216 - def parse(self, regexp):
217 ''' 218 Generate a Sequence from the given regexp text. 219 '''
220
221 222 -class Sequence(Node):
223 ''' 224 A sequence of Characters, etc. 225 ''' 226
227 - def __init__(self, alphabet, *children):
228 assert isinstance(alphabet, Alphabet) 229 # pylint: disable-msg=W0142 230 # have to disable this assertion to embed error nodes 231 # for child in children: 232 # assert isinstance(child, RegexpGraphNode), type(child) 233 super(Sequence, self).__init__(*children) 234 self.alphabet = alphabet 235 self.state = None 236 self.__str = self._build_str()
237
238 - def _build_str(self):
239 ''' 240 Construct a string representation of self. 241 ''' 242 return self.alphabet.fmt_sequence(self)
243
244 - def __str__(self):
245 return self.__str
246
247 - def __hash__(self):
248 return hash(self.__str)
249
250 - def build(self, graph, before, after):
251 ''' 252 Connect in sequence. 253 ''' 254 if self: 255 src = before 256 last = self[-1] 257 for child in self: 258 dest = after if child is last else graph.new_node() 259 child.build(graph, src, dest) 260 src = dest 261 else: 262 graph.connect(before, after)
263 264 @staticmethod
265 - def __clone(node, args, kargs):
266 try: 267 args = list(args) 268 args.insert(0, node.alphabet) 269 except AttributeError: 270 pass 271 return clone(node, args, kargs)
272
273 - def clone(self):
275 276 277 #class RegexpGraphNode(metaclass=ABCMeta): 278 RegexpGraphNode = ABCMeta('RegexpGraphNode', (object, ), {}) 279 RegexpGraphNode.register(_Character) 280 RegexpGraphNode.register(Sequence)
281 282 283 -class Option(Sequence):
284 ''' 285 An optional sequence of Characters (or sequences). 286 ''' 287
288 - def _build_str(self):
289 ''' 290 Construct a string representation of self. 291 ''' 292 return self.alphabet.fmt_option(self)
293
294 - def build(self, graph, before, after):
295 ''' 296 Connect as sequence and also directly from start to end 297 ''' 298 super(Option, self).build(graph, before, after) 299 graph.connect(before, after)
300
301 302 -class Empty(Sequence):
303 ''' 304 Matches an empty sequence. 305 ''' 306
307 - def _build_str(self):
308 ''' 309 Construct a string representation of self. 310 ''' 311 return self.alphabet.fmt_sequence([])
312
313 - def build(self, graph, before, after):
314 ''' 315 Connect directly from start to end. 316 ''' 317 graph.connect(before, after)
318
319 320 -class Repeat(Sequence):
321 ''' 322 A sequence of Characters (or sequences) that can repeat 0 or more times. 323 ''' 324
325 - def _build_str(self):
326 ''' 327 Construct a string representation of self. 328 ''' 329 return self.alphabet.fmt_repeat(self)
330
331 - def build(self, graph, before, after):
332 ''' 333 Connect in loop from before to before, and also directly from 334 start to end. 335 ''' 336 node = graph.new_node() 337 graph.connect(before, node) 338 super(Repeat, self).build(graph, node, node) 339 graph.connect(node, after)
340
341 342 -def Choice(alphabet, *children):
343 ''' 344 Encase a choice in a sequence so that it can be treated in the same 345 way as other components (if we don't do this, then passing it to another 346 sequence will split it into constituents and then sequence them). 347 ''' 348 return Sequence(alphabet, _Choice(alphabet, *children))
349
350 351 -class _Choice(Sequence):
352 ''' 353 A set of alternative Characters (or sequences). 354 ''' 355
356 - def _build_str(self):
357 ''' 358 Construct a string representation of self. 359 ''' 360 return self.alphabet.fmt_choice(self)
361
362 - def build(self, graph, before, after):
363 ''' 364 Connect in parallel from start to end, but add extra nodes so that 365 the sequence is tried in order (because evaluation tries empty 366 transitions last) and that loops don't return to start. 367 ''' 368 if self: 369 last = self[-1] 370 for child in self: 371 # this places numbering first, so empty transition chosen in order 372 if child is not last: 373 node = graph.new_node() 374 child.build(graph, before, after) 375 if child is not last: 376 graph.connect(before, node) 377 before = node
378
379 # if self: 380 # last = self[-1] 381 # for child in self: 382 # node = graph.new_node() 383 # graph.connect(before, node) 384 # child.build(graph, node, after) 385 # if child is not last: 386 # node = graph.new_node() 387 # graph.connect(before, node) 388 # before = node 389 390 391 -class Labelled(Sequence):
392 ''' 393 A labelled sequence. Within our limited implementation these are 394 restricted to (1) being children of Regexp and (2) not being followed 395 by any other sequence. Their termination defines terminal nodes. 396 ''' 397
398 - def __init__(self, alphabet, label, *children):
399 self.label = label 400 super(Labelled, self).__init__(alphabet, *children)
401
402 - def _build_str(self):
403 ''' 404 Construct a string representation of self. 405 ''' 406 return self.alphabet.fmt_label(self.label, 407 self.alphabet.fmt_sequence(self))
408
409 - def build(self, graph, before, after):
410 ''' 411 A sequence, but with an extra final empty transition to force 412 any loops before termination. 413 ''' 414 node = graph.new_node() 415 super(Labelled, self).build(graph, before, node) 416 graph.connect(node, after) 417 graph.terminate(node, [self.label])
418
419 420 -class RegexpError(Exception):
421 ''' 422 An error associated with (problems in) the regexp implementation. 423 ''' 424 pass
425
426 427 -class Compiler(LogMixin):
428 ''' 429 Compile an expression. 430 ''' 431
432 - def __init__(self, expression, alphabet):
433 super(Compiler, self).__init__() 434 self.expression = expression 435 self.alphabet = alphabet
436
437 - def nfa(self):
438 ''' 439 Generate a NFA-based matcher. 440 ''' 441 self._debug(fmt('compiling to nfa: {0}', self)) 442 graph = NfaGraph(self.alphabet) 443 self.expression.build(graph, graph.new_node(), graph.new_node()) 444 self._debug(fmt('nfa graph: {0}', graph)) 445 return NfaPattern(graph, self.alphabet)
446
447 - def dfa(self):
448 ''' 449 Generate a DFA-based matcher (faster than NFA, but returns only a 450 single, greedy match). 451 ''' 452 self._debug(fmt('compiling to dfa: {0}', self)) 453 ngraph = NfaGraph(self.alphabet) 454 self.expression.build(ngraph, ngraph.new_node(), ngraph.new_node()) 455 self._debug(fmt('nfa graph: {0}', ngraph)) 456 dgraph = NfaToDfa(ngraph, self.alphabet).dfa 457 self._debug(fmt('dfa graph: {0}', dgraph)) 458 return DfaPattern(dgraph, self.alphabet)
459
460 - def re(self):
461 ''' 462 Generate a matcher that wraps the standard "re" package. 463 ''' 464 return RePattern(str(self), self.alphabet)
465
466 - def __str__(self):
467 ''' 468 Show the expression itself. 469 ''' 470 return str(self.expression)
471 472 @staticmethod
473 - def _coerce(regexp, alphabet):
474 ''' 475 Coerce to a regexp. 476 ''' 477 if isinstance(regexp, basestring): 478 coerced = alphabet.parse(regexp) 479 if not coerced: 480 raise RegexpError(fmt('Cannot parse regexp {0!r} using {1}', 481 regexp, alphabet)) 482 return coerced 483 else: 484 return regexp
485 486 @staticmethod
487 - def single(alphabet, regexp, label='label'):
488 ''' 489 Generate an instance for a single expression or sequence. 490 ''' 491 return Compiler(Labelled(alphabet, label, 492 *Compiler._coerce(regexp, alphabet)), alphabet)
493 494 @staticmethod
495 - def multiple(alphabet, regexps):
496 ''' 497 Generate an instance for several expressions. 498 ''' 499 return Compiler( 500 Choice(alphabet, 501 *[Labelled(alphabet, label, 502 *Compiler._coerce(regexp, alphabet)) 503 for (label, regexp) in regexps]), alphabet)
504
505 506 507 -class BaseGraph(object):
508 ''' 509 Describes a collection of connected nodes. 510 ''' 511
512 - def __init__(self, alphabet):
513 super(BaseGraph, self).__init__() 514 self._alphabet = alphabet 515 self._next_node = 0 516 self._transitions = {} # map from source to (dest, edge) 517 self._terminals = {} # node to label
518
519 - def terminate(self, node, labels):
520 ''' 521 Indicate that the node terminates with the given labels. 522 ''' 523 assert node < self._next_node 524 if node not in self._terminals: 525 self._terminals[node] = set() 526 self._terminals[node].update(labels)
527
528 - def new_node(self):
529 ''' 530 Get a new node. 531 ''' 532 node = self._next_node 533 self._next_node += 1 534 return node
535
536 - def connect(self, src, dest, edge):
537 ''' 538 Define a connection between src and dest, with an edge 539 value (a character). 540 ''' 541 assert src < self._next_node 542 assert dest < self._next_node 543 if src not in self._transitions: 544 self._transitions[src] = [] 545 self._transitions[src].append((dest, edge))
546
547 - def __iter__(self):
548 ''' 549 An iterator over all nodes. 550 ''' 551 return iter(range(self._next_node))
552
553 - def transitions(self, src):
554 ''' 555 An iterator over all non-empty transitions from src - returns 556 (dest, edge) pairs. 557 ''' 558 return iter(self._transitions.get(src, []))
559
560 - def terminals(self, node):
561 ''' 562 An iterator over the terminals for the give node. 563 ''' 564 return iter(self._terminals.get(node, []))
565
566 - def __len__(self):
567 return self._next_node
568
569 570 -class NfaGraph(BaseGraph):
571 ''' 572 Describes a NFA with epsilon (empty) transitions. 573 ''' 574
575 - def __init__(self, alphabet):
576 super(NfaGraph, self).__init__(alphabet) 577 self._empty_transitions = {} # map from source to set(dest)
578
579 - def new_node(self):
580 ''' 581 Get a new (unconnected) node. 582 ''' 583 node = super(NfaGraph, self).new_node() 584 self._empty_transitions[node] = set() 585 return node
586
587 - def connect(self, src, dest, edge=None):
588 ''' 589 Define a connection between src and dest, with an optional edge 590 value (a character). 591 ''' 592 if edge: 593 super(NfaGraph, self).connect(src, dest, edge) 594 else: 595 assert src < self._next_node 596 assert dest < self._next_node 597 assert src not in self._terminals, 'Source is terminal' 598 self._empty_transitions[src].add(dest)
599
600 - def empty_transitions(self, src):
601 ''' 602 An iterator over all empty transitions from src. 603 ''' 604 return iter(self._empty_transitions.get(src, []))
605
606 - def connected(self, nodes):
607 ''' 608 Return all nodes connected to the given node. 609 ''' 610 connected = set() 611 stack = deque(nodes) 612 while stack: 613 src = stack.pop() 614 connected.add(src) 615 for dest in self.empty_transitions(src): 616 if dest not in connected: 617 connected.add(dest) 618 stack.append(dest) 619 return (frozenset(connected), 620 chain(*[self.terminals(node) for node in connected]))
621
622 - def terminal(self, node):
623 ''' 624 The NFA graph has single terminal. 625 ''' 626 terminals = list(self.terminals(node)) 627 if terminals: 628 assert len(terminals) == 1, 'Multiple terminals in NFA' 629 return terminals[0] 630 else: 631 return None
632
633 - def __str__(self):
634 ''' 635 Example: 636 0: 3, 4; 1: 2; 2(Tk1); 3: [{u'\x00'}-`b-{u'\U0010ffff'}]->3, 1; 637 4: {$}->5, 7; 5: 6; 6($); 7: {^}->10; 8: 9; 9(^); 10: 11; 638 11: [ ]->11, 8 639 640 Node 0 leads to 3 and 4 (both empty) 641 Node 1 leads to 2 (empty) 642 Node 2 is terminal, labelled with "Tk1" 643 Node 3 loops back to 3 for a character in the given range, or to 1 644 etc. 645 ''' 646 lines = [] 647 for node in self: 648 edges = [] 649 for (dest, edge) in self.transitions(node): 650 edges.append(fmt('{0}->{1}', edge, dest)) 651 for dest in self.empty_transitions(node): 652 edges.append(str(dest)) 653 label = '' if self.terminal(node) is None \ 654 else fmt('({0})', self.terminal(node)) 655 if edges: 656 lines.append( 657 fmt('{0}{1}: {2}', node, label, ', '.join(edges))) 658 else: 659 lines.append(fmt('{0}{1}', node, label)) 660 return '; '.join(lines)
661
662 663 -class NfaPattern(LogMixin):
664 ''' 665 Given a graph this constructs a transition table and an associated 666 matcher. The matcher attempts to find longest matches but does not 667 guarantee termination (if a possible empty match is repeated). 668 669 Note that the matcher returns a triple, including label. This is not 670 the same interface as the matchers used in recursive descent parsing. 671 672 Evaluation order for transition: 673 - Transition with character, if defined 674 - Empty transition to largest numbered node 675 these ensure we do deepest match first. 676 ''' 677
678 - def __init__(self, graph, alphabet):
679 super(NfaPattern, self).__init__() 680 self.__graph = graph 681 self.__alphabet = alphabet 682 self.__table = {} 683 self.__build_table()
684
685 - def __build_table(self):
686 ''' 687 Rewrite the graph as a transition table, with appropriate ordering. 688 ''' 689 for src in self.__graph: 690 # construct an interval map of possible destinations and terminals 691 # given a character 692 fragments = TaggedFragments(self.__alphabet) 693 for (dest, char) in self.__graph.transitions(src): 694 fragments.append(char, (dest, self.__graph.terminal(dest))) 695 map_ = IntervalMap() 696 for (interval, dts) in fragments: 697 map_[interval] = dts 698 # collect empty transitions 699 empties = [(dest, self.__graph.terminal(dest)) 700 # ordering here is reverse of what is required, which 701 # is ok because we use empties[-1] below 702 for dest in sorted(self.__graph.empty_transitions(src))] 703 self.__table[src] = (map_, empties)
704
705 - def match(self, stream):
706 ''' 707 Use the table to match a stream. 708 709 The stack holds the current state, which is consumed from left to 710 right. An entry on the stack contains: 711 712 - map_ - a map from character to [(dest state, terminals)] 713 714 - matched - the [(dest state, terminals)] generated by the map for 715 a given character 716 717 - empties - empty transitions for this state 718 719 - match - the current match, as a list of tokens consumed from the 720 stream 721 722 - stream - the current stream 723 ''' 724 #self._debug(str(self.__table)) 725 stack = deque() 726 (map_, empties) = self.__table[0] 727 stack.append((map_, None, empties, [], stream)) 728 while stack: 729 #self._debug(str(stack)) 730 (map_, matched, empties, match, stream) = stack.pop() 731 if not map_ and not matched and not empties: 732 # if we have no more transitions, drop 733 pass 734 elif map_: 735 # re-add empties with old match 736 stack.append((None, None, empties, match, stream)) 737 # and try matching a character 738 if not s_empty(stream): 739 (value, next_stream) = s_next(stream) 740 try: 741 matched = map_[value] 742 if matched: 743 stack.append((None, matched, None, 744 match + [value], next_stream)) 745 except IndexError: 746 pass 747 elif matched: 748 (dest, terminal) = matched[-1] 749 # add back reduced matched 750 if len(matched) > 1: # avoid discard iteration 751 stack.append((map_, matched[:-1], empties, match, stream)) 752 # and expand this destination 753 (map_, empties) = self.__table[dest] 754 stack.append((map_, None, empties, match, stream)) 755 if terminal: 756 yield (terminal, self.__alphabet.join(match), stream) 757 else: 758 # we must have an empty transition 759 (dest, terminal) = empties[-1] 760 # add back reduced empties 761 if len(empties) > 1: # avoid discard iteration 762 stack.append((map_, matched, empties[:-1], match, stream)) 763 # and expand this destination 764 (map_, empties) = self.__table[dest] 765 stack.append((map_, None, empties, match, stream)) 766 if terminal: 767 yield (terminal, self.__alphabet.join(match), stream)
768
769 - def __repr__(self):
770 return '<NFA>'
771
772 773 -class DfaGraph(BaseGraph):
774 ''' 775 Describes a DFA where each node is a collection of NFA nodes. 776 ''' 777
778 - def __init__(self, alphabet):
779 super(DfaGraph, self).__init__(alphabet) 780 self._dfa_to_nfa = {} # map from dfa node to set(nfa nodes) 781 self._nfa_to_dfa = {} # map from set(nfa nodes) to dfa nodes
782
783 - def node(self, nfa_nodes):
784 ''' 785 Add a node, defined as a set of nfa nodes. If the set already exists, 786 (False, old node) is returned, with the existing DFA node. 787 Otherwise (True, new node) is returned. 788 ''' 789 new = nfa_nodes not in self._nfa_to_dfa 790 if new: 791 dfa_node = self.new_node() 792 self._nfa_to_dfa[nfa_nodes] = dfa_node 793 self._dfa_to_nfa[dfa_node] = nfa_nodes 794 return (new, self._nfa_to_dfa[nfa_nodes])
795
796 - def nfa_nodes(self, node):
797 ''' 798 An iterator over NFA nodes associated with the given DFA node. 799 ''' 800 return iter(self._dfa_to_nfa[node])
801
802 - def __str__(self):
803 lines = [] 804 for node in self: 805 edges = [] 806 for (dest, edge) in self.transitions(node): 807 edges.append(fmt('{0}->{1}', edge, dest)) 808 nodes = [n for n in self.nfa_nodes(node)] 809 edges = ' ' + ','.join(edges) if edges else '' 810 labels = list(self.terminals(node)) 811 labels = fmt('({0})', ','.join(str(label) for label in labels)) \ 812 if labels else '' 813 lines.append(fmt('{0}{1}: {2}{3}', node, labels, nodes, edges)) 814 return '; '.join(lines)
815
816 817 # pylint: disable-msg=R0903 818 # this is complex enough 819 -class NfaToDfa(object):
820 ''' 821 Convert a NFA graph to a DFA graph (uses the usual superset approach but 822 does combination of states in a way that seems to fit better with the idea 823 of character ranges). 824 ''' 825
826 - def __init__(self, nfa, alphabet):
827 super(NfaToDfa, self).__init__() 828 self.__nfa = nfa 829 self.__alphabet = alphabet 830 self.dfa = DfaGraph(alphabet) 831 self.__build_graph()
832
833 - def __build_graph(self):
834 ''' 835 This is the driver for the "usual" superset algorithm - we find 836 all states that correspond to a DFA state, then look at transitions 837 to other states. This repeats until we have covered all the 838 different combinations of NFA states that could occur. 839 ''' 840 stack = deque() # (dfa node, set(nfa nodes), set(terminals)) 841 # start with initial node 842 (nfa_nodes, terminals) = self.__nfa.connected([0]) 843 (_, src) = self.dfa.node(nfa_nodes) 844 stack.append((src, nfa_nodes, terminals)) 845 # continue until all states covered 846 while stack: 847 (src, nfa_nodes, terminals) = stack.pop() 848 self.dfa.terminate(src, terminals) 849 fragments = self.__fragment_transitions(nfa_nodes) 850 groups = self.__group_fragments(fragments) 851 self.__add_groups(src, groups, stack)
852
853 - def __fragment_transitions(self, nfa_nodes):
854 ''' 855 From the given nodes we can accumulate the destination nodes and 856 terminals associated with each transition (edge/character). 857 At the same time we separate the character matches into non-overlapping 858 fragments. 859 ''' 860 fragments = TaggedFragments(self.__alphabet) 861 for nfa_node in nfa_nodes: 862 for (dest, edge) in self.__nfa.transitions(nfa_node): 863 (nodes, terminals) = self.__nfa.connected([dest]) 864 fragments.append(edge, (nodes, list(terminals))) 865 return fragments
866 867 @staticmethod
868 - def __group_fragments(fragments):
869 ''' 870 For each fragment, we for the complete set of possible destinations 871 and associated terminals. Since it is possible that more than one 872 fragment will lead to the same set of target nodes we group all 873 related fragments together. 874 ''' 875 # map from set(nfa nodes) to (set(intervals), set(terminals)) 876 groups = {} 877 for (interval, nts) in fragments: 878 if len(nts) > 1: 879 # collect all nodes and terminals for fragment 880 (nodes, terminals) = (set(), set()) 881 for (n, t) in nts: 882 nodes.update(n) 883 terminals.update(t) 884 nodes = frozenset(nodes) 885 else: 886 (nodes, terminals) = nts[0] 887 if nodes not in groups: 888 groups[nodes] = (set(), set()) 889 groups[nodes][0].add(interval) 890 groups[nodes][1].update(terminals) 891 return groups
892
893 - def __add_groups(self, src, groups, stack):
894 ''' 895 The target nfa nodes identified above are now used to create dfa 896 nodes. 897 ''' 898 for nfa_nodes in groups: 899 (intervals, terminals) = groups[nfa_nodes] 900 char = Character(intervals, self.__alphabet) 901 (new, dest) = self.dfa.node(nfa_nodes) 902 self.dfa.connect(src, dest, char) 903 if new: 904 stack.append((dest, nfa_nodes, terminals))
905
906 907 -class DfaPattern(LogMixin):
908 ''' 909 Create a lookup table for a DFA and a matcher to evaluate it. 910 ''' 911
912 - def __init__(self, graph, alphabet):
913 super(DfaPattern, self).__init__() 914 self.__graph = graph 915 self.__alphabet = alphabet 916 self.__table = [None] * len(graph) 917 self.__empty_labels = list(graph.terminals(0)) 918 self.__build_table()
919
920 - def __build_table(self):
921 ''' 922 Construct a transition table. 923 ''' 924 for src in self.__graph: 925 row = IntervalMap() 926 for (dest, char) in self.__graph.transitions(src): 927 # use tuple rather than list to allow hashing of tokens! 928 labels = tuple(self.__graph.terminals(dest)) 929 for interval in char: 930 row[interval] = (dest, labels) 931 self.__table[src] = row
932
933 - def match(self, stream_in):
934 ''' 935 Match against the stream. 936 ''' 937 try: 938 (terminals, size, _) = self.size_match(stream_in) 939 (value, stream_out) = s_next(stream_in, count=size) 940 return (terminals, value, stream_out) 941 except TypeError: 942 # the matcher returned None 943 return None
944
945 - def size_match(self, stream):
946 ''' 947 Match against the stream, but return the length of the match. 948 ''' 949 state = 0 950 size = 0 951 longest = (self.__empty_labels, 0, stream) \ 952 if self.__empty_labels else None 953 (line, _) = s_line(stream, True) 954 while size < len(line): 955 future = self.__table[state][line[size]] 956 if future is None: 957 break 958 # update state 959 (state, terminals) = future 960 size += 1 961 # match is strictly increasing, so storing the length is enough 962 # (no need to make an expensive copy) 963 if terminals: 964 try: 965 (_, next_stream) = s_next(stream, count=size) 966 longest = (terminals, size, next_stream) 967 except StopIteration: 968 pass 969 return longest
970
971 - def __repr__(self):
972 return '<DFA>'
973