1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 '''
31 The main configuration object and various standard configurations.
32 '''
33
34
35
36
37 from collections import namedtuple
38
39 from lepl.core.parser import make_raw_parser, make_single, make_multiple
40 from lepl.stream.factory import DEFAULT_STREAM_FACTORY
41
42
43 Configuration = namedtuple('Configuration',
44 'rewriters monitors stream_factory stream_kargs')
45 '''Carrier for configuration.'''
49 '''
50 Error raised for problems with configuration.
51 '''
52 pass
53
56 '''
57 Accumulate configuration through chained methods.
58 '''
59
61
62 self.__started = False
63
64
65
66 self.__changed = True
67 self.__rewriters = set()
68 self.__monitors = []
69 self.__stream_factory = DEFAULT_STREAM_FACTORY
70 self.__alphabet = None
71 self.__stream_kargs = {}
72
73
74
75 self.matcher = matcher
76
78 '''
79 Set default values on demand to avoid dependency loop.
80 '''
81 if not self.__started:
82 self.__started = True
83 self.default()
84
85
86
87
89 '''
90 Add a rewriter that will be applied to the matcher graph when the
91 parser is generated.
92 '''
93 self.__start()
94 self.clear_cache()
95
96
97 if rewriter in self.__rewriters:
98 self.__rewriters.remove(rewriter)
99 self.__rewriters.add(rewriter)
100 return self
101
103 '''
104 Remove a rewriter from the current configuration.
105 '''
106 self.__start()
107 self.clear_cache()
108 self.__rewriters = set(r for r in self.__rewriters
109 if r is not rewriter)
110 return self
111
113 '''
114 Remove all rewriters of a given type from the current configuration.
115 '''
116 self.__start()
117 self.clear_cache()
118 if type_:
119 self.__rewriters = set(r for r in self.__rewriters
120 if not isinstance(r, type_))
121 else:
122 self.__rewriters = set()
123 return self
124
126 '''
127 Add a monitor to the current configuration. Monitors are called
128 from within the trampolining process and can be used to track
129 evaluation, control resource use, etc.
130 '''
131 self.__start()
132 self.clear_cache()
133 self.__monitors.append(monitor)
134 return self
135
137 '''
138 Remove all monitors from the current configuration.
139 '''
140 self.__start()
141 self.clear_cache()
142 self.__monitors = []
143 return self
144
146 '''
147 Specify the stream factory. This is used to generate the input stream
148 for the parser.
149 '''
150 self.__start()
151 self.clear_cache()
152 self.__stream_factory = stream_factory
153 return self
154
156 '''
157 Add a value for passing to the stream factory.
158 '''
159 for name in kargs:
160 self.__stream_kargs[name] = kargs[name]
161 return self
162
164 '''
165 Remove all values passed to the stream factory.
166 '''
167 self.__stream_kargs = {}
168
169 @property
171 '''
172 The current configuration (rewriters, monitors, stream_factory).
173 '''
174 self.__start()
175 self.__changed = False
176 rewriters = list(self.__rewriters)
177 rewriters.sort()
178 return Configuration(rewriters, list(self.__monitors),
179 self.__stream_factory, dict(self.__stream_kargs))
180
182 '''
183 Get the alphabet used.
184
185 Typically this is Unicode, which is the default. It is needed for
186 the generation of regular expressions.
187 '''
188 from lepl.regexp.unicode import UnicodeAlphabet
189 if not self.__alphabet:
190 self.__alphabet = UnicodeAlphabet.instance()
191 return self.__alphabet
192
194 '''
195 Set the alphabet used. It is needed for the generation of regular
196 expressions, for example (but the default, for Unicode, is usually
197 sufficient).
198 '''
199 if alphabet:
200 if self.__alphabet:
201 if self.__alphabet != alphabet:
202 raise ConfigurationError(
203 'Alphabet has changed during configuration '
204 '(perhaps the default was already used?)')
205 else:
206 self.__alphabet = alphabet
207 self.__start()
208 self.clear_cache()
209
210 @property
212 '''
213 Has the config been changed by the user since it was last returned
214 via `configuration`? if not, any previously generated parser can be
215 reused.
216 '''
217 return self.__changed
218
220 '''
221 Force calculation of a new parser.
222 '''
223 self.__changed = True
224
225
226
234
241
260
262 '''
263 Raise an error if the first match fails. If `eos` is True then this
264 requires that the entire input is matched, otherwise it only requires
265 that the matcher succeed. The exception includes information about
266 the deepest read to the stream (which is a good indication of where
267 any error occurs).
268
269 This is part of the default configuration. It can be removed with
270 `no_full_first_match()`.
271 '''
272 from lepl.core.rewriters import FullFirstMatch
273 return self.add_rewriter(FullFirstMatch(eos))
274
281
283 '''
284 Combined nested `And()` and `Or()` matchers. This does not change
285 the parser semantics, but improves efficiency.
286
287 This is part of the default configuration. It can be removed with
288 `no_flatten`.
289 '''
290 from lepl.core.rewriters import Flatten
291 return self.add_rewriter(Flatten())
292
299
311
325
327 '''
328 Compile simple matchers to re (C library) regular expressions.
329 This improves efficiency but may change the parser semantics slightly
330 (DFA regular expressions do not provide backtracking / alternative
331 matches).
332 '''
333 from lepl.matchers.core import Regexp
334 from lepl.regexp.rewriters import CompileRegexp
335 def regexp_wrapper(regexp, _alphabet):
336 '''
337 Adapt the Regexp matcher to the form needed (forcing Unicode).
338 '''
339 return Regexp(str(regexp))
340 self.alphabet(alphabet)
341 return self.add_rewriter(
342 CompileRegexp(self.__get_alphabet(), force, regexp_wrapper))
343
350
352 '''
353 Rearrange arguments to `Or()` so that left-recursive matchers are
354 tested last. This improves efficiency, but may alter the parser
355 semantics (the ordering of multiple results with ambiguous grammars
356 may change).
357
358 `conservative` refers to the algorithm used to detect loops; False
359 may classify some left--recursive loops as right--recursive.
360 '''
361 from lepl.core.rewriters import OptimizeOr
362 return self.add_rewriter(OptimizeOr(conservative))
363
370
371 - def lexer(self, alphabet=None, discard=None, lexer=None):
384
391
393 '''
394 Combine simple matchers so that they are evaluated without
395 trampolining. This improves efficiency (particularly because it
396 reduces the number of matchers that can be memoized).
397
398 This is part of the default configuration. It can be removed with
399 `no_direct_eval`.
400 '''
401 from lepl.core.rewriters import DirectEvaluation
402 return self.add_rewriter(DirectEvaluation(spec))
403
410
421
428
430 '''
431 This configuration attempts to detect which memoizer is most effective
432 for each matcher. As such it is a general "fix" for left-recursive
433 grammars and is suggested in the warning shown when the right-only
434 memoizer detects left recursion.
435
436 Lepl does not guarantee that all left-recursive grammars are handled
437 correctly. The corrections applied may be incomplete and can be
438 inefficient. It is always better to re-write a grammar to avoid
439 left-recursion. One way to improve efficiency, at the cost of less
440 accurate matching, is to specify a non-zero ``d`` parameter - this
441 is the maximum iteration depth that will be used (by default, when
442 ``d`` is zero, it is the length of the remaining input, which can
443 be very large).
444
445 '''
446 from lepl.core.rewriters import AutoMemoize
447 from lepl.matchers.memo import LMemo, RMemo
448 self.no_memoize()
449 return self.add_rewriter(
450 AutoMemoize(conservative=conservative, left=LMemo,
451 right=RMemo if full else None, d=d))
452
454 '''
455 Add memoization that may detect and stabilise left-recursion. This
456 makes the parser more robust (so it can handle more grammars) but
457 also more complex (and probably slower).
458
459 ``config.auto_memoize()`` will also add memoization, but will select
460 left/right memoization depending on the path through the parser.
461
462 Lepl does not guarantee that all left-recursive grammars are handled
463 correctly. The corrections applied may be incomplete and can be
464 inefficient. It is always better to re-write a grammar to avoid
465 left-recursion. One way to improve efficiency, at the cost of less
466 accurate matching, is to specify a non-zero ``d`` parameter - this
467 is the maximum iteration depth that will be used (by default, when
468 ``d`` is zero, it is the length of the remaining input, which can
469 be very large).
470 '''
471 from lepl.core.rewriters import LeftMemoize
472 self.no_memoize()
473 return self.add_rewriter(LeftMemoize(d))
474
476 '''
477 Add memoization that can make some complex parsers (with a lot of
478 backtracking) more efficient. This also detects left-recursive
479 grammars and displays a suitable warning.
480
481 This is included in the default configuration. For simple grammars
482 it may make things slower; it can be disabled by
483 ``config.no_memoize()``.
484 '''
485 from lepl.core.rewriters import RightMemoize
486 self.no_memoize()
487 return self.add_rewriter(RightMemoize())
488
499
500 - def lines(self, discard=None, tabsize=8,
501 block_policy=None, block_start=None):
502 '''
503 Configure "offside parsing". This enables lexing and adds extra
504 tokens to mark the start and end of lines. If block_policy is
505 specified then the line start token will also include spaces
506 which can be used by the ``Block()`` and ``BLine()`` matchers
507 to do offside (whitespace-aware) parsing.
508
509 `discard` is the regular expression to use to identify spaces
510 between tokens (by default, spaces and tabs).
511
512 The remaining parameters are used only if at least one of
513 `block_policy` and `block_start` is given.
514
515 `block_policy` decides how indentation if calculated.
516 See `explicit` etc in lepl.lexer.blocks.matchers.
517
518 `block_start` is the initial indentation (by default, zero). If set
519 to lepl.lexer.lines.matchers.NO_BLOCKS indentation will not
520 be checked (useful for tests).
521
522 `tabsize` is used only if `block_policy` is given. It is the number
523 of spaces used to replace a leading tab (no replacement if None).
524 '''
525 from lepl.lexer.lines.lexer import make_offside_lexer
526 from lepl.lexer.lines.matchers import Block, DEFAULT_POLICY, LineStart
527 from lepl.lexer.lines.monitor import block_monitor
528 blocks = block_policy is not None or block_start is not None
529 if blocks:
530 if block_start is None:
531 block_start = 0
532 if block_policy is None:
533 block_policy = DEFAULT_POLICY
534 self.add_monitor(block_monitor(block_start))
535 self.set_arguments(Block, policy=block_policy)
536 else:
537 self.set_arguments(LineStart, indent=False)
538 self.lexer(self.__get_alphabet(), discard,
539 make_offside_lexer(tabsize, blocks))
540 return self
541
542
543
544
546 '''
547 Add a monitor to trace results using `TraceStack()`.
548
549 This is not used by default as it has a cost at runtime.
550 '''
551 from lepl.core.trace import TraceStack
552 return self.add_monitor(TraceStack(enabled))
553
555 '''
556 Add a monitor to correctly insert the transforms needed when using
557 the `TraceVariables()` context:
558
559 with TraceVariables():
560 ...
561
562 This is used by default as it has no runtime cost (once the parser
563 is created).
564 '''
565 from lepl.core.rewriters import TraceVariables
566 return self.add_rewriter(TraceVariables())
567
569 '''
570 Reduce memory use (at the expense of backtracking).
571
572 This will:
573 - Add a monitor to manage resources. See `GeneratorManager()`.
574 - Disable direct evaluation (more trampolining gives more scope
575 for removing generators)
576 - Disable the full first match error (which requires a copy of the
577 input for the error message)
578 - Disable memoisation (which would keep input in memory)
579
580 This reduces memory usage, but makes the parser less reliable.
581 Usually a value like 100 (the default) for the queue length will make
582 memory use insignificant and still give a useful first parse.
583
584 Note that, although the parser will use less memory, it may run
585 more slowly (as extra work needs to be done to "clean out" the
586 stored values).
587 '''
588 from lepl.core.manager import GeneratorManager
589 self.add_monitor(GeneratorManager(queue_len))
590 self.no_direct_eval()
591 self.no_memoize()
592 self.no_full_first_match()
593 self.cache_level(-9)
594 return self
595
597 '''
598 Control when the stream can be cached internally (this is used for
599 debugging and error messages) - streams are cached for debugging when
600 the value is greater than zero. The value is incremented each time
601 a new stream is constructed (eg when constructing tokens).
602
603 A value of 1 implies that a stream would be always cached. A value of
604 0 might be used when iterating over a file with the lexer - the
605 iteration is not cached, but individual tokens will be.
606 '''
607 self.add_stream_kargs(cache_level=level)
608
609 - def record_deepest(self, n_before=6, n_results_after=2, n_done_after=2):
616
617
618
620 '''
621 Delete any earlier configuration and disable the default (so no
622 rewriters or monitors are used).
623 '''
624 self.__started = True
625 self.clear_cache()
626 self.__rewriters = set()
627 self.__monitors = []
628 self.__stream_factory = DEFAULT_STREAM_FACTORY
629 self.__alphabet = None
630 return self
631
648
651 '''
652 Methods to configure and generate a parser or matcher.
653 '''
654
660
677
678
680 '''
681 Get a function that will parse the contents of a file, returning a
682 sequence of (results, stream) pairs. The data will be read as
683 required (using an iterator), so the file must remain open during
684 parsing. To avoid this, read all data into a string and parse that.
685 '''
686 return self._raw_parser('file')
687
689 '''
690 Get a function that will parse the contents of an iterable
691 (eg. a generator), returning a sequence of (results, stream) pairs.
692 The data will be read as required.
693 '''
694 return self._raw_parser('iterable')
695
697 '''
698 Get a function that will parse the contents of a list returning a
699 sequence of (results, stream) pairs.
700 '''
701 return self._raw_parser('list')
702
704 '''
705 Get a function that will parse the contents of a string returning a
706 sequence of (results, stream) pairs.
707 '''
708 return self._raw_parser('string')
709
711 '''
712 Get a function that will parse the contents of a generic sequence
713 (with [] and len()) returning a sequence of (results, stream) pairs.
714 '''
715 return self._raw_parser('sequence')
716
718 '''
719 Get a function that will parse input, returning a sequence of
720 (results, stream) pairs.
721 The type of stream is inferred from the input to the parser.
722 '''
723 return self._raw_parser()
724
725
727 '''
728 Parse the contents of a file, returning a sequence of
729 (results, stream) pairs. The data will be read as required
730 (using an iterator), so the file must remain open during parsing.
731 To avoid this, read all data into a string and parse that.
732 '''
733 return self.get_match_file()(file_, **kargs)
734
736 '''
737 Parse the contents of an iterable (eg. a generator), returning
738 a sequence of (results, stream) pairs. The data will be read as
739 required.
740 '''
741 return self.get_match_iterable()(iterable, **kargs)
742
744 '''
745 Parse the contents of a list returning a sequence of (results, stream)
746 pairs.
747 '''
748 return self.get_match_list()(list_, **kargs)
749
751 '''
752 Parse the contents of a string, returning a sequence of
753 (results, stream) pairs.
754 '''
755 return self.get_match_string()(string, **kargs)
756
758 '''
759 Parse the contents of a generic sequence (with [] and len())
760 returning a sequence of (results, stream) pairs.
761 '''
762 return self.get_match_sequence()(sequence, **kargs)
763
764 - def match(self, input_, **kargs):
765 '''
766 Parse input, returning a sequence of (results, stream) pairs.
767 The type of stream is inferred from the input.
768 '''
769 return self.get_match()(input_, **kargs)
770
771
773 '''
774 Get a function that will parse the contents of a file, returning a
775 single match. The data will be read as required (using an iterator),
776 so the file must remain open during parsing. To avoid this, read
777 all data into a string and parse that.
778 '''
779 return make_single(self.get_match_file())
780
782 '''
783 Get a function that will parse the contents of an iterable
784 (eg. a generator), returning a single match. The data will be read
785 as required.
786 '''
787 return make_single(self.get_match_iterable())
788
790 '''
791 Get a function that will parse the contents of a list returning a
792 single match.
793 '''
794 return make_single(self.get_match_list())
795
797 '''
798 Get a function that will parse the contents of a string returning a
799 single match.
800 '''
801 return make_single(self.get_match_string())
802
804 '''
805 Get a function that will parse the contents of a generic sequence
806 (with [] and len()) returning a single match.
807 '''
808 return make_single(self.get_match_sequence())
809
811 '''
812 Get a function that will parse input, returning a single match.
813 The type of stream is inferred from the input to the parser.
814 '''
815 return make_single(self.get_match())
816
817
819 '''
820 Parse the contents of a file, returning a single match. The data
821 will be read as required (using an iterator), so the file must
822 remain open during parsing. To avoid this, read all data into a
823 string and parse that.
824 '''
825 return self.get_parse_file()(file_, **kargs)
826
828 '''
829 Parse the contents of an iterable (eg. a generator), returning
830 a single match. The data will be read as required.
831 '''
832 return self.get_parse_iterable()(iterable, **kargs)
833
835 '''
836 Parse the contents of a list returning a single match.
837 '''
838 return self.get_parse_list()(list_, **kargs)
839
841 '''
842 Parse the contents of a string, returning a single match.
843 '''
844 return self.get_parse_string()(string, **kargs)
845
847 '''
848 Pparse the contents of a generic sequence (with [] and len())
849 returning a single match.
850 '''
851 return self.get_parse_sequence()(sequence, **kargs)
852
853 - def parse(self, input_, **kargs):
854 '''
855 Parse the input, returning a single match. The type of stream is
856 inferred from the input.
857 '''
858 return self.get_parse()(input_, **kargs)
859
860
862 '''
863 Get a function that will parse the contents of a file, returning a
864 sequence of matches. The data will be read as required (using an
865 iterator), so the file must remain open during parsing. To avoid
866 this, read all data into a string and parse that.
867 '''
868 return make_multiple(self.get_match_file())
869
871 '''
872 Get a function that will parse the contents of an iterable
873 (eg. a generator), returning a sequence of matches. The data will
874 be read as required.
875 '''
876 return make_multiple(self.get_match_iterable())
877
879 '''
880 Get a function that will parse the contents of a list returning a
881 sequence of matches.
882 '''
883 return make_multiple(self.get_match_list())
884
886 '''
887 Get a function that will parse a string, returning a sequence of
888 matches.
889 '''
890 return make_multiple(self.get_match_string())
891
893 '''
894 Get a function that will parse the contents of a generic sequence
895 (with [] and len()) returning a sequence of matches.
896 '''
897 return make_multiple(self.get_match_sequence())
898
900 '''
901 Get a function that will parse input, returning a sequence of
902 matches. The type of stream is inferred from the input to the
903 parser.
904 '''
905 return make_multiple(self.get_match())
906
907
909 '''
910 Parse the contents of a file, returning a sequence of matches.
911 The data will be read as required (using an iterator), so the file
912 must remain open during parsing. To avoid this, read all data
913 into a string and parse that.
914 '''
915 return self.get_parse_file_all()(file_, **kargs)
916
918 '''
919 Parse the contents of an iterable (eg. a generator), returning
920 a sequence of matches. The data will be read as required.
921 '''
922 return self.get_parse_iterable_all()(iterable, **kargs)
923
925 '''
926 Parse the contents of a list returning a sequence of matches.
927 '''
928 return self.get_parse_list_all()(list_, **kargs)
929
931 '''
932 Parse the contents of a string, returning a sequence of matches.
933 '''
934 return self.get_parse_string_all()(string, **kargs)
935
937 '''
938 Parse the contents of a generic sequence (with [] and len())
939 returning a sequence of matches.
940 '''
941 return self.get_parse_sequence_all()(sequence, **kargs)
942
944 '''
945 Parse input, returning a sequence of
946 matches. The type of stream is inferred from the input to the
947 parser.
948 '''
949 return self.get_parse_all()(input_, **kargs)
950