1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 '''
31 Tests for the lepl.regexp.unicode module.
32 '''
33
34 from unittest import TestCase
35
36
37 from lepl import RegexpError, DEFAULT_STREAM_FACTORY
38 from lepl.regexp.core import NfaGraph, NfaToDfa, Compiler
39 from lepl.regexp.unicode import UnicodeAlphabet
40 from lepl.stream.simple import StringHelper
41 from lepl.support.lib import fmt
42
43
44
45
46
47 UNICODE = UnicodeAlphabet.instance()
48
49
52
54 return fmt('(?P<label>{0!s})', text)
55
57
64
66
67 c = _test_parser('a')
68 assert label('a') == str(c), str(c)
69 c = _test_parser('[ac]')
70 assert label('[ac]') == str(c), str(c)
71 c = _test_parser('[a-c]')
72 assert label('[a-c]') == str(c), str(c)
73 c = _test_parser('[a-cp-q]')
74 assert label('[a-cp-q]') == str(c), str(c)
75 c = _test_parser(r'\\')
76 assert label(r'\\') == str(c), str(c)
77 c = _test_parser(r'\-')
78 assert label(r'\-') == str(c), str(c)
79 c = _test_parser(r'[\\-x]')
80 assert label(r'[\\-x]') == str(c), str(c)
81 c = _test_parser('[a-bq,]')
82 assert label('[,a-bq]') == str(c), str(c)
83 c = _test_parser('[a-b,q]')
84 assert label('[,a-bq]') == str(c), str(c)
85 c = _test_parser('[,a-bq]')
86 assert label('[,a-bq]') == str(c), str(c)
87 c = _test_parser('[^a]')
88 assert (r'(?P<label>[\x00-`b-\uffff])' == str(c) or
89 r'(?P<label>[\x00-`b-\U0010ffff])' == str(c)), str(c)
90
92 c = _test_parser('[a-ce-g]')
93 assert label('[a-ce-g]') == str(c), str(c)
94 c = _test_parser('[a-cd-f]')
95 assert label('[a-f]') == str(c), str(c)
96 c = _test_parser('[a-cc-e]')
97 assert label('[a-e]') == str(c), str(c)
98 c = _test_parser('[a-cb-d]')
99 assert label('[a-d]') == str(c), str(c)
100 c = _test_parser('[a-ca-c]')
101 assert label('[a-c]') == str(c), str(c)
102 c = _test_parser('[a-a]')
103 assert label('a') == str(c), str(c)
104 c = _test_parser('[e-ga-c]')
105 assert label('[a-ce-g]') == str(c), str(c)
106 c = _test_parser('[d-fa-c]')
107 assert label('[a-f]') == str(c), str(c)
108 c = _test_parser('[c-ea-c]')
109 assert label('[a-e]') == str(c), str(c)
110 c = _test_parser('[b-da-c]')
111 assert label('[a-d]') == str(c), str(c)
112 c = _test_parser('[a-gc-e]')
113 assert label('[a-g]') == str(c), str(c)
114 c = _test_parser('[c-ea-g]')
115 assert label('[a-g]') == str(c), str(c)
116 c = _test_parser('[a-eg]')
117 assert label('[a-eg]') == str(c), str(c)
118 c = _test_parser('[ga-e]')
119 assert label('[a-eg]') == str(c), str(c)
120
130
140
149
159
161
162 try:
163 _test_parser('(a)')
164 assert False, 'Expected error'
165 except SyntaxError as e:
166 assert 'do not currently support matched groups' in str(e), e
167
175
176
178
186
190
193
196
199
201 self.assert_matches('(?:a|b)*', 'aababbac',
202 ['aababba', 'aababb', 'aabab', 'aaba', 'aab', 'aa', 'a', ''])
203
207
211
213 '''
214 Matches with 'b' are duplicated, since it appears in both ranges.
215 '''
216 self.assert_matches('(?:[ab]|[bc])*', 'abc',
217 ['abc', 'ab', 'abc', 'ab', 'a', ''])
218
220
221 self.assert_matches('a(?:[x-z]|a(?:g|b))*(?:u|v)p',
222 'ayagxabvp', ['ayagxabvp'])
223
224
226
236
238 self.assert_dfa_graph('abc',
239 '0: [0] a->1; 1: [3] b->2; 2: [4] c->3; 3(label): [1, 2]')
240
242 self.assert_dfa_graph('ab*c',
243 '0: [0] a->1; 1: [3, 4, 5] c->2,b->3; 2(label): [1, 2]; 3: [4, 5] c->2,b->3')
244
246 self.assert_dfa_graph('a(?:b|c)',
247 '0: [0] a->1; 1: [3, 4] [b-c]->2; 2(label): [1, 2]')
248
250 self.assert_dfa_graph('a(?:b|cd)*e',
251 '0: [0] a->1; 1: [3, 4, 5, 6] e->2,c->3,b->4; 2(label): [1, 2]; 3: [7] d->4; 4: [4, 5, 6] e->2,c->3,b->4')
252
254 self.assert_dfa_graph('a(?:bcd|bce)',
255 '0: [0] a->1; 1: [3, 4] b->2; 2: [5, 7] c->3; 3: [8, 6] [d-e]->4; 4(label): [1, 2]')
256
258 self.assert_dfa_graph('a(?:bc|b*d)',
259 '0: [0] a->1; 1: [3, 4, 6, 7] b->2,d->3; 2: [5, 6, 7] [c-d]->3,b->4; 3(label): [1, 2]; 4: [6, 7] d->3,b->4')
260
262 self.assert_dfa_graph('a(?:bb|b*c)',
263 '0: [0] a->1; 1: [3, 4, 6, 7] b->2,c->3; 2: [5, 6, 7] c->3,b->4; 3(label): [1, 2]; 4(label): [1, 2, 6, 7] c->3,b->5; 5: [6, 7] c->3,b->5')
264
266 '''
267 This one's nice - the 'a' completely disappears.
268 '''
269
270 self.assert_dfa_graph('.*a?b',
271 (r'0: [0, 3, 4, 5] [\x00-ac-\uffff]->1,b->2; 1: [3, 4, 5] [\x00-ac-\uffff]->1,b->2; 2(label): [1, 2, 3, 4, 5] [\x00-ac-\uffff]->1,b->2',
272 r'0: [0, 3, 4, 5] [\x00-ac-\U0010ffff]->1,b->2; 1: [3, 4, 5] [\x00-ac-\U0010ffff]->1,b->2; 2(label): [1, 2, 3, 4, 5] [\x00-ac-\U0010ffff]->1,b->2'))
273
296