Package lepl :: Package regexp :: Package _test :: Module unicode
[hide private]
[frames] | no frames]

Source Code for Module lepl.regexp._test.unicode

  1   
  2  # The contents of this file are subject to the Mozilla Public License 
  3  # (MPL) Version 1.1 (the "License"); you may not use this file except 
  4  # in compliance with the License. You may obtain a copy of the License 
  5  # at http://www.mozilla.org/MPL/ 
  6  # 
  7  # Software distributed under the License is distributed on an "AS IS" 
  8  # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 
  9  # the License for the specific language governing rights and 
 10  # limitations under the License. 
 11  # 
 12  # The Original Code is LEPL (http://www.acooke.org/lepl) 
 13  # The Initial Developer of the Original Code is Andrew Cooke. 
 14  # Portions created by the Initial Developer are Copyright (C) 2009-2010 
 15  # Andrew Cooke (andrew@acooke.org). All Rights Reserved. 
 16  # 
 17  # Alternatively, the contents of this file may be used under the terms 
 18  # of the LGPL license (the GNU Lesser General Public License, 
 19  # http://www.gnu.org/licenses/lgpl.html), in which case the provisions 
 20  # of the LGPL License are applicable instead of those above. 
 21  # 
 22  # If you wish to allow use of your version of this file only under the 
 23  # terms of the LGPL License and not to allow others to use your version 
 24  # of this file under the MPL, indicate your decision by deleting the 
 25  # provisions above and replace them with the notice and other provisions 
 26  # required by the LGPL License.  If you do not delete the provisions 
 27  # above, a recipient may use your version of this file under either the 
 28  # MPL or the LGPL License. 
 29   
 30  ''' 
 31  Tests for the lepl.regexp.unicode module. 
 32  ''' 
 33   
 34  from unittest import TestCase 
 35   
 36  #from logging import basicConfig, DEBUG 
 37  from lepl import RegexpError, DEFAULT_STREAM_FACTORY 
 38  from lepl.regexp.core import NfaGraph, NfaToDfa, Compiler 
 39  from lepl.regexp.unicode import UnicodeAlphabet 
 40  from lepl.stream.simple import StringHelper 
 41  from lepl.support.lib import fmt 
 42   
 43  # pylint: disable-msg=C0103, C0111, C0301, R0201, R0904 
 44  # (dude this is just a test) 
 45   
 46   
 47  UNICODE = UnicodeAlphabet.instance() 
 48   
 49   
50 -def _test_parser(regexp):
51 return Compiler.single(UNICODE, regexp, 'label')
52
53 -def label(text):
54 return fmt('(?P<label>{0!s})', text)
55
56 -class CharactersTest(TestCase):
57
58 - def test_unicode_dot(self):
59 #basicConfig(level=DEBUG) 60 c = _test_parser('.') 61 assert label('.') == str(c), str(c) 62 c = _test_parser('.\\.') 63 assert label('.\\.') == str(c), str(c)
64
65 - def test_brackets(self):
66 #basicConfig(level=DEBUG) 67 c = _test_parser('a') 68 assert label('a') == str(c), str(c) 69 c = _test_parser('[ac]') 70 assert label('[ac]') == str(c), str(c) 71 c = _test_parser('[a-c]') 72 assert label('[a-c]') == str(c), str(c) 73 c = _test_parser('[a-cp-q]') 74 assert label('[a-cp-q]') == str(c), str(c) 75 c = _test_parser(r'\\') 76 assert label(r'\\') == str(c), str(c) 77 c = _test_parser(r'\-') 78 assert label(r'\-') == str(c), str(c) 79 c = _test_parser(r'[\\-x]') 80 assert label(r'[\\-x]') == str(c), str(c) 81 c = _test_parser('[a-bq,]') 82 assert label('[,a-bq]') == str(c), str(c) 83 c = _test_parser('[a-b,q]') 84 assert label('[,a-bq]') == str(c), str(c) 85 c = _test_parser('[,a-bq]') 86 assert label('[,a-bq]') == str(c), str(c) 87 c = _test_parser('[^a]') 88 assert (r'(?P<label>[\x00-`b-\uffff])' == str(c) or 89 r'(?P<label>[\x00-`b-\U0010ffff])' == str(c)), str(c)
90
91 - def test_merge(self):
92 c = _test_parser('[a-ce-g]') 93 assert label('[a-ce-g]') == str(c), str(c) 94 c = _test_parser('[a-cd-f]') 95 assert label('[a-f]') == str(c), str(c) 96 c = _test_parser('[a-cc-e]') 97 assert label('[a-e]') == str(c), str(c) 98 c = _test_parser('[a-cb-d]') 99 assert label('[a-d]') == str(c), str(c) 100 c = _test_parser('[a-ca-c]') 101 assert label('[a-c]') == str(c), str(c) 102 c = _test_parser('[a-a]') 103 assert label('a') == str(c), str(c) 104 c = _test_parser('[e-ga-c]') 105 assert label('[a-ce-g]') == str(c), str(c) 106 c = _test_parser('[d-fa-c]') 107 assert label('[a-f]') == str(c), str(c) 108 c = _test_parser('[c-ea-c]') 109 assert label('[a-e]') == str(c), str(c) 110 c = _test_parser('[b-da-c]') 111 assert label('[a-d]') == str(c), str(c) 112 c = _test_parser('[a-gc-e]') 113 assert label('[a-g]') == str(c), str(c) 114 c = _test_parser('[c-ea-g]') 115 assert label('[a-g]') == str(c), str(c) 116 c = _test_parser('[a-eg]') 117 assert label('[a-eg]') == str(c), str(c) 118 c = _test_parser('[ga-e]') 119 assert label('[a-eg]') == str(c), str(c)
120
121 - def test_star(self):
122 c = _test_parser('a*') 123 assert label('a*') == str(c), str(c) 124 c = _test_parser('a(?:bc)*d') 125 assert label('a(?:bc)*d') == str(c), str(c) 126 c = _test_parser('a(?:bc)*d[e-g]*') 127 assert label('a(?:bc)*d[e-g]*') == str(c), str(c) 128 c = _test_parser('a[a-cx]*') 129 assert label('a[a-cx]*') == str(c), str(c)
130
131 - def test_option(self):
132 c = _test_parser('a?') 133 assert label('a?') == str(c), str(c) 134 c = _test_parser('a(?:bc)?d') 135 assert label('a(?:bc)?d') == str(c), str(c) 136 c = _test_parser('a(?:bc)?d[e-g]?') 137 assert label('a(?:bc)?d[e-g]?') == str(c), str(c) 138 c = _test_parser('ab?c') 139 assert label('ab?c') == str(c), str(c)
140
141 - def test_choice(self):
142 #basicConfig(level=DEBUG) 143 c = _test_parser('(?:a*|b|[c-d])') 144 assert label('(?:a*|b|[c-d])') == str(c), str(c) 145 c = _test_parser('a(?:a|b)*') 146 assert label('a(?:a|b)*') == str(c), str(c) 147 c = _test_parser('a(?:[a-c]x|axb)*') 148 assert label('a(?:[a-c]x|axb)*') == str(c), str(c)
149
150 - def test_bad_escape(self):
151 #basicConfig(level=DEBUG) 152 c = _test_parser(r'\+') 153 assert label('\\+') == str(c), str(c) 154 try: 155 c = _test_parser('+') 156 assert False, 'Expected error' 157 except RegexpError: 158 pass
159
160 - def test_bad_group(self):
161 #basicConfig(level=DEBUG) 162 try: 163 _test_parser('(a)') 164 assert False, 'Expected error' 165 except SyntaxError as e: 166 assert 'do not currently support matched groups' in str(e), e
167
168 - def test_escape(self):
169 c = _test_parser('\\x40') 170 assert label('@') == str(c), str(c) 171 c = _test_parser('\\u0040') 172 assert label('@') == str(c), str(c) 173 c = _test_parser('\\U00000040') 174 assert label('@') == str(c), str(c)
175 176
177 -class NfaTest(TestCase):
178
179 - def assert_matches(self, pattern, text, results):
180 r = _test_parser(pattern) 181 m = r.nfa().match 182 s = list(m(DEFAULT_STREAM_FACTORY.from_string(text))) 183 assert len(s) == len(results), s 184 for (a, b) in zip(s, results): 185 assert a[1] == b, a[1] + ' != ' + b
186
187 - def test_simple(self):
188 #basicConfig(level=DEBUG) 189 self.assert_matches('ab', 'abc', ['ab'])
190
191 - def test_star(self):
192 self.assert_matches('a*b', 'aaabc', ['aaab'])
193
194 - def test_plus(self):
195 self.assert_matches('[a-z]+', 'abc', ['abc', 'ab', 'a'])
196
197 - def test_choice(self):
198 self.assert_matches('(?:a|b)', 'ac', ['a'])
199
200 - def test_star_choice(self):
201 self.assert_matches('(?:a|b)*', 'aababbac', 202 ['aababba', 'aababb', 'aabab', 'aaba', 'aab', 'aa', 'a', ''])
203
204 - def test_multiple_choice(self):
205 #basicConfig(level=DEBUG) 206 self.assert_matches('(?:a|ab)b', 'abb', ['ab', 'abb'])
207
208 - def test_range(self):
209 self.assert_matches('[abc]*', 'bbcx', ['bbc', 'bb', 'b', '']) 210 self.assert_matches('[A-Z][a-z]*', 'Abc', ['Abc', 'Ab', 'A'])
211
212 - def test_range_overlap(self):
213 ''' 214 Matches with 'b' are duplicated, since it appears in both ranges. 215 ''' 216 self.assert_matches('(?:[ab]|[bc])*', 'abc', 217 ['abc', 'ab', 'abc', 'ab', 'a', ''])
218
219 - def test_complex(self):
220 #basicConfig(level=DEBUG) 221 self.assert_matches('a(?:[x-z]|a(?:g|b))*(?:u|v)p', 222 'ayagxabvp', ['ayagxabvp'])
223 224
225 -class DfaGraphTest(TestCase):
226
227 - def assert_dfa_graph(self, regexp, desc):
228 r = _test_parser(regexp) 229 nfa = NfaGraph(UNICODE) 230 r.expression.build(nfa, nfa.new_node(), nfa.new_node()) 231 dfa = NfaToDfa(nfa, UNICODE).dfa 232 try: 233 assert str(dfa) in desc, str(dfa) 234 except: 235 assert str(dfa) == desc, str(dfa)
236
237 - def test_dfa_no_empty(self):
238 self.assert_dfa_graph('abc', 239 '0: [0] a->1; 1: [3] b->2; 2: [4] c->3; 3(label): [1, 2]')
240
241 - def test_dfa_simple_repeat(self):
242 self.assert_dfa_graph('ab*c', 243 '0: [0] a->1; 1: [3, 4, 5] c->2,b->3; 2(label): [1, 2]; 3: [4, 5] c->2,b->3')
244
245 - def test_dfa_simple_choice(self):
246 self.assert_dfa_graph('a(?:b|c)', 247 '0: [0] a->1; 1: [3, 4] [b-c]->2; 2(label): [1, 2]')
248
249 - def test_dfa_repeated_choice(self):
250 self.assert_dfa_graph('a(?:b|cd)*e', 251 '0: [0] a->1; 1: [3, 4, 5, 6] e->2,c->3,b->4; 2(label): [1, 2]; 3: [7] d->4; 4: [4, 5, 6] e->2,c->3,b->4')
252
254 self.assert_dfa_graph('a(?:bcd|bce)', 255 '0: [0] a->1; 1: [3, 4] b->2; 2: [5, 7] c->3; 3: [8, 6] [d-e]->4; 4(label): [1, 2]')
256
258 self.assert_dfa_graph('a(?:bc|b*d)', 259 '0: [0] a->1; 1: [3, 4, 6, 7] b->2,d->3; 2: [5, 6, 7] [c-d]->3,b->4; 3(label): [1, 2]; 4: [6, 7] d->3,b->4')
260
262 self.assert_dfa_graph('a(?:bb|b*c)', 263 '0: [0] a->1; 1: [3, 4, 6, 7] b->2,c->3; 2: [5, 6, 7] c->3,b->4; 3(label): [1, 2]; 4(label): [1, 2, 6, 7] c->3,b->5; 5: [6, 7] c->3,b->5')
264
265 - def test_dfa_dot_option(self):
266 ''' 267 This one's nice - the 'a' completely disappears. 268 ''' 269 #basicConfig(level=DEBUG) 270 self.assert_dfa_graph('.*a?b', 271 (r'0: [0, 3, 4, 5] [\x00-ac-\uffff]->1,b->2; 1: [3, 4, 5] [\x00-ac-\uffff]->1,b->2; 2(label): [1, 2, 3, 4, 5] [\x00-ac-\uffff]->1,b->2', 272 r'0: [0, 3, 4, 5] [\x00-ac-\U0010ffff]->1,b->2; 1: [3, 4, 5] [\x00-ac-\U0010ffff]->1,b->2; 2(label): [1, 2, 3, 4, 5] [\x00-ac-\U0010ffff]->1,b->2'))
273
274 -class DfaTest(TestCase):
275
276 - def assert_dfa(self, regexp, text, results):
277 r = _test_parser(regexp).dfa().match((0, StringHelper(text))) 278 assert r[1] == results, r
279
280 - def test_simple(self):
281 self.assert_dfa('abc', 'abcd', 'abc')
282
283 - def test_dot_option(self):
284 self.assert_dfa('.*a?b', 'aaabc', 'aaab')
285
286 - def test_empty(self):
287 self.assert_dfa('a*', 'bc', '') 288 self.assert_dfa('a*', '', '')
289
290 - def test_conflicting_choice(self):
291 self.assert_dfa('a(?:bc|b*d)', 'abde', 'abd') 292 self.assert_dfa('a(?:bc|b*d)', 'abce', 'abc')
293
294 - def test_space_star(self):
295 self.assert_dfa(' *', ' a', ' ')
296