Package lepl :: Package regexp :: Module interval
[hide private]
[frames] | no frames]

Source Code for Module lepl.regexp.interval

  1  from lepl.support.node import Node 
  2   
  3  # The contents of this file are subject to the Mozilla Public License 
  4  # (MPL) Version 1.1 (the "License"); you may not use this file except 
  5  # in compliance with the License. You may obtain a copy of the License 
  6  # at http://www.mozilla.org/MPL/ 
  7  # 
  8  # Software distributed under the License is distributed on an "AS IS" 
  9  # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 
 10  # the License for the specific language governing rights and 
 11  # limitations under the License. 
 12  # 
 13  # The Original Code is LEPL (http://www.acooke.org/lepl) 
 14  # The Initial Developer of the Original Code is Andrew Cooke. 
 15  # Portions created by the Initial Developer are Copyright (C) 2009-2010 
 16  # Andrew Cooke (andrew@acooke.org). All Rights Reserved. 
 17  # 
 18  # Alternatively, the contents of this file may be used under the terms 
 19  # of the LGPL license (the GNU Lesser General Public License, 
 20  # http://www.gnu.org/licenses/lgpl.html), in which case the provisions 
 21  # of the LGPL License are applicable instead of those above. 
 22  # 
 23  # If you wish to allow use of your version of this file only under the 
 24  # terms of the LGPL License and not to allow others to use your version 
 25  # of this file under the MPL, indicate your decision by deleting the 
 26  # provisions above and replace them with the notice and other provisions 
 27  # required by the LGPL License.  If you do not delete the provisions 
 28  # above, a recipient may use your version of this file under either the 
 29  # MPL or the LGPL License. 
 30   
 31  ''' 
 32  Support for managing sets of intervals (not all used).   
 33  An interval is an open range implemented as a tuple pair.  For example (2,5)  
 34  is an interval that represents the integers 2,3,4 and 5. 
 35  ''' 
 36   
 37  from bisect import bisect_left 
 38  from collections import deque 
 39   
 40   
41 -class Intervals(object):
42 ''' 43 A set of possible values for a character, described as a collection of 44 intervals. Each interval is [a, b] (ie a <= x <= b, where x is a character 45 code). We use open bounds to avoid having to specify an "out of range" 46 value, making it easier to work with a variety of alphabets. 47 48 The intervals are stored in a list, ordered by a, joining overlapping 49 intervals as necessary. 50 ''' 51 52 # pylint: disable-msg=C0103 53 # (use (a,b) variables consistently) 54
55 - def __init__(self, intervals, alphabet):
56 self.alphabet = alphabet 57 self.__intervals = deque() 58 for interval in intervals: 59 self.__append(interval) 60 self.__intervals = list(self.__intervals) 61 self.__str = alphabet.fmt_intervals(self.__intervals) 62 self.__index = [] 63 self.state = None 64 self.__build_index()
65
66 - def append(self, interval):
67 ''' 68 Add an interval to the range. 69 ''' 70 self.__append(interval) 71 self.__build_index()
72
73 - def __build_index(self):
74 ''' 75 Pre-construct the index used for bisection. 76 ''' 77 self.__index = [interval[1] for interval in self.__intervals]
78
79 - def __append(self, interval):
80 ''' 81 Add an interval to the existing intervals. 82 83 This maintains self.__intervals in the normalized form described above. 84 ''' 85 (a1, b1) = interval 86 if b1 < a1: 87 (a1, b1) = (b1, a1) 88 intervals = deque() 89 done = False 90 while self.__intervals: 91 # pylint: disable-msg=E1103 92 # (pylint fails to infer type) 93 (a0, b0) = self.__intervals.popleft() 94 if a0 <= a1: 95 if b0 < a1 and b0 != self.alphabet.before(a1): 96 # old interval starts and ends before new interval 97 # so keep old interval and continue 98 intervals.append((a0, b0)) 99 elif b1 <= b0: 100 # old interval starts before and ends after new interval 101 # so keep old interval, discard new interval and slurp 102 intervals.append((a0, b0)) 103 done = True 104 break 105 else: 106 # old interval starts before new, but partially overlaps 107 # so discard old interval, extend new interval and continue 108 # (since it may overlap more intervals...) 109 (a1, b1) = (a0, b1) 110 else: 111 if b1 < a0 and b1 != self.alphabet.before(a0): 112 # new interval starts and ends before old, so add both 113 # and slurp 114 intervals.append((a1, b1)) 115 intervals.append((a0, b0)) 116 done = True 117 break 118 elif b0 <= b1: 119 # new interval starts before and ends after old interval 120 # so discard old and continue (since it may overlap...) 121 pass 122 else: 123 # new interval starts before old, but partially overlaps, 124 # add extended interval and slurp rest 125 intervals.append((a1, b0)) 126 done = True 127 break 128 if not done: 129 intervals.append((a1, b1)) 130 intervals.extend(self.__intervals) # slurp remaining 131 self.__intervals = intervals
132
133 - def __str__(self):
134 return self.__str
135
136 - def __repr__(self):
137 return self.__str
138
139 - def len(self):
140 ''' 141 The number of intervals in the range. 142 ''' 143 return len(self.__intervals)
144
145 - def __getitem__(self, index):
146 return self.__intervals[index]
147
148 - def __iter__(self):
149 return iter(self.__intervals)
150
151 - def __contains__(self, c):
152 ''' 153 Does the value lie within the intervals? 154 ''' 155 if self.__index: 156 index = bisect_left(self.__index, c) 157 if index < len(self.__intervals): 158 (a, b) = self.__intervals[index] 159 return a <= c <= b 160 return False
161
162 - def __hash__(self):
163 return hash(self.__str)
164
165 - def __eq__(self, other):
166 # pylint: disable-msg=W0212 167 # (test for same class) 168 return isinstance(other, Intervals) and self.__str == other.__str
169 170
171 -class _Character(Node):
172 ''' 173 A set of intervals that is part of a DFA/NFA graph. This is separate 174 from the `Intervals` instance since we need to clone the node, but can 175 keep the intervals (necessary so that node equality works correctly when 176 the same character is used more than once). 177 ''' 178
179 - def __init__(self, intervals):
180 super(_Character, self).__init__(intervals) 181 self.__intervals = intervals
182
183 - def append(self, interval):
184 ''' 185 Add an interval to the range. 186 ''' 187 self.__intervals.append(interval)
188
189 - def __str__(self):
190 return str(self.__intervals)
191
192 - def __repr__(self):
193 return repr(self.__intervals)
194
195 - def len(self):
196 ''' 197 The number of intervals in the range. 198 ''' 199 return len(self.__intervals)
200
201 - def __getitem__(self, index):
202 return self.__intervals[index]
203
204 - def __iter__(self):
205 return iter(self.__intervals)
206
207 - def __contains__(self, c):
208 return c in self.__intervals
209
210 - def build(self, graph, src, dest):
211 ''' 212 Insert within an NFA graph (although at this level, it's not clear it's 213 NFA). 214 ''' 215 graph.connect(src, dest, self)
216 217
218 -def Character(intervals, alphabet):
219 ''' 220 Allow simple construction of Character instances. 221 ''' 222 return _Character(Intervals(intervals, alphabet))
223 224 225 #class Fragments(object): 226 # ''' 227 # Similar to Character, but each additional interval fragments the list 228 # of intervals, instead of creating a new merged interval. For example, 229 # if (3,5) is added to (1,4) and (7,8) then the result will be the 230 # intervals (1,2), (3,4), (5,5) and (7,8) - the interval (3,4) is the 231 # overlap between (1,4) and (3,5). 232 # 233 # Used internally to combine transitions. 234 # ''' 235 # 236 # # pylint: disable-msg=C0103 237 # # (use (a,b) variables consistently) 238 # 239 # def __init__(self, alphabet, characters=None): 240 # self.alphabet = alphabet 241 # self.__intervals = deque() 242 # if characters: 243 # for character in characters: 244 # self.append(character) 245 # 246 # def append(self, character): 247 # ''' 248 # Add a character to the intervals. 249 # ''' 250 # assert type(character) is Character 251 # for interval in character: 252 # self.__append(interval) 253 # 254 # def __append(self, interval): 255 # ''' 256 # Add an interval to the existing intervals. 257 # ''' 258 # (a1, b1) = interval 259 # if b1 < a1: 260 # (a1, b1) = (b1, a1) 261 # intervals = deque() 262 # alphabet = self.alphabet 263 # done = False 264 # while self.__intervals: 265 # (a0, b0) = self.__intervals.popleft() 266 # if a0 <= a1: 267 # if b0 < a1: 268 # # old interval starts and ends before new interval 269 # # so keep old interval and continue 270 # intervals.append((a0, b0)) 271 # elif b1 <= b0: 272 # # old interval starts before or with and ends after or with 273 # # new interval 274 # # so we have one, two or three new intervals 275 # if a0 < a1: 276 # # first part of old 277 # intervals.append((a0, alphabet.before(a1))) 278 # # common to both 279 # intervals.append((a1, b1)) 280 # if b1 < b0: 281 # # last part of old 282 # intervals.append((alphabet.after(b1), b0)) 283 # done = True 284 # break 285 # else: 286 # # old interval starts before new, but partially overlaps 287 # # so split old and continue 288 # # (since it may overlap more intervals...) 289 # if a0 < a1: 290 # # first part of old 291 # intervals.append((a0, alphabet.before(a1))) 292 # # common to both 293 # intervals.append((a1, b0)) 294 # a1 = alphabet.after(b0) 295 # else: 296 # if b1 < a0: 297 # # new interval starts and ends before old 298 # intervals.append((a1, b1)) 299 # intervals.append((a0, b0)) 300 # done = True 301 # break 302 # elif b0 <= b1: 303 # # new interval starts before and ends after or with old 304 # # interval 305 # # so split and continue if extends (since last part may 306 # # overlap...) 307 # # first part of new 308 # intervals.append((a1, alphabet.before(a0))) 309 # # overlap 310 # intervals.append((a0, b0)) 311 # if b1 > b0: 312 # a1 = alphabet.after(b0) 313 # else: 314 # done = True 315 # break 316 # else: 317 # # new interval starts before old, but partially overlaps, 318 # # split and slurp rest 319 # # first part of new 320 # intervals.append((a1, alphabet.before(a0))) 321 # # overlap 322 # intervals.append((a0, b1)) 323 # # last part of old 324 # intervals.append((alphabet.after(b1), b0)) 325 # done = True 326 # break 327 # if not done: 328 # intervals.append((a1, b1)) 329 # intervals.extend(self.__intervals) # slurp remaining 330 # self.__intervals = intervals 331 # 332 # def len(self): 333 # ''' 334 # The number of intervals contained. 335 # ''' 336 # return len(self.__intervals) 337 # 338 # def __getitem__(self, index): 339 # return self.__intervals[index] 340 # 341 # def __iter__(self): 342 # return iter(self.__intervals) 343 344
345 -class IntervalMap(dict):
346 ''' 347 Map from intervals to values. 348 349 Note - this is for open intervals! This means it will not work as 350 expected for continuous variables (which will overlap when two intervals 351 share a single boundary value). In other words, you cannot store 352 (1,2) and (2,3) together because both contain 2. 353 ''' 354 355 # pylint: disable-msg=C0103 356 # (use (a,b) variables consistently) 357
358 - def __init__(self):
359 super(IntervalMap, self).__init__() 360 self.__intervals = [] 361 # None is used as a flag to indicate that a new index is needed 362 self.__index = None
363
364 - def index(self):
365 ''' 366 Build the internal indices. Called automatically when necessary. 367 ''' 368 second = lambda x: x[1] 369 self.__intervals = list(sorted(self.keys(), key=second)) 370 # pylint: disable-msg=W0141 371 self.__index = list(map(second, self.__intervals))
372
373 - def __setitem__(self, interval, value):
374 # these are rather inefficient, but perhaps useful during development 375 # assert None == self[interval[0]], 'Overlap' 376 # assert None == self[interval[1]], 'Overlap' 377 self.__index = None 378 super(IntervalMap, self).__setitem__(interval, value)
379
380 - def __getitem__(self, point):
381 ''' 382 The argument here is a single value, not an interval. 383 ''' 384 if self.__index is None: 385 self.index() 386 if self.__index: 387 try: 388 index = bisect_left(self.__index, point) 389 except TypeError as e: 390 from lepl.regexp.core import RegexpError 391 raise RegexpError( 392 """Input characters are inconsistent with the given alphabet. 393 This error is often triggered by using the default configuration with non-text input; disable with matcher.config.no_compile_to_regexp(). 394 Alternatively, configure with a suitable alphabet.""") 395 if index < len(self.__index): 396 # keep interval for identity on retrieval, just in case 397 (a, b) = interval = self.__intervals[index] 398 if a <= point <= b: 399 return super(IntervalMap, self).__getitem__(interval) 400 return None
401
402 - def __delitem__(self, interval):
403 self.__index = None 404 super(IntervalMap, self).__delitem__(interval)
405
406 - def __contains__(self, interval):
407 (a, b) = interval 408 return self[a] is not None or self[b] is not None
409 410
411 -class TaggedFragments(object):
412 ''' 413 Similar to Fragments, but associates a value with each initial interval; 414 on retrieval returns a list of all values associated with fragment. 415 ''' 416 417 # pylint: disable-msg=C0103 418 # (use (a,b) variables consistently) 419
420 - def __init__(self, alphabet):
421 self.alphabet = alphabet 422 self.__intervals = deque()
423
424 - def append(self, character, value):
425 ''' 426 Add a range and tag. 427 ''' 428 assert type(character) is _Character 429 for interval in character: 430 self.__append(interval, [value])
431
432 - def __append(self, interval, v1):
433 ''' 434 Add an interval to the existing intervals. 435 ''' 436 (a1, b1) = interval 437 if b1 < a1: 438 (a1, b1) = (b1, a1) 439 intervals = deque() 440 alphabet = self.alphabet 441 done = False 442 while self.__intervals: 443 ((a0, b0), v0) = self.__intervals.popleft() 444 if a0 <= a1: 445 if b0 < a1: 446 # old interval starts and ends before new interval 447 # so keep old interval and continue 448 intervals.append(((a0, b0), v0)) 449 elif b1 <= b0: 450 # old interval starts before or with and ends after or with 451 # new interval 452 # so we have one, two or three new intervals 453 if a0 < a1: 454 # first part of old 455 intervals.append(((a0, alphabet.before(a1)), v0)) 456 # common to both 457 intervals.append(((a1, b1), v0+v1)) 458 if b1 < b0: 459 # last part of old 460 intervals.append(((alphabet.after(b1), b0), v0)) 461 done = True 462 break 463 else: 464 # old interval starts before new, but partially overlaps 465 # so split old and continue 466 # (since new may overlap more intervals...) 467 if a0 < a1: 468 # first part of old 469 intervals.append(((a0, alphabet.before(a1)), v0)) 470 # common to both 471 intervals.append(((a1, b0), v0+v1)) 472 a1 = alphabet.after(b0) 473 else: 474 if b1 < a0: 475 # new interval starts and ends before old 476 intervals.append(((a1, b1), v1)) 477 intervals.append(((a0, b0), v0)) 478 done = True 479 break 480 elif b0 <= b1: 481 # new interval starts before and ends after or with old 482 # interval 483 # so split and continue if extends (since last part may 484 # overlap...) 485 # first part of new 486 intervals.append(((a1, alphabet.before(a0)), v1)) 487 # old 488 intervals.append(((a0, b0), v0+v1)) 489 if b1 > b0: 490 a1 = alphabet.after(b0) 491 else: 492 done = True 493 break 494 else: 495 # new interval starts before old, but partially overlaps, 496 # split and slurp rest 497 # first part of new 498 intervals.append(((a1, alphabet.before(a0)), v1)) 499 # overlap 500 intervals.append(((a0, b1), v0+v1)) 501 # last part of old 502 intervals.append(((alphabet.after(b1), b0), v0)) 503 done = True 504 break 505 if not done: 506 intervals.append(((a1, b1), v1)) 507 intervals.extend(self.__intervals) # slurp remaining 508 self.__intervals = intervals
509
510 - def len(self):
511 ''' 512 The number of intervals contained. 513 ''' 514 return len(self.__intervals)
515
516 - def __getitem__(self, index):
517 return self.__intervals[index]
518
519 - def __iter__(self):
520 return iter(self.__intervals)
521