Merged revisions 85328 via svnmerge from
[python/dscho.git] / Lib / sre_parse.py
blob13737ca12f0e3ffd8ce362f26a8309057eadf081
2 # Secret Labs' Regular Expression Engine
4 # convert re-style regular expression to sre pattern
6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
8 # See the sre.py file for information on usage and redistribution.
11 """Internal support module for sre"""
13 # XXX: show string offset and offending character for all errors
15 import sys
17 from sre_constants import *
19 SPECIAL_CHARS = ".\\[{()*+?^$|"
20 REPEAT_CHARS = "*+?{"
22 DIGITS = set("0123456789")
24 OCTDIGITS = set("01234567")
25 HEXDIGITS = set("0123456789abcdefABCDEF")
27 WHITESPACE = set(" \t\n\r\v\f")
29 ESCAPES = {
30 r"\a": (LITERAL, ord("\a")),
31 r"\b": (LITERAL, ord("\b")),
32 r"\f": (LITERAL, ord("\f")),
33 r"\n": (LITERAL, ord("\n")),
34 r"\r": (LITERAL, ord("\r")),
35 r"\t": (LITERAL, ord("\t")),
36 r"\v": (LITERAL, ord("\v")),
37 r"\\": (LITERAL, ord("\\"))
40 CATEGORIES = {
41 r"\A": (AT, AT_BEGINNING_STRING), # start of string
42 r"\b": (AT, AT_BOUNDARY),
43 r"\B": (AT, AT_NON_BOUNDARY),
44 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
45 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
46 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
47 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
48 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
49 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
50 r"\Z": (AT, AT_END_STRING), # end of string
53 FLAGS = {
54 # standard flags
55 "i": SRE_FLAG_IGNORECASE,
56 "L": SRE_FLAG_LOCALE,
57 "m": SRE_FLAG_MULTILINE,
58 "s": SRE_FLAG_DOTALL,
59 "x": SRE_FLAG_VERBOSE,
60 # extensions
61 "a": SRE_FLAG_ASCII,
62 "t": SRE_FLAG_TEMPLATE,
63 "u": SRE_FLAG_UNICODE,
66 class Pattern:
67 # master pattern object. keeps track of global attributes
68 def __init__(self):
69 self.flags = 0
70 self.open = []
71 self.groups = 1
72 self.groupdict = {}
73 def opengroup(self, name=None):
74 gid = self.groups
75 self.groups = gid + 1
76 if name is not None:
77 ogid = self.groupdict.get(name, None)
78 if ogid is not None:
79 raise error("redefinition of group name %s as group %d; "
80 "was group %d" % (repr(name), gid, ogid))
81 self.groupdict[name] = gid
82 self.open.append(gid)
83 return gid
84 def closegroup(self, gid):
85 self.open.remove(gid)
86 def checkgroup(self, gid):
87 return gid < self.groups and gid not in self.open
89 class SubPattern:
90 # a subpattern, in intermediate form
91 def __init__(self, pattern, data=None):
92 self.pattern = pattern
93 if data is None:
94 data = []
95 self.data = data
96 self.width = None
97 def dump(self, level=0):
98 nl = 1
99 seqtypes = (tuple, list)
100 for op, av in self.data:
101 print(level*" " + op, end=' '); nl = 0
102 if op == "in":
103 # member sublanguage
104 print(); nl = 1
105 for op, a in av:
106 print((level+1)*" " + op, a)
107 elif op == "branch":
108 print(); nl = 1
109 i = 0
110 for a in av[1]:
111 if i > 0:
112 print(level*" " + "or")
113 a.dump(level+1); nl = 1
114 i = i + 1
115 elif isinstance(av, seqtypes):
116 for a in av:
117 if isinstance(a, SubPattern):
118 if not nl: print()
119 a.dump(level+1); nl = 1
120 else:
121 print(a, end=' ') ; nl = 0
122 else:
123 print(av, end=' ') ; nl = 0
124 if not nl: print()
125 def __repr__(self):
126 return repr(self.data)
127 def __len__(self):
128 return len(self.data)
129 def __delitem__(self, index):
130 del self.data[index]
131 def __getitem__(self, index):
132 if isinstance(index, slice):
133 return SubPattern(self.pattern, self.data[index])
134 return self.data[index]
135 def __setitem__(self, index, code):
136 self.data[index] = code
137 def insert(self, index, code):
138 self.data.insert(index, code)
139 def append(self, code):
140 self.data.append(code)
141 def getwidth(self):
142 # determine the width (min, max) for this subpattern
143 if self.width:
144 return self.width
145 lo = hi = 0
146 UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
147 REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
148 for op, av in self.data:
149 if op is BRANCH:
150 i = sys.maxsize
151 j = 0
152 for av in av[1]:
153 l, h = av.getwidth()
154 i = min(i, l)
155 j = max(j, h)
156 lo = lo + i
157 hi = hi + j
158 elif op is CALL:
159 i, j = av.getwidth()
160 lo = lo + i
161 hi = hi + j
162 elif op is SUBPATTERN:
163 i, j = av[1].getwidth()
164 lo = lo + i
165 hi = hi + j
166 elif op in REPEATCODES:
167 i, j = av[2].getwidth()
168 lo = lo + int(i) * av[0]
169 hi = hi + int(j) * av[1]
170 elif op in UNITCODES:
171 lo = lo + 1
172 hi = hi + 1
173 elif op == SUCCESS:
174 break
175 self.width = int(min(lo, sys.maxsize)), int(min(hi, sys.maxsize))
176 return self.width
178 class Tokenizer:
179 def __init__(self, string):
180 self.string = string
181 self.index = 0
182 self.__next()
183 def __next(self):
184 if self.index >= len(self.string):
185 self.next = None
186 return
187 char = self.string[self.index:self.index+1]
188 # Special case for the str8, since indexing returns a integer
189 # XXX This is only needed for test_bug_926075 in test_re.py
190 if char and isinstance(char, bytes):
191 char = chr(char[0])
192 if char == "\\":
193 try:
194 c = self.string[self.index + 1]
195 except IndexError:
196 raise error("bogus escape (end of line)")
197 if isinstance(self.string, bytes):
198 c = chr(c)
199 char = char + c
200 self.index = self.index + len(char)
201 self.next = char
202 def match(self, char, skip=1):
203 if char == self.next:
204 if skip:
205 self.__next()
206 return 1
207 return 0
208 def get(self):
209 this = self.next
210 self.__next()
211 return this
212 def tell(self):
213 return self.index, self.next
214 def seek(self, index):
215 self.index, self.next = index
217 def isident(char):
218 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
220 def isdigit(char):
221 return "0" <= char <= "9"
223 def isname(name):
224 # check that group name is a valid string
225 if not isident(name[0]):
226 return False
227 for char in name[1:]:
228 if not isident(char) and not isdigit(char):
229 return False
230 return True
232 def _class_escape(source, escape):
233 # handle escape code inside character class
234 code = ESCAPES.get(escape)
235 if code:
236 return code
237 code = CATEGORIES.get(escape)
238 if code:
239 return code
240 try:
241 c = escape[1:2]
242 if c == "x":
243 # hexadecimal escape (exactly two digits)
244 while source.next in HEXDIGITS and len(escape) < 4:
245 escape = escape + source.get()
246 escape = escape[2:]
247 if len(escape) != 2:
248 raise error("bogus escape: %s" % repr("\\" + escape))
249 return LITERAL, int(escape, 16) & 0xff
250 elif c in OCTDIGITS:
251 # octal escape (up to three digits)
252 while source.next in OCTDIGITS and len(escape) < 4:
253 escape = escape + source.get()
254 escape = escape[1:]
255 return LITERAL, int(escape, 8) & 0xff
256 elif c in DIGITS:
257 raise error("bogus escape: %s" % repr(escape))
258 if len(escape) == 2:
259 return LITERAL, ord(escape[1])
260 except ValueError:
261 pass
262 raise error("bogus escape: %s" % repr(escape))
264 def _escape(source, escape, state):
265 # handle escape code in expression
266 code = CATEGORIES.get(escape)
267 if code:
268 return code
269 code = ESCAPES.get(escape)
270 if code:
271 return code
272 try:
273 c = escape[1:2]
274 if c == "x":
275 # hexadecimal escape
276 while source.next in HEXDIGITS and len(escape) < 4:
277 escape = escape + source.get()
278 if len(escape) != 4:
279 raise ValueError
280 return LITERAL, int(escape[2:], 16) & 0xff
281 elif c == "0":
282 # octal escape
283 while source.next in OCTDIGITS and len(escape) < 4:
284 escape = escape + source.get()
285 return LITERAL, int(escape[1:], 8) & 0xff
286 elif c in DIGITS:
287 # octal escape *or* decimal group reference (sigh)
288 if source.next in DIGITS:
289 escape = escape + source.get()
290 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
291 source.next in OCTDIGITS):
292 # got three octal digits; this is an octal escape
293 escape = escape + source.get()
294 return LITERAL, int(escape[1:], 8) & 0xff
295 # not an octal escape, so this is a group reference
296 group = int(escape[1:])
297 if group < state.groups:
298 if not state.checkgroup(group):
299 raise error("cannot refer to open group")
300 return GROUPREF, group
301 raise ValueError
302 if len(escape) == 2:
303 return LITERAL, ord(escape[1])
304 except ValueError:
305 pass
306 raise error("bogus escape: %s" % repr(escape))
308 def _parse_sub(source, state, nested=1):
309 # parse an alternation: a|b|c
311 items = []
312 itemsappend = items.append
313 sourcematch = source.match
314 while 1:
315 itemsappend(_parse(source, state))
316 if sourcematch("|"):
317 continue
318 if not nested:
319 break
320 if not source.next or sourcematch(")", 0):
321 break
322 else:
323 raise error("pattern not properly closed")
325 if len(items) == 1:
326 return items[0]
328 subpattern = SubPattern(state)
329 subpatternappend = subpattern.append
331 # check if all items share a common prefix
332 while 1:
333 prefix = None
334 for item in items:
335 if not item:
336 break
337 if prefix is None:
338 prefix = item[0]
339 elif item[0] != prefix:
340 break
341 else:
342 # all subitems start with a common "prefix".
343 # move it out of the branch
344 for item in items:
345 del item[0]
346 subpatternappend(prefix)
347 continue # check next one
348 break
350 # check if the branch can be replaced by a character set
351 for item in items:
352 if len(item) != 1 or item[0][0] != LITERAL:
353 break
354 else:
355 # we can store this as a character set instead of a
356 # branch (the compiler may optimize this even more)
357 set = []
358 setappend = set.append
359 for item in items:
360 setappend(item[0])
361 subpatternappend((IN, set))
362 return subpattern
364 subpattern.append((BRANCH, (None, items)))
365 return subpattern
367 def _parse_sub_cond(source, state, condgroup):
368 item_yes = _parse(source, state)
369 if source.match("|"):
370 item_no = _parse(source, state)
371 if source.match("|"):
372 raise error("conditional backref with more than two branches")
373 else:
374 item_no = None
375 if source.next and not source.match(")", 0):
376 raise error("pattern not properly closed")
377 subpattern = SubPattern(state)
378 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
379 return subpattern
381 _PATTERNENDERS = set("|)")
382 _ASSERTCHARS = set("=!<")
383 _LOOKBEHINDASSERTCHARS = set("=!")
384 _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
386 def _parse(source, state):
387 # parse a simple pattern
388 subpattern = SubPattern(state)
390 # precompute constants into local variables
391 subpatternappend = subpattern.append
392 sourceget = source.get
393 sourcematch = source.match
394 _len = len
395 PATTERNENDERS = _PATTERNENDERS
396 ASSERTCHARS = _ASSERTCHARS
397 LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
398 REPEATCODES = _REPEATCODES
400 while 1:
402 if source.next in PATTERNENDERS:
403 break # end of subpattern
404 this = sourceget()
405 if this is None:
406 break # end of pattern
408 if state.flags & SRE_FLAG_VERBOSE:
409 # skip whitespace and comments
410 if this in WHITESPACE:
411 continue
412 if this == "#":
413 while 1:
414 this = sourceget()
415 if this in (None, "\n"):
416 break
417 continue
419 if this and this[0] not in SPECIAL_CHARS:
420 subpatternappend((LITERAL, ord(this)))
422 elif this == "[":
423 # character set
424 set = []
425 setappend = set.append
426 ## if sourcematch(":"):
427 ## pass # handle character classes
428 if sourcematch("^"):
429 setappend((NEGATE, None))
430 # check remaining characters
431 start = set[:]
432 while 1:
433 this = sourceget()
434 if this == "]" and set != start:
435 break
436 elif this and this[0] == "\\":
437 code1 = _class_escape(source, this)
438 elif this:
439 code1 = LITERAL, ord(this)
440 else:
441 raise error("unexpected end of regular expression")
442 if sourcematch("-"):
443 # potential range
444 this = sourceget()
445 if this == "]":
446 if code1[0] is IN:
447 code1 = code1[1][0]
448 setappend(code1)
449 setappend((LITERAL, ord("-")))
450 break
451 elif this:
452 if this[0] == "\\":
453 code2 = _class_escape(source, this)
454 else:
455 code2 = LITERAL, ord(this)
456 if code1[0] != LITERAL or code2[0] != LITERAL:
457 raise error("bad character range")
458 lo = code1[1]
459 hi = code2[1]
460 if hi < lo:
461 raise error("bad character range")
462 setappend((RANGE, (lo, hi)))
463 else:
464 raise error("unexpected end of regular expression")
465 else:
466 if code1[0] is IN:
467 code1 = code1[1][0]
468 setappend(code1)
470 # XXX: <fl> should move set optimization to compiler!
471 if _len(set)==1 and set[0][0] is LITERAL:
472 subpatternappend(set[0]) # optimization
473 elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
474 subpatternappend((NOT_LITERAL, set[1][1])) # optimization
475 else:
476 # XXX: <fl> should add charmap optimization here
477 subpatternappend((IN, set))
479 elif this and this[0] in REPEAT_CHARS:
480 # repeat previous item
481 if this == "?":
482 min, max = 0, 1
483 elif this == "*":
484 min, max = 0, MAXREPEAT
486 elif this == "+":
487 min, max = 1, MAXREPEAT
488 elif this == "{":
489 if source.next == "}":
490 subpatternappend((LITERAL, ord(this)))
491 continue
492 here = source.tell()
493 min, max = 0, MAXREPEAT
494 lo = hi = ""
495 while source.next in DIGITS:
496 lo = lo + source.get()
497 if sourcematch(","):
498 while source.next in DIGITS:
499 hi = hi + sourceget()
500 else:
501 hi = lo
502 if not sourcematch("}"):
503 subpatternappend((LITERAL, ord(this)))
504 source.seek(here)
505 continue
506 if lo:
507 min = int(lo)
508 if hi:
509 max = int(hi)
510 if max < min:
511 raise error("bad repeat interval")
512 else:
513 raise error("not supported")
514 # figure out which item to repeat
515 if subpattern:
516 item = subpattern[-1:]
517 else:
518 item = None
519 if not item or (_len(item) == 1 and item[0][0] == AT):
520 raise error("nothing to repeat")
521 if item[0][0] in REPEATCODES:
522 raise error("multiple repeat")
523 if sourcematch("?"):
524 subpattern[-1] = (MIN_REPEAT, (min, max, item))
525 else:
526 subpattern[-1] = (MAX_REPEAT, (min, max, item))
528 elif this == ".":
529 subpatternappend((ANY, None))
531 elif this == "(":
532 group = 1
533 name = None
534 condgroup = None
535 if sourcematch("?"):
536 group = 0
537 # options
538 if sourcematch("P"):
539 # python extensions
540 if sourcematch("<"):
541 # named group: skip forward to end of name
542 name = ""
543 while 1:
544 char = sourceget()
545 if char is None:
546 raise error("unterminated name")
547 if char == ">":
548 break
549 name = name + char
550 group = 1
551 if not isname(name):
552 raise error("bad character in group name")
553 elif sourcematch("="):
554 # named backreference
555 name = ""
556 while 1:
557 char = sourceget()
558 if char is None:
559 raise error("unterminated name")
560 if char == ")":
561 break
562 name = name + char
563 if not isname(name):
564 raise error("bad character in group name")
565 gid = state.groupdict.get(name)
566 if gid is None:
567 raise error("unknown group name")
568 subpatternappend((GROUPREF, gid))
569 continue
570 else:
571 char = sourceget()
572 if char is None:
573 raise error("unexpected end of pattern")
574 raise error("unknown specifier: ?P%s" % char)
575 elif sourcematch(":"):
576 # non-capturing group
577 group = 2
578 elif sourcematch("#"):
579 # comment
580 while 1:
581 if source.next is None or source.next == ")":
582 break
583 sourceget()
584 if not sourcematch(")"):
585 raise error("unbalanced parenthesis")
586 continue
587 elif source.next in ASSERTCHARS:
588 # lookahead assertions
589 char = sourceget()
590 dir = 1
591 if char == "<":
592 if source.next not in LOOKBEHINDASSERTCHARS:
593 raise error("syntax error")
594 dir = -1 # lookbehind
595 char = sourceget()
596 p = _parse_sub(source, state)
597 if not sourcematch(")"):
598 raise error("unbalanced parenthesis")
599 if char == "=":
600 subpatternappend((ASSERT, (dir, p)))
601 else:
602 subpatternappend((ASSERT_NOT, (dir, p)))
603 continue
604 elif sourcematch("("):
605 # conditional backreference group
606 condname = ""
607 while 1:
608 char = sourceget()
609 if char is None:
610 raise error("unterminated name")
611 if char == ")":
612 break
613 condname = condname + char
614 group = 2
615 if isname(condname):
616 condgroup = state.groupdict.get(condname)
617 if condgroup is None:
618 raise error("unknown group name")
619 else:
620 try:
621 condgroup = int(condname)
622 except ValueError:
623 raise error("bad character in group name")
624 else:
625 # flags
626 if not source.next in FLAGS:
627 raise error("unexpected end of pattern")
628 while source.next in FLAGS:
629 state.flags = state.flags | FLAGS[sourceget()]
630 if group:
631 # parse group contents
632 if group == 2:
633 # anonymous group
634 group = None
635 else:
636 group = state.opengroup(name)
637 if condgroup:
638 p = _parse_sub_cond(source, state, condgroup)
639 else:
640 p = _parse_sub(source, state)
641 if not sourcematch(")"):
642 raise error("unbalanced parenthesis")
643 if group is not None:
644 state.closegroup(group)
645 subpatternappend((SUBPATTERN, (group, p)))
646 else:
647 while 1:
648 char = sourceget()
649 if char is None:
650 raise error("unexpected end of pattern")
651 if char == ")":
652 break
653 raise error("unknown extension")
655 elif this == "^":
656 subpatternappend((AT, AT_BEGINNING))
658 elif this == "$":
659 subpattern.append((AT, AT_END))
661 elif this and this[0] == "\\":
662 code = _escape(source, this, state)
663 subpatternappend(code)
665 else:
666 raise error("parser error")
668 return subpattern
670 def fix_flags(src, flags):
671 # Check and fix flags according to the type of pattern (str or bytes)
672 if isinstance(src, str):
673 if not flags & SRE_FLAG_ASCII:
674 flags |= SRE_FLAG_UNICODE
675 elif flags & SRE_FLAG_UNICODE:
676 raise ValueError("ASCII and UNICODE flags are incompatible")
677 else:
678 if flags & SRE_FLAG_UNICODE:
679 raise ValueError("can't use UNICODE flag with a bytes pattern")
680 return flags
682 def parse(str, flags=0, pattern=None):
683 # parse 're' pattern into list of (opcode, argument) tuples
685 source = Tokenizer(str)
687 if pattern is None:
688 pattern = Pattern()
689 pattern.flags = flags
690 pattern.str = str
692 p = _parse_sub(source, pattern, 0)
693 p.pattern.flags = fix_flags(str, p.pattern.flags)
695 tail = source.get()
696 if tail == ")":
697 raise error("unbalanced parenthesis")
698 elif tail:
699 raise error("bogus characters at end of regular expression")
701 if flags & SRE_FLAG_DEBUG:
702 p.dump()
704 if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
705 # the VERBOSE flag was switched on inside the pattern. to be
706 # on the safe side, we'll parse the whole thing again...
707 return parse(str, p.pattern.flags)
709 return p
711 def parse_template(source, pattern):
712 # parse 're' replacement string into list of literals and
713 # group references
714 s = Tokenizer(source)
715 sget = s.get
716 p = []
717 a = p.append
718 def literal(literal, p=p, pappend=a):
719 if p and p[-1][0] is LITERAL:
720 p[-1] = LITERAL, p[-1][1] + literal
721 else:
722 pappend((LITERAL, literal))
723 sep = source[:0]
724 if isinstance(sep, str):
725 makechar = chr
726 else:
727 makechar = chr
728 while 1:
729 this = sget()
730 if this is None:
731 break # end of replacement string
732 if this and this[0] == "\\":
733 # group
734 c = this[1:2]
735 if c == "g":
736 name = ""
737 if s.match("<"):
738 while 1:
739 char = sget()
740 if char is None:
741 raise error("unterminated group name")
742 if char == ">":
743 break
744 name = name + char
745 if not name:
746 raise error("bad group name")
747 try:
748 index = int(name)
749 if index < 0:
750 raise error("negative group number")
751 except ValueError:
752 if not isname(name):
753 raise error("bad character in group name")
754 try:
755 index = pattern.groupindex[name]
756 except KeyError:
757 raise IndexError("unknown group name")
758 a((MARK, index))
759 elif c == "0":
760 if s.next in OCTDIGITS:
761 this = this + sget()
762 if s.next in OCTDIGITS:
763 this = this + sget()
764 literal(makechar(int(this[1:], 8) & 0xff))
765 elif c in DIGITS:
766 isoctal = False
767 if s.next in DIGITS:
768 this = this + sget()
769 if (c in OCTDIGITS and this[2] in OCTDIGITS and
770 s.next in OCTDIGITS):
771 this = this + sget()
772 isoctal = True
773 literal(makechar(int(this[1:], 8) & 0xff))
774 if not isoctal:
775 a((MARK, int(this[1:])))
776 else:
777 try:
778 this = makechar(ESCAPES[this][1])
779 except KeyError:
780 pass
781 literal(this)
782 else:
783 literal(this)
784 # convert template to groups and literals lists
785 i = 0
786 groups = []
787 groupsappend = groups.append
788 literals = [None] * len(p)
789 if isinstance(source, str):
790 encode = lambda x: x
791 else:
792 # The tokenizer implicitly decodes bytes objects as latin-1, we must
793 # therefore re-encode the final representation.
794 encode = lambda x: x.encode('latin1')
795 for c, s in p:
796 if c is MARK:
797 groupsappend((i, s))
798 # literal[i] is already None
799 else:
800 literals[i] = encode(s)
801 i = i + 1
802 return groups, literals
804 def expand_template(template, match):
805 g = match.group
806 sep = match.string[:0]
807 groups, literals = template
808 literals = literals[:]
809 try:
810 for index, group in groups:
811 literals[index] = s = g(group)
812 if s is None:
813 raise error("unmatched group")
814 except IndexError:
815 raise error("invalid group reference")
816 return sep.join(literals)