Catch situations where currentframe() returns None. See SF patch #1447410, this is...
[python.git] / Lib / sre_parse.py
blob319bf43b33f048d8f4528e876592f7ca7b71a591
2 # Secret Labs' Regular Expression Engine
4 # convert re-style regular expression to sre pattern
6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
8 # See the sre.py file for information on usage and redistribution.
11 """Internal support module for sre"""
13 # XXX: show string offset and offending character for all errors
15 import sys
17 from sre_constants import *
19 def set(seq):
20 s = {}
21 for elem in seq:
22 s[elem] = 1
23 return s
25 SPECIAL_CHARS = ".\\[{()*+?^$|"
26 REPEAT_CHARS = "*+?{"
28 DIGITS = set("0123456789")
30 OCTDIGITS = set("01234567")
31 HEXDIGITS = set("0123456789abcdefABCDEF")
33 WHITESPACE = set(" \t\n\r\v\f")
35 ESCAPES = {
36 r"\a": (LITERAL, ord("\a")),
37 r"\b": (LITERAL, ord("\b")),
38 r"\f": (LITERAL, ord("\f")),
39 r"\n": (LITERAL, ord("\n")),
40 r"\r": (LITERAL, ord("\r")),
41 r"\t": (LITERAL, ord("\t")),
42 r"\v": (LITERAL, ord("\v")),
43 r"\\": (LITERAL, ord("\\"))
46 CATEGORIES = {
47 r"\A": (AT, AT_BEGINNING_STRING), # start of string
48 r"\b": (AT, AT_BOUNDARY),
49 r"\B": (AT, AT_NON_BOUNDARY),
50 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
51 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
52 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
53 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
54 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
55 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
56 r"\Z": (AT, AT_END_STRING), # end of string
59 FLAGS = {
60 # standard flags
61 "i": SRE_FLAG_IGNORECASE,
62 "L": SRE_FLAG_LOCALE,
63 "m": SRE_FLAG_MULTILINE,
64 "s": SRE_FLAG_DOTALL,
65 "x": SRE_FLAG_VERBOSE,
66 # extensions
67 "t": SRE_FLAG_TEMPLATE,
68 "u": SRE_FLAG_UNICODE,
71 class Pattern:
72 # master pattern object. keeps track of global attributes
73 def __init__(self):
74 self.flags = 0
75 self.open = []
76 self.groups = 1
77 self.groupdict = {}
78 def opengroup(self, name=None):
79 gid = self.groups
80 self.groups = gid + 1
81 if name is not None:
82 ogid = self.groupdict.get(name, None)
83 if ogid is not None:
84 raise error, ("redefinition of group name %s as group %d; "
85 "was group %d" % (repr(name), gid, ogid))
86 self.groupdict[name] = gid
87 self.open.append(gid)
88 return gid
89 def closegroup(self, gid):
90 self.open.remove(gid)
91 def checkgroup(self, gid):
92 return gid < self.groups and gid not in self.open
94 class SubPattern:
95 # a subpattern, in intermediate form
96 def __init__(self, pattern, data=None):
97 self.pattern = pattern
98 if data is None:
99 data = []
100 self.data = data
101 self.width = None
102 def dump(self, level=0):
103 nl = 1
104 seqtypes = type(()), type([])
105 for op, av in self.data:
106 print level*" " + op,; nl = 0
107 if op == "in":
108 # member sublanguage
109 print; nl = 1
110 for op, a in av:
111 print (level+1)*" " + op, a
112 elif op == "branch":
113 print; nl = 1
114 i = 0
115 for a in av[1]:
116 if i > 0:
117 print level*" " + "or"
118 a.dump(level+1); nl = 1
119 i = i + 1
120 elif type(av) in seqtypes:
121 for a in av:
122 if isinstance(a, SubPattern):
123 if not nl: print
124 a.dump(level+1); nl = 1
125 else:
126 print a, ; nl = 0
127 else:
128 print av, ; nl = 0
129 if not nl: print
130 def __repr__(self):
131 return repr(self.data)
132 def __len__(self):
133 return len(self.data)
134 def __delitem__(self, index):
135 del self.data[index]
136 def __getitem__(self, index):
137 return self.data[index]
138 def __setitem__(self, index, code):
139 self.data[index] = code
140 def __getslice__(self, start, stop):
141 return SubPattern(self.pattern, self.data[start:stop])
142 def insert(self, index, code):
143 self.data.insert(index, code)
144 def append(self, code):
145 self.data.append(code)
146 def getwidth(self):
147 # determine the width (min, max) for this subpattern
148 if self.width:
149 return self.width
150 lo = hi = 0L
151 UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
152 REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
153 for op, av in self.data:
154 if op is BRANCH:
155 i = sys.maxint
156 j = 0
157 for av in av[1]:
158 l, h = av.getwidth()
159 i = min(i, l)
160 j = max(j, h)
161 lo = lo + i
162 hi = hi + j
163 elif op is CALL:
164 i, j = av.getwidth()
165 lo = lo + i
166 hi = hi + j
167 elif op is SUBPATTERN:
168 i, j = av[1].getwidth()
169 lo = lo + i
170 hi = hi + j
171 elif op in REPEATCODES:
172 i, j = av[2].getwidth()
173 lo = lo + long(i) * av[0]
174 hi = hi + long(j) * av[1]
175 elif op in UNITCODES:
176 lo = lo + 1
177 hi = hi + 1
178 elif op == SUCCESS:
179 break
180 self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
181 return self.width
183 class Tokenizer:
184 def __init__(self, string):
185 self.string = string
186 self.index = 0
187 self.__next()
188 def __next(self):
189 if self.index >= len(self.string):
190 self.next = None
191 return
192 char = self.string[self.index]
193 if char[0] == "\\":
194 try:
195 c = self.string[self.index + 1]
196 except IndexError:
197 raise error, "bogus escape (end of line)"
198 char = char + c
199 self.index = self.index + len(char)
200 self.next = char
201 def match(self, char, skip=1):
202 if char == self.next:
203 if skip:
204 self.__next()
205 return 1
206 return 0
207 def get(self):
208 this = self.next
209 self.__next()
210 return this
211 def tell(self):
212 return self.index, self.next
213 def seek(self, index):
214 self.index, self.next = index
216 def isident(char):
217 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
219 def isdigit(char):
220 return "0" <= char <= "9"
222 def isname(name):
223 # check that group name is a valid string
224 if not isident(name[0]):
225 return False
226 for char in name[1:]:
227 if not isident(char) and not isdigit(char):
228 return False
229 return True
231 def _class_escape(source, escape):
232 # handle escape code inside character class
233 code = ESCAPES.get(escape)
234 if code:
235 return code
236 code = CATEGORIES.get(escape)
237 if code:
238 return code
239 try:
240 c = escape[1:2]
241 if c == "x":
242 # hexadecimal escape (exactly two digits)
243 while source.next in HEXDIGITS and len(escape) < 4:
244 escape = escape + source.get()
245 escape = escape[2:]
246 if len(escape) != 2:
247 raise error, "bogus escape: %s" % repr("\\" + escape)
248 return LITERAL, int(escape, 16) & 0xff
249 elif c in OCTDIGITS:
250 # octal escape (up to three digits)
251 while source.next in OCTDIGITS and len(escape) < 4:
252 escape = escape + source.get()
253 escape = escape[1:]
254 return LITERAL, int(escape, 8) & 0xff
255 elif c in DIGITS:
256 raise error, "bogus escape: %s" % repr(escape)
257 if len(escape) == 2:
258 return LITERAL, ord(escape[1])
259 except ValueError:
260 pass
261 raise error, "bogus escape: %s" % repr(escape)
263 def _escape(source, escape, state):
264 # handle escape code in expression
265 code = CATEGORIES.get(escape)
266 if code:
267 return code
268 code = ESCAPES.get(escape)
269 if code:
270 return code
271 try:
272 c = escape[1:2]
273 if c == "x":
274 # hexadecimal escape
275 while source.next in HEXDIGITS and len(escape) < 4:
276 escape = escape + source.get()
277 if len(escape) != 4:
278 raise ValueError
279 return LITERAL, int(escape[2:], 16) & 0xff
280 elif c == "0":
281 # octal escape
282 while source.next in OCTDIGITS and len(escape) < 4:
283 escape = escape + source.get()
284 return LITERAL, int(escape[1:], 8) & 0xff
285 elif c in DIGITS:
286 # octal escape *or* decimal group reference (sigh)
287 if source.next in DIGITS:
288 escape = escape + source.get()
289 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
290 source.next in OCTDIGITS):
291 # got three octal digits; this is an octal escape
292 escape = escape + source.get()
293 return LITERAL, int(escape[1:], 8) & 0xff
294 # not an octal escape, so this is a group reference
295 group = int(escape[1:])
296 if group < state.groups:
297 if not state.checkgroup(group):
298 raise error, "cannot refer to open group"
299 return GROUPREF, group
300 raise ValueError
301 if len(escape) == 2:
302 return LITERAL, ord(escape[1])
303 except ValueError:
304 pass
305 raise error, "bogus escape: %s" % repr(escape)
307 def _parse_sub(source, state, nested=1):
308 # parse an alternation: a|b|c
310 items = []
311 itemsappend = items.append
312 sourcematch = source.match
313 while 1:
314 itemsappend(_parse(source, state))
315 if sourcematch("|"):
316 continue
317 if not nested:
318 break
319 if not source.next or sourcematch(")", 0):
320 break
321 else:
322 raise error, "pattern not properly closed"
324 if len(items) == 1:
325 return items[0]
327 subpattern = SubPattern(state)
328 subpatternappend = subpattern.append
330 # check if all items share a common prefix
331 while 1:
332 prefix = None
333 for item in items:
334 if not item:
335 break
336 if prefix is None:
337 prefix = item[0]
338 elif item[0] != prefix:
339 break
340 else:
341 # all subitems start with a common "prefix".
342 # move it out of the branch
343 for item in items:
344 del item[0]
345 subpatternappend(prefix)
346 continue # check next one
347 break
349 # check if the branch can be replaced by a character set
350 for item in items:
351 if len(item) != 1 or item[0][0] != LITERAL:
352 break
353 else:
354 # we can store this as a character set instead of a
355 # branch (the compiler may optimize this even more)
356 set = []
357 setappend = set.append
358 for item in items:
359 setappend(item[0])
360 subpatternappend((IN, set))
361 return subpattern
363 subpattern.append((BRANCH, (None, items)))
364 return subpattern
366 def _parse_sub_cond(source, state, condgroup):
367 item_yes = _parse(source, state)
368 if source.match("|"):
369 item_no = _parse(source, state)
370 if source.match("|"):
371 raise error, "conditional backref with more than two branches"
372 else:
373 item_no = None
374 if source.next and not source.match(")", 0):
375 raise error, "pattern not properly closed"
376 subpattern = SubPattern(state)
377 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
378 return subpattern
380 _PATTERNENDERS = set("|)")
381 _ASSERTCHARS = set("=!<")
382 _LOOKBEHINDASSERTCHARS = set("=!")
383 _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
385 def _parse(source, state):
386 # parse a simple pattern
387 subpattern = SubPattern(state)
389 # precompute constants into local variables
390 subpatternappend = subpattern.append
391 sourceget = source.get
392 sourcematch = source.match
393 _len = len
394 PATTERNENDERS = _PATTERNENDERS
395 ASSERTCHARS = _ASSERTCHARS
396 LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
397 REPEATCODES = _REPEATCODES
399 while 1:
401 if source.next in PATTERNENDERS:
402 break # end of subpattern
403 this = sourceget()
404 if this is None:
405 break # end of pattern
407 if state.flags & SRE_FLAG_VERBOSE:
408 # skip whitespace and comments
409 if this in WHITESPACE:
410 continue
411 if this == "#":
412 while 1:
413 this = sourceget()
414 if this in (None, "\n"):
415 break
416 continue
418 if this and this[0] not in SPECIAL_CHARS:
419 subpatternappend((LITERAL, ord(this)))
421 elif this == "[":
422 # character set
423 set = []
424 setappend = set.append
425 ## if sourcematch(":"):
426 ## pass # handle character classes
427 if sourcematch("^"):
428 setappend((NEGATE, None))
429 # check remaining characters
430 start = set[:]
431 while 1:
432 this = sourceget()
433 if this == "]" and set != start:
434 break
435 elif this and this[0] == "\\":
436 code1 = _class_escape(source, this)
437 elif this:
438 code1 = LITERAL, ord(this)
439 else:
440 raise error, "unexpected end of regular expression"
441 if sourcematch("-"):
442 # potential range
443 this = sourceget()
444 if this == "]":
445 if code1[0] is IN:
446 code1 = code1[1][0]
447 setappend(code1)
448 setappend((LITERAL, ord("-")))
449 break
450 elif this:
451 if this[0] == "\\":
452 code2 = _class_escape(source, this)
453 else:
454 code2 = LITERAL, ord(this)
455 if code1[0] != LITERAL or code2[0] != LITERAL:
456 raise error, "bad character range"
457 lo = code1[1]
458 hi = code2[1]
459 if hi < lo:
460 raise error, "bad character range"
461 setappend((RANGE, (lo, hi)))
462 else:
463 raise error, "unexpected end of regular expression"
464 else:
465 if code1[0] is IN:
466 code1 = code1[1][0]
467 setappend(code1)
469 # XXX: <fl> should move set optimization to compiler!
470 if _len(set)==1 and set[0][0] is LITERAL:
471 subpatternappend(set[0]) # optimization
472 elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
473 subpatternappend((NOT_LITERAL, set[1][1])) # optimization
474 else:
475 # XXX: <fl> should add charmap optimization here
476 subpatternappend((IN, set))
478 elif this and this[0] in REPEAT_CHARS:
479 # repeat previous item
480 if this == "?":
481 min, max = 0, 1
482 elif this == "*":
483 min, max = 0, MAXREPEAT
485 elif this == "+":
486 min, max = 1, MAXREPEAT
487 elif this == "{":
488 if source.next == "}":
489 subpatternappend((LITERAL, ord(this)))
490 continue
491 here = source.tell()
492 min, max = 0, MAXREPEAT
493 lo = hi = ""
494 while source.next in DIGITS:
495 lo = lo + source.get()
496 if sourcematch(","):
497 while source.next in DIGITS:
498 hi = hi + sourceget()
499 else:
500 hi = lo
501 if not sourcematch("}"):
502 subpatternappend((LITERAL, ord(this)))
503 source.seek(here)
504 continue
505 if lo:
506 min = int(lo)
507 if hi:
508 max = int(hi)
509 if max < min:
510 raise error, "bad repeat interval"
511 else:
512 raise error, "not supported"
513 # figure out which item to repeat
514 if subpattern:
515 item = subpattern[-1:]
516 else:
517 item = None
518 if not item or (_len(item) == 1 and item[0][0] == AT):
519 raise error, "nothing to repeat"
520 if item[0][0] in REPEATCODES:
521 raise error, "multiple repeat"
522 if sourcematch("?"):
523 subpattern[-1] = (MIN_REPEAT, (min, max, item))
524 else:
525 subpattern[-1] = (MAX_REPEAT, (min, max, item))
527 elif this == ".":
528 subpatternappend((ANY, None))
530 elif this == "(":
531 group = 1
532 name = None
533 condgroup = None
534 if sourcematch("?"):
535 group = 0
536 # options
537 if sourcematch("P"):
538 # python extensions
539 if sourcematch("<"):
540 # named group: skip forward to end of name
541 name = ""
542 while 1:
543 char = sourceget()
544 if char is None:
545 raise error, "unterminated name"
546 if char == ">":
547 break
548 name = name + char
549 group = 1
550 if not isname(name):
551 raise error, "bad character in group name"
552 elif sourcematch("="):
553 # named backreference
554 name = ""
555 while 1:
556 char = sourceget()
557 if char is None:
558 raise error, "unterminated name"
559 if char == ")":
560 break
561 name = name + char
562 if not isname(name):
563 raise error, "bad character in group name"
564 gid = state.groupdict.get(name)
565 if gid is None:
566 raise error, "unknown group name"
567 subpatternappend((GROUPREF, gid))
568 continue
569 else:
570 char = sourceget()
571 if char is None:
572 raise error, "unexpected end of pattern"
573 raise error, "unknown specifier: ?P%s" % char
574 elif sourcematch(":"):
575 # non-capturing group
576 group = 2
577 elif sourcematch("#"):
578 # comment
579 while 1:
580 if source.next is None or source.next == ")":
581 break
582 sourceget()
583 if not sourcematch(")"):
584 raise error, "unbalanced parenthesis"
585 continue
586 elif source.next in ASSERTCHARS:
587 # lookahead assertions
588 char = sourceget()
589 dir = 1
590 if char == "<":
591 if source.next not in LOOKBEHINDASSERTCHARS:
592 raise error, "syntax error"
593 dir = -1 # lookbehind
594 char = sourceget()
595 p = _parse_sub(source, state)
596 if not sourcematch(")"):
597 raise error, "unbalanced parenthesis"
598 if char == "=":
599 subpatternappend((ASSERT, (dir, p)))
600 else:
601 subpatternappend((ASSERT_NOT, (dir, p)))
602 continue
603 elif sourcematch("("):
604 # conditional backreference group
605 condname = ""
606 while 1:
607 char = sourceget()
608 if char is None:
609 raise error, "unterminated name"
610 if char == ")":
611 break
612 condname = condname + char
613 group = 2
614 if isname(condname):
615 condgroup = state.groupdict.get(condname)
616 if condgroup is None:
617 raise error, "unknown group name"
618 else:
619 try:
620 condgroup = int(condname)
621 except ValueError:
622 raise error, "bad character in group name"
623 else:
624 # flags
625 if not source.next in FLAGS:
626 raise error, "unexpected end of pattern"
627 while source.next in FLAGS:
628 state.flags = state.flags | FLAGS[sourceget()]
629 if group:
630 # parse group contents
631 if group == 2:
632 # anonymous group
633 group = None
634 else:
635 group = state.opengroup(name)
636 if condgroup:
637 p = _parse_sub_cond(source, state, condgroup)
638 else:
639 p = _parse_sub(source, state)
640 if not sourcematch(")"):
641 raise error, "unbalanced parenthesis"
642 if group is not None:
643 state.closegroup(group)
644 subpatternappend((SUBPATTERN, (group, p)))
645 else:
646 while 1:
647 char = sourceget()
648 if char is None:
649 raise error, "unexpected end of pattern"
650 if char == ")":
651 break
652 raise error, "unknown extension"
654 elif this == "^":
655 subpatternappend((AT, AT_BEGINNING))
657 elif this == "$":
658 subpattern.append((AT, AT_END))
660 elif this and this[0] == "\\":
661 code = _escape(source, this, state)
662 subpatternappend(code)
664 else:
665 raise error, "parser error"
667 return subpattern
669 def parse(str, flags=0, pattern=None):
670 # parse 're' pattern into list of (opcode, argument) tuples
672 source = Tokenizer(str)
674 if pattern is None:
675 pattern = Pattern()
676 pattern.flags = flags
677 pattern.str = str
679 p = _parse_sub(source, pattern, 0)
681 tail = source.get()
682 if tail == ")":
683 raise error, "unbalanced parenthesis"
684 elif tail:
685 raise error, "bogus characters at end of regular expression"
687 if flags & SRE_FLAG_DEBUG:
688 p.dump()
690 if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
691 # the VERBOSE flag was switched on inside the pattern. to be
692 # on the safe side, we'll parse the whole thing again...
693 return parse(str, p.pattern.flags)
695 return p
697 def parse_template(source, pattern):
698 # parse 're' replacement string into list of literals and
699 # group references
700 s = Tokenizer(source)
701 sget = s.get
702 p = []
703 a = p.append
704 def literal(literal, p=p, pappend=a):
705 if p and p[-1][0] is LITERAL:
706 p[-1] = LITERAL, p[-1][1] + literal
707 else:
708 pappend((LITERAL, literal))
709 sep = source[:0]
710 if type(sep) is type(""):
711 makechar = chr
712 else:
713 makechar = unichr
714 while 1:
715 this = sget()
716 if this is None:
717 break # end of replacement string
718 if this and this[0] == "\\":
719 # group
720 c = this[1:2]
721 if c == "g":
722 name = ""
723 if s.match("<"):
724 while 1:
725 char = sget()
726 if char is None:
727 raise error, "unterminated group name"
728 if char == ">":
729 break
730 name = name + char
731 if not name:
732 raise error, "bad group name"
733 try:
734 index = int(name)
735 if index < 0:
736 raise error, "negative group number"
737 except ValueError:
738 if not isname(name):
739 raise error, "bad character in group name"
740 try:
741 index = pattern.groupindex[name]
742 except KeyError:
743 raise IndexError, "unknown group name"
744 a((MARK, index))
745 elif c == "0":
746 if s.next in OCTDIGITS:
747 this = this + sget()
748 if s.next in OCTDIGITS:
749 this = this + sget()
750 literal(makechar(int(this[1:], 8) & 0xff))
751 elif c in DIGITS:
752 isoctal = False
753 if s.next in DIGITS:
754 this = this + sget()
755 if (c in OCTDIGITS and this[2] in OCTDIGITS and
756 s.next in OCTDIGITS):
757 this = this + sget()
758 isoctal = True
759 literal(makechar(int(this[1:], 8) & 0xff))
760 if not isoctal:
761 a((MARK, int(this[1:])))
762 else:
763 try:
764 this = makechar(ESCAPES[this][1])
765 except KeyError:
766 pass
767 literal(this)
768 else:
769 literal(this)
770 # convert template to groups and literals lists
771 i = 0
772 groups = []
773 groupsappend = groups.append
774 literals = [None] * len(p)
775 for c, s in p:
776 if c is MARK:
777 groupsappend((i, s))
778 # literal[i] is already None
779 else:
780 literals[i] = s
781 i = i + 1
782 return groups, literals
784 def expand_template(template, match):
785 g = match.group
786 sep = match.string[:0]
787 groups, literals = template
788 literals = literals[:]
789 try:
790 for index, group in groups:
791 literals[index] = s = g(group)
792 if s is None:
793 raise error, "unmatched group"
794 except IndexError:
795 raise error, "invalid group reference"
796 return sep.join(literals)