2 # Secret Labs' Regular Expression Engine
4 # convert re-style regular expression to sre pattern
6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
8 # See the sre.py file for information on usage and redistribution.
11 """Internal support module for sre"""
13 # XXX: show string offset and offending character for all errors
17 from sre_constants
import *
25 SPECIAL_CHARS
= ".\\[{()*+?^$|"
28 DIGITS
= set("0123456789")
30 OCTDIGITS
= set("01234567")
31 HEXDIGITS
= set("0123456789abcdefABCDEF")
33 WHITESPACE
= set(" \t\n\r\v\f")
36 r
"\a": (LITERAL
, ord("\a")),
37 r
"\b": (LITERAL
, ord("\b")),
38 r
"\f": (LITERAL
, ord("\f")),
39 r
"\n": (LITERAL
, ord("\n")),
40 r
"\r": (LITERAL
, ord("\r")),
41 r
"\t": (LITERAL
, ord("\t")),
42 r
"\v": (LITERAL
, ord("\v")),
43 r
"\\": (LITERAL
, ord("\\"))
47 r
"\A": (AT
, AT_BEGINNING_STRING
), # start of string
48 r
"\b": (AT
, AT_BOUNDARY
),
49 r
"\B": (AT
, AT_NON_BOUNDARY
),
50 r
"\d": (IN
, [(CATEGORY
, CATEGORY_DIGIT
)]),
51 r
"\D": (IN
, [(CATEGORY
, CATEGORY_NOT_DIGIT
)]),
52 r
"\s": (IN
, [(CATEGORY
, CATEGORY_SPACE
)]),
53 r
"\S": (IN
, [(CATEGORY
, CATEGORY_NOT_SPACE
)]),
54 r
"\w": (IN
, [(CATEGORY
, CATEGORY_WORD
)]),
55 r
"\W": (IN
, [(CATEGORY
, CATEGORY_NOT_WORD
)]),
56 r
"\Z": (AT
, AT_END_STRING
), # end of string
61 "i": SRE_FLAG_IGNORECASE
,
63 "m": SRE_FLAG_MULTILINE
,
65 "x": SRE_FLAG_VERBOSE
,
67 "t": SRE_FLAG_TEMPLATE
,
68 "u": SRE_FLAG_UNICODE
,
72 # master pattern object. keeps track of global attributes
78 def opengroup(self
, name
=None):
82 ogid
= self
.groupdict
.get(name
, None)
84 raise error
, ("redefinition of group name %s as group %d; "
85 "was group %d" % (repr(name
), gid
, ogid
))
86 self
.groupdict
[name
] = gid
89 def closegroup(self
, gid
):
91 def checkgroup(self
, gid
):
92 return gid
< self
.groups
and gid
not in self
.open
95 # a subpattern, in intermediate form
96 def __init__(self
, pattern
, data
=None):
97 self
.pattern
= pattern
102 def dump(self
, level
=0):
104 seqtypes
= type(()), type([])
105 for op
, av
in self
.data
:
106 print level
*" " + op
,; nl
= 0
111 print (level
+1)*" " + op
, a
117 print level
*" " + "or"
118 a
.dump(level
+1); nl
= 1
120 elif type(av
) in seqtypes
:
122 if isinstance(a
, SubPattern
):
124 a
.dump(level
+1); nl
= 1
131 return repr(self
.data
)
133 return len(self
.data
)
134 def __delitem__(self
, index
):
136 def __getitem__(self
, index
):
137 return self
.data
[index
]
138 def __setitem__(self
, index
, code
):
139 self
.data
[index
] = code
140 def __getslice__(self
, start
, stop
):
141 return SubPattern(self
.pattern
, self
.data
[start
:stop
])
142 def insert(self
, index
, code
):
143 self
.data
.insert(index
, code
)
144 def append(self
, code
):
145 self
.data
.append(code
)
147 # determine the width (min, max) for this subpattern
151 UNITCODES
= (ANY
, RANGE
, IN
, LITERAL
, NOT_LITERAL
, CATEGORY
)
152 REPEATCODES
= (MIN_REPEAT
, MAX_REPEAT
)
153 for op
, av
in self
.data
:
167 elif op
is SUBPATTERN
:
168 i
, j
= av
[1].getwidth()
171 elif op
in REPEATCODES
:
172 i
, j
= av
[2].getwidth()
173 lo
= lo
+ long(i
) * av
[0]
174 hi
= hi
+ long(j
) * av
[1]
175 elif op
in UNITCODES
:
180 self
.width
= int(min(lo
, sys
.maxint
)), int(min(hi
, sys
.maxint
))
184 def __init__(self
, string
):
189 if self
.index
>= len(self
.string
):
192 char
= self
.string
[self
.index
]
195 c
= self
.string
[self
.index
+ 1]
197 raise error
, "bogus escape (end of line)"
199 self
.index
= self
.index
+ len(char
)
201 def match(self
, char
, skip
=1):
202 if char
== self
.next
:
212 return self
.index
, self
.next
213 def seek(self
, index
):
214 self
.index
, self
.next
= index
217 return "a" <= char
<= "z" or "A" <= char
<= "Z" or char
== "_"
220 return "0" <= char
<= "9"
223 # check that group name is a valid string
224 if not isident(name
[0]):
226 for char
in name
[1:]:
227 if not isident(char
) and not isdigit(char
):
231 def _class_escape(source
, escape
):
232 # handle escape code inside character class
233 code
= ESCAPES
.get(escape
)
236 code
= CATEGORIES
.get(escape
)
242 # hexadecimal escape (exactly two digits)
243 while source
.next
in HEXDIGITS
and len(escape
) < 4:
244 escape
= escape
+ source
.get()
247 raise error
, "bogus escape: %s" % repr("\\" + escape
)
248 return LITERAL
, int(escape
, 16) & 0xff
250 # octal escape (up to three digits)
251 while source
.next
in OCTDIGITS
and len(escape
) < 4:
252 escape
= escape
+ source
.get()
254 return LITERAL
, int(escape
, 8) & 0xff
256 raise error
, "bogus escape: %s" % repr(escape
)
258 return LITERAL
, ord(escape
[1])
261 raise error
, "bogus escape: %s" % repr(escape
)
263 def _escape(source
, escape
, state
):
264 # handle escape code in expression
265 code
= CATEGORIES
.get(escape
)
268 code
= ESCAPES
.get(escape
)
275 while source
.next
in HEXDIGITS
and len(escape
) < 4:
276 escape
= escape
+ source
.get()
279 return LITERAL
, int(escape
[2:], 16) & 0xff
282 while source
.next
in OCTDIGITS
and len(escape
) < 4:
283 escape
= escape
+ source
.get()
284 return LITERAL
, int(escape
[1:], 8) & 0xff
286 # octal escape *or* decimal group reference (sigh)
287 if source
.next
in DIGITS
:
288 escape
= escape
+ source
.get()
289 if (escape
[1] in OCTDIGITS
and escape
[2] in OCTDIGITS
and
290 source
.next
in OCTDIGITS
):
291 # got three octal digits; this is an octal escape
292 escape
= escape
+ source
.get()
293 return LITERAL
, int(escape
[1:], 8) & 0xff
294 # not an octal escape, so this is a group reference
295 group
= int(escape
[1:])
296 if group
< state
.groups
:
297 if not state
.checkgroup(group
):
298 raise error
, "cannot refer to open group"
299 return GROUPREF
, group
302 return LITERAL
, ord(escape
[1])
305 raise error
, "bogus escape: %s" % repr(escape
)
307 def _parse_sub(source
, state
, nested
=1):
308 # parse an alternation: a|b|c
311 itemsappend
= items
.append
312 sourcematch
= source
.match
314 itemsappend(_parse(source
, state
))
319 if not source
.next
or sourcematch(")", 0):
322 raise error
, "pattern not properly closed"
327 subpattern
= SubPattern(state
)
328 subpatternappend
= subpattern
.append
330 # check if all items share a common prefix
338 elif item
[0] != prefix
:
341 # all subitems start with a common "prefix".
342 # move it out of the branch
345 subpatternappend(prefix
)
346 continue # check next one
349 # check if the branch can be replaced by a character set
351 if len(item
) != 1 or item
[0][0] != LITERAL
:
354 # we can store this as a character set instead of a
355 # branch (the compiler may optimize this even more)
357 setappend
= set.append
360 subpatternappend((IN
, set))
363 subpattern
.append((BRANCH
, (None, items
)))
366 def _parse_sub_cond(source
, state
, condgroup
):
367 item_yes
= _parse(source
, state
)
368 if source
.match("|"):
369 item_no
= _parse(source
, state
)
370 if source
.match("|"):
371 raise error
, "conditional backref with more than two branches"
374 if source
.next
and not source
.match(")", 0):
375 raise error
, "pattern not properly closed"
376 subpattern
= SubPattern(state
)
377 subpattern
.append((GROUPREF_EXISTS
, (condgroup
, item_yes
, item_no
)))
380 _PATTERNENDERS
= set("|)")
381 _ASSERTCHARS
= set("=!<")
382 _LOOKBEHINDASSERTCHARS
= set("=!")
383 _REPEATCODES
= set([MIN_REPEAT
, MAX_REPEAT
])
385 def _parse(source
, state
):
386 # parse a simple pattern
387 subpattern
= SubPattern(state
)
389 # precompute constants into local variables
390 subpatternappend
= subpattern
.append
391 sourceget
= source
.get
392 sourcematch
= source
.match
394 PATTERNENDERS
= _PATTERNENDERS
395 ASSERTCHARS
= _ASSERTCHARS
396 LOOKBEHINDASSERTCHARS
= _LOOKBEHINDASSERTCHARS
397 REPEATCODES
= _REPEATCODES
401 if source
.next
in PATTERNENDERS
:
402 break # end of subpattern
405 break # end of pattern
407 if state
.flags
& SRE_FLAG_VERBOSE
:
408 # skip whitespace and comments
409 if this
in WHITESPACE
:
414 if this
in (None, "\n"):
418 if this
and this
[0] not in SPECIAL_CHARS
:
419 subpatternappend((LITERAL
, ord(this
)))
424 setappend
= set.append
425 ## if sourcematch(":"):
426 ## pass # handle character classes
428 setappend((NEGATE
, None))
429 # check remaining characters
433 if this
== "]" and set != start
:
435 elif this
and this
[0] == "\\":
436 code1
= _class_escape(source
, this
)
438 code1
= LITERAL
, ord(this
)
440 raise error
, "unexpected end of regular expression"
448 setappend((LITERAL
, ord("-")))
452 code2
= _class_escape(source
, this
)
454 code2
= LITERAL
, ord(this
)
455 if code1
[0] != LITERAL
or code2
[0] != LITERAL
:
456 raise error
, "bad character range"
460 raise error
, "bad character range"
461 setappend((RANGE
, (lo
, hi
)))
463 raise error
, "unexpected end of regular expression"
469 # XXX: <fl> should move set optimization to compiler!
470 if _len(set)==1 and set[0][0] is LITERAL
:
471 subpatternappend(set[0]) # optimization
472 elif _len(set)==2 and set[0][0] is NEGATE
and set[1][0] is LITERAL
:
473 subpatternappend((NOT_LITERAL
, set[1][1])) # optimization
475 # XXX: <fl> should add charmap optimization here
476 subpatternappend((IN
, set))
478 elif this
and this
[0] in REPEAT_CHARS
:
479 # repeat previous item
483 min, max = 0, MAXREPEAT
486 min, max = 1, MAXREPEAT
488 if source
.next
== "}":
489 subpatternappend((LITERAL
, ord(this
)))
492 min, max = 0, MAXREPEAT
494 while source
.next
in DIGITS
:
495 lo
= lo
+ source
.get()
497 while source
.next
in DIGITS
:
498 hi
= hi
+ sourceget()
501 if not sourcematch("}"):
502 subpatternappend((LITERAL
, ord(this
)))
510 raise error
, "bad repeat interval"
512 raise error
, "not supported"
513 # figure out which item to repeat
515 item
= subpattern
[-1:]
518 if not item
or (_len(item
) == 1 and item
[0][0] == AT
):
519 raise error
, "nothing to repeat"
520 if item
[0][0] in REPEATCODES
:
521 raise error
, "multiple repeat"
523 subpattern
[-1] = (MIN_REPEAT
, (min, max, item
))
525 subpattern
[-1] = (MAX_REPEAT
, (min, max, item
))
528 subpatternappend((ANY
, None))
540 # named group: skip forward to end of name
545 raise error
, "unterminated name"
551 raise error
, "bad character in group name"
552 elif sourcematch("="):
553 # named backreference
558 raise error
, "unterminated name"
563 raise error
, "bad character in group name"
564 gid
= state
.groupdict
.get(name
)
566 raise error
, "unknown group name"
567 subpatternappend((GROUPREF
, gid
))
572 raise error
, "unexpected end of pattern"
573 raise error
, "unknown specifier: ?P%s" % char
574 elif sourcematch(":"):
575 # non-capturing group
577 elif sourcematch("#"):
580 if source
.next
is None or source
.next
== ")":
583 if not sourcematch(")"):
584 raise error
, "unbalanced parenthesis"
586 elif source
.next
in ASSERTCHARS
:
587 # lookahead assertions
591 if source
.next
not in LOOKBEHINDASSERTCHARS
:
592 raise error
, "syntax error"
593 dir = -1 # lookbehind
595 p
= _parse_sub(source
, state
)
596 if not sourcematch(")"):
597 raise error
, "unbalanced parenthesis"
599 subpatternappend((ASSERT
, (dir, p
)))
601 subpatternappend((ASSERT_NOT
, (dir, p
)))
603 elif sourcematch("("):
604 # conditional backreference group
609 raise error
, "unterminated name"
612 condname
= condname
+ char
615 condgroup
= state
.groupdict
.get(condname
)
616 if condgroup
is None:
617 raise error
, "unknown group name"
620 condgroup
= int(condname
)
622 raise error
, "bad character in group name"
625 if not source
.next
in FLAGS
:
626 raise error
, "unexpected end of pattern"
627 while source
.next
in FLAGS
:
628 state
.flags
= state
.flags | FLAGS
[sourceget()]
630 # parse group contents
635 group
= state
.opengroup(name
)
637 p
= _parse_sub_cond(source
, state
, condgroup
)
639 p
= _parse_sub(source
, state
)
640 if not sourcematch(")"):
641 raise error
, "unbalanced parenthesis"
642 if group
is not None:
643 state
.closegroup(group
)
644 subpatternappend((SUBPATTERN
, (group
, p
)))
649 raise error
, "unexpected end of pattern"
652 raise error
, "unknown extension"
655 subpatternappend((AT
, AT_BEGINNING
))
658 subpattern
.append((AT
, AT_END
))
660 elif this
and this
[0] == "\\":
661 code
= _escape(source
, this
, state
)
662 subpatternappend(code
)
665 raise error
, "parser error"
669 def parse(str, flags
=0, pattern
=None):
670 # parse 're' pattern into list of (opcode, argument) tuples
672 source
= Tokenizer(str)
676 pattern
.flags
= flags
679 p
= _parse_sub(source
, pattern
, 0)
683 raise error
, "unbalanced parenthesis"
685 raise error
, "bogus characters at end of regular expression"
687 if flags
& SRE_FLAG_DEBUG
:
690 if not (flags
& SRE_FLAG_VERBOSE
) and p
.pattern
.flags
& SRE_FLAG_VERBOSE
:
691 # the VERBOSE flag was switched on inside the pattern. to be
692 # on the safe side, we'll parse the whole thing again...
693 return parse(str, p
.pattern
.flags
)
697 def parse_template(source
, pattern
):
698 # parse 're' replacement string into list of literals and
700 s
= Tokenizer(source
)
704 def literal(literal
, p
=p
, pappend
=a
):
705 if p
and p
[-1][0] is LITERAL
:
706 p
[-1] = LITERAL
, p
[-1][1] + literal
708 pappend((LITERAL
, literal
))
710 if type(sep
) is type(""):
717 break # end of replacement string
718 if this
and this
[0] == "\\":
727 raise error
, "unterminated group name"
732 raise error
, "bad group name"
736 raise error
, "negative group number"
739 raise error
, "bad character in group name"
741 index
= pattern
.groupindex
[name
]
743 raise IndexError, "unknown group name"
746 if s
.next
in OCTDIGITS
:
748 if s
.next
in OCTDIGITS
:
750 literal(makechar(int(this
[1:], 8) & 0xff))
755 if (c
in OCTDIGITS
and this
[2] in OCTDIGITS
and
756 s
.next
in OCTDIGITS
):
759 literal(makechar(int(this
[1:], 8) & 0xff))
761 a((MARK
, int(this
[1:])))
764 this
= makechar(ESCAPES
[this
][1])
770 # convert template to groups and literals lists
773 groupsappend
= groups
.append
774 literals
= [None] * len(p
)
778 # literal[i] is already None
782 return groups
, literals
784 def expand_template(template
, match
):
786 sep
= match
.string
[:0]
787 groups
, literals
= template
788 literals
= literals
[:]
790 for index
, group
in groups
:
791 literals
[index
] = s
= g(group
)
793 raise error
, "unmatched group"
795 raise error
, "invalid group reference"
796 return sep
.join(literals
)