2 # Secret Labs' Regular Expression Engine
4 # convert re-style regular expression to sre pattern
6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
8 # See the sre.py file for information on usage and redistribution.
11 """Internal support module for sre"""
13 # XXX: show string offset and offending character for all errors
17 from sre_constants
import *
25 SPECIAL_CHARS
= ".\\[{()*+?^$|"
28 DIGITS
= set("0123456789")
30 OCTDIGITS
= set("01234567")
31 HEXDIGITS
= set("0123456789abcdefABCDEF")
33 WHITESPACE
= set(" \t\n\r\v\f")
36 r
"\a": (LITERAL
, ord("\a")),
37 r
"\b": (LITERAL
, ord("\b")),
38 r
"\f": (LITERAL
, ord("\f")),
39 r
"\n": (LITERAL
, ord("\n")),
40 r
"\r": (LITERAL
, ord("\r")),
41 r
"\t": (LITERAL
, ord("\t")),
42 r
"\v": (LITERAL
, ord("\v")),
43 r
"\\": (LITERAL
, ord("\\"))
47 r
"\A": (AT
, AT_BEGINNING_STRING
), # start of string
48 r
"\b": (AT
, AT_BOUNDARY
),
49 r
"\B": (AT
, AT_NON_BOUNDARY
),
50 r
"\d": (IN
, [(CATEGORY
, CATEGORY_DIGIT
)]),
51 r
"\D": (IN
, [(CATEGORY
, CATEGORY_NOT_DIGIT
)]),
52 r
"\s": (IN
, [(CATEGORY
, CATEGORY_SPACE
)]),
53 r
"\S": (IN
, [(CATEGORY
, CATEGORY_NOT_SPACE
)]),
54 r
"\w": (IN
, [(CATEGORY
, CATEGORY_WORD
)]),
55 r
"\W": (IN
, [(CATEGORY
, CATEGORY_NOT_WORD
)]),
56 r
"\Z": (AT
, AT_END_STRING
), # end of string
61 "i": SRE_FLAG_IGNORECASE
,
63 "m": SRE_FLAG_MULTILINE
,
65 "x": SRE_FLAG_VERBOSE
,
67 "t": SRE_FLAG_TEMPLATE
,
68 "u": SRE_FLAG_UNICODE
,
72 # master pattern object. keeps track of global attributes
78 def opengroup(self
, name
=None):
82 ogid
= self
.groupdict
.get(name
, None)
84 raise error
, ("redefinition of group name %s as group %d; "
85 "was group %d" % (repr(name
), gid
, ogid
))
86 self
.groupdict
[name
] = gid
89 def closegroup(self
, gid
):
91 def checkgroup(self
, gid
):
92 return gid
< self
.groups
and gid
not in self
.open
95 # a subpattern, in intermediate form
96 def __init__(self
, pattern
, data
=None):
97 self
.pattern
= pattern
102 def dump(self
, level
=0):
104 seqtypes
= type(()), type([])
105 for op
, av
in self
.data
:
106 print level
*" " + op
,; nl
= 0
111 print (level
+1)*" " + op
, a
117 print level
*" " + "or"
118 a
.dump(level
+1); nl
= 1
120 elif type(av
) in seqtypes
:
122 if isinstance(a
, SubPattern
):
124 a
.dump(level
+1); nl
= 1
131 return repr(self
.data
)
133 return len(self
.data
)
134 def __delitem__(self
, index
):
136 def __getitem__(self
, index
):
137 if isinstance(index
, slice):
138 return SubPattern(self
.pattern
, self
.data
[index
])
139 return self
.data
[index
]
140 def __setitem__(self
, index
, code
):
141 self
.data
[index
] = code
142 def __getslice__(self
, start
, stop
):
143 return SubPattern(self
.pattern
, self
.data
[start
:stop
])
144 def insert(self
, index
, code
):
145 self
.data
.insert(index
, code
)
146 def append(self
, code
):
147 self
.data
.append(code
)
149 # determine the width (min, max) for this subpattern
153 UNITCODES
= (ANY
, RANGE
, IN
, LITERAL
, NOT_LITERAL
, CATEGORY
)
154 REPEATCODES
= (MIN_REPEAT
, MAX_REPEAT
)
155 for op
, av
in self
.data
:
169 elif op
is SUBPATTERN
:
170 i
, j
= av
[1].getwidth()
173 elif op
in REPEATCODES
:
174 i
, j
= av
[2].getwidth()
175 lo
= lo
+ long(i
) * av
[0]
176 hi
= hi
+ long(j
) * av
[1]
177 elif op
in UNITCODES
:
182 self
.width
= int(min(lo
, sys
.maxint
)), int(min(hi
, sys
.maxint
))
186 def __init__(self
, string
):
191 if self
.index
>= len(self
.string
):
194 char
= self
.string
[self
.index
]
197 c
= self
.string
[self
.index
+ 1]
199 raise error
, "bogus escape (end of line)"
201 self
.index
= self
.index
+ len(char
)
203 def match(self
, char
, skip
=1):
204 if char
== self
.next
:
214 return self
.index
, self
.next
215 def seek(self
, index
):
216 self
.index
, self
.next
= index
219 return "a" <= char
<= "z" or "A" <= char
<= "Z" or char
== "_"
222 return "0" <= char
<= "9"
225 # check that group name is a valid string
226 if not isident(name
[0]):
228 for char
in name
[1:]:
229 if not isident(char
) and not isdigit(char
):
233 def _class_escape(source
, escape
):
234 # handle escape code inside character class
235 code
= ESCAPES
.get(escape
)
238 code
= CATEGORIES
.get(escape
)
244 # hexadecimal escape (exactly two digits)
245 while source
.next
in HEXDIGITS
and len(escape
) < 4:
246 escape
= escape
+ source
.get()
249 raise error
, "bogus escape: %s" % repr("\\" + escape
)
250 return LITERAL
, int(escape
, 16) & 0xff
252 # octal escape (up to three digits)
253 while source
.next
in OCTDIGITS
and len(escape
) < 4:
254 escape
= escape
+ source
.get()
256 return LITERAL
, int(escape
, 8) & 0xff
258 raise error
, "bogus escape: %s" % repr(escape
)
260 return LITERAL
, ord(escape
[1])
263 raise error
, "bogus escape: %s" % repr(escape
)
265 def _escape(source
, escape
, state
):
266 # handle escape code in expression
267 code
= CATEGORIES
.get(escape
)
270 code
= ESCAPES
.get(escape
)
277 while source
.next
in HEXDIGITS
and len(escape
) < 4:
278 escape
= escape
+ source
.get()
281 return LITERAL
, int(escape
[2:], 16) & 0xff
284 while source
.next
in OCTDIGITS
and len(escape
) < 4:
285 escape
= escape
+ source
.get()
286 return LITERAL
, int(escape
[1:], 8) & 0xff
288 # octal escape *or* decimal group reference (sigh)
289 if source
.next
in DIGITS
:
290 escape
= escape
+ source
.get()
291 if (escape
[1] in OCTDIGITS
and escape
[2] in OCTDIGITS
and
292 source
.next
in OCTDIGITS
):
293 # got three octal digits; this is an octal escape
294 escape
= escape
+ source
.get()
295 return LITERAL
, int(escape
[1:], 8) & 0xff
296 # not an octal escape, so this is a group reference
297 group
= int(escape
[1:])
298 if group
< state
.groups
:
299 if not state
.checkgroup(group
):
300 raise error
, "cannot refer to open group"
301 return GROUPREF
, group
304 return LITERAL
, ord(escape
[1])
307 raise error
, "bogus escape: %s" % repr(escape
)
309 def _parse_sub(source
, state
, nested
=1):
310 # parse an alternation: a|b|c
313 itemsappend
= items
.append
314 sourcematch
= source
.match
316 itemsappend(_parse(source
, state
))
321 if not source
.next
or sourcematch(")", 0):
324 raise error
, "pattern not properly closed"
329 subpattern
= SubPattern(state
)
330 subpatternappend
= subpattern
.append
332 # check if all items share a common prefix
340 elif item
[0] != prefix
:
343 # all subitems start with a common "prefix".
344 # move it out of the branch
347 subpatternappend(prefix
)
348 continue # check next one
351 # check if the branch can be replaced by a character set
353 if len(item
) != 1 or item
[0][0] != LITERAL
:
356 # we can store this as a character set instead of a
357 # branch (the compiler may optimize this even more)
359 setappend
= set.append
362 subpatternappend((IN
, set))
365 subpattern
.append((BRANCH
, (None, items
)))
368 def _parse_sub_cond(source
, state
, condgroup
):
369 item_yes
= _parse(source
, state
)
370 if source
.match("|"):
371 item_no
= _parse(source
, state
)
372 if source
.match("|"):
373 raise error
, "conditional backref with more than two branches"
376 if source
.next
and not source
.match(")", 0):
377 raise error
, "pattern not properly closed"
378 subpattern
= SubPattern(state
)
379 subpattern
.append((GROUPREF_EXISTS
, (condgroup
, item_yes
, item_no
)))
382 _PATTERNENDERS
= set("|)")
383 _ASSERTCHARS
= set("=!<")
384 _LOOKBEHINDASSERTCHARS
= set("=!")
385 _REPEATCODES
= set([MIN_REPEAT
, MAX_REPEAT
])
387 def _parse(source
, state
):
388 # parse a simple pattern
389 subpattern
= SubPattern(state
)
391 # precompute constants into local variables
392 subpatternappend
= subpattern
.append
393 sourceget
= source
.get
394 sourcematch
= source
.match
396 PATTERNENDERS
= _PATTERNENDERS
397 ASSERTCHARS
= _ASSERTCHARS
398 LOOKBEHINDASSERTCHARS
= _LOOKBEHINDASSERTCHARS
399 REPEATCODES
= _REPEATCODES
403 if source
.next
in PATTERNENDERS
:
404 break # end of subpattern
407 break # end of pattern
409 if state
.flags
& SRE_FLAG_VERBOSE
:
410 # skip whitespace and comments
411 if this
in WHITESPACE
:
416 if this
in (None, "\n"):
420 if this
and this
[0] not in SPECIAL_CHARS
:
421 subpatternappend((LITERAL
, ord(this
)))
426 setappend
= set.append
427 ## if sourcematch(":"):
428 ## pass # handle character classes
430 setappend((NEGATE
, None))
431 # check remaining characters
435 if this
== "]" and set != start
:
437 elif this
and this
[0] == "\\":
438 code1
= _class_escape(source
, this
)
440 code1
= LITERAL
, ord(this
)
442 raise error
, "unexpected end of regular expression"
450 setappend((LITERAL
, ord("-")))
454 code2
= _class_escape(source
, this
)
456 code2
= LITERAL
, ord(this
)
457 if code1
[0] != LITERAL
or code2
[0] != LITERAL
:
458 raise error
, "bad character range"
462 raise error
, "bad character range"
463 setappend((RANGE
, (lo
, hi
)))
465 raise error
, "unexpected end of regular expression"
471 # XXX: <fl> should move set optimization to compiler!
472 if _len(set)==1 and set[0][0] is LITERAL
:
473 subpatternappend(set[0]) # optimization
474 elif _len(set)==2 and set[0][0] is NEGATE
and set[1][0] is LITERAL
:
475 subpatternappend((NOT_LITERAL
, set[1][1])) # optimization
477 # XXX: <fl> should add charmap optimization here
478 subpatternappend((IN
, set))
480 elif this
and this
[0] in REPEAT_CHARS
:
481 # repeat previous item
485 min, max = 0, MAXREPEAT
488 min, max = 1, MAXREPEAT
490 if source
.next
== "}":
491 subpatternappend((LITERAL
, ord(this
)))
494 min, max = 0, MAXREPEAT
496 while source
.next
in DIGITS
:
497 lo
= lo
+ source
.get()
499 while source
.next
in DIGITS
:
500 hi
= hi
+ sourceget()
503 if not sourcematch("}"):
504 subpatternappend((LITERAL
, ord(this
)))
512 raise error
, "bad repeat interval"
514 raise error
, "not supported"
515 # figure out which item to repeat
517 item
= subpattern
[-1:]
520 if not item
or (_len(item
) == 1 and item
[0][0] == AT
):
521 raise error
, "nothing to repeat"
522 if item
[0][0] in REPEATCODES
:
523 raise error
, "multiple repeat"
525 subpattern
[-1] = (MIN_REPEAT
, (min, max, item
))
527 subpattern
[-1] = (MAX_REPEAT
, (min, max, item
))
530 subpatternappend((ANY
, None))
542 # named group: skip forward to end of name
547 raise error
, "unterminated name"
553 raise error
, "bad character in group name"
554 elif sourcematch("="):
555 # named backreference
560 raise error
, "unterminated name"
565 raise error
, "bad character in group name"
566 gid
= state
.groupdict
.get(name
)
568 raise error
, "unknown group name"
569 subpatternappend((GROUPREF
, gid
))
574 raise error
, "unexpected end of pattern"
575 raise error
, "unknown specifier: ?P%s" % char
576 elif sourcematch(":"):
577 # non-capturing group
579 elif sourcematch("#"):
582 if source
.next
is None or source
.next
== ")":
585 if not sourcematch(")"):
586 raise error
, "unbalanced parenthesis"
588 elif source
.next
in ASSERTCHARS
:
589 # lookahead assertions
593 if source
.next
not in LOOKBEHINDASSERTCHARS
:
594 raise error
, "syntax error"
595 dir = -1 # lookbehind
597 p
= _parse_sub(source
, state
)
598 if not sourcematch(")"):
599 raise error
, "unbalanced parenthesis"
601 subpatternappend((ASSERT
, (dir, p
)))
603 subpatternappend((ASSERT_NOT
, (dir, p
)))
605 elif sourcematch("("):
606 # conditional backreference group
611 raise error
, "unterminated name"
614 condname
= condname
+ char
617 condgroup
= state
.groupdict
.get(condname
)
618 if condgroup
is None:
619 raise error
, "unknown group name"
622 condgroup
= int(condname
)
624 raise error
, "bad character in group name"
627 if not source
.next
in FLAGS
:
628 raise error
, "unexpected end of pattern"
629 while source
.next
in FLAGS
:
630 state
.flags
= state
.flags | FLAGS
[sourceget()]
632 # parse group contents
637 group
= state
.opengroup(name
)
639 p
= _parse_sub_cond(source
, state
, condgroup
)
641 p
= _parse_sub(source
, state
)
642 if not sourcematch(")"):
643 raise error
, "unbalanced parenthesis"
644 if group
is not None:
645 state
.closegroup(group
)
646 subpatternappend((SUBPATTERN
, (group
, p
)))
651 raise error
, "unexpected end of pattern"
654 raise error
, "unknown extension"
657 subpatternappend((AT
, AT_BEGINNING
))
660 subpattern
.append((AT
, AT_END
))
662 elif this
and this
[0] == "\\":
663 code
= _escape(source
, this
, state
)
664 subpatternappend(code
)
667 raise error
, "parser error"
671 def parse(str, flags
=0, pattern
=None):
672 # parse 're' pattern into list of (opcode, argument) tuples
674 source
= Tokenizer(str)
678 pattern
.flags
= flags
681 p
= _parse_sub(source
, pattern
, 0)
685 raise error
, "unbalanced parenthesis"
687 raise error
, "bogus characters at end of regular expression"
689 if flags
& SRE_FLAG_DEBUG
:
692 if not (flags
& SRE_FLAG_VERBOSE
) and p
.pattern
.flags
& SRE_FLAG_VERBOSE
:
693 # the VERBOSE flag was switched on inside the pattern. to be
694 # on the safe side, we'll parse the whole thing again...
695 return parse(str, p
.pattern
.flags
)
699 def parse_template(source
, pattern
):
700 # parse 're' replacement string into list of literals and
702 s
= Tokenizer(source
)
706 def literal(literal
, p
=p
, pappend
=a
):
707 if p
and p
[-1][0] is LITERAL
:
708 p
[-1] = LITERAL
, p
[-1][1] + literal
710 pappend((LITERAL
, literal
))
712 if type(sep
) is type(""):
719 break # end of replacement string
720 if this
and this
[0] == "\\":
729 raise error
, "unterminated group name"
734 raise error
, "bad group name"
738 raise error
, "negative group number"
741 raise error
, "bad character in group name"
743 index
= pattern
.groupindex
[name
]
745 raise IndexError, "unknown group name"
748 if s
.next
in OCTDIGITS
:
750 if s
.next
in OCTDIGITS
:
752 literal(makechar(int(this
[1:], 8) & 0xff))
757 if (c
in OCTDIGITS
and this
[2] in OCTDIGITS
and
758 s
.next
in OCTDIGITS
):
761 literal(makechar(int(this
[1:], 8) & 0xff))
763 a((MARK
, int(this
[1:])))
766 this
= makechar(ESCAPES
[this
][1])
772 # convert template to groups and literals lists
775 groupsappend
= groups
.append
776 literals
= [None] * len(p
)
780 # literal[i] is already None
784 return groups
, literals
786 def expand_template(template
, match
):
788 sep
= match
.string
[:0]
789 groups
, literals
= template
790 literals
= literals
[:]
792 for index
, group
in groups
:
793 literals
[index
] = s
= g(group
)
795 raise error
, "unmatched group"
797 raise error
, "invalid group reference"
798 return sep
.join(literals
)