2 # Secret Labs' Regular Expression Engine
4 # convert re-style regular expression to sre pattern
6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
8 # See the sre.py file for information on usage and redistribution.
11 """Internal support module for sre"""
13 # XXX: show string offset and offending character for all errors
17 from sre_constants
import *
19 SPECIAL_CHARS
= ".\\[{()*+?^$|"
22 DIGITS
= set("0123456789")
24 OCTDIGITS
= set("01234567")
25 HEXDIGITS
= set("0123456789abcdefABCDEF")
27 WHITESPACE
= set(" \t\n\r\v\f")
30 r
"\a": (LITERAL
, ord("\a")),
31 r
"\b": (LITERAL
, ord("\b")),
32 r
"\f": (LITERAL
, ord("\f")),
33 r
"\n": (LITERAL
, ord("\n")),
34 r
"\r": (LITERAL
, ord("\r")),
35 r
"\t": (LITERAL
, ord("\t")),
36 r
"\v": (LITERAL
, ord("\v")),
37 r
"\\": (LITERAL
, ord("\\"))
41 r
"\A": (AT
, AT_BEGINNING_STRING
), # start of string
42 r
"\b": (AT
, AT_BOUNDARY
),
43 r
"\B": (AT
, AT_NON_BOUNDARY
),
44 r
"\d": (IN
, [(CATEGORY
, CATEGORY_DIGIT
)]),
45 r
"\D": (IN
, [(CATEGORY
, CATEGORY_NOT_DIGIT
)]),
46 r
"\s": (IN
, [(CATEGORY
, CATEGORY_SPACE
)]),
47 r
"\S": (IN
, [(CATEGORY
, CATEGORY_NOT_SPACE
)]),
48 r
"\w": (IN
, [(CATEGORY
, CATEGORY_WORD
)]),
49 r
"\W": (IN
, [(CATEGORY
, CATEGORY_NOT_WORD
)]),
50 r
"\Z": (AT
, AT_END_STRING
), # end of string
55 "i": SRE_FLAG_IGNORECASE
,
57 "m": SRE_FLAG_MULTILINE
,
59 "x": SRE_FLAG_VERBOSE
,
61 "t": SRE_FLAG_TEMPLATE
,
62 "u": SRE_FLAG_UNICODE
,
66 # master pattern object. keeps track of global attributes
72 def opengroup(self
, name
=None):
76 ogid
= self
.groupdict
.get(name
, None)
78 raise error
, ("redefinition of group name %s as group %d; "
79 "was group %d" % (repr(name
), gid
, ogid
))
80 self
.groupdict
[name
] = gid
83 def closegroup(self
, gid
):
85 def checkgroup(self
, gid
):
86 return gid
< self
.groups
and gid
not in self
.open
89 # a subpattern, in intermediate form
90 def __init__(self
, pattern
, data
=None):
91 self
.pattern
= pattern
96 def dump(self
, level
=0):
98 seqtypes
= type(()), type([])
99 for op
, av
in self
.data
:
100 print level
*" " + op
,; nl
= 0
105 print (level
+1)*" " + op
, a
111 print level
*" " + "or"
112 a
.dump(level
+1); nl
= 1
114 elif type(av
) in seqtypes
:
116 if isinstance(a
, SubPattern
):
118 a
.dump(level
+1); nl
= 1
125 return repr(self
.data
)
127 return len(self
.data
)
128 def __delitem__(self
, index
):
130 def __getitem__(self
, index
):
131 if isinstance(index
, slice):
132 return SubPattern(self
.pattern
, self
.data
[index
])
133 return self
.data
[index
]
134 def __setitem__(self
, index
, code
):
135 self
.data
[index
] = code
136 def insert(self
, index
, code
):
137 self
.data
.insert(index
, code
)
138 def append(self
, code
):
139 self
.data
.append(code
)
141 # determine the width (min, max) for this subpattern
145 UNITCODES
= (ANY
, RANGE
, IN
, LITERAL
, NOT_LITERAL
, CATEGORY
)
146 REPEATCODES
= (MIN_REPEAT
, MAX_REPEAT
)
147 for op
, av
in self
.data
:
161 elif op
is SUBPATTERN
:
162 i
, j
= av
[1].getwidth()
165 elif op
in REPEATCODES
:
166 i
, j
= av
[2].getwidth()
167 lo
= lo
+ long(i
) * av
[0]
168 hi
= hi
+ long(j
) * av
[1]
169 elif op
in UNITCODES
:
174 self
.width
= int(min(lo
, sys
.maxint
)), int(min(hi
, sys
.maxint
))
178 def __init__(self
, string
):
183 if self
.index
>= len(self
.string
):
186 char
= self
.string
[self
.index
]
189 c
= self
.string
[self
.index
+ 1]
191 raise error
, "bogus escape (end of line)"
193 self
.index
= self
.index
+ len(char
)
195 def match(self
, char
, skip
=1):
196 if char
== self
.next
:
206 return self
.index
, self
.next
207 def seek(self
, index
):
208 self
.index
, self
.next
= index
211 return "a" <= char
<= "z" or "A" <= char
<= "Z" or char
== "_"
214 return "0" <= char
<= "9"
217 # check that group name is a valid string
218 if not isident(name
[0]):
220 for char
in name
[1:]:
221 if not isident(char
) and not isdigit(char
):
225 def _class_escape(source
, escape
):
226 # handle escape code inside character class
227 code
= ESCAPES
.get(escape
)
230 code
= CATEGORIES
.get(escape
)
236 # hexadecimal escape (exactly two digits)
237 while source
.next
in HEXDIGITS
and len(escape
) < 4:
238 escape
= escape
+ source
.get()
241 raise error
, "bogus escape: %s" % repr("\\" + escape
)
242 return LITERAL
, int(escape
, 16) & 0xff
244 # octal escape (up to three digits)
245 while source
.next
in OCTDIGITS
and len(escape
) < 4:
246 escape
= escape
+ source
.get()
248 return LITERAL
, int(escape
, 8) & 0xff
250 raise error
, "bogus escape: %s" % repr(escape
)
252 return LITERAL
, ord(escape
[1])
255 raise error
, "bogus escape: %s" % repr(escape
)
257 def _escape(source
, escape
, state
):
258 # handle escape code in expression
259 code
= CATEGORIES
.get(escape
)
262 code
= ESCAPES
.get(escape
)
269 while source
.next
in HEXDIGITS
and len(escape
) < 4:
270 escape
= escape
+ source
.get()
273 return LITERAL
, int(escape
[2:], 16) & 0xff
276 while source
.next
in OCTDIGITS
and len(escape
) < 4:
277 escape
= escape
+ source
.get()
278 return LITERAL
, int(escape
[1:], 8) & 0xff
280 # octal escape *or* decimal group reference (sigh)
281 if source
.next
in DIGITS
:
282 escape
= escape
+ source
.get()
283 if (escape
[1] in OCTDIGITS
and escape
[2] in OCTDIGITS
and
284 source
.next
in OCTDIGITS
):
285 # got three octal digits; this is an octal escape
286 escape
= escape
+ source
.get()
287 return LITERAL
, int(escape
[1:], 8) & 0xff
288 # not an octal escape, so this is a group reference
289 group
= int(escape
[1:])
290 if group
< state
.groups
:
291 if not state
.checkgroup(group
):
292 raise error
, "cannot refer to open group"
293 return GROUPREF
, group
296 return LITERAL
, ord(escape
[1])
299 raise error
, "bogus escape: %s" % repr(escape
)
301 def _parse_sub(source
, state
, nested
=1):
302 # parse an alternation: a|b|c
305 itemsappend
= items
.append
306 sourcematch
= source
.match
308 itemsappend(_parse(source
, state
))
313 if not source
.next
or sourcematch(")", 0):
316 raise error
, "pattern not properly closed"
321 subpattern
= SubPattern(state
)
322 subpatternappend
= subpattern
.append
324 # check if all items share a common prefix
332 elif item
[0] != prefix
:
335 # all subitems start with a common "prefix".
336 # move it out of the branch
339 subpatternappend(prefix
)
340 continue # check next one
343 # check if the branch can be replaced by a character set
345 if len(item
) != 1 or item
[0][0] != LITERAL
:
348 # we can store this as a character set instead of a
349 # branch (the compiler may optimize this even more)
351 setappend
= set.append
354 subpatternappend((IN
, set))
357 subpattern
.append((BRANCH
, (None, items
)))
360 def _parse_sub_cond(source
, state
, condgroup
):
361 item_yes
= _parse(source
, state
)
362 if source
.match("|"):
363 item_no
= _parse(source
, state
)
364 if source
.match("|"):
365 raise error
, "conditional backref with more than two branches"
368 if source
.next
and not source
.match(")", 0):
369 raise error
, "pattern not properly closed"
370 subpattern
= SubPattern(state
)
371 subpattern
.append((GROUPREF_EXISTS
, (condgroup
, item_yes
, item_no
)))
374 _PATTERNENDERS
= set("|)")
375 _ASSERTCHARS
= set("=!<")
376 _LOOKBEHINDASSERTCHARS
= set("=!")
377 _REPEATCODES
= set([MIN_REPEAT
, MAX_REPEAT
])
379 def _parse(source
, state
):
380 # parse a simple pattern
381 subpattern
= SubPattern(state
)
383 # precompute constants into local variables
384 subpatternappend
= subpattern
.append
385 sourceget
= source
.get
386 sourcematch
= source
.match
388 PATTERNENDERS
= _PATTERNENDERS
389 ASSERTCHARS
= _ASSERTCHARS
390 LOOKBEHINDASSERTCHARS
= _LOOKBEHINDASSERTCHARS
391 REPEATCODES
= _REPEATCODES
395 if source
.next
in PATTERNENDERS
:
396 break # end of subpattern
399 break # end of pattern
401 if state
.flags
& SRE_FLAG_VERBOSE
:
402 # skip whitespace and comments
403 if this
in WHITESPACE
:
408 if this
in (None, "\n"):
412 if this
and this
[0] not in SPECIAL_CHARS
:
413 subpatternappend((LITERAL
, ord(this
)))
418 setappend
= set.append
419 ## if sourcematch(":"):
420 ## pass # handle character classes
422 setappend((NEGATE
, None))
423 # check remaining characters
427 if this
== "]" and set != start
:
429 elif this
and this
[0] == "\\":
430 code1
= _class_escape(source
, this
)
432 code1
= LITERAL
, ord(this
)
434 raise error
, "unexpected end of regular expression"
442 setappend((LITERAL
, ord("-")))
446 code2
= _class_escape(source
, this
)
448 code2
= LITERAL
, ord(this
)
449 if code1
[0] != LITERAL
or code2
[0] != LITERAL
:
450 raise error
, "bad character range"
454 raise error
, "bad character range"
455 setappend((RANGE
, (lo
, hi
)))
457 raise error
, "unexpected end of regular expression"
463 # XXX: <fl> should move set optimization to compiler!
464 if _len(set)==1 and set[0][0] is LITERAL
:
465 subpatternappend(set[0]) # optimization
466 elif _len(set)==2 and set[0][0] is NEGATE
and set[1][0] is LITERAL
:
467 subpatternappend((NOT_LITERAL
, set[1][1])) # optimization
469 # XXX: <fl> should add charmap optimization here
470 subpatternappend((IN
, set))
472 elif this
and this
[0] in REPEAT_CHARS
:
473 # repeat previous item
477 min, max = 0, MAXREPEAT
480 min, max = 1, MAXREPEAT
482 if source
.next
== "}":
483 subpatternappend((LITERAL
, ord(this
)))
486 min, max = 0, MAXREPEAT
488 while source
.next
in DIGITS
:
489 lo
= lo
+ source
.get()
491 while source
.next
in DIGITS
:
492 hi
= hi
+ sourceget()
495 if not sourcematch("}"):
496 subpatternappend((LITERAL
, ord(this
)))
504 raise error
, "bad repeat interval"
506 raise error
, "not supported"
507 # figure out which item to repeat
509 item
= subpattern
[-1:]
512 if not item
or (_len(item
) == 1 and item
[0][0] == AT
):
513 raise error
, "nothing to repeat"
514 if item
[0][0] in REPEATCODES
:
515 raise error
, "multiple repeat"
517 subpattern
[-1] = (MIN_REPEAT
, (min, max, item
))
519 subpattern
[-1] = (MAX_REPEAT
, (min, max, item
))
522 subpatternappend((ANY
, None))
534 # named group: skip forward to end of name
539 raise error
, "unterminated name"
545 raise error
, "bad character in group name"
546 elif sourcematch("="):
547 # named backreference
552 raise error
, "unterminated name"
557 raise error
, "bad character in group name"
558 gid
= state
.groupdict
.get(name
)
560 raise error
, "unknown group name"
561 subpatternappend((GROUPREF
, gid
))
566 raise error
, "unexpected end of pattern"
567 raise error
, "unknown specifier: ?P%s" % char
568 elif sourcematch(":"):
569 # non-capturing group
571 elif sourcematch("#"):
574 if source
.next
is None or source
.next
== ")":
577 if not sourcematch(")"):
578 raise error
, "unbalanced parenthesis"
580 elif source
.next
in ASSERTCHARS
:
581 # lookahead assertions
585 if source
.next
not in LOOKBEHINDASSERTCHARS
:
586 raise error
, "syntax error"
587 dir = -1 # lookbehind
589 p
= _parse_sub(source
, state
)
590 if not sourcematch(")"):
591 raise error
, "unbalanced parenthesis"
593 subpatternappend((ASSERT
, (dir, p
)))
595 subpatternappend((ASSERT_NOT
, (dir, p
)))
597 elif sourcematch("("):
598 # conditional backreference group
603 raise error
, "unterminated name"
606 condname
= condname
+ char
609 condgroup
= state
.groupdict
.get(condname
)
610 if condgroup
is None:
611 raise error
, "unknown group name"
614 condgroup
= int(condname
)
616 raise error
, "bad character in group name"
619 if not source
.next
in FLAGS
:
620 raise error
, "unexpected end of pattern"
621 while source
.next
in FLAGS
:
622 state
.flags
= state
.flags | FLAGS
[sourceget()]
624 # parse group contents
629 group
= state
.opengroup(name
)
631 p
= _parse_sub_cond(source
, state
, condgroup
)
633 p
= _parse_sub(source
, state
)
634 if not sourcematch(")"):
635 raise error
, "unbalanced parenthesis"
636 if group
is not None:
637 state
.closegroup(group
)
638 subpatternappend((SUBPATTERN
, (group
, p
)))
643 raise error
, "unexpected end of pattern"
646 raise error
, "unknown extension"
649 subpatternappend((AT
, AT_BEGINNING
))
652 subpattern
.append((AT
, AT_END
))
654 elif this
and this
[0] == "\\":
655 code
= _escape(source
, this
, state
)
656 subpatternappend(code
)
659 raise error
, "parser error"
663 def parse(str, flags
=0, pattern
=None):
664 # parse 're' pattern into list of (opcode, argument) tuples
666 source
= Tokenizer(str)
670 pattern
.flags
= flags
673 p
= _parse_sub(source
, pattern
, 0)
677 raise error
, "unbalanced parenthesis"
679 raise error
, "bogus characters at end of regular expression"
681 if flags
& SRE_FLAG_DEBUG
:
684 if not (flags
& SRE_FLAG_VERBOSE
) and p
.pattern
.flags
& SRE_FLAG_VERBOSE
:
685 # the VERBOSE flag was switched on inside the pattern. to be
686 # on the safe side, we'll parse the whole thing again...
687 return parse(str, p
.pattern
.flags
)
691 def parse_template(source
, pattern
):
692 # parse 're' replacement string into list of literals and
694 s
= Tokenizer(source
)
698 def literal(literal
, p
=p
, pappend
=a
):
699 if p
and p
[-1][0] is LITERAL
:
700 p
[-1] = LITERAL
, p
[-1][1] + literal
702 pappend((LITERAL
, literal
))
704 if type(sep
) is type(""):
711 break # end of replacement string
712 if this
and this
[0] == "\\":
721 raise error
, "unterminated group name"
726 raise error
, "bad group name"
730 raise error
, "negative group number"
733 raise error
, "bad character in group name"
735 index
= pattern
.groupindex
[name
]
737 raise IndexError, "unknown group name"
740 if s
.next
in OCTDIGITS
:
742 if s
.next
in OCTDIGITS
:
744 literal(makechar(int(this
[1:], 8) & 0xff))
749 if (c
in OCTDIGITS
and this
[2] in OCTDIGITS
and
750 s
.next
in OCTDIGITS
):
753 literal(makechar(int(this
[1:], 8) & 0xff))
755 a((MARK
, int(this
[1:])))
758 this
= makechar(ESCAPES
[this
][1])
764 # convert template to groups and literals lists
767 groupsappend
= groups
.append
768 literals
= [None] * len(p
)
772 # literal[i] is already None
776 return groups
, literals
778 def expand_template(template
, match
):
780 sep
= match
.string
[:0]
781 groups
, literals
= template
782 literals
= literals
[:]
784 for index
, group
in groups
:
785 literals
[index
] = s
= g(group
)
787 raise error
, "unmatched group"
789 raise error
, "invalid group reference"
790 return sep
.join(literals
)