2 # Secret Labs' Regular Expression Engine
4 # convert re-style regular expression to sre pattern
6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
8 # See the sre.py file for information on usage and redistribution.
11 """Internal support module for sre"""
13 # XXX: show string offset and offending character for all errors
17 from sre_constants
import *
19 SPECIAL_CHARS
= ".\\[{()*+?^$|"
22 DIGITS
= set("0123456789")
24 OCTDIGITS
= set("01234567")
25 HEXDIGITS
= set("0123456789abcdefABCDEF")
27 WHITESPACE
= set(" \t\n\r\v\f")
30 r
"\a": (LITERAL
, ord("\a")),
31 r
"\b": (LITERAL
, ord("\b")),
32 r
"\f": (LITERAL
, ord("\f")),
33 r
"\n": (LITERAL
, ord("\n")),
34 r
"\r": (LITERAL
, ord("\r")),
35 r
"\t": (LITERAL
, ord("\t")),
36 r
"\v": (LITERAL
, ord("\v")),
37 r
"\\": (LITERAL
, ord("\\"))
41 r
"\A": (AT
, AT_BEGINNING_STRING
), # start of string
42 r
"\b": (AT
, AT_BOUNDARY
),
43 r
"\B": (AT
, AT_NON_BOUNDARY
),
44 r
"\d": (IN
, [(CATEGORY
, CATEGORY_DIGIT
)]),
45 r
"\D": (IN
, [(CATEGORY
, CATEGORY_NOT_DIGIT
)]),
46 r
"\s": (IN
, [(CATEGORY
, CATEGORY_SPACE
)]),
47 r
"\S": (IN
, [(CATEGORY
, CATEGORY_NOT_SPACE
)]),
48 r
"\w": (IN
, [(CATEGORY
, CATEGORY_WORD
)]),
49 r
"\W": (IN
, [(CATEGORY
, CATEGORY_NOT_WORD
)]),
50 r
"\Z": (AT
, AT_END_STRING
), # end of string
55 "i": SRE_FLAG_IGNORECASE
,
57 "m": SRE_FLAG_MULTILINE
,
59 "x": SRE_FLAG_VERBOSE
,
62 "t": SRE_FLAG_TEMPLATE
,
63 "u": SRE_FLAG_UNICODE
,
67 # master pattern object. keeps track of global attributes
73 def opengroup(self
, name
=None):
77 ogid
= self
.groupdict
.get(name
, None)
79 raise error("redefinition of group name %s as group %d; "
80 "was group %d" % (repr(name
), gid
, ogid
))
81 self
.groupdict
[name
] = gid
84 def closegroup(self
, gid
):
86 def checkgroup(self
, gid
):
87 return gid
< self
.groups
and gid
not in self
.open
90 # a subpattern, in intermediate form
91 def __init__(self
, pattern
, data
=None):
92 self
.pattern
= pattern
97 def dump(self
, level
=0):
99 seqtypes
= (tuple, list)
100 for op
, av
in self
.data
:
101 print(level
*" " + op
, end
=' '); nl
= 0
106 print((level
+1)*" " + op
, a
)
112 print(level
*" " + "or")
113 a
.dump(level
+1); nl
= 1
115 elif isinstance(av
, seqtypes
):
117 if isinstance(a
, SubPattern
):
119 a
.dump(level
+1); nl
= 1
121 print(a
, end
=' ') ; nl
= 0
123 print(av
, end
=' ') ; nl
= 0
126 return repr(self
.data
)
128 return len(self
.data
)
129 def __delitem__(self
, index
):
131 def __getitem__(self
, index
):
132 if isinstance(index
, slice):
133 return SubPattern(self
.pattern
, self
.data
[index
])
134 return self
.data
[index
]
135 def __setitem__(self
, index
, code
):
136 self
.data
[index
] = code
137 def insert(self
, index
, code
):
138 self
.data
.insert(index
, code
)
139 def append(self
, code
):
140 self
.data
.append(code
)
142 # determine the width (min, max) for this subpattern
146 UNITCODES
= (ANY
, RANGE
, IN
, LITERAL
, NOT_LITERAL
, CATEGORY
)
147 REPEATCODES
= (MIN_REPEAT
, MAX_REPEAT
)
148 for op
, av
in self
.data
:
162 elif op
is SUBPATTERN
:
163 i
, j
= av
[1].getwidth()
166 elif op
in REPEATCODES
:
167 i
, j
= av
[2].getwidth()
168 lo
= lo
+ int(i
) * av
[0]
169 hi
= hi
+ int(j
) * av
[1]
170 elif op
in UNITCODES
:
175 self
.width
= int(min(lo
, sys
.maxsize
)), int(min(hi
, sys
.maxsize
))
179 def __init__(self
, string
):
184 if self
.index
>= len(self
.string
):
187 char
= self
.string
[self
.index
:self
.index
+1]
188 # Special case for the str8, since indexing returns a integer
189 # XXX This is only needed for test_bug_926075 in test_re.py
190 if char
and isinstance(char
, bytes
):
194 c
= self
.string
[self
.index
+ 1]
196 raise error("bogus escape (end of line)")
197 if isinstance(self
.string
, bytes
):
200 self
.index
= self
.index
+ len(char
)
202 def match(self
, char
, skip
=1):
203 if char
== self
.next
:
213 return self
.index
, self
.next
214 def seek(self
, index
):
215 self
.index
, self
.next
= index
218 return "a" <= char
<= "z" or "A" <= char
<= "Z" or char
== "_"
221 return "0" <= char
<= "9"
224 # check that group name is a valid string
225 if not isident(name
[0]):
227 for char
in name
[1:]:
228 if not isident(char
) and not isdigit(char
):
232 def _class_escape(source
, escape
):
233 # handle escape code inside character class
234 code
= ESCAPES
.get(escape
)
237 code
= CATEGORIES
.get(escape
)
243 # hexadecimal escape (exactly two digits)
244 while source
.next
in HEXDIGITS
and len(escape
) < 4:
245 escape
= escape
+ source
.get()
248 raise error("bogus escape: %s" % repr("\\" + escape
))
249 return LITERAL
, int(escape
, 16) & 0xff
251 # octal escape (up to three digits)
252 while source
.next
in OCTDIGITS
and len(escape
) < 4:
253 escape
= escape
+ source
.get()
255 return LITERAL
, int(escape
, 8) & 0xff
257 raise error("bogus escape: %s" % repr(escape
))
259 return LITERAL
, ord(escape
[1])
262 raise error("bogus escape: %s" % repr(escape
))
264 def _escape(source
, escape
, state
):
265 # handle escape code in expression
266 code
= CATEGORIES
.get(escape
)
269 code
= ESCAPES
.get(escape
)
276 while source
.next
in HEXDIGITS
and len(escape
) < 4:
277 escape
= escape
+ source
.get()
280 return LITERAL
, int(escape
[2:], 16) & 0xff
283 while source
.next
in OCTDIGITS
and len(escape
) < 4:
284 escape
= escape
+ source
.get()
285 return LITERAL
, int(escape
[1:], 8) & 0xff
287 # octal escape *or* decimal group reference (sigh)
288 if source
.next
in DIGITS
:
289 escape
= escape
+ source
.get()
290 if (escape
[1] in OCTDIGITS
and escape
[2] in OCTDIGITS
and
291 source
.next
in OCTDIGITS
):
292 # got three octal digits; this is an octal escape
293 escape
= escape
+ source
.get()
294 return LITERAL
, int(escape
[1:], 8) & 0xff
295 # not an octal escape, so this is a group reference
296 group
= int(escape
[1:])
297 if group
< state
.groups
:
298 if not state
.checkgroup(group
):
299 raise error("cannot refer to open group")
300 return GROUPREF
, group
303 return LITERAL
, ord(escape
[1])
306 raise error("bogus escape: %s" % repr(escape
))
308 def _parse_sub(source
, state
, nested
=1):
309 # parse an alternation: a|b|c
312 itemsappend
= items
.append
313 sourcematch
= source
.match
315 itemsappend(_parse(source
, state
))
320 if not source
.next
or sourcematch(")", 0):
323 raise error("pattern not properly closed")
328 subpattern
= SubPattern(state
)
329 subpatternappend
= subpattern
.append
331 # check if all items share a common prefix
339 elif item
[0] != prefix
:
342 # all subitems start with a common "prefix".
343 # move it out of the branch
346 subpatternappend(prefix
)
347 continue # check next one
350 # check if the branch can be replaced by a character set
352 if len(item
) != 1 or item
[0][0] != LITERAL
:
355 # we can store this as a character set instead of a
356 # branch (the compiler may optimize this even more)
358 setappend
= set.append
361 subpatternappend((IN
, set))
364 subpattern
.append((BRANCH
, (None, items
)))
367 def _parse_sub_cond(source
, state
, condgroup
):
368 item_yes
= _parse(source
, state
)
369 if source
.match("|"):
370 item_no
= _parse(source
, state
)
371 if source
.match("|"):
372 raise error("conditional backref with more than two branches")
375 if source
.next
and not source
.match(")", 0):
376 raise error("pattern not properly closed")
377 subpattern
= SubPattern(state
)
378 subpattern
.append((GROUPREF_EXISTS
, (condgroup
, item_yes
, item_no
)))
381 _PATTERNENDERS
= set("|)")
382 _ASSERTCHARS
= set("=!<")
383 _LOOKBEHINDASSERTCHARS
= set("=!")
384 _REPEATCODES
= set([MIN_REPEAT
, MAX_REPEAT
])
386 def _parse(source
, state
):
387 # parse a simple pattern
388 subpattern
= SubPattern(state
)
390 # precompute constants into local variables
391 subpatternappend
= subpattern
.append
392 sourceget
= source
.get
393 sourcematch
= source
.match
395 PATTERNENDERS
= _PATTERNENDERS
396 ASSERTCHARS
= _ASSERTCHARS
397 LOOKBEHINDASSERTCHARS
= _LOOKBEHINDASSERTCHARS
398 REPEATCODES
= _REPEATCODES
402 if source
.next
in PATTERNENDERS
:
403 break # end of subpattern
406 break # end of pattern
408 if state
.flags
& SRE_FLAG_VERBOSE
:
409 # skip whitespace and comments
410 if this
in WHITESPACE
:
415 if this
in (None, "\n"):
419 if this
and this
[0] not in SPECIAL_CHARS
:
420 subpatternappend((LITERAL
, ord(this
)))
425 setappend
= set.append
426 ## if sourcematch(":"):
427 ## pass # handle character classes
429 setappend((NEGATE
, None))
430 # check remaining characters
434 if this
== "]" and set != start
:
436 elif this
and this
[0] == "\\":
437 code1
= _class_escape(source
, this
)
439 code1
= LITERAL
, ord(this
)
441 raise error("unexpected end of regular expression")
449 setappend((LITERAL
, ord("-")))
453 code2
= _class_escape(source
, this
)
455 code2
= LITERAL
, ord(this
)
456 if code1
[0] != LITERAL
or code2
[0] != LITERAL
:
457 raise error("bad character range")
461 raise error("bad character range")
462 setappend((RANGE
, (lo
, hi
)))
464 raise error("unexpected end of regular expression")
470 # XXX: <fl> should move set optimization to compiler!
471 if _len(set)==1 and set[0][0] is LITERAL
:
472 subpatternappend(set[0]) # optimization
473 elif _len(set)==2 and set[0][0] is NEGATE
and set[1][0] is LITERAL
:
474 subpatternappend((NOT_LITERAL
, set[1][1])) # optimization
476 # XXX: <fl> should add charmap optimization here
477 subpatternappend((IN
, set))
479 elif this
and this
[0] in REPEAT_CHARS
:
480 # repeat previous item
484 min, max = 0, MAXREPEAT
487 min, max = 1, MAXREPEAT
489 if source
.next
== "}":
490 subpatternappend((LITERAL
, ord(this
)))
493 min, max = 0, MAXREPEAT
495 while source
.next
in DIGITS
:
496 lo
= lo
+ source
.get()
498 while source
.next
in DIGITS
:
499 hi
= hi
+ sourceget()
502 if not sourcematch("}"):
503 subpatternappend((LITERAL
, ord(this
)))
511 raise error("bad repeat interval")
513 raise error("not supported")
514 # figure out which item to repeat
516 item
= subpattern
[-1:]
519 if not item
or (_len(item
) == 1 and item
[0][0] == AT
):
520 raise error("nothing to repeat")
521 if item
[0][0] in REPEATCODES
:
522 raise error("multiple repeat")
524 subpattern
[-1] = (MIN_REPEAT
, (min, max, item
))
526 subpattern
[-1] = (MAX_REPEAT
, (min, max, item
))
529 subpatternappend((ANY
, None))
541 # named group: skip forward to end of name
546 raise error("unterminated name")
552 raise error("bad character in group name")
553 elif sourcematch("="):
554 # named backreference
559 raise error("unterminated name")
564 raise error("bad character in group name")
565 gid
= state
.groupdict
.get(name
)
567 raise error("unknown group name")
568 subpatternappend((GROUPREF
, gid
))
573 raise error("unexpected end of pattern")
574 raise error("unknown specifier: ?P%s" % char
)
575 elif sourcematch(":"):
576 # non-capturing group
578 elif sourcematch("#"):
581 if source
.next
is None or source
.next
== ")":
584 if not sourcematch(")"):
585 raise error("unbalanced parenthesis")
587 elif source
.next
in ASSERTCHARS
:
588 # lookahead assertions
592 if source
.next
not in LOOKBEHINDASSERTCHARS
:
593 raise error("syntax error")
594 dir = -1 # lookbehind
596 p
= _parse_sub(source
, state
)
597 if not sourcematch(")"):
598 raise error("unbalanced parenthesis")
600 subpatternappend((ASSERT
, (dir, p
)))
602 subpatternappend((ASSERT_NOT
, (dir, p
)))
604 elif sourcematch("("):
605 # conditional backreference group
610 raise error("unterminated name")
613 condname
= condname
+ char
616 condgroup
= state
.groupdict
.get(condname
)
617 if condgroup
is None:
618 raise error("unknown group name")
621 condgroup
= int(condname
)
623 raise error("bad character in group name")
626 if not source
.next
in FLAGS
:
627 raise error("unexpected end of pattern")
628 while source
.next
in FLAGS
:
629 state
.flags
= state
.flags | FLAGS
[sourceget()]
631 # parse group contents
636 group
= state
.opengroup(name
)
638 p
= _parse_sub_cond(source
, state
, condgroup
)
640 p
= _parse_sub(source
, state
)
641 if not sourcematch(")"):
642 raise error("unbalanced parenthesis")
643 if group
is not None:
644 state
.closegroup(group
)
645 subpatternappend((SUBPATTERN
, (group
, p
)))
650 raise error("unexpected end of pattern")
653 raise error("unknown extension")
656 subpatternappend((AT
, AT_BEGINNING
))
659 subpattern
.append((AT
, AT_END
))
661 elif this
and this
[0] == "\\":
662 code
= _escape(source
, this
, state
)
663 subpatternappend(code
)
666 raise error("parser error")
670 def fix_flags(src
, flags
):
671 # Check and fix flags according to the type of pattern (str or bytes)
672 if isinstance(src
, str):
673 if not flags
& SRE_FLAG_ASCII
:
674 flags |
= SRE_FLAG_UNICODE
675 elif flags
& SRE_FLAG_UNICODE
:
676 raise ValueError("ASCII and UNICODE flags are incompatible")
678 if flags
& SRE_FLAG_UNICODE
:
679 raise ValueError("can't use UNICODE flag with a bytes pattern")
682 def parse(str, flags
=0, pattern
=None):
683 # parse 're' pattern into list of (opcode, argument) tuples
685 source
= Tokenizer(str)
689 pattern
.flags
= flags
692 p
= _parse_sub(source
, pattern
, 0)
693 p
.pattern
.flags
= fix_flags(str, p
.pattern
.flags
)
697 raise error("unbalanced parenthesis")
699 raise error("bogus characters at end of regular expression")
701 if flags
& SRE_FLAG_DEBUG
:
704 if not (flags
& SRE_FLAG_VERBOSE
) and p
.pattern
.flags
& SRE_FLAG_VERBOSE
:
705 # the VERBOSE flag was switched on inside the pattern. to be
706 # on the safe side, we'll parse the whole thing again...
707 return parse(str, p
.pattern
.flags
)
711 def parse_template(source
, pattern
):
712 # parse 're' replacement string into list of literals and
714 s
= Tokenizer(source
)
718 def literal(literal
, p
=p
, pappend
=a
):
719 if p
and p
[-1][0] is LITERAL
:
720 p
[-1] = LITERAL
, p
[-1][1] + literal
722 pappend((LITERAL
, literal
))
724 if isinstance(sep
, str):
731 break # end of replacement string
732 if this
and this
[0] == "\\":
741 raise error("unterminated group name")
746 raise error("bad group name")
750 raise error("negative group number")
753 raise error("bad character in group name")
755 index
= pattern
.groupindex
[name
]
757 raise IndexError("unknown group name")
760 if s
.next
in OCTDIGITS
:
762 if s
.next
in OCTDIGITS
:
764 literal(makechar(int(this
[1:], 8) & 0xff))
769 if (c
in OCTDIGITS
and this
[2] in OCTDIGITS
and
770 s
.next
in OCTDIGITS
):
773 literal(makechar(int(this
[1:], 8) & 0xff))
775 a((MARK
, int(this
[1:])))
778 this
= makechar(ESCAPES
[this
][1])
784 # convert template to groups and literals lists
787 groupsappend
= groups
.append
788 literals
= [None] * len(p
)
789 if isinstance(source
, str):
792 # The tokenizer implicitly decodes bytes objects as latin-1, we must
793 # therefore re-encode the final representation.
794 encode
= lambda x
: x
.encode('latin1')
798 # literal[i] is already None
800 literals
[i
] = encode(s
)
802 return groups
, literals
804 def expand_template(template
, match
):
806 sep
= match
.string
[:0]
807 groups
, literals
= template
808 literals
= literals
[:]
810 for index
, group
in groups
:
811 literals
[index
] = s
= g(group
)
813 raise error("unmatched group")
815 raise error("invalid group reference")
816 return sep
.join(literals
)