Lib/test/test_re.py

   1 import sys
   2 sys.path = ['.'] + sys.path
   3
   4 from test.test_support import verbose, run_unittest, catch_warning
   5 import re
   6 from re import Scanner
   7 import sys, os, traceback
   8 from weakref import proxy
   9
  10 # Misc tests from Tim Peters' re.doc
  11
  12 # WARNING: Don't change details in these tests if you don't know
  13 # what you're doing. Some of these tests were carefuly modeled to
  14 # cover most of the code.
  15
  16 import unittest
  17
  18 class ReTests(unittest.TestCase):
  19
  20     def test_weakref(self):
  21         s = 'QabbbcR'
  22         x = re.compile('ab+c')
  23         y = proxy(x)
  24         self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
  25
  26     def test_search_star_plus(self):
  27         self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
  28         self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
  29         self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
  30         self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
  31         self.assertEqual(re.search('x', 'aaa'), None)
  32         self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
  33         self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
  34         self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
  35         self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
  36         self.assertEqual(re.match('a+', 'xxx'), None)
  37
  38     def bump_num(self, matchobj):
  39         int_value = int(matchobj.group(0))
  40         return str(int_value + 1)
  41
  42     def test_basic_re_sub(self):
  43         self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
  44         self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
  45                          '9.3 -3 24x100y')
  46         self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
  47                          '9.3 -3 23x99y')
  48
  49         self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
  50         self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
  51
  52         s = r"\1\1"
  53         self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
  54         self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
  55         self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
  56
  57         self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
  58         self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
  59         self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
  60         self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
  61
  62         self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
  63                          '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
  64         self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
  65         self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
  66                          (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
  67
  68         self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
  69
  70     def test_bug_449964(self):
  71         # fails for group followed by other escape
  72         self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
  73                          'xx\bxx\b')
  74
  75     def test_bug_449000(self):
  76         # Test for sub() on escaped characters
  77         self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
  78                          'abc\ndef\n')
  79         self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
  80                          'abc\ndef\n')
  81         self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
  82                          'abc\ndef\n')
  83         self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
  84                          'abc\ndef\n')
  85
  86     def test_bug_1140(self):
  87         # re.sub(x, y, u'') should return u'', not '', and
  88         # re.sub(x, y, '') should return '', not u''.
  89         # Also:
  90         # re.sub(x, y, unicode(x)) should return unicode(y), and
  91         # re.sub(x, y, str(x)) should return
  92         #     str(y) if isinstance(y, str) else unicode(y).
  93         for x in 'x', u'x':
  94             for y in 'y', u'y':
  95                 z = re.sub(x, y, u'')
  96                 self.assertEqual(z, u'')
  97                 self.assertEqual(type(z), unicode)
  98                 #
  99                 z = re.sub(x, y, '')
 100                 self.assertEqual(z, '')
 101                 self.assertEqual(type(z), str)
 102                 #
 103                 z = re.sub(x, y, unicode(x))
 104                 self.assertEqual(z, y)
 105                 self.assertEqual(type(z), unicode)
 106                 #
 107                 z = re.sub(x, y, str(x))
 108                 self.assertEqual(z, y)
 109                 self.assertEqual(type(z), type(y))
 110
 111     def test_bug_1661(self):
 112         # Verify that flags do not get silently ignored with compiled patterns
 113         pattern = re.compile('.')
 114         self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
 115         self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
 116         self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
 117         self.assertRaises(ValueError, re.compile, pattern, re.I)
 118
 119     def test_sub_template_numeric_escape(self):
 120         # bug 776311 and friends
 121         self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
 122         self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
 123         self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
 124         self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
 125         self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
 126         self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
 127         self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
 128
 129         self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
 130         self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
 131
 132         self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
 133         self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
 134         self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
 135         self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
 136         self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
 137
 138         self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
 139         self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
 140
 141         self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
 142         self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
 143         self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
 144         self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
 145         self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
 146         self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
 147         self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
 148         self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
 149         self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
 150         self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
 151         self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
 152         self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
 153
 154         # in python2.3 (etc), these loop endlessly in sre_parser.py
 155         self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
 156         self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
 157                          'xz8')
 158         self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
 159                          'xza')
 160
 161     def test_qualified_re_sub(self):
 162         self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
 163         self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
 164
 165     def test_bug_114660(self):
 166         self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello  there'),
 167                          'hello there')
 168
 169     def test_bug_462270(self):
 170         # Test for empty sub() behaviour, see SF bug #462270
 171         self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
 172         self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
 173
 174     def test_symbolic_refs(self):
 175         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
 176         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
 177         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
 178         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
 179         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
 180         self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
 181         self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
 182         self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
 183         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
 184
 185     def test_re_subn(self):
 186         self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
 187         self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
 188         self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
 189         self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
 190         self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
 191
 192     def test_re_split(self):
 193         self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
 194         self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
 195         self.assertEqual(re.split("(:*)", ":a:b::c"),
 196                          ['', ':', 'a', ':', 'b', '::', 'c'])
 197         self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
 198         self.assertEqual(re.split("(:)*", ":a:b::c"),
 199                          ['', ':', 'a', ':', 'b', ':', 'c'])
 200         self.assertEqual(re.split("([b:]+)", ":a:b::c"),
 201                          ['', ':', 'a', ':b::', 'c'])
 202         self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
 203                          ['', None, ':', 'a', None, ':', '', 'b', None, '',
 204                           None, '::', 'c'])
 205         self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
 206                          ['', 'a', '', '', 'c'])
 207
 208     def test_qualified_re_split(self):
 209         self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
 210         self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
 211         self.assertEqual(re.split("(:)", ":a:b::c", 2),
 212                          ['', ':', 'a', ':', 'b::c'])
 213         self.assertEqual(re.split("(:*)", ":a:b::c", 2),
 214                          ['', ':', 'a', ':', 'b::c'])
 215
 216     def test_re_findall(self):
 217         self.assertEqual(re.findall(":+", "abc"), [])
 218         self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
 219         self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
 220         self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
 221                                                                (":", ":"),
 222                                                                (":", "::")])
 223
 224     def test_bug_117612(self):
 225         self.assertEqual(re.findall(r"(a|(b))", "aba"),
 226                          [("a", ""),("b", "b"),("a", "")])
 227
 228     def test_re_match(self):
 229         self.assertEqual(re.match('a', 'a').groups(), ())
 230         self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
 231         self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
 232         self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
 233         self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
 234
 235         pat = re.compile('((a)|(b))(c)?')
 236         self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
 237         self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
 238         self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
 239         self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
 240         self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
 241
 242         # A single group
 243         m = re.match('(a)', 'a')
 244         self.assertEqual(m.group(0), 'a')
 245         self.assertEqual(m.group(0), 'a')
 246         self.assertEqual(m.group(1), 'a')
 247         self.assertEqual(m.group(1, 1), ('a', 'a'))
 248
 249         pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
 250         self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
 251         self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
 252                          (None, 'b', None))
 253         self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
 254
 255     def test_re_groupref_exists(self):
 256         self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
 257                          ('(', 'a'))
 258         self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
 259                          (None, 'a'))
 260         self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'), None)
 261         self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a'), None)
 262         self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
 263                          ('a', 'b'))
 264         self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
 265                          (None, 'd'))
 266         self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
 267                          (None, 'd'))
 268         self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
 269                          ('a', ''))
 270
 271         # Tests for bug #1177831: exercise groups other than the first group
 272         p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
 273         self.assertEqual(p.match('abc').groups(),
 274                          ('a', 'b', 'c'))
 275         self.assertEqual(p.match('ad').groups(),
 276                          ('a', None, 'd'))
 277         self.assertEqual(p.match('abd'), None)
 278         self.assertEqual(p.match('ac'), None)
 279
 280
 281     def test_re_groupref(self):
 282         self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
 283                          ('|', 'a'))
 284         self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
 285                          (None, 'a'))
 286         self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', 'a|'), None)
 287         self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a'), None)
 288         self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
 289                          ('a', 'a'))
 290         self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
 291                          (None, None))
 292
 293     def test_groupdict(self):
 294         self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
 295                                   'first second').groupdict(),
 296                          {'first':'first', 'second':'second'})
 297
 298     def test_expand(self):
 299         self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
 300                                   "first second")
 301                                   .expand(r"\2 \1 \g<second> \g<first>"),
 302                          "second first second first")
 303
 304     def test_repeat_minmax(self):
 305         self.assertEqual(re.match("^(\w){1}$", "abc"), None)
 306         self.assertEqual(re.match("^(\w){1}?$", "abc"), None)
 307         self.assertEqual(re.match("^(\w){1,2}$", "abc"), None)
 308         self.assertEqual(re.match("^(\w){1,2}?$", "abc"), None)
 309
 310         self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
 311         self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
 312         self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
 313         self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
 314         self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
 315         self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
 316         self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
 317         self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
 318
 319         self.assertEqual(re.match("^x{1}$", "xxx"), None)
 320         self.assertEqual(re.match("^x{1}?$", "xxx"), None)
 321         self.assertEqual(re.match("^x{1,2}$", "xxx"), None)
 322         self.assertEqual(re.match("^x{1,2}?$", "xxx"), None)
 323
 324         self.assertNotEqual(re.match("^x{3}$", "xxx"), None)
 325         self.assertNotEqual(re.match("^x{1,3}$", "xxx"), None)
 326         self.assertNotEqual(re.match("^x{1,4}$", "xxx"), None)
 327         self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
 328         self.assertNotEqual(re.match("^x{3}?$", "xxx"), None)
 329         self.assertNotEqual(re.match("^x{1,3}?$", "xxx"), None)
 330         self.assertNotEqual(re.match("^x{1,4}?$", "xxx"), None)
 331         self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
 332
 333         self.assertEqual(re.match("^x{}$", "xxx"), None)
 334         self.assertNotEqual(re.match("^x{}$", "x{}"), None)
 335
 336     def test_getattr(self):
 337         self.assertEqual(re.match("(a)", "a").pos, 0)
 338         self.assertEqual(re.match("(a)", "a").endpos, 1)
 339         self.assertEqual(re.match("(a)", "a").string, "a")
 340         self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
 341         self.assertNotEqual(re.match("(a)", "a").re, None)
 342
 343     def test_special_escapes(self):
 344         self.assertEqual(re.search(r"\b(b.)\b",
 345                                    "abcd abc bcd bx").group(1), "bx")
 346         self.assertEqual(re.search(r"\B(b.)\B",
 347                                    "abc bcd bc abxd").group(1), "bx")
 348         self.assertEqual(re.search(r"\b(b.)\b",
 349                                    "abcd abc bcd bx", re.LOCALE).group(1), "bx")
 350         self.assertEqual(re.search(r"\B(b.)\B",
 351                                    "abc bcd bc abxd", re.LOCALE).group(1), "bx")
 352         self.assertEqual(re.search(r"\b(b.)\b",
 353                                    "abcd abc bcd bx", re.UNICODE).group(1), "bx")
 354         self.assertEqual(re.search(r"\B(b.)\B",
 355                                    "abc bcd bc abxd", re.UNICODE).group(1), "bx")
 356         self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
 357         self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
 358         self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
 359         self.assertEqual(re.search(r"\b(b.)\b",
 360                                    u"abcd abc bcd bx").group(1), "bx")
 361         self.assertEqual(re.search(r"\B(b.)\B",
 362                                    u"abc bcd bc abxd").group(1), "bx")
 363         self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
 364         self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
 365         self.assertEqual(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M), None)
 366         self.assertEqual(re.search(r"\d\D\w\W\s\S",
 367                                    "1aa! a").group(0), "1aa! a")
 368         self.assertEqual(re.search(r"\d\D\w\W\s\S",
 369                                    "1aa! a", re.LOCALE).group(0), "1aa! a")
 370         self.assertEqual(re.search(r"\d\D\w\W\s\S",
 371                                    "1aa! a", re.UNICODE).group(0), "1aa! a")
 372
 373     def test_ignore_case(self):
 374         self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
 375         self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
 376
 377     def test_bigcharset(self):
 378         self.assertEqual(re.match(u"([\u2222\u2223])",
 379                                   u"\u2222").group(1), u"\u2222")
 380         self.assertEqual(re.match(u"([\u2222\u2223])",
 381                                   u"\u2222", re.UNICODE).group(1), u"\u2222")
 382
 383     def test_anyall(self):
 384         self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
 385                          "a\nb")
 386         self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
 387                          "a\n\nb")
 388
 389     def test_non_consuming(self):
 390         self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
 391         self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
 392         self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
 393         self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
 394         self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
 395         self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
 396         self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
 397
 398         self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
 399         self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
 400         self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
 401         self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
 402
 403     def test_ignore_case(self):
 404         self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
 405         self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
 406         self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
 407         self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
 408         self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
 409         self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
 410         self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
 411         self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
 412
 413     def test_category(self):
 414         self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
 415
 416     def test_getlower(self):
 417         import _sre
 418         self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
 419         self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
 420         self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
 421
 422         self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
 423         self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
 424
 425     def test_not_literal(self):
 426         self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
 427         self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
 428
 429     def test_search_coverage(self):
 430         self.assertEqual(re.search("\s(b)", " b").group(1), "b")
 431         self.assertEqual(re.search("a\s", "a ").group(0), "a ")
 432
 433     def test_re_escape(self):
 434         p=""
 435         for i in range(0, 256):
 436             p = p + chr(i)
 437             self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None,
 438                              True)
 439             self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1))
 440
 441         pat=re.compile(re.escape(p))
 442         self.assertEqual(pat.match(p) is not None, True)
 443         self.assertEqual(pat.match(p).span(), (0,256))
 444
 445     def test_pickling(self):
 446         import pickle
 447         self.pickle_test(pickle)
 448         import cPickle
 449         self.pickle_test(cPickle)
 450         # old pickles expect the _compile() reconstructor in sre module
 451         import warnings
 452         with catch_warning():
 453             warnings.filterwarnings("ignore", "The sre module is deprecated",
 454                                     DeprecationWarning)
 455             from sre import _compile
 456
 457     def pickle_test(self, pickle):
 458         oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
 459         s = pickle.dumps(oldpat)
 460         newpat = pickle.loads(s)
 461         self.assertEqual(oldpat, newpat)
 462
 463     def test_constants(self):
 464         self.assertEqual(re.I, re.IGNORECASE)
 465         self.assertEqual(re.L, re.LOCALE)
 466         self.assertEqual(re.M, re.MULTILINE)
 467         self.assertEqual(re.S, re.DOTALL)
 468         self.assertEqual(re.X, re.VERBOSE)
 469
 470     def test_flags(self):
 471         for flag in [re.I, re.M, re.X, re.S, re.L]:
 472             self.assertNotEqual(re.compile('^pattern$', flag), None)
 473
 474     def test_sre_character_literals(self):
 475         for i in [0, 8, 16, 32, 64, 127, 128, 255]:
 476             self.assertNotEqual(re.match(r"\%03o" % i, chr(i)), None)
 477             self.assertNotEqual(re.match(r"\%03o0" % i, chr(i)+"0"), None)
 478             self.assertNotEqual(re.match(r"\%03o8" % i, chr(i)+"8"), None)
 479             self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None)
 480             self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None)
 481             self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None)
 482         self.assertRaises(re.error, re.match, "\911", "")
 483
 484     def test_sre_character_class_literals(self):
 485         for i in [0, 8, 16, 32, 64, 127, 128, 255]:
 486             self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None)
 487             self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None)
 488             self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None)
 489             self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None)
 490             self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None)
 491             self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None)
 492         self.assertRaises(re.error, re.match, "[\911]", "")
 493
 494     def test_bug_113254(self):
 495         self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
 496         self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
 497         self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
 498
 499     def test_bug_527371(self):
 500         # bug described in patches 527371/672491
 501         self.assertEqual(re.match(r'(a)?a','a').lastindex, None)
 502         self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
 503         self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
 504         self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
 505         self.assertEqual(re.match("((a))", "a").lastindex, 1)
 506
 507     def test_bug_545855(self):
 508         # bug 545855 -- This pattern failed to cause a compile error as it
 509         # should, instead provoking a TypeError.
 510         self.assertRaises(re.error, re.compile, 'foo[a-')
 511
 512     def test_bug_418626(self):
 513         # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
 514         # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
 515         # pattern '*?' on a long string.
 516         self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
 517         self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
 518                          20003)
 519         self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
 520         # non-simple '*?' still used to hit the recursion limit, before the
 521         # non-recursive scheme was implemented.
 522         self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
 523
 524     def test_bug_612074(self):
 525         pat=u"["+re.escape(u"\u2039")+u"]"
 526         self.assertEqual(re.compile(pat) and 1, 1)
 527
 528     def test_stack_overflow(self):
 529         # nasty cases that used to overflow the straightforward recursive
 530         # implementation of repeated groups.
 531         self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
 532         self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
 533         self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
 534
 535     def test_scanner(self):
 536         def s_ident(scanner, token): return token
 537         def s_operator(scanner, token): return "op%s" % token
 538         def s_float(scanner, token): return float(token)
 539         def s_int(scanner, token): return int(token)
 540
 541         scanner = Scanner([
 542             (r"[a-zA-Z_]\w*", s_ident),
 543             (r"\d+\.\d*", s_float),
 544             (r"\d+", s_int),
 545             (r"=|\+|-|\*|/", s_operator),
 546             (r"\s+", None),
 547             ])
 548
 549         self.assertNotEqual(scanner.scanner.scanner("").pattern, None)
 550
 551         self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
 552                          (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
 553                            'op+', 'bar'], ''))
 554
 555     def test_bug_448951(self):
 556         # bug 448951 (similar to 429357, but with single char match)
 557         # (Also test greedy matches.)
 558         for op in '','?','*':
 559             self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
 560                              (None, None))
 561             self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
 562                              ('a:', 'a'))
 563
 564     def test_bug_725106(self):
 565         # capturing groups in alternatives in repeats
 566         self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
 567                          ('b', 'a'))
 568         self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
 569                          ('c', 'b'))
 570         self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
 571                          ('b', None))
 572         self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
 573                          ('b', None))
 574         self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
 575                          ('b', 'a'))
 576         self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
 577                          ('c', 'b'))
 578         self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
 579                          ('b', None))
 580         self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
 581                          ('b', None))
 582
 583     def test_bug_725149(self):
 584         # mark_stack_base restoring before restoring marks
 585         self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
 586                          ('a', None))
 587         self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
 588                          ('a', None, None))
 589
 590     def test_bug_764548(self):
 591         # bug 764548, re.compile() barfs on str/unicode subclasses
 592         try:
 593             unicode
 594         except NameError:
 595             return  # no problem if we have no unicode
 596         class my_unicode(unicode): pass
 597         pat = re.compile(my_unicode("abc"))
 598         self.assertEqual(pat.match("xyz"), None)
 599
 600     def test_finditer(self):
 601         iter = re.finditer(r":+", "a:b::c:::d")
 602         self.assertEqual([item.group(0) for item in iter],
 603                          [":", "::", ":::"])
 604
 605     def test_bug_926075(self):
 606         try:
 607             unicode
 608         except NameError:
 609             return # no problem if we have no unicode
 610         self.assert_(re.compile('bug_926075') is not
 611                      re.compile(eval("u'bug_926075'")))
 612
 613     def test_bug_931848(self):
 614         try:
 615             unicode
 616         except NameError:
 617             pass
 618         pattern = eval('u"[\u002E\u3002\uFF0E\uFF61]"')
 619         self.assertEqual(re.compile(pattern).split("a.b.c"),
 620                          ['a','b','c'])
 621
 622     def test_bug_581080(self):
 623         iter = re.finditer(r"\s", "a b")
 624         self.assertEqual(iter.next().span(), (1,2))
 625         self.assertRaises(StopIteration, iter.next)
 626
 627         scanner = re.compile(r"\s").scanner("a b")
 628         self.assertEqual(scanner.search().span(), (1, 2))
 629         self.assertEqual(scanner.search(), None)
 630
 631     def test_bug_817234(self):
 632         iter = re.finditer(r".*", "asdf")
 633         self.assertEqual(iter.next().span(), (0, 4))
 634         self.assertEqual(iter.next().span(), (4, 4))
 635         self.assertRaises(StopIteration, iter.next)
 636
 637     def test_empty_array(self):
 638         # SF buf 1647541
 639         import array
 640         for typecode in 'cbBuhHiIlLfd':
 641             a = array.array(typecode)
 642             self.assertEqual(re.compile("bla").match(a), None)
 643             self.assertEqual(re.compile("").match(a).groups(), ())
 644
 645     def test_inline_flags(self):
 646         # Bug #1700
 647         upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow
 648         lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow
 649
 650         p = re.compile(upper_char, re.I | re.U)
 651         q = p.match(lower_char)
 652         self.assertNotEqual(q, None)
 653
 654         p = re.compile(lower_char, re.I | re.U)
 655         q = p.match(upper_char)
 656         self.assertNotEqual(q, None)
 657
 658         p = re.compile('(?i)' + upper_char, re.U)
 659         q = p.match(lower_char)
 660         self.assertNotEqual(q, None)
 661
 662         p = re.compile('(?i)' + lower_char, re.U)
 663         q = p.match(upper_char)
 664         self.assertNotEqual(q, None)
 665
 666         p = re.compile('(?iu)' + upper_char)
 667         q = p.match(lower_char)
 668         self.assertNotEqual(q, None)
 669
 670         p = re.compile('(?iu)' + lower_char)
 671         q = p.match(upper_char)
 672         self.assertNotEqual(q, None)
 673
 674     def test_dollar_matches_twice(self):
 675         "$ matches the end of string, and just before the terminating \n"
 676         pattern = re.compile('$')
 677         self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
 678         self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
 679         self.assertEqual(pattern.sub('#', '\n'), '#\n#')
 680
 681         pattern = re.compile('$', re.MULTILINE)
 682         self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
 683         self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
 684         self.assertEqual(pattern.sub('#', '\n'), '#\n#')
 685
 686
 687 def run_re_tests():
 688     from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR
 689     if verbose:
 690         print 'Running re_tests test suite'
 691     else:
 692         # To save time, only run the first and last 10 tests
 693         #tests = tests[:10] + tests[-10:]
 694         pass
 695
 696     for t in tests:
 697         sys.stdout.flush()
 698         pattern = s = outcome = repl = expected = None
 699         if len(t) == 5:
 700             pattern, s, outcome, repl, expected = t
 701         elif len(t) == 3:
 702             pattern, s, outcome = t
 703         else:
 704             raise ValueError, ('Test tuples should have 3 or 5 fields', t)
 705
 706         try:
 707             obj = re.compile(pattern)
 708         except re.error:
 709             if outcome == SYNTAX_ERROR: pass  # Expected a syntax error
 710             else:
 711                 print '=== Syntax error:', t
 712         except KeyboardInterrupt: raise KeyboardInterrupt
 713         except:
 714             print '*** Unexpected error ***', t
 715             if verbose:
 716                 traceback.print_exc(file=sys.stdout)
 717         else:
 718             try:
 719                 result = obj.search(s)
 720             except re.error, msg:
 721                 print '=== Unexpected exception', t, repr(msg)
 722             if outcome == SYNTAX_ERROR:
 723                 # This should have been a syntax error; forget it.
 724                 pass
 725             elif outcome == FAIL:
 726                 if result is None: pass   # No match, as expected
 727                 else: print '=== Succeeded incorrectly', t
 728             elif outcome == SUCCEED:
 729                 if result is not None:
 730                     # Matched, as expected, so now we compute the
 731                     # result string and compare it to our expected result.
 732                     start, end = result.span(0)
 733                     vardict={'found': result.group(0),
 734                              'groups': result.group(),
 735                              'flags': result.re.flags}
 736                     for i in range(1, 100):
 737                         try:
 738                             gi = result.group(i)
 739                             # Special hack because else the string concat fails:
 740                             if gi is None:
 741                                 gi = "None"
 742                         except IndexError:
 743                             gi = "Error"
 744                         vardict['g%d' % i] = gi
 745                     for i in result.re.groupindex.keys():
 746                         try:
 747                             gi = result.group(i)
 748                             if gi is None:
 749                                 gi = "None"
 750                         except IndexError:
 751                             gi = "Error"
 752                         vardict[i] = gi
 753                     repl = eval(repl, vardict)
 754                     if repl != expected:
 755                         print '=== grouping error', t,
 756                         print repr(repl) + ' should be ' + repr(expected)
 757                 else:
 758                     print '=== Failed incorrectly', t
 759
 760                 # Try the match on a unicode string, and check that it
 761                 # still succeeds.
 762                 try:
 763                     result = obj.search(unicode(s, "latin-1"))
 764                     if result is None:
 765                         print '=== Fails on unicode match', t
 766                 except NameError:
 767                     continue # 1.5.2
 768                 except TypeError:
 769                     continue # unicode test case
 770
 771                 # Try the match on a unicode pattern, and check that it
 772                 # still succeeds.
 773                 obj=re.compile(unicode(pattern, "latin-1"))
 774                 result = obj.search(s)
 775                 if result is None:
 776                     print '=== Fails on unicode pattern match', t
 777
 778                 # Try the match with the search area limited to the extent
 779                 # of the match and see if it still succeeds.  \B will
 780                 # break (because it won't match at the end or start of a
 781                 # string), so we'll ignore patterns that feature it.
 782
 783                 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
 784                                and result is not None:
 785                     obj = re.compile(pattern)
 786                     result = obj.search(s, result.start(0), result.end(0) + 1)
 787                     if result is None:
 788                         print '=== Failed on range-limited match', t
 789
 790                 # Try the match with IGNORECASE enabled, and check that it
 791                 # still succeeds.
 792                 obj = re.compile(pattern, re.IGNORECASE)
 793                 result = obj.search(s)
 794                 if result is None:
 795                     print '=== Fails on case-insensitive match', t
 796
 797                 # Try the match with LOCALE enabled, and check that it
 798                 # still succeeds.
 799                 obj = re.compile(pattern, re.LOCALE)
 800                 result = obj.search(s)
 801                 if result is None:
 802                     print '=== Fails on locale-sensitive match', t
 803
 804                 # Try the match with UNICODE locale enabled, and check
 805                 # that it still succeeds.
 806                 obj = re.compile(pattern, re.UNICODE)
 807                 result = obj.search(s)
 808                 if result is None:
 809                     print '=== Fails on unicode-sensitive match', t
 810
 811 def test_main():
 812     run_unittest(ReTests)
 813     run_re_tests()
 814
 815 if __name__ == "__main__":
 816     test_main()