#1153769: document PEP 237 changes to string formatting.
[python.git] / Lib / test / test_re.py
bloba2470cd6f27a78d949d8c2a7a120d8ec2cb84277
1 import sys
2 sys.path = ['.'] + sys.path
4 from test.test_support import verbose, run_unittest, catch_warning
5 import re
6 from re import Scanner
7 import sys, os, traceback
8 from weakref import proxy
10 # Misc tests from Tim Peters' re.doc
12 # WARNING: Don't change details in these tests if you don't know
13 # what you're doing. Some of these tests were carefuly modeled to
14 # cover most of the code.
16 import unittest
18 class ReTests(unittest.TestCase):
20 def test_weakref(self):
21 s = 'QabbbcR'
22 x = re.compile('ab+c')
23 y = proxy(x)
24 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
26 def test_search_star_plus(self):
27 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
28 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
29 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
30 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
31 self.assertEqual(re.search('x', 'aaa'), None)
32 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
33 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
34 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
35 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
36 self.assertEqual(re.match('a+', 'xxx'), None)
38 def bump_num(self, matchobj):
39 int_value = int(matchobj.group(0))
40 return str(int_value + 1)
42 def test_basic_re_sub(self):
43 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
44 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
45 '9.3 -3 24x100y')
46 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
47 '9.3 -3 23x99y')
49 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
50 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
52 s = r"\1\1"
53 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
54 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
55 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
57 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
58 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
59 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
60 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
62 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
63 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
64 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
65 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
66 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
68 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
70 def test_bug_449964(self):
71 # fails for group followed by other escape
72 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
73 'xx\bxx\b')
75 def test_bug_449000(self):
76 # Test for sub() on escaped characters
77 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
78 'abc\ndef\n')
79 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
80 'abc\ndef\n')
81 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
82 'abc\ndef\n')
83 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
84 'abc\ndef\n')
86 def test_bug_1140(self):
87 # re.sub(x, y, u'') should return u'', not '', and
88 # re.sub(x, y, '') should return '', not u''.
89 # Also:
90 # re.sub(x, y, unicode(x)) should return unicode(y), and
91 # re.sub(x, y, str(x)) should return
92 # str(y) if isinstance(y, str) else unicode(y).
93 for x in 'x', u'x':
94 for y in 'y', u'y':
95 z = re.sub(x, y, u'')
96 self.assertEqual(z, u'')
97 self.assertEqual(type(z), unicode)
99 z = re.sub(x, y, '')
100 self.assertEqual(z, '')
101 self.assertEqual(type(z), str)
103 z = re.sub(x, y, unicode(x))
104 self.assertEqual(z, y)
105 self.assertEqual(type(z), unicode)
107 z = re.sub(x, y, str(x))
108 self.assertEqual(z, y)
109 self.assertEqual(type(z), type(y))
111 def test_bug_1661(self):
112 # Verify that flags do not get silently ignored with compiled patterns
113 pattern = re.compile('.')
114 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
115 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
116 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
117 self.assertRaises(ValueError, re.compile, pattern, re.I)
119 def test_sub_template_numeric_escape(self):
120 # bug 776311 and friends
121 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
122 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
123 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
124 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
125 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
126 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
127 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
129 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
130 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
132 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
133 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
134 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
135 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
136 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
138 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
139 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
141 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
142 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
143 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
144 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
145 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
146 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
147 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
148 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
149 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
150 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
151 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
152 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
154 # in python2.3 (etc), these loop endlessly in sre_parser.py
155 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
156 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
157 'xz8')
158 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
159 'xza')
161 def test_qualified_re_sub(self):
162 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
163 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
165 def test_bug_114660(self):
166 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
167 'hello there')
169 def test_bug_462270(self):
170 # Test for empty sub() behaviour, see SF bug #462270
171 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
172 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
174 def test_symbolic_refs(self):
175 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
176 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
177 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
178 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
179 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
180 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
181 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
182 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
183 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
185 def test_re_subn(self):
186 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
187 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
188 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
189 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
190 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
192 def test_re_split(self):
193 self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
194 self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
195 self.assertEqual(re.split("(:*)", ":a:b::c"),
196 ['', ':', 'a', ':', 'b', '::', 'c'])
197 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
198 self.assertEqual(re.split("(:)*", ":a:b::c"),
199 ['', ':', 'a', ':', 'b', ':', 'c'])
200 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
201 ['', ':', 'a', ':b::', 'c'])
202 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
203 ['', None, ':', 'a', None, ':', '', 'b', None, '',
204 None, '::', 'c'])
205 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
206 ['', 'a', '', '', 'c'])
208 def test_qualified_re_split(self):
209 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
210 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
211 self.assertEqual(re.split("(:)", ":a:b::c", 2),
212 ['', ':', 'a', ':', 'b::c'])
213 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
214 ['', ':', 'a', ':', 'b::c'])
216 def test_re_findall(self):
217 self.assertEqual(re.findall(":+", "abc"), [])
218 self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
219 self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
220 self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
221 (":", ":"),
222 (":", "::")])
224 def test_bug_117612(self):
225 self.assertEqual(re.findall(r"(a|(b))", "aba"),
226 [("a", ""),("b", "b"),("a", "")])
228 def test_re_match(self):
229 self.assertEqual(re.match('a', 'a').groups(), ())
230 self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
231 self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
232 self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
233 self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
235 pat = re.compile('((a)|(b))(c)?')
236 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
237 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
238 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
239 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
240 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
242 # A single group
243 m = re.match('(a)', 'a')
244 self.assertEqual(m.group(0), 'a')
245 self.assertEqual(m.group(0), 'a')
246 self.assertEqual(m.group(1), 'a')
247 self.assertEqual(m.group(1, 1), ('a', 'a'))
249 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
250 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
251 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
252 (None, 'b', None))
253 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
255 def test_re_groupref_exists(self):
256 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
257 ('(', 'a'))
258 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
259 (None, 'a'))
260 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'), None)
261 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a'), None)
262 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
263 ('a', 'b'))
264 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
265 (None, 'd'))
266 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
267 (None, 'd'))
268 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
269 ('a', ''))
271 # Tests for bug #1177831: exercise groups other than the first group
272 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
273 self.assertEqual(p.match('abc').groups(),
274 ('a', 'b', 'c'))
275 self.assertEqual(p.match('ad').groups(),
276 ('a', None, 'd'))
277 self.assertEqual(p.match('abd'), None)
278 self.assertEqual(p.match('ac'), None)
281 def test_re_groupref(self):
282 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
283 ('|', 'a'))
284 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
285 (None, 'a'))
286 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', 'a|'), None)
287 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a'), None)
288 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
289 ('a', 'a'))
290 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
291 (None, None))
293 def test_groupdict(self):
294 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
295 'first second').groupdict(),
296 {'first':'first', 'second':'second'})
298 def test_expand(self):
299 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
300 "first second")
301 .expand(r"\2 \1 \g<second> \g<first>"),
302 "second first second first")
304 def test_repeat_minmax(self):
305 self.assertEqual(re.match("^(\w){1}$", "abc"), None)
306 self.assertEqual(re.match("^(\w){1}?$", "abc"), None)
307 self.assertEqual(re.match("^(\w){1,2}$", "abc"), None)
308 self.assertEqual(re.match("^(\w){1,2}?$", "abc"), None)
310 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
311 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
312 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
313 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
314 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
315 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
316 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
317 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
319 self.assertEqual(re.match("^x{1}$", "xxx"), None)
320 self.assertEqual(re.match("^x{1}?$", "xxx"), None)
321 self.assertEqual(re.match("^x{1,2}$", "xxx"), None)
322 self.assertEqual(re.match("^x{1,2}?$", "xxx"), None)
324 self.assertNotEqual(re.match("^x{3}$", "xxx"), None)
325 self.assertNotEqual(re.match("^x{1,3}$", "xxx"), None)
326 self.assertNotEqual(re.match("^x{1,4}$", "xxx"), None)
327 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
328 self.assertNotEqual(re.match("^x{3}?$", "xxx"), None)
329 self.assertNotEqual(re.match("^x{1,3}?$", "xxx"), None)
330 self.assertNotEqual(re.match("^x{1,4}?$", "xxx"), None)
331 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
333 self.assertEqual(re.match("^x{}$", "xxx"), None)
334 self.assertNotEqual(re.match("^x{}$", "x{}"), None)
336 def test_getattr(self):
337 self.assertEqual(re.match("(a)", "a").pos, 0)
338 self.assertEqual(re.match("(a)", "a").endpos, 1)
339 self.assertEqual(re.match("(a)", "a").string, "a")
340 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
341 self.assertNotEqual(re.match("(a)", "a").re, None)
343 def test_special_escapes(self):
344 self.assertEqual(re.search(r"\b(b.)\b",
345 "abcd abc bcd bx").group(1), "bx")
346 self.assertEqual(re.search(r"\B(b.)\B",
347 "abc bcd bc abxd").group(1), "bx")
348 self.assertEqual(re.search(r"\b(b.)\b",
349 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
350 self.assertEqual(re.search(r"\B(b.)\B",
351 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
352 self.assertEqual(re.search(r"\b(b.)\b",
353 "abcd abc bcd bx", re.UNICODE).group(1), "bx")
354 self.assertEqual(re.search(r"\B(b.)\B",
355 "abc bcd bc abxd", re.UNICODE).group(1), "bx")
356 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
357 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
358 self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
359 self.assertEqual(re.search(r"\b(b.)\b",
360 u"abcd abc bcd bx").group(1), "bx")
361 self.assertEqual(re.search(r"\B(b.)\B",
362 u"abc bcd bc abxd").group(1), "bx")
363 self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
364 self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
365 self.assertEqual(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M), None)
366 self.assertEqual(re.search(r"\d\D\w\W\s\S",
367 "1aa! a").group(0), "1aa! a")
368 self.assertEqual(re.search(r"\d\D\w\W\s\S",
369 "1aa! a", re.LOCALE).group(0), "1aa! a")
370 self.assertEqual(re.search(r"\d\D\w\W\s\S",
371 "1aa! a", re.UNICODE).group(0), "1aa! a")
373 def test_ignore_case(self):
374 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
375 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
377 def test_bigcharset(self):
378 self.assertEqual(re.match(u"([\u2222\u2223])",
379 u"\u2222").group(1), u"\u2222")
380 self.assertEqual(re.match(u"([\u2222\u2223])",
381 u"\u2222", re.UNICODE).group(1), u"\u2222")
383 def test_anyall(self):
384 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
385 "a\nb")
386 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
387 "a\n\nb")
389 def test_non_consuming(self):
390 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
391 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
392 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
393 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
394 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
395 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
396 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
398 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
399 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
400 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
401 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
403 def test_ignore_case(self):
404 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
405 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
406 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
407 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
408 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
409 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
410 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
411 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
413 def test_category(self):
414 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
416 def test_getlower(self):
417 import _sre
418 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
419 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
420 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
422 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
423 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
425 def test_not_literal(self):
426 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
427 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
429 def test_search_coverage(self):
430 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
431 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
433 def test_re_escape(self):
434 p=""
435 for i in range(0, 256):
436 p = p + chr(i)
437 self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None,
438 True)
439 self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1))
441 pat=re.compile(re.escape(p))
442 self.assertEqual(pat.match(p) is not None, True)
443 self.assertEqual(pat.match(p).span(), (0,256))
445 def test_pickling(self):
446 import pickle
447 self.pickle_test(pickle)
448 import cPickle
449 self.pickle_test(cPickle)
450 # old pickles expect the _compile() reconstructor in sre module
451 import warnings
452 with catch_warning():
453 warnings.filterwarnings("ignore", "The sre module is deprecated",
454 DeprecationWarning)
455 from sre import _compile
457 def pickle_test(self, pickle):
458 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
459 s = pickle.dumps(oldpat)
460 newpat = pickle.loads(s)
461 self.assertEqual(oldpat, newpat)
463 def test_constants(self):
464 self.assertEqual(re.I, re.IGNORECASE)
465 self.assertEqual(re.L, re.LOCALE)
466 self.assertEqual(re.M, re.MULTILINE)
467 self.assertEqual(re.S, re.DOTALL)
468 self.assertEqual(re.X, re.VERBOSE)
470 def test_flags(self):
471 for flag in [re.I, re.M, re.X, re.S, re.L]:
472 self.assertNotEqual(re.compile('^pattern$', flag), None)
474 def test_sre_character_literals(self):
475 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
476 self.assertNotEqual(re.match(r"\%03o" % i, chr(i)), None)
477 self.assertNotEqual(re.match(r"\%03o0" % i, chr(i)+"0"), None)
478 self.assertNotEqual(re.match(r"\%03o8" % i, chr(i)+"8"), None)
479 self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None)
480 self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None)
481 self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None)
482 self.assertRaises(re.error, re.match, "\911", "")
484 def test_sre_character_class_literals(self):
485 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
486 self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None)
487 self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None)
488 self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None)
489 self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None)
490 self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None)
491 self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None)
492 self.assertRaises(re.error, re.match, "[\911]", "")
494 def test_bug_113254(self):
495 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
496 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
497 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
499 def test_bug_527371(self):
500 # bug described in patches 527371/672491
501 self.assertEqual(re.match(r'(a)?a','a').lastindex, None)
502 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
503 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
504 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
505 self.assertEqual(re.match("((a))", "a").lastindex, 1)
507 def test_bug_545855(self):
508 # bug 545855 -- This pattern failed to cause a compile error as it
509 # should, instead provoking a TypeError.
510 self.assertRaises(re.error, re.compile, 'foo[a-')
512 def test_bug_418626(self):
513 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
514 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
515 # pattern '*?' on a long string.
516 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
517 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
518 20003)
519 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
520 # non-simple '*?' still used to hit the recursion limit, before the
521 # non-recursive scheme was implemented.
522 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
524 def test_bug_612074(self):
525 pat=u"["+re.escape(u"\u2039")+u"]"
526 self.assertEqual(re.compile(pat) and 1, 1)
528 def test_stack_overflow(self):
529 # nasty cases that used to overflow the straightforward recursive
530 # implementation of repeated groups.
531 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
532 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
533 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
535 def test_scanner(self):
536 def s_ident(scanner, token): return token
537 def s_operator(scanner, token): return "op%s" % token
538 def s_float(scanner, token): return float(token)
539 def s_int(scanner, token): return int(token)
541 scanner = Scanner([
542 (r"[a-zA-Z_]\w*", s_ident),
543 (r"\d+\.\d*", s_float),
544 (r"\d+", s_int),
545 (r"=|\+|-|\*|/", s_operator),
546 (r"\s+", None),
549 self.assertNotEqual(scanner.scanner.scanner("").pattern, None)
551 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
552 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
553 'op+', 'bar'], ''))
555 def test_bug_448951(self):
556 # bug 448951 (similar to 429357, but with single char match)
557 # (Also test greedy matches.)
558 for op in '','?','*':
559 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
560 (None, None))
561 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
562 ('a:', 'a'))
564 def test_bug_725106(self):
565 # capturing groups in alternatives in repeats
566 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
567 ('b', 'a'))
568 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
569 ('c', 'b'))
570 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
571 ('b', None))
572 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
573 ('b', None))
574 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
575 ('b', 'a'))
576 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
577 ('c', 'b'))
578 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
579 ('b', None))
580 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
581 ('b', None))
583 def test_bug_725149(self):
584 # mark_stack_base restoring before restoring marks
585 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
586 ('a', None))
587 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
588 ('a', None, None))
590 def test_bug_764548(self):
591 # bug 764548, re.compile() barfs on str/unicode subclasses
592 try:
593 unicode
594 except NameError:
595 return # no problem if we have no unicode
596 class my_unicode(unicode): pass
597 pat = re.compile(my_unicode("abc"))
598 self.assertEqual(pat.match("xyz"), None)
600 def test_finditer(self):
601 iter = re.finditer(r":+", "a:b::c:::d")
602 self.assertEqual([item.group(0) for item in iter],
603 [":", "::", ":::"])
605 def test_bug_926075(self):
606 try:
607 unicode
608 except NameError:
609 return # no problem if we have no unicode
610 self.assert_(re.compile('bug_926075') is not
611 re.compile(eval("u'bug_926075'")))
613 def test_bug_931848(self):
614 try:
615 unicode
616 except NameError:
617 pass
618 pattern = eval('u"[\u002E\u3002\uFF0E\uFF61]"')
619 self.assertEqual(re.compile(pattern).split("a.b.c"),
620 ['a','b','c'])
622 def test_bug_581080(self):
623 iter = re.finditer(r"\s", "a b")
624 self.assertEqual(iter.next().span(), (1,2))
625 self.assertRaises(StopIteration, iter.next)
627 scanner = re.compile(r"\s").scanner("a b")
628 self.assertEqual(scanner.search().span(), (1, 2))
629 self.assertEqual(scanner.search(), None)
631 def test_bug_817234(self):
632 iter = re.finditer(r".*", "asdf")
633 self.assertEqual(iter.next().span(), (0, 4))
634 self.assertEqual(iter.next().span(), (4, 4))
635 self.assertRaises(StopIteration, iter.next)
637 def test_empty_array(self):
638 # SF buf 1647541
639 import array
640 for typecode in 'cbBuhHiIlLfd':
641 a = array.array(typecode)
642 self.assertEqual(re.compile("bla").match(a), None)
643 self.assertEqual(re.compile("").match(a).groups(), ())
645 def test_inline_flags(self):
646 # Bug #1700
647 upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow
648 lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow
650 p = re.compile(upper_char, re.I | re.U)
651 q = p.match(lower_char)
652 self.assertNotEqual(q, None)
654 p = re.compile(lower_char, re.I | re.U)
655 q = p.match(upper_char)
656 self.assertNotEqual(q, None)
658 p = re.compile('(?i)' + upper_char, re.U)
659 q = p.match(lower_char)
660 self.assertNotEqual(q, None)
662 p = re.compile('(?i)' + lower_char, re.U)
663 q = p.match(upper_char)
664 self.assertNotEqual(q, None)
666 p = re.compile('(?iu)' + upper_char)
667 q = p.match(lower_char)
668 self.assertNotEqual(q, None)
670 p = re.compile('(?iu)' + lower_char)
671 q = p.match(upper_char)
672 self.assertNotEqual(q, None)
674 def test_dollar_matches_twice(self):
675 "$ matches the end of string, and just before the terminating \n"
676 pattern = re.compile('$')
677 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
678 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
679 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
681 pattern = re.compile('$', re.MULTILINE)
682 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
683 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
684 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
687 def run_re_tests():
688 from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR
689 if verbose:
690 print 'Running re_tests test suite'
691 else:
692 # To save time, only run the first and last 10 tests
693 #tests = tests[:10] + tests[-10:]
694 pass
696 for t in tests:
697 sys.stdout.flush()
698 pattern = s = outcome = repl = expected = None
699 if len(t) == 5:
700 pattern, s, outcome, repl, expected = t
701 elif len(t) == 3:
702 pattern, s, outcome = t
703 else:
704 raise ValueError, ('Test tuples should have 3 or 5 fields', t)
706 try:
707 obj = re.compile(pattern)
708 except re.error:
709 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
710 else:
711 print '=== Syntax error:', t
712 except KeyboardInterrupt: raise KeyboardInterrupt
713 except:
714 print '*** Unexpected error ***', t
715 if verbose:
716 traceback.print_exc(file=sys.stdout)
717 else:
718 try:
719 result = obj.search(s)
720 except re.error, msg:
721 print '=== Unexpected exception', t, repr(msg)
722 if outcome == SYNTAX_ERROR:
723 # This should have been a syntax error; forget it.
724 pass
725 elif outcome == FAIL:
726 if result is None: pass # No match, as expected
727 else: print '=== Succeeded incorrectly', t
728 elif outcome == SUCCEED:
729 if result is not None:
730 # Matched, as expected, so now we compute the
731 # result string and compare it to our expected result.
732 start, end = result.span(0)
733 vardict={'found': result.group(0),
734 'groups': result.group(),
735 'flags': result.re.flags}
736 for i in range(1, 100):
737 try:
738 gi = result.group(i)
739 # Special hack because else the string concat fails:
740 if gi is None:
741 gi = "None"
742 except IndexError:
743 gi = "Error"
744 vardict['g%d' % i] = gi
745 for i in result.re.groupindex.keys():
746 try:
747 gi = result.group(i)
748 if gi is None:
749 gi = "None"
750 except IndexError:
751 gi = "Error"
752 vardict[i] = gi
753 repl = eval(repl, vardict)
754 if repl != expected:
755 print '=== grouping error', t,
756 print repr(repl) + ' should be ' + repr(expected)
757 else:
758 print '=== Failed incorrectly', t
760 # Try the match on a unicode string, and check that it
761 # still succeeds.
762 try:
763 result = obj.search(unicode(s, "latin-1"))
764 if result is None:
765 print '=== Fails on unicode match', t
766 except NameError:
767 continue # 1.5.2
768 except TypeError:
769 continue # unicode test case
771 # Try the match on a unicode pattern, and check that it
772 # still succeeds.
773 obj=re.compile(unicode(pattern, "latin-1"))
774 result = obj.search(s)
775 if result is None:
776 print '=== Fails on unicode pattern match', t
778 # Try the match with the search area limited to the extent
779 # of the match and see if it still succeeds. \B will
780 # break (because it won't match at the end or start of a
781 # string), so we'll ignore patterns that feature it.
783 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
784 and result is not None:
785 obj = re.compile(pattern)
786 result = obj.search(s, result.start(0), result.end(0) + 1)
787 if result is None:
788 print '=== Failed on range-limited match', t
790 # Try the match with IGNORECASE enabled, and check that it
791 # still succeeds.
792 obj = re.compile(pattern, re.IGNORECASE)
793 result = obj.search(s)
794 if result is None:
795 print '=== Fails on case-insensitive match', t
797 # Try the match with LOCALE enabled, and check that it
798 # still succeeds.
799 obj = re.compile(pattern, re.LOCALE)
800 result = obj.search(s)
801 if result is None:
802 print '=== Fails on locale-sensitive match', t
804 # Try the match with UNICODE locale enabled, and check
805 # that it still succeeds.
806 obj = re.compile(pattern, re.UNICODE)
807 result = obj.search(s)
808 if result is None:
809 print '=== Fails on unicode-sensitive match', t
811 def test_main():
812 run_unittest(ReTests)
813 run_re_tests()
815 if __name__ == "__main__":
816 test_main()