move sections
[python/dscho.git] / Lib / test / test_re.py
blobf978086dbc738d995f150f6771c224820f721f8a
1 from test.test_support import verbose, run_unittest, import_module
2 import re
3 from re import Scanner
4 import sys, traceback
5 from weakref import proxy
7 # Misc tests from Tim Peters' re.doc
9 # WARNING: Don't change details in these tests if you don't know
10 # what you're doing. Some of these tests were carefuly modeled to
11 # cover most of the code.
13 import unittest
15 class ReTests(unittest.TestCase):
17 def test_weakref(self):
18 s = 'QabbbcR'
19 x = re.compile('ab+c')
20 y = proxy(x)
21 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
23 def test_search_star_plus(self):
24 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
25 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
26 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
27 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
28 self.assertEqual(re.search('x', 'aaa'), None)
29 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
30 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
31 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
32 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
33 self.assertEqual(re.match('a+', 'xxx'), None)
35 def bump_num(self, matchobj):
36 int_value = int(matchobj.group(0))
37 return str(int_value + 1)
39 def test_basic_re_sub(self):
40 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
41 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
42 '9.3 -3 24x100y')
43 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
44 '9.3 -3 23x99y')
46 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
47 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
49 s = r"\1\1"
50 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
51 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
52 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
54 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
55 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
56 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
57 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
59 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
60 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
61 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
62 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
63 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
65 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
67 def test_bug_449964(self):
68 # fails for group followed by other escape
69 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
70 'xx\bxx\b')
72 def test_bug_449000(self):
73 # Test for sub() on escaped characters
74 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
75 'abc\ndef\n')
76 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
77 'abc\ndef\n')
78 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
79 'abc\ndef\n')
80 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
81 'abc\ndef\n')
83 def test_bug_1140(self):
84 # re.sub(x, y, u'') should return u'', not '', and
85 # re.sub(x, y, '') should return '', not u''.
86 # Also:
87 # re.sub(x, y, unicode(x)) should return unicode(y), and
88 # re.sub(x, y, str(x)) should return
89 # str(y) if isinstance(y, str) else unicode(y).
90 for x in 'x', u'x':
91 for y in 'y', u'y':
92 z = re.sub(x, y, u'')
93 self.assertEqual(z, u'')
94 self.assertEqual(type(z), unicode)
96 z = re.sub(x, y, '')
97 self.assertEqual(z, '')
98 self.assertEqual(type(z), str)
100 z = re.sub(x, y, unicode(x))
101 self.assertEqual(z, y)
102 self.assertEqual(type(z), unicode)
104 z = re.sub(x, y, str(x))
105 self.assertEqual(z, y)
106 self.assertEqual(type(z), type(y))
108 def test_bug_1661(self):
109 # Verify that flags do not get silently ignored with compiled patterns
110 pattern = re.compile('.')
111 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
112 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
113 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
114 self.assertRaises(ValueError, re.compile, pattern, re.I)
116 def test_bug_3629(self):
117 # A regex that triggered a bug in the sre-code validator
118 re.compile("(?P<quote>)(?(quote))")
120 def test_sub_template_numeric_escape(self):
121 # bug 776311 and friends
122 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
123 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
124 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
125 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
126 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
127 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
128 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
130 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
131 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
133 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
134 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
135 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
136 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
137 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
139 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
140 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
142 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
143 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
144 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
145 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
146 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
147 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
148 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
149 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
150 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
151 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
152 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
153 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
155 # in python2.3 (etc), these loop endlessly in sre_parser.py
156 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
157 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
158 'xz8')
159 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
160 'xza')
162 def test_qualified_re_sub(self):
163 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
164 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
166 def test_bug_114660(self):
167 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
168 'hello there')
170 def test_bug_462270(self):
171 # Test for empty sub() behaviour, see SF bug #462270
172 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
173 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
175 def test_symbolic_refs(self):
176 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
177 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
178 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
179 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
180 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
181 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
182 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
183 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
184 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
186 def test_re_subn(self):
187 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
188 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
189 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
190 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
191 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
193 def test_re_split(self):
194 self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
195 self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
196 self.assertEqual(re.split("(:*)", ":a:b::c"),
197 ['', ':', 'a', ':', 'b', '::', 'c'])
198 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
199 self.assertEqual(re.split("(:)*", ":a:b::c"),
200 ['', ':', 'a', ':', 'b', ':', 'c'])
201 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
202 ['', ':', 'a', ':b::', 'c'])
203 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
204 ['', None, ':', 'a', None, ':', '', 'b', None, '',
205 None, '::', 'c'])
206 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
207 ['', 'a', '', '', 'c'])
209 def test_qualified_re_split(self):
210 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
211 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
212 self.assertEqual(re.split("(:)", ":a:b::c", 2),
213 ['', ':', 'a', ':', 'b::c'])
214 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
215 ['', ':', 'a', ':', 'b::c'])
217 def test_re_findall(self):
218 self.assertEqual(re.findall(":+", "abc"), [])
219 self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
220 self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
221 self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
222 (":", ":"),
223 (":", "::")])
225 def test_bug_117612(self):
226 self.assertEqual(re.findall(r"(a|(b))", "aba"),
227 [("a", ""),("b", "b"),("a", "")])
229 def test_re_match(self):
230 self.assertEqual(re.match('a', 'a').groups(), ())
231 self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
232 self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
233 self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
234 self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
236 pat = re.compile('((a)|(b))(c)?')
237 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
238 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
239 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
240 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
241 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
243 # A single group
244 m = re.match('(a)', 'a')
245 self.assertEqual(m.group(0), 'a')
246 self.assertEqual(m.group(0), 'a')
247 self.assertEqual(m.group(1), 'a')
248 self.assertEqual(m.group(1, 1), ('a', 'a'))
250 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
251 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
252 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
253 (None, 'b', None))
254 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
256 def test_re_groupref_exists(self):
257 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
258 ('(', 'a'))
259 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
260 (None, 'a'))
261 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'), None)
262 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a'), None)
263 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
264 ('a', 'b'))
265 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
266 (None, 'd'))
267 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
268 (None, 'd'))
269 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
270 ('a', ''))
272 # Tests for bug #1177831: exercise groups other than the first group
273 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
274 self.assertEqual(p.match('abc').groups(),
275 ('a', 'b', 'c'))
276 self.assertEqual(p.match('ad').groups(),
277 ('a', None, 'd'))
278 self.assertEqual(p.match('abd'), None)
279 self.assertEqual(p.match('ac'), None)
282 def test_re_groupref(self):
283 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
284 ('|', 'a'))
285 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
286 (None, 'a'))
287 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', 'a|'), None)
288 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a'), None)
289 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
290 ('a', 'a'))
291 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
292 (None, None))
294 def test_groupdict(self):
295 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
296 'first second').groupdict(),
297 {'first':'first', 'second':'second'})
299 def test_expand(self):
300 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
301 "first second")
302 .expand(r"\2 \1 \g<second> \g<first>"),
303 "second first second first")
305 def test_repeat_minmax(self):
306 self.assertEqual(re.match("^(\w){1}$", "abc"), None)
307 self.assertEqual(re.match("^(\w){1}?$", "abc"), None)
308 self.assertEqual(re.match("^(\w){1,2}$", "abc"), None)
309 self.assertEqual(re.match("^(\w){1,2}?$", "abc"), None)
311 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
312 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
313 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
314 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
315 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
316 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
317 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
318 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
320 self.assertEqual(re.match("^x{1}$", "xxx"), None)
321 self.assertEqual(re.match("^x{1}?$", "xxx"), None)
322 self.assertEqual(re.match("^x{1,2}$", "xxx"), None)
323 self.assertEqual(re.match("^x{1,2}?$", "xxx"), None)
325 self.assertNotEqual(re.match("^x{3}$", "xxx"), None)
326 self.assertNotEqual(re.match("^x{1,3}$", "xxx"), None)
327 self.assertNotEqual(re.match("^x{1,4}$", "xxx"), None)
328 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
329 self.assertNotEqual(re.match("^x{3}?$", "xxx"), None)
330 self.assertNotEqual(re.match("^x{1,3}?$", "xxx"), None)
331 self.assertNotEqual(re.match("^x{1,4}?$", "xxx"), None)
332 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
334 self.assertEqual(re.match("^x{}$", "xxx"), None)
335 self.assertNotEqual(re.match("^x{}$", "x{}"), None)
337 def test_getattr(self):
338 self.assertEqual(re.match("(a)", "a").pos, 0)
339 self.assertEqual(re.match("(a)", "a").endpos, 1)
340 self.assertEqual(re.match("(a)", "a").string, "a")
341 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
342 self.assertNotEqual(re.match("(a)", "a").re, None)
344 def test_special_escapes(self):
345 self.assertEqual(re.search(r"\b(b.)\b",
346 "abcd abc bcd bx").group(1), "bx")
347 self.assertEqual(re.search(r"\B(b.)\B",
348 "abc bcd bc abxd").group(1), "bx")
349 self.assertEqual(re.search(r"\b(b.)\b",
350 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
351 self.assertEqual(re.search(r"\B(b.)\B",
352 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
353 self.assertEqual(re.search(r"\b(b.)\b",
354 "abcd abc bcd bx", re.UNICODE).group(1), "bx")
355 self.assertEqual(re.search(r"\B(b.)\B",
356 "abc bcd bc abxd", re.UNICODE).group(1), "bx")
357 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
358 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
359 self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
360 self.assertEqual(re.search(r"\b(b.)\b",
361 u"abcd abc bcd bx").group(1), "bx")
362 self.assertEqual(re.search(r"\B(b.)\B",
363 u"abc bcd bc abxd").group(1), "bx")
364 self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
365 self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
366 self.assertEqual(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M), None)
367 self.assertEqual(re.search(r"\d\D\w\W\s\S",
368 "1aa! a").group(0), "1aa! a")
369 self.assertEqual(re.search(r"\d\D\w\W\s\S",
370 "1aa! a", re.LOCALE).group(0), "1aa! a")
371 self.assertEqual(re.search(r"\d\D\w\W\s\S",
372 "1aa! a", re.UNICODE).group(0), "1aa! a")
374 def test_bigcharset(self):
375 self.assertEqual(re.match(u"([\u2222\u2223])",
376 u"\u2222").group(1), u"\u2222")
377 self.assertEqual(re.match(u"([\u2222\u2223])",
378 u"\u2222", re.UNICODE).group(1), u"\u2222")
380 def test_anyall(self):
381 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
382 "a\nb")
383 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
384 "a\n\nb")
386 def test_non_consuming(self):
387 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
388 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
389 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
390 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
391 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
392 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
393 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
395 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
396 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
397 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
398 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
400 def test_ignore_case(self):
401 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
402 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
403 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
404 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
405 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
406 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
407 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
408 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
409 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
410 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
412 def test_category(self):
413 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
415 def test_getlower(self):
416 import _sre
417 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
418 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
419 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
421 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
422 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
424 def test_not_literal(self):
425 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
426 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
428 def test_search_coverage(self):
429 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
430 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
432 def test_re_escape(self):
433 p=""
434 for i in range(0, 256):
435 p = p + chr(i)
436 self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None,
437 True)
438 self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1))
440 pat=re.compile(re.escape(p))
441 self.assertEqual(pat.match(p) is not None, True)
442 self.assertEqual(pat.match(p).span(), (0,256))
444 def test_pickling(self):
445 import pickle
446 self.pickle_test(pickle)
447 import cPickle
448 self.pickle_test(cPickle)
449 # old pickles expect the _compile() reconstructor in sre module
450 import_module("sre", deprecated=True)
451 from sre import _compile
453 def pickle_test(self, pickle):
454 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
455 s = pickle.dumps(oldpat)
456 newpat = pickle.loads(s)
457 self.assertEqual(oldpat, newpat)
459 def test_constants(self):
460 self.assertEqual(re.I, re.IGNORECASE)
461 self.assertEqual(re.L, re.LOCALE)
462 self.assertEqual(re.M, re.MULTILINE)
463 self.assertEqual(re.S, re.DOTALL)
464 self.assertEqual(re.X, re.VERBOSE)
466 def test_flags(self):
467 for flag in [re.I, re.M, re.X, re.S, re.L]:
468 self.assertNotEqual(re.compile('^pattern$', flag), None)
470 def test_sre_character_literals(self):
471 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
472 self.assertNotEqual(re.match(r"\%03o" % i, chr(i)), None)
473 self.assertNotEqual(re.match(r"\%03o0" % i, chr(i)+"0"), None)
474 self.assertNotEqual(re.match(r"\%03o8" % i, chr(i)+"8"), None)
475 self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None)
476 self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None)
477 self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None)
478 self.assertRaises(re.error, re.match, "\911", "")
480 def test_sre_character_class_literals(self):
481 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
482 self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None)
483 self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None)
484 self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None)
485 self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None)
486 self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None)
487 self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None)
488 self.assertRaises(re.error, re.match, "[\911]", "")
490 def test_bug_113254(self):
491 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
492 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
493 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
495 def test_bug_527371(self):
496 # bug described in patches 527371/672491
497 self.assertEqual(re.match(r'(a)?a','a').lastindex, None)
498 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
499 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
500 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
501 self.assertEqual(re.match("((a))", "a").lastindex, 1)
503 def test_bug_545855(self):
504 # bug 545855 -- This pattern failed to cause a compile error as it
505 # should, instead provoking a TypeError.
506 self.assertRaises(re.error, re.compile, 'foo[a-')
508 def test_bug_418626(self):
509 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
510 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
511 # pattern '*?' on a long string.
512 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
513 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
514 20003)
515 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
516 # non-simple '*?' still used to hit the recursion limit, before the
517 # non-recursive scheme was implemented.
518 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
520 def test_bug_612074(self):
521 pat=u"["+re.escape(u"\u2039")+u"]"
522 self.assertEqual(re.compile(pat) and 1, 1)
524 def test_stack_overflow(self):
525 # nasty cases that used to overflow the straightforward recursive
526 # implementation of repeated groups.
527 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
528 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
529 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
531 def test_scanner(self):
532 def s_ident(scanner, token): return token
533 def s_operator(scanner, token): return "op%s" % token
534 def s_float(scanner, token): return float(token)
535 def s_int(scanner, token): return int(token)
537 scanner = Scanner([
538 (r"[a-zA-Z_]\w*", s_ident),
539 (r"\d+\.\d*", s_float),
540 (r"\d+", s_int),
541 (r"=|\+|-|\*|/", s_operator),
542 (r"\s+", None),
545 self.assertNotEqual(scanner.scanner.scanner("").pattern, None)
547 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
548 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
549 'op+', 'bar'], ''))
551 def test_bug_448951(self):
552 # bug 448951 (similar to 429357, but with single char match)
553 # (Also test greedy matches.)
554 for op in '','?','*':
555 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
556 (None, None))
557 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
558 ('a:', 'a'))
560 def test_bug_725106(self):
561 # capturing groups in alternatives in repeats
562 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
563 ('b', 'a'))
564 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
565 ('c', 'b'))
566 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
567 ('b', None))
568 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
569 ('b', None))
570 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
571 ('b', 'a'))
572 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
573 ('c', 'b'))
574 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
575 ('b', None))
576 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
577 ('b', None))
579 def test_bug_725149(self):
580 # mark_stack_base restoring before restoring marks
581 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
582 ('a', None))
583 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
584 ('a', None, None))
586 def test_bug_764548(self):
587 # bug 764548, re.compile() barfs on str/unicode subclasses
588 try:
589 unicode
590 except NameError:
591 return # no problem if we have no unicode
592 class my_unicode(unicode): pass
593 pat = re.compile(my_unicode("abc"))
594 self.assertEqual(pat.match("xyz"), None)
596 def test_finditer(self):
597 iter = re.finditer(r":+", "a:b::c:::d")
598 self.assertEqual([item.group(0) for item in iter],
599 [":", "::", ":::"])
601 def test_bug_926075(self):
602 try:
603 unicode
604 except NameError:
605 return # no problem if we have no unicode
606 self.assertTrue(re.compile('bug_926075') is not
607 re.compile(eval("u'bug_926075'")))
609 def test_bug_931848(self):
610 try:
611 unicode
612 except NameError:
613 pass
614 pattern = eval('u"[\u002E\u3002\uFF0E\uFF61]"')
615 self.assertEqual(re.compile(pattern).split("a.b.c"),
616 ['a','b','c'])
618 def test_bug_581080(self):
619 iter = re.finditer(r"\s", "a b")
620 self.assertEqual(iter.next().span(), (1,2))
621 self.assertRaises(StopIteration, iter.next)
623 scanner = re.compile(r"\s").scanner("a b")
624 self.assertEqual(scanner.search().span(), (1, 2))
625 self.assertEqual(scanner.search(), None)
627 def test_bug_817234(self):
628 iter = re.finditer(r".*", "asdf")
629 self.assertEqual(iter.next().span(), (0, 4))
630 self.assertEqual(iter.next().span(), (4, 4))
631 self.assertRaises(StopIteration, iter.next)
633 def test_bug_6561(self):
634 # '\d' should match characters in Unicode category 'Nd'
635 # (Number, Decimal Digit), but not those in 'Nl' (Number,
636 # Letter) or 'No' (Number, Other).
637 decimal_digits = [
638 u'\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
639 u'\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
640 u'\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
642 for x in decimal_digits:
643 self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)
645 not_decimal_digits = [
646 u'\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
647 u'\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
648 u'\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
649 u'\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
651 for x in not_decimal_digits:
652 self.assertIsNone(re.match('^\d$', x, re.UNICODE))
654 def test_empty_array(self):
655 # SF buf 1647541
656 import array
657 for typecode in 'cbBuhHiIlLfd':
658 a = array.array(typecode)
659 self.assertEqual(re.compile("bla").match(a), None)
660 self.assertEqual(re.compile("").match(a).groups(), ())
662 def test_inline_flags(self):
663 # Bug #1700
664 upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow
665 lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow
667 p = re.compile(upper_char, re.I | re.U)
668 q = p.match(lower_char)
669 self.assertNotEqual(q, None)
671 p = re.compile(lower_char, re.I | re.U)
672 q = p.match(upper_char)
673 self.assertNotEqual(q, None)
675 p = re.compile('(?i)' + upper_char, re.U)
676 q = p.match(lower_char)
677 self.assertNotEqual(q, None)
679 p = re.compile('(?i)' + lower_char, re.U)
680 q = p.match(upper_char)
681 self.assertNotEqual(q, None)
683 p = re.compile('(?iu)' + upper_char)
684 q = p.match(lower_char)
685 self.assertNotEqual(q, None)
687 p = re.compile('(?iu)' + lower_char)
688 q = p.match(upper_char)
689 self.assertNotEqual(q, None)
691 def test_dollar_matches_twice(self):
692 "$ matches the end of string, and just before the terminating \n"
693 pattern = re.compile('$')
694 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
695 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
696 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
698 pattern = re.compile('$', re.MULTILINE)
699 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
700 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
701 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
703 def test_dealloc(self):
704 # issue 3299: check for segfault in debug build
705 import _sre
706 # the overflow limit is different on wide and narrow builds and it
707 # depends on the definition of SRE_CODE (see sre.h).
708 # 2**128 should be big enough to overflow on both. For smaller values
709 # a RuntimeError is raised instead of OverflowError.
710 long_overflow = 2**128
711 self.assertRaises(TypeError, re.finditer, "a", {})
712 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
714 def run_re_tests():
715 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
716 if verbose:
717 print 'Running re_tests test suite'
718 else:
719 # To save time, only run the first and last 10 tests
720 #tests = tests[:10] + tests[-10:]
721 pass
723 for t in tests:
724 sys.stdout.flush()
725 pattern = s = outcome = repl = expected = None
726 if len(t) == 5:
727 pattern, s, outcome, repl, expected = t
728 elif len(t) == 3:
729 pattern, s, outcome = t
730 else:
731 raise ValueError, ('Test tuples should have 3 or 5 fields', t)
733 try:
734 obj = re.compile(pattern)
735 except re.error:
736 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
737 else:
738 print '=== Syntax error:', t
739 except KeyboardInterrupt: raise KeyboardInterrupt
740 except:
741 print '*** Unexpected error ***', t
742 if verbose:
743 traceback.print_exc(file=sys.stdout)
744 else:
745 try:
746 result = obj.search(s)
747 except re.error, msg:
748 print '=== Unexpected exception', t, repr(msg)
749 if outcome == SYNTAX_ERROR:
750 # This should have been a syntax error; forget it.
751 pass
752 elif outcome == FAIL:
753 if result is None: pass # No match, as expected
754 else: print '=== Succeeded incorrectly', t
755 elif outcome == SUCCEED:
756 if result is not None:
757 # Matched, as expected, so now we compute the
758 # result string and compare it to our expected result.
759 start, end = result.span(0)
760 vardict={'found': result.group(0),
761 'groups': result.group(),
762 'flags': result.re.flags}
763 for i in range(1, 100):
764 try:
765 gi = result.group(i)
766 # Special hack because else the string concat fails:
767 if gi is None:
768 gi = "None"
769 except IndexError:
770 gi = "Error"
771 vardict['g%d' % i] = gi
772 for i in result.re.groupindex.keys():
773 try:
774 gi = result.group(i)
775 if gi is None:
776 gi = "None"
777 except IndexError:
778 gi = "Error"
779 vardict[i] = gi
780 repl = eval(repl, vardict)
781 if repl != expected:
782 print '=== grouping error', t,
783 print repr(repl) + ' should be ' + repr(expected)
784 else:
785 print '=== Failed incorrectly', t
787 # Try the match on a unicode string, and check that it
788 # still succeeds.
789 try:
790 result = obj.search(unicode(s, "latin-1"))
791 if result is None:
792 print '=== Fails on unicode match', t
793 except NameError:
794 continue # 1.5.2
795 except TypeError:
796 continue # unicode test case
798 # Try the match on a unicode pattern, and check that it
799 # still succeeds.
800 obj=re.compile(unicode(pattern, "latin-1"))
801 result = obj.search(s)
802 if result is None:
803 print '=== Fails on unicode pattern match', t
805 # Try the match with the search area limited to the extent
806 # of the match and see if it still succeeds. \B will
807 # break (because it won't match at the end or start of a
808 # string), so we'll ignore patterns that feature it.
810 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
811 and result is not None:
812 obj = re.compile(pattern)
813 result = obj.search(s, result.start(0), result.end(0) + 1)
814 if result is None:
815 print '=== Failed on range-limited match', t
817 # Try the match with IGNORECASE enabled, and check that it
818 # still succeeds.
819 obj = re.compile(pattern, re.IGNORECASE)
820 result = obj.search(s)
821 if result is None:
822 print '=== Fails on case-insensitive match', t
824 # Try the match with LOCALE enabled, and check that it
825 # still succeeds.
826 obj = re.compile(pattern, re.LOCALE)
827 result = obj.search(s)
828 if result is None:
829 print '=== Fails on locale-sensitive match', t
831 # Try the match with UNICODE locale enabled, and check
832 # that it still succeeds.
833 obj = re.compile(pattern, re.UNICODE)
834 result = obj.search(s)
835 if result is None:
836 print '=== Fails on unicode-sensitive match', t
838 def test_main():
839 run_unittest(ReTests)
840 run_re_tests()
842 if __name__ == "__main__":
843 test_main()