Exceptions raised during renaming in rotating file handlers are now passed to handleE...
[python.git] / Lib / test / test_codecs.py
blobded5d1917a62db1c7f29edd7c995a4961ee141db
1 from test import test_support
2 import unittest
3 import codecs
4 import sys, StringIO
6 class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
13 def write(self, chars):
14 self._buffer += chars
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
26 class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
28 # get a StreamReader for the encoding and feed the bytestring version
29 # of input to the reader byte by byte. Read every available from
30 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
33 r = codecs.getreader(self.encoding)(q)
34 result = u""
35 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
36 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
44 def test_readline(self):
45 def getreader(input):
46 stream = StringIO.StringIO(input.encode(self.encoding))
47 return codecs.getreader(self.encoding)(stream)
49 def readalllines(input, keepends=True):
50 reader = getreader(input)
51 lines = []
52 while True:
53 line = reader.readline(keepends=keepends)
54 if not line:
55 break
56 lines.append(line)
57 return "".join(lines)
59 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
60 self.assertEqual(readalllines(s, True), s)
61 self.assertEqual(readalllines(s, False), u"foobarbazspameggs")
63 # Test long lines (multiple calls to read() in readline())
64 vw = []
65 vwo = []
66 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
67 vw.append((i*200)*u"\3042" + lineend)
68 vwo.append((i*200)*u"\3042")
69 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
70 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
72 # Test lines where the first read might end with \r, so the
73 # reader has to look ahead whether this is a lone \r or a \r\n
74 for size in xrange(80):
75 for lineend in u"\n \r\n \r \u2028".split():
76 s = 10*(size*u"a" + lineend + u"xxx\n")
77 reader = getreader(s)
78 for i in xrange(10):
79 self.assertEqual(
80 reader.readline(keepends=True),
81 size*u"a" + lineend,
83 reader = getreader(s)
84 for i in xrange(10):
85 self.assertEqual(
86 reader.readline(keepends=False),
87 size*u"a",
90 def test_bug1175396(self):
91 s = [
92 '<%!--===================================================\r\n',
93 ' BLOG index page: show recent articles,\r\n',
94 ' today\'s articles, or articles of a specific date.\r\n',
95 '========================================================--%>\r\n',
96 '<%@inputencoding="ISO-8859-1"%>\r\n',
97 '<%@pagetemplate=TEMPLATE.y%>\r\n',
98 '<%@import=import frog.util, frog%>\r\n',
99 '<%@import=import frog.objects%>\r\n',
100 '<%@import=from frog.storageerrors import StorageError%>\r\n',
101 '<%\r\n',
102 '\r\n',
103 'import logging\r\n',
104 'log=logging.getLogger("Snakelets.logger")\r\n',
105 '\r\n',
106 '\r\n',
107 'user=self.SessionCtx.user\r\n',
108 'storageEngine=self.SessionCtx.storageEngine\r\n',
109 '\r\n',
110 '\r\n',
111 'def readArticlesFromDate(date, count=None):\r\n',
112 ' entryids=storageEngine.listBlogEntries(date)\r\n',
113 ' entryids.reverse() # descending\r\n',
114 ' if count:\r\n',
115 ' entryids=entryids[:count]\r\n',
116 ' try:\r\n',
117 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
118 ' except StorageError,x:\r\n',
119 ' log.error("Error loading articles: "+str(x))\r\n',
120 ' self.abort("cannot load articles")\r\n',
121 '\r\n',
122 'showdate=None\r\n',
123 '\r\n',
124 'arg=self.Request.getArg()\r\n',
125 'if arg=="today":\r\n',
126 ' #-------------------- TODAY\'S ARTICLES\r\n',
127 ' self.write("<h2>Today\'s articles</h2>")\r\n',
128 ' showdate = frog.util.isodatestr() \r\n',
129 ' entries = readArticlesFromDate(showdate)\r\n',
130 'elif arg=="active":\r\n',
131 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
132 ' self.Yredirect("active.y")\r\n',
133 'elif arg=="login":\r\n',
134 ' #-------------------- LOGIN PAGE redirect\r\n',
135 ' self.Yredirect("login.y")\r\n',
136 'elif arg=="date":\r\n',
137 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
138 ' showdate = self.Request.getParameter("date")\r\n',
139 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
140 ' entries = readArticlesFromDate(showdate)\r\n',
141 'else:\r\n',
142 ' #-------------------- RECENT ARTICLES\r\n',
143 ' self.write("<h2>Recent articles</h2>")\r\n',
144 ' dates=storageEngine.listBlogEntryDates()\r\n',
145 ' if dates:\r\n',
146 ' entries=[]\r\n',
147 ' SHOWAMOUNT=10\r\n',
148 ' for showdate in dates:\r\n',
149 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
150 ' if len(entries)>=SHOWAMOUNT:\r\n',
151 ' break\r\n',
152 ' \r\n',
154 stream = StringIO.StringIO("".join(s).encode(self.encoding))
155 reader = codecs.getreader(self.encoding)(stream)
156 for (i, line) in enumerate(reader):
157 self.assertEqual(line, s[i])
159 def test_readlinequeue(self):
160 q = Queue()
161 writer = codecs.getwriter(self.encoding)(q)
162 reader = codecs.getreader(self.encoding)(q)
164 # No lineends
165 writer.write(u"foo\r")
166 self.assertEqual(reader.readline(keepends=False), u"foo")
167 writer.write(u"\nbar\r")
168 self.assertEqual(reader.readline(keepends=False), u"")
169 self.assertEqual(reader.readline(keepends=False), u"bar")
170 writer.write(u"baz")
171 self.assertEqual(reader.readline(keepends=False), u"baz")
172 self.assertEqual(reader.readline(keepends=False), u"")
174 # Lineends
175 writer.write(u"foo\r")
176 self.assertEqual(reader.readline(keepends=True), u"foo\r")
177 writer.write(u"\nbar\r")
178 self.assertEqual(reader.readline(keepends=True), u"\n")
179 self.assertEqual(reader.readline(keepends=True), u"bar\r")
180 writer.write(u"baz")
181 self.assertEqual(reader.readline(keepends=True), u"baz")
182 self.assertEqual(reader.readline(keepends=True), u"")
183 writer.write(u"foo\r\n")
184 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
186 def test_bug1098990_a(self):
187 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
188 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
189 s3 = u"next line.\r\n"
191 s = (s1+s2+s3).encode(self.encoding)
192 stream = StringIO.StringIO(s)
193 reader = codecs.getreader(self.encoding)(stream)
194 self.assertEqual(reader.readline(), s1)
195 self.assertEqual(reader.readline(), s2)
196 self.assertEqual(reader.readline(), s3)
197 self.assertEqual(reader.readline(), u"")
199 def test_bug1098990_b(self):
200 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
201 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
202 s3 = u"stillokay:bbbbxx\r\n"
203 s4 = u"broken!!!!badbad\r\n"
204 s5 = u"againokay.\r\n"
206 s = (s1+s2+s3+s4+s5).encode(self.encoding)
207 stream = StringIO.StringIO(s)
208 reader = codecs.getreader(self.encoding)(stream)
209 self.assertEqual(reader.readline(), s1)
210 self.assertEqual(reader.readline(), s2)
211 self.assertEqual(reader.readline(), s3)
212 self.assertEqual(reader.readline(), s4)
213 self.assertEqual(reader.readline(), s5)
214 self.assertEqual(reader.readline(), u"")
216 class UTF16Test(ReadTest):
217 encoding = "utf-16"
219 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
220 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
222 def test_only_one_bom(self):
223 _,_,reader,writer = codecs.lookup(self.encoding)
224 # encode some stream
225 s = StringIO.StringIO()
226 f = writer(s)
227 f.write(u"spam")
228 f.write(u"spam")
229 d = s.getvalue()
230 # check whether there is exactly one BOM in it
231 self.assert_(d == self.spamle or d == self.spambe)
232 # try to read it back
233 s = StringIO.StringIO(d)
234 f = reader(s)
235 self.assertEquals(f.read(), u"spamspam")
237 def test_badbom(self):
238 s = StringIO.StringIO("\xff\xff")
239 f = codecs.getreader(self.encoding)(s)
240 self.assertRaises(UnicodeError, f.read)
242 s = StringIO.StringIO("\xff\xff\xff\xff")
243 f = codecs.getreader(self.encoding)(s)
244 self.assertRaises(UnicodeError, f.read)
246 def test_partial(self):
247 self.check_partial(
248 u"\x00\xff\u0100\uffff",
250 u"", # first byte of BOM read
251 u"", # second byte of BOM read => byteorder known
252 u"",
253 u"\x00",
254 u"\x00",
255 u"\x00\xff",
256 u"\x00\xff",
257 u"\x00\xff\u0100",
258 u"\x00\xff\u0100",
259 u"\x00\xff\u0100\uffff",
263 def test_errors(self):
264 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
266 class UTF16LETest(ReadTest):
267 encoding = "utf-16-le"
269 def test_partial(self):
270 self.check_partial(
271 u"\x00\xff\u0100\uffff",
273 u"",
274 u"\x00",
275 u"\x00",
276 u"\x00\xff",
277 u"\x00\xff",
278 u"\x00\xff\u0100",
279 u"\x00\xff\u0100",
280 u"\x00\xff\u0100\uffff",
284 def test_errors(self):
285 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
287 class UTF16BETest(ReadTest):
288 encoding = "utf-16-be"
290 def test_partial(self):
291 self.check_partial(
292 u"\x00\xff\u0100\uffff",
294 u"",
295 u"\x00",
296 u"\x00",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff\u0100",
300 u"\x00\xff\u0100",
301 u"\x00\xff\u0100\uffff",
305 def test_errors(self):
306 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
308 class UTF8Test(ReadTest):
309 encoding = "utf-8"
311 def test_partial(self):
312 self.check_partial(
313 u"\x00\xff\u07ff\u0800\uffff",
315 u"\x00",
316 u"\x00",
317 u"\x00\xff",
318 u"\x00\xff",
319 u"\x00\xff\u07ff",
320 u"\x00\xff\u07ff",
321 u"\x00\xff\u07ff",
322 u"\x00\xff\u07ff\u0800",
323 u"\x00\xff\u07ff\u0800",
324 u"\x00\xff\u07ff\u0800",
325 u"\x00\xff\u07ff\u0800\uffff",
329 class UTF7Test(ReadTest):
330 encoding = "utf-7"
332 # No test_partial() yet, because UTF-7 doesn't support it.
334 class UTF16ExTest(unittest.TestCase):
336 def test_errors(self):
337 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
339 def test_bad_args(self):
340 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
342 class ReadBufferTest(unittest.TestCase):
344 def test_array(self):
345 import array
346 self.assertEqual(
347 codecs.readbuffer_encode(array.array("c", "spam")),
348 ("spam", 4)
351 def test_empty(self):
352 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
354 def test_bad_args(self):
355 self.assertRaises(TypeError, codecs.readbuffer_encode)
356 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
358 class CharBufferTest(unittest.TestCase):
360 def test_string(self):
361 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
363 def test_empty(self):
364 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
366 def test_bad_args(self):
367 self.assertRaises(TypeError, codecs.charbuffer_encode)
368 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
370 class UTF8SigTest(ReadTest):
371 encoding = "utf-8-sig"
373 def test_partial(self):
374 self.check_partial(
375 u"\ufeff\x00\xff\u07ff\u0800\uffff",
377 u"",
378 u"",
379 u"", # First BOM has been read and skipped
380 u"",
381 u"",
382 u"\ufeff", # Second BOM has been read and emitted
383 u"\ufeff\x00", # "\x00" read and emitted
384 u"\ufeff\x00", # First byte of encoded u"\xff" read
385 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
386 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
387 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
388 u"\ufeff\x00\xff\u07ff",
389 u"\ufeff\x00\xff\u07ff",
390 u"\ufeff\x00\xff\u07ff\u0800",
391 u"\ufeff\x00\xff\u07ff\u0800",
392 u"\ufeff\x00\xff\u07ff\u0800",
393 u"\ufeff\x00\xff\u07ff\u0800\uffff",
397 class EscapeDecodeTest(unittest.TestCase):
398 def test_empty(self):
399 self.assertEquals(codecs.escape_decode(""), ("", 0))
401 class RecodingTest(unittest.TestCase):
402 def test_recoding(self):
403 f = StringIO.StringIO()
404 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
405 f2.write(u"a")
406 f2.close()
407 # Python used to crash on this at exit because of a refcount
408 # bug in _codecsmodule.c
410 # From RFC 3492
411 punycode_testcases = [
412 # A Arabic (Egyptian):
413 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
414 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
415 "egbpdaj6bu4bxfgehfvwxn"),
416 # B Chinese (simplified):
417 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
418 "ihqwcrb4cv8a8dqg056pqjye"),
419 # C Chinese (traditional):
420 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
421 "ihqwctvzc91f659drss3x8bo0yb"),
422 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
423 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
424 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
425 u"\u0065\u0073\u006B\u0079",
426 "Proprostnemluvesky-uyb24dma41a"),
427 # E Hebrew:
428 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
429 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
430 u"\u05D1\u05E8\u05D9\u05EA",
431 "4dbcagdahymbxekheh6e0a7fei0b"),
432 # F Hindi (Devanagari):
433 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
434 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
435 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
436 u"\u0939\u0948\u0902",
437 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
439 #(G) Japanese (kanji and hiragana):
440 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
441 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
442 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
444 # (H) Korean (Hangul syllables):
445 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
446 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
447 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
448 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
449 "psd879ccm6fea98c"),
451 # (I) Russian (Cyrillic):
452 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
453 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
454 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
455 u"\u0438",
456 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
458 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
459 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
460 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
461 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
462 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
463 u"\u0061\u00F1\u006F\u006C",
464 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
466 # (K) Vietnamese:
467 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
468 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
469 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
470 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
471 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
472 u"\u0056\u0069\u1EC7\u0074",
473 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
475 #(L) 3<nen>B<gumi><kinpachi><sensei>
476 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
477 "3B-ww4c5e180e575a65lsy2b"),
479 # (M) <amuro><namie>-with-SUPER-MONKEYS
480 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
481 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
482 u"\u004F\u004E\u004B\u0045\u0059\u0053",
483 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
485 # (N) Hello-Another-Way-<sorezore><no><basho>
486 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
487 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
488 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
489 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
491 # (O) <hitotsu><yane><no><shita>2
492 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
493 "2-u9tlzr9756bt3uc0v"),
495 # (P) Maji<de>Koi<suru>5<byou><mae>
496 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
497 u"\u308B\u0035\u79D2\u524D",
498 "MajiKoi5-783gue6qz075azm5e"),
500 # (Q) <pafii>de<runba>
501 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
502 "de-jg4avhby1noc0d"),
504 # (R) <sono><supiido><de>
505 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
506 "d9juau41awczczp"),
508 # (S) -> $1.00 <-
509 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
510 u"\u003C\u002D",
511 "-> $1.00 <--")
514 for i in punycode_testcases:
515 if len(i)!=2:
516 print repr(i)
518 class PunycodeTest(unittest.TestCase):
519 def test_encode(self):
520 for uni, puny in punycode_testcases:
521 # Need to convert both strings to lower case, since
522 # some of the extended encodings use upper case, but our
523 # code produces only lower case. Converting just puny to
524 # lower is also insufficient, since some of the input characters
525 # are upper case.
526 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
528 def test_decode(self):
529 for uni, puny in punycode_testcases:
530 self.assertEquals(uni, puny.decode("punycode"))
532 class UnicodeInternalTest(unittest.TestCase):
533 def test_bug1251300(self):
534 # Decoding with unicode_internal used to not correctly handle "code
535 # points" above 0x10ffff on UCS-4 builds.
536 if sys.maxunicode > 0xffff:
537 ok = [
538 ("\x00\x10\xff\xff", u"\U0010ffff"),
539 ("\x00\x00\x01\x01", u"\U00000101"),
540 ("", u""),
542 not_ok = [
543 "\x7f\xff\xff\xff",
544 "\x80\x00\x00\x00",
545 "\x81\x00\x00\x00",
546 "\x00",
547 "\x00\x00\x00\x00\x00",
549 for internal, uni in ok:
550 if sys.byteorder == "little":
551 internal = "".join(reversed(internal))
552 self.assertEquals(uni, internal.decode("unicode_internal"))
553 for internal in not_ok:
554 if sys.byteorder == "little":
555 internal = "".join(reversed(internal))
556 self.assertRaises(UnicodeDecodeError, internal.decode,
557 "unicode_internal")
559 def test_decode_error_attributes(self):
560 if sys.maxunicode > 0xffff:
561 try:
562 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
563 except UnicodeDecodeError, ex:
564 self.assertEquals("unicode_internal", ex.encoding)
565 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
566 self.assertEquals(4, ex.start)
567 self.assertEquals(8, ex.end)
568 else:
569 self.fail()
571 def test_decode_callback(self):
572 if sys.maxunicode > 0xffff:
573 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
574 decoder = codecs.getdecoder("unicode_internal")
575 ab = u"ab".encode("unicode_internal")
576 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
577 "UnicodeInternalTest")
578 self.assertEquals((u"ab", 12), ignored)
580 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
581 nameprep_tests = [
582 # 3.1 Map to nothing.
583 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
584 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
585 '\xb8\x8f\xef\xbb\xbf',
586 'foobarbaz'),
587 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
588 ('CAFE',
589 'cafe'),
590 # 3.3 Case folding 8bit U+00DF (german sharp s).
591 # The original test case is bogus; it says \xc3\xdf
592 ('\xc3\x9f',
593 'ss'),
594 # 3.4 Case folding U+0130 (turkish capital I with dot).
595 ('\xc4\xb0',
596 'i\xcc\x87'),
597 # 3.5 Case folding multibyte U+0143 U+037A.
598 ('\xc5\x83\xcd\xba',
599 '\xc5\x84 \xce\xb9'),
600 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
601 # XXX: skip this as it fails in UCS-2 mode
602 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
603 # 'telc\xe2\x88\x95kg\xcf\x83'),
604 (None, None),
605 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
606 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
607 '\xc7\xb0 a'),
608 # 3.8 Case folding U+1FB7 and normalization.
609 ('\xe1\xbe\xb7',
610 '\xe1\xbe\xb6\xce\xb9'),
611 # 3.9 Self-reverting case folding U+01F0 and normalization.
612 # The original test case is bogus, it says `\xc7\xf0'
613 ('\xc7\xb0',
614 '\xc7\xb0'),
615 # 3.10 Self-reverting case folding U+0390 and normalization.
616 ('\xce\x90',
617 '\xce\x90'),
618 # 3.11 Self-reverting case folding U+03B0 and normalization.
619 ('\xce\xb0',
620 '\xce\xb0'),
621 # 3.12 Self-reverting case folding U+1E96 and normalization.
622 ('\xe1\xba\x96',
623 '\xe1\xba\x96'),
624 # 3.13 Self-reverting case folding U+1F56 and normalization.
625 ('\xe1\xbd\x96',
626 '\xe1\xbd\x96'),
627 # 3.14 ASCII space character U+0020.
628 (' ',
629 ' '),
630 # 3.15 Non-ASCII 8bit space character U+00A0.
631 ('\xc2\xa0',
632 ' '),
633 # 3.16 Non-ASCII multibyte space character U+1680.
634 ('\xe1\x9a\x80',
635 None),
636 # 3.17 Non-ASCII multibyte space character U+2000.
637 ('\xe2\x80\x80',
638 ' '),
639 # 3.18 Zero Width Space U+200b.
640 ('\xe2\x80\x8b',
641 ''),
642 # 3.19 Non-ASCII multibyte space character U+3000.
643 ('\xe3\x80\x80',
644 ' '),
645 # 3.20 ASCII control characters U+0010 U+007F.
646 ('\x10\x7f',
647 '\x10\x7f'),
648 # 3.21 Non-ASCII 8bit control character U+0085.
649 ('\xc2\x85',
650 None),
651 # 3.22 Non-ASCII multibyte control character U+180E.
652 ('\xe1\xa0\x8e',
653 None),
654 # 3.23 Zero Width No-Break Space U+FEFF.
655 ('\xef\xbb\xbf',
656 ''),
657 # 3.24 Non-ASCII control character U+1D175.
658 ('\xf0\x9d\x85\xb5',
659 None),
660 # 3.25 Plane 0 private use character U+F123.
661 ('\xef\x84\xa3',
662 None),
663 # 3.26 Plane 15 private use character U+F1234.
664 ('\xf3\xb1\x88\xb4',
665 None),
666 # 3.27 Plane 16 private use character U+10F234.
667 ('\xf4\x8f\x88\xb4',
668 None),
669 # 3.28 Non-character code point U+8FFFE.
670 ('\xf2\x8f\xbf\xbe',
671 None),
672 # 3.29 Non-character code point U+10FFFF.
673 ('\xf4\x8f\xbf\xbf',
674 None),
675 # 3.30 Surrogate code U+DF42.
676 ('\xed\xbd\x82',
677 None),
678 # 3.31 Non-plain text character U+FFFD.
679 ('\xef\xbf\xbd',
680 None),
681 # 3.32 Ideographic description character U+2FF5.
682 ('\xe2\xbf\xb5',
683 None),
684 # 3.33 Display property character U+0341.
685 ('\xcd\x81',
686 '\xcc\x81'),
687 # 3.34 Left-to-right mark U+200E.
688 ('\xe2\x80\x8e',
689 None),
690 # 3.35 Deprecated U+202A.
691 ('\xe2\x80\xaa',
692 None),
693 # 3.36 Language tagging character U+E0001.
694 ('\xf3\xa0\x80\x81',
695 None),
696 # 3.37 Language tagging character U+E0042.
697 ('\xf3\xa0\x81\x82',
698 None),
699 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
700 ('foo\xd6\xbebar',
701 None),
702 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
703 ('foo\xef\xb5\x90bar',
704 None),
705 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
706 ('foo\xef\xb9\xb6bar',
707 'foo \xd9\x8ebar'),
708 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
709 ('\xd8\xa71',
710 None),
711 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
712 ('\xd8\xa71\xd8\xa8',
713 '\xd8\xa71\xd8\xa8'),
714 # 3.43 Unassigned code point U+E0002.
715 # Skip this test as we allow unassigned
716 #('\xf3\xa0\x80\x82',
717 # None),
718 (None, None),
719 # 3.44 Larger test (shrinking).
720 # Original test case reads \xc3\xdf
721 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
722 '\xaa\xce\xb0\xe2\x80\x80',
723 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
724 # 3.45 Larger test (expanding).
725 # Original test case reads \xc3\x9f
726 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
727 '\x80',
728 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
729 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
730 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
734 class NameprepTest(unittest.TestCase):
735 def test_nameprep(self):
736 from encodings.idna import nameprep
737 for pos, (orig, prepped) in enumerate(nameprep_tests):
738 if orig is None:
739 # Skipped
740 continue
741 # The Unicode strings are given in UTF-8
742 orig = unicode(orig, "utf-8")
743 if prepped is None:
744 # Input contains prohibited characters
745 self.assertRaises(UnicodeError, nameprep, orig)
746 else:
747 prepped = unicode(prepped, "utf-8")
748 try:
749 self.assertEquals(nameprep(orig), prepped)
750 except Exception,e:
751 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
753 class CodecTest(unittest.TestCase):
754 def test_builtin(self):
755 self.assertEquals(unicode("python.org", "idna"), u"python.org")
757 def test_stream(self):
758 import StringIO
759 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
760 r.read(3)
761 self.assertEquals(r.read(), u"")
763 class CodecsModuleTest(unittest.TestCase):
765 def test_decode(self):
766 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
767 u'\xe4\xf6\xfc')
768 self.assertRaises(TypeError, codecs.decode)
769 self.assertEquals(codecs.decode('abc'), u'abc')
770 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
772 def test_encode(self):
773 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
774 '\xe4\xf6\xfc')
775 self.assertRaises(TypeError, codecs.encode)
776 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
777 self.assertEquals(codecs.encode(u'abc'), 'abc')
778 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
780 def test_register(self):
781 self.assertRaises(TypeError, codecs.register)
782 self.assertRaises(TypeError, codecs.register, 42)
784 def test_lookup(self):
785 self.assertRaises(TypeError, codecs.lookup)
786 self.assertRaises(LookupError, codecs.lookup, "__spam__")
787 self.assertRaises(LookupError, codecs.lookup, " ")
789 def test_getencoder(self):
790 self.assertRaises(TypeError, codecs.getencoder)
791 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
793 def test_getdecoder(self):
794 self.assertRaises(TypeError, codecs.getdecoder)
795 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
797 def test_getreader(self):
798 self.assertRaises(TypeError, codecs.getreader)
799 self.assertRaises(LookupError, codecs.getreader, "__spam__")
801 def test_getwriter(self):
802 self.assertRaises(TypeError, codecs.getwriter)
803 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
805 class StreamReaderTest(unittest.TestCase):
807 def setUp(self):
808 self.reader = codecs.getreader('utf-8')
809 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
811 def test_readlines(self):
812 f = self.reader(self.stream)
813 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
815 class Str2StrTest(unittest.TestCase):
817 def test_read(self):
818 sin = "\x80".encode("base64_codec")
819 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
820 sout = reader.read()
821 self.assertEqual(sout, "\x80")
822 self.assert_(isinstance(sout, str))
824 def test_readline(self):
825 sin = "\x80".encode("base64_codec")
826 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
827 sout = reader.readline()
828 self.assertEqual(sout, "\x80")
829 self.assert_(isinstance(sout, str))
831 all_unicode_encodings = [
832 "ascii",
833 "base64_codec",
834 "big5",
835 "big5hkscs",
836 "charmap",
837 "cp037",
838 "cp1006",
839 "cp1026",
840 "cp1140",
841 "cp1250",
842 "cp1251",
843 "cp1252",
844 "cp1253",
845 "cp1254",
846 "cp1255",
847 "cp1256",
848 "cp1257",
849 "cp1258",
850 "cp424",
851 "cp437",
852 "cp500",
853 "cp737",
854 "cp775",
855 "cp850",
856 "cp852",
857 "cp855",
858 "cp856",
859 "cp857",
860 "cp860",
861 "cp861",
862 "cp862",
863 "cp863",
864 "cp864",
865 "cp865",
866 "cp866",
867 "cp869",
868 "cp874",
869 "cp875",
870 "cp932",
871 "cp949",
872 "cp950",
873 "euc_jis_2004",
874 "euc_jisx0213",
875 "euc_jp",
876 "euc_kr",
877 "gb18030",
878 "gb2312",
879 "gbk",
880 "hex_codec",
881 "hp_roman8",
882 "hz",
883 "idna",
884 "iso2022_jp",
885 "iso2022_jp_1",
886 "iso2022_jp_2",
887 "iso2022_jp_2004",
888 "iso2022_jp_3",
889 "iso2022_jp_ext",
890 "iso2022_kr",
891 "iso8859_1",
892 "iso8859_10",
893 "iso8859_11",
894 "iso8859_13",
895 "iso8859_14",
896 "iso8859_15",
897 "iso8859_16",
898 "iso8859_2",
899 "iso8859_3",
900 "iso8859_4",
901 "iso8859_5",
902 "iso8859_6",
903 "iso8859_7",
904 "iso8859_8",
905 "iso8859_9",
906 "johab",
907 "koi8_r",
908 "koi8_u",
909 "latin_1",
910 "mac_cyrillic",
911 "mac_greek",
912 "mac_iceland",
913 "mac_latin2",
914 "mac_roman",
915 "mac_turkish",
916 "palmos",
917 "ptcp154",
918 "punycode",
919 "raw_unicode_escape",
920 "rot_13",
921 "shift_jis",
922 "shift_jis_2004",
923 "shift_jisx0213",
924 "tis_620",
925 "unicode_escape",
926 "unicode_internal",
927 "utf_16",
928 "utf_16_be",
929 "utf_16_le",
930 "utf_7",
931 "utf_8",
934 if hasattr(codecs, "mbcs_encode"):
935 all_unicode_encodings.append("mbcs")
937 # The following encodings work only with str, not unicode
938 all_string_encodings = [
939 "quopri_codec",
940 "string_escape",
941 "uu_codec",
944 # The following encoding is not tested, because it's not supposed
945 # to work:
946 # "undefined"
948 # The following encodings don't work in stateful mode
949 broken_unicode_with_streams = [
950 "base64_codec",
951 "hex_codec",
952 "punycode",
953 "unicode_internal"
956 try:
957 import bz2
958 except ImportError:
959 pass
960 else:
961 all_unicode_encodings.append("bz2_codec")
962 broken_unicode_with_streams.append("bz2_codec")
964 try:
965 import zlib
966 except ImportError:
967 pass
968 else:
969 all_unicode_encodings.append("zlib_codec")
970 broken_unicode_with_streams.append("zlib_codec")
972 class BasicUnicodeTest(unittest.TestCase):
973 def test_basics(self):
974 s = u"abc123" # all codecs should be able to encode these
975 for encoding in all_unicode_encodings:
976 (bytes, size) = codecs.getencoder(encoding)(s)
977 if encoding != "unicode_internal":
978 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
979 (chars, size) = codecs.getdecoder(encoding)(bytes)
980 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
982 if encoding not in broken_unicode_with_streams:
983 # check stream reader/writer
984 q = Queue()
985 writer = codecs.getwriter(encoding)(q)
986 encodedresult = ""
987 for c in s:
988 writer.write(c)
989 encodedresult += q.read()
990 q = Queue()
991 reader = codecs.getreader(encoding)(q)
992 decodedresult = u""
993 for c in encodedresult:
994 q.write(c)
995 decodedresult += reader.read()
996 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
998 def test_seek(self):
999 # all codecs should be able to encode these
1000 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1001 for encoding in all_unicode_encodings:
1002 if encoding == "idna": # FIXME: See SF bug #1163178
1003 continue
1004 if encoding in broken_unicode_with_streams:
1005 continue
1006 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1007 for t in xrange(5):
1008 # Test that calling seek resets the internal codec state and buffers
1009 reader.seek(0, 0)
1010 line = reader.readline()
1011 self.assertEqual(s[:len(line)], line)
1013 def test_bad_decode_args(self):
1014 for encoding in all_unicode_encodings:
1015 decoder = codecs.getdecoder(encoding)
1016 self.assertRaises(TypeError, decoder)
1017 if encoding not in ("idna", "punycode"):
1018 self.assertRaises(TypeError, decoder, 42)
1020 def test_bad_encode_args(self):
1021 for encoding in all_unicode_encodings:
1022 encoder = codecs.getencoder(encoding)
1023 self.assertRaises(TypeError, encoder)
1025 class BasicStrTest(unittest.TestCase):
1026 def test_basics(self):
1027 s = "abc123"
1028 for encoding in all_string_encodings:
1029 (bytes, size) = codecs.getencoder(encoding)(s)
1030 self.assertEqual(size, len(s))
1031 (chars, size) = codecs.getdecoder(encoding)(bytes)
1032 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1034 class CharmapTest(unittest.TestCase):
1035 def test_decode_with_string_map(self):
1036 self.assertEquals(
1037 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1038 (u"abc", 3)
1041 self.assertEquals(
1042 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1043 (u"ab\ufffd", 3)
1046 self.assertEquals(
1047 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1048 (u"ab\ufffd", 3)
1051 self.assertEquals(
1052 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1053 (u"ab", 3)
1056 self.assertEquals(
1057 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1058 (u"ab", 3)
1061 allbytes = "".join(chr(i) for i in xrange(256))
1062 self.assertEquals(
1063 codecs.charmap_decode(allbytes, "ignore", u""),
1064 (u"", len(allbytes))
1068 def test_main():
1069 test_support.run_unittest(
1070 UTF16Test,
1071 UTF16LETest,
1072 UTF16BETest,
1073 UTF8Test,
1074 UTF8SigTest,
1075 UTF7Test,
1076 UTF16ExTest,
1077 ReadBufferTest,
1078 CharBufferTest,
1079 EscapeDecodeTest,
1080 RecodingTest,
1081 PunycodeTest,
1082 UnicodeInternalTest,
1083 NameprepTest,
1084 CodecTest,
1085 CodecsModuleTest,
1086 StreamReaderTest,
1087 Str2StrTest,
1088 BasicUnicodeTest,
1089 BasicStrTest,
1090 CharmapTest
1094 if __name__ == "__main__":
1095 test_main()