move sections
[python/dscho.git] / Lib / test / test_codecs.py
blob4c020c3347b8bd259fe13fe61850c9104bc05292
1 from test import test_support
2 import unittest
3 import codecs
4 import sys, StringIO, _testcapi
6 class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
13 def write(self, chars):
14 self._buffer += chars
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
26 class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
28 # get a StreamReader for the encoding and feed the bytestring version
29 # of input to the reader byte by byte. Read everything available from
30 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
33 r = codecs.getreader(self.encoding)(q)
34 result = u""
35 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
36 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
44 # do the check again, this time using a incremental decoder
45 d = codecs.getincrementaldecoder(self.encoding)()
46 result = u""
47 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
48 result += d.decode(c)
49 self.assertEqual(result, partialresult)
50 # check that there's nothing left in the buffers
51 self.assertEqual(d.decode("", True), u"")
52 self.assertEqual(d.buffer, "")
54 # Check whether the reset method works properly
55 d.reset()
56 result = u""
57 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
58 result += d.decode(c)
59 self.assertEqual(result, partialresult)
60 # check that there's nothing left in the buffers
61 self.assertEqual(d.decode("", True), u"")
62 self.assertEqual(d.buffer, "")
64 # check iterdecode()
65 encoded = input.encode(self.encoding)
66 self.assertEqual(
67 input,
68 u"".join(codecs.iterdecode(encoded, self.encoding))
71 def test_readline(self):
72 def getreader(input):
73 stream = StringIO.StringIO(input.encode(self.encoding))
74 return codecs.getreader(self.encoding)(stream)
76 def readalllines(input, keepends=True, size=None):
77 reader = getreader(input)
78 lines = []
79 while True:
80 line = reader.readline(size=size, keepends=keepends)
81 if not line:
82 break
83 lines.append(line)
84 return "|".join(lines)
86 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
87 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
88 sexpectednoends = u"foo|bar|baz|spam|eggs"
89 self.assertEqual(readalllines(s, True), sexpected)
90 self.assertEqual(readalllines(s, False), sexpectednoends)
91 self.assertEqual(readalllines(s, True, 10), sexpected)
92 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
94 # Test long lines (multiple calls to read() in readline())
95 vw = []
96 vwo = []
97 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
98 vw.append((i*200)*u"\3042" + lineend)
99 vwo.append((i*200)*u"\3042")
100 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
101 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
103 # Test lines where the first read might end with \r, so the
104 # reader has to look ahead whether this is a lone \r or a \r\n
105 for size in xrange(80):
106 for lineend in u"\n \r\n \r \u2028".split():
107 s = 10*(size*u"a" + lineend + u"xxx\n")
108 reader = getreader(s)
109 for i in xrange(10):
110 self.assertEqual(
111 reader.readline(keepends=True),
112 size*u"a" + lineend,
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=False),
118 size*u"a",
121 def test_bug1175396(self):
122 s = [
123 '<%!--===================================================\r\n',
124 ' BLOG index page: show recent articles,\r\n',
125 ' today\'s articles, or articles of a specific date.\r\n',
126 '========================================================--%>\r\n',
127 '<%@inputencoding="ISO-8859-1"%>\r\n',
128 '<%@pagetemplate=TEMPLATE.y%>\r\n',
129 '<%@import=import frog.util, frog%>\r\n',
130 '<%@import=import frog.objects%>\r\n',
131 '<%@import=from frog.storageerrors import StorageError%>\r\n',
132 '<%\r\n',
133 '\r\n',
134 'import logging\r\n',
135 'log=logging.getLogger("Snakelets.logger")\r\n',
136 '\r\n',
137 '\r\n',
138 'user=self.SessionCtx.user\r\n',
139 'storageEngine=self.SessionCtx.storageEngine\r\n',
140 '\r\n',
141 '\r\n',
142 'def readArticlesFromDate(date, count=None):\r\n',
143 ' entryids=storageEngine.listBlogEntries(date)\r\n',
144 ' entryids.reverse() # descending\r\n',
145 ' if count:\r\n',
146 ' entryids=entryids[:count]\r\n',
147 ' try:\r\n',
148 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
149 ' except StorageError,x:\r\n',
150 ' log.error("Error loading articles: "+str(x))\r\n',
151 ' self.abort("cannot load articles")\r\n',
152 '\r\n',
153 'showdate=None\r\n',
154 '\r\n',
155 'arg=self.Request.getArg()\r\n',
156 'if arg=="today":\r\n',
157 ' #-------------------- TODAY\'S ARTICLES\r\n',
158 ' self.write("<h2>Today\'s articles</h2>")\r\n',
159 ' showdate = frog.util.isodatestr() \r\n',
160 ' entries = readArticlesFromDate(showdate)\r\n',
161 'elif arg=="active":\r\n',
162 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
163 ' self.Yredirect("active.y")\r\n',
164 'elif arg=="login":\r\n',
165 ' #-------------------- LOGIN PAGE redirect\r\n',
166 ' self.Yredirect("login.y")\r\n',
167 'elif arg=="date":\r\n',
168 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
169 ' showdate = self.Request.getParameter("date")\r\n',
170 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
171 ' entries = readArticlesFromDate(showdate)\r\n',
172 'else:\r\n',
173 ' #-------------------- RECENT ARTICLES\r\n',
174 ' self.write("<h2>Recent articles</h2>")\r\n',
175 ' dates=storageEngine.listBlogEntryDates()\r\n',
176 ' if dates:\r\n',
177 ' entries=[]\r\n',
178 ' SHOWAMOUNT=10\r\n',
179 ' for showdate in dates:\r\n',
180 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
181 ' if len(entries)>=SHOWAMOUNT:\r\n',
182 ' break\r\n',
183 ' \r\n',
185 stream = StringIO.StringIO("".join(s).encode(self.encoding))
186 reader = codecs.getreader(self.encoding)(stream)
187 for (i, line) in enumerate(reader):
188 self.assertEqual(line, s[i])
190 def test_readlinequeue(self):
191 q = Queue()
192 writer = codecs.getwriter(self.encoding)(q)
193 reader = codecs.getreader(self.encoding)(q)
195 # No lineends
196 writer.write(u"foo\r")
197 self.assertEqual(reader.readline(keepends=False), u"foo")
198 writer.write(u"\nbar\r")
199 self.assertEqual(reader.readline(keepends=False), u"")
200 self.assertEqual(reader.readline(keepends=False), u"bar")
201 writer.write(u"baz")
202 self.assertEqual(reader.readline(keepends=False), u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"")
205 # Lineends
206 writer.write(u"foo\r")
207 self.assertEqual(reader.readline(keepends=True), u"foo\r")
208 writer.write(u"\nbar\r")
209 self.assertEqual(reader.readline(keepends=True), u"\n")
210 self.assertEqual(reader.readline(keepends=True), u"bar\r")
211 writer.write(u"baz")
212 self.assertEqual(reader.readline(keepends=True), u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"")
214 writer.write(u"foo\r\n")
215 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
217 def test_bug1098990_a(self):
218 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
219 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
220 s3 = u"next line.\r\n"
222 s = (s1+s2+s3).encode(self.encoding)
223 stream = StringIO.StringIO(s)
224 reader = codecs.getreader(self.encoding)(stream)
225 self.assertEqual(reader.readline(), s1)
226 self.assertEqual(reader.readline(), s2)
227 self.assertEqual(reader.readline(), s3)
228 self.assertEqual(reader.readline(), u"")
230 def test_bug1098990_b(self):
231 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
232 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
233 s3 = u"stillokay:bbbbxx\r\n"
234 s4 = u"broken!!!!badbad\r\n"
235 s5 = u"againokay.\r\n"
237 s = (s1+s2+s3+s4+s5).encode(self.encoding)
238 stream = StringIO.StringIO(s)
239 reader = codecs.getreader(self.encoding)(stream)
240 self.assertEqual(reader.readline(), s1)
241 self.assertEqual(reader.readline(), s2)
242 self.assertEqual(reader.readline(), s3)
243 self.assertEqual(reader.readline(), s4)
244 self.assertEqual(reader.readline(), s5)
245 self.assertEqual(reader.readline(), u"")
247 class UTF32Test(ReadTest):
248 encoding = "utf-32"
250 spamle = ('\xff\xfe\x00\x00'
251 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
252 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
253 spambe = ('\x00\x00\xfe\xff'
254 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
255 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
257 def test_only_one_bom(self):
258 _,_,reader,writer = codecs.lookup(self.encoding)
259 # encode some stream
260 s = StringIO.StringIO()
261 f = writer(s)
262 f.write(u"spam")
263 f.write(u"spam")
264 d = s.getvalue()
265 # check whether there is exactly one BOM in it
266 self.assertTrue(d == self.spamle or d == self.spambe)
267 # try to read it back
268 s = StringIO.StringIO(d)
269 f = reader(s)
270 self.assertEquals(f.read(), u"spamspam")
272 def test_badbom(self):
273 s = StringIO.StringIO(4*"\xff")
274 f = codecs.getreader(self.encoding)(s)
275 self.assertRaises(UnicodeError, f.read)
277 s = StringIO.StringIO(8*"\xff")
278 f = codecs.getreader(self.encoding)(s)
279 self.assertRaises(UnicodeError, f.read)
281 def test_partial(self):
282 self.check_partial(
283 u"\x00\xff\u0100\uffff",
285 u"", # first byte of BOM read
286 u"", # second byte of BOM read
287 u"", # third byte of BOM read
288 u"", # fourth byte of BOM read => byteorder known
289 u"",
290 u"",
291 u"",
292 u"\x00",
293 u"\x00",
294 u"\x00",
295 u"\x00",
296 u"\x00\xff",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff",
300 u"\x00\xff\u0100",
301 u"\x00\xff\u0100",
302 u"\x00\xff\u0100",
303 u"\x00\xff\u0100",
304 u"\x00\xff\u0100\uffff",
308 def test_handlers(self):
309 self.assertEqual((u'\ufffd', 1),
310 codecs.utf_32_decode('\x01', 'replace', True))
311 self.assertEqual((u'', 1),
312 codecs.utf_32_decode('\x01', 'ignore', True))
314 def test_errors(self):
315 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
316 "\xff", "strict", True)
318 def test_issue8941(self):
319 # Issue #8941: insufficient result allocation when decoding into
320 # surrogate pairs on UCS-2 builds.
321 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
322 self.assertEqual(u'\U00010000' * 1024,
323 codecs.utf_32_decode(encoded_le)[0])
324 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
325 self.assertEqual(u'\U00010000' * 1024,
326 codecs.utf_32_decode(encoded_be)[0])
328 class UTF32LETest(ReadTest):
329 encoding = "utf-32-le"
331 def test_partial(self):
332 self.check_partial(
333 u"\x00\xff\u0100\uffff",
335 u"",
336 u"",
337 u"",
338 u"\x00",
339 u"\x00",
340 u"\x00",
341 u"\x00",
342 u"\x00\xff",
343 u"\x00\xff",
344 u"\x00\xff",
345 u"\x00\xff",
346 u"\x00\xff\u0100",
347 u"\x00\xff\u0100",
348 u"\x00\xff\u0100",
349 u"\x00\xff\u0100",
350 u"\x00\xff\u0100\uffff",
354 def test_simple(self):
355 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
357 def test_errors(self):
358 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
359 "\xff", "strict", True)
361 def test_issue8941(self):
362 # Issue #8941: insufficient result allocation when decoding into
363 # surrogate pairs on UCS-2 builds.
364 encoded = '\x00\x00\x01\x00' * 1024
365 self.assertEqual(u'\U00010000' * 1024,
366 codecs.utf_32_le_decode(encoded)[0])
368 class UTF32BETest(ReadTest):
369 encoding = "utf-32-be"
371 def test_partial(self):
372 self.check_partial(
373 u"\x00\xff\u0100\uffff",
375 u"",
376 u"",
377 u"",
378 u"\x00",
379 u"\x00",
380 u"\x00",
381 u"\x00",
382 u"\x00\xff",
383 u"\x00\xff",
384 u"\x00\xff",
385 u"\x00\xff",
386 u"\x00\xff\u0100",
387 u"\x00\xff\u0100",
388 u"\x00\xff\u0100",
389 u"\x00\xff\u0100",
390 u"\x00\xff\u0100\uffff",
394 def test_simple(self):
395 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
397 def test_errors(self):
398 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
399 "\xff", "strict", True)
401 def test_issue8941(self):
402 # Issue #8941: insufficient result allocation when decoding into
403 # surrogate pairs on UCS-2 builds.
404 encoded = '\x00\x01\x00\x00' * 1024
405 self.assertEqual(u'\U00010000' * 1024,
406 codecs.utf_32_be_decode(encoded)[0])
409 class UTF16Test(ReadTest):
410 encoding = "utf-16"
412 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
413 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
415 def test_only_one_bom(self):
416 _,_,reader,writer = codecs.lookup(self.encoding)
417 # encode some stream
418 s = StringIO.StringIO()
419 f = writer(s)
420 f.write(u"spam")
421 f.write(u"spam")
422 d = s.getvalue()
423 # check whether there is exactly one BOM in it
424 self.assertTrue(d == self.spamle or d == self.spambe)
425 # try to read it back
426 s = StringIO.StringIO(d)
427 f = reader(s)
428 self.assertEquals(f.read(), u"spamspam")
430 def test_badbom(self):
431 s = StringIO.StringIO("\xff\xff")
432 f = codecs.getreader(self.encoding)(s)
433 self.assertRaises(UnicodeError, f.read)
435 s = StringIO.StringIO("\xff\xff\xff\xff")
436 f = codecs.getreader(self.encoding)(s)
437 self.assertRaises(UnicodeError, f.read)
439 def test_partial(self):
440 self.check_partial(
441 u"\x00\xff\u0100\uffff",
443 u"", # first byte of BOM read
444 u"", # second byte of BOM read => byteorder known
445 u"",
446 u"\x00",
447 u"\x00",
448 u"\x00\xff",
449 u"\x00\xff",
450 u"\x00\xff\u0100",
451 u"\x00\xff\u0100",
452 u"\x00\xff\u0100\uffff",
456 def test_handlers(self):
457 self.assertEqual((u'\ufffd', 1),
458 codecs.utf_16_decode('\x01', 'replace', True))
459 self.assertEqual((u'', 1),
460 codecs.utf_16_decode('\x01', 'ignore', True))
462 def test_errors(self):
463 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
465 def test_bug691291(self):
466 # Files are always opened in binary mode, even if no binary mode was
467 # specified. This means that no automatic conversion of '\n' is done
468 # on reading and writing.
469 s1 = u'Hello\r\nworld\r\n'
471 s = s1.encode(self.encoding)
472 try:
473 with open(test_support.TESTFN, 'wb') as fp:
474 fp.write(s)
475 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
476 self.assertEqual(reader.read(), s1)
477 finally:
478 test_support.unlink(test_support.TESTFN)
480 class UTF16LETest(ReadTest):
481 encoding = "utf-16-le"
483 def test_partial(self):
484 self.check_partial(
485 u"\x00\xff\u0100\uffff",
487 u"",
488 u"\x00",
489 u"\x00",
490 u"\x00\xff",
491 u"\x00\xff",
492 u"\x00\xff\u0100",
493 u"\x00\xff\u0100",
494 u"\x00\xff\u0100\uffff",
498 def test_errors(self):
499 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
501 class UTF16BETest(ReadTest):
502 encoding = "utf-16-be"
504 def test_partial(self):
505 self.check_partial(
506 u"\x00\xff\u0100\uffff",
508 u"",
509 u"\x00",
510 u"\x00",
511 u"\x00\xff",
512 u"\x00\xff",
513 u"\x00\xff\u0100",
514 u"\x00\xff\u0100",
515 u"\x00\xff\u0100\uffff",
519 def test_errors(self):
520 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
522 class UTF8Test(ReadTest):
523 encoding = "utf-8"
525 def test_partial(self):
526 self.check_partial(
527 u"\x00\xff\u07ff\u0800\uffff",
529 u"\x00",
530 u"\x00",
531 u"\x00\xff",
532 u"\x00\xff",
533 u"\x00\xff\u07ff",
534 u"\x00\xff\u07ff",
535 u"\x00\xff\u07ff",
536 u"\x00\xff\u07ff\u0800",
537 u"\x00\xff\u07ff\u0800",
538 u"\x00\xff\u07ff\u0800",
539 u"\x00\xff\u07ff\u0800\uffff",
543 class UTF7Test(ReadTest):
544 encoding = "utf-7"
546 def test_partial(self):
547 self.check_partial(
548 u"a+-b",
550 u"a",
551 u"a",
552 u"a+",
553 u"a+-",
554 u"a+-b",
558 class UTF16ExTest(unittest.TestCase):
560 def test_errors(self):
561 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
563 def test_bad_args(self):
564 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
566 class ReadBufferTest(unittest.TestCase):
568 def test_array(self):
569 import array
570 self.assertEqual(
571 codecs.readbuffer_encode(array.array("c", "spam")),
572 ("spam", 4)
575 def test_empty(self):
576 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
578 def test_bad_args(self):
579 self.assertRaises(TypeError, codecs.readbuffer_encode)
580 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
582 class CharBufferTest(unittest.TestCase):
584 def test_string(self):
585 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
587 def test_empty(self):
588 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
590 def test_bad_args(self):
591 self.assertRaises(TypeError, codecs.charbuffer_encode)
592 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
594 class UTF8SigTest(ReadTest):
595 encoding = "utf-8-sig"
597 def test_partial(self):
598 self.check_partial(
599 u"\ufeff\x00\xff\u07ff\u0800\uffff",
601 u"",
602 u"",
603 u"", # First BOM has been read and skipped
604 u"",
605 u"",
606 u"\ufeff", # Second BOM has been read and emitted
607 u"\ufeff\x00", # "\x00" read and emitted
608 u"\ufeff\x00", # First byte of encoded u"\xff" read
609 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
610 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
611 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
612 u"\ufeff\x00\xff\u07ff",
613 u"\ufeff\x00\xff\u07ff",
614 u"\ufeff\x00\xff\u07ff\u0800",
615 u"\ufeff\x00\xff\u07ff\u0800",
616 u"\ufeff\x00\xff\u07ff\u0800",
617 u"\ufeff\x00\xff\u07ff\u0800\uffff",
621 def test_bug1601501(self):
622 # SF bug #1601501: check that the codec works with a buffer
623 unicode("\xef\xbb\xbf", "utf-8-sig")
625 def test_bom(self):
626 d = codecs.getincrementaldecoder("utf-8-sig")()
627 s = u"spam"
628 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
630 def test_stream_bom(self):
631 unistring = u"ABC\u00A1\u2200XYZ"
632 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
634 reader = codecs.getreader("utf-8-sig")
635 for sizehint in [None] + range(1, 11) + \
636 [64, 128, 256, 512, 1024]:
637 istream = reader(StringIO.StringIO(bytestring))
638 ostream = StringIO.StringIO()
639 while 1:
640 if sizehint is not None:
641 data = istream.read(sizehint)
642 else:
643 data = istream.read()
645 if not data:
646 break
647 ostream.write(data)
649 got = ostream.getvalue()
650 self.assertEqual(got, unistring)
652 def test_stream_bare(self):
653 unistring = u"ABC\u00A1\u2200XYZ"
654 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
656 reader = codecs.getreader("utf-8-sig")
657 for sizehint in [None] + range(1, 11) + \
658 [64, 128, 256, 512, 1024]:
659 istream = reader(StringIO.StringIO(bytestring))
660 ostream = StringIO.StringIO()
661 while 1:
662 if sizehint is not None:
663 data = istream.read(sizehint)
664 else:
665 data = istream.read()
667 if not data:
668 break
669 ostream.write(data)
671 got = ostream.getvalue()
672 self.assertEqual(got, unistring)
674 class EscapeDecodeTest(unittest.TestCase):
675 def test_empty(self):
676 self.assertEquals(codecs.escape_decode(""), ("", 0))
678 class RecodingTest(unittest.TestCase):
679 def test_recoding(self):
680 f = StringIO.StringIO()
681 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
682 f2.write(u"a")
683 f2.close()
684 # Python used to crash on this at exit because of a refcount
685 # bug in _codecsmodule.c
687 # From RFC 3492
688 punycode_testcases = [
689 # A Arabic (Egyptian):
690 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
691 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
692 "egbpdaj6bu4bxfgehfvwxn"),
693 # B Chinese (simplified):
694 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
695 "ihqwcrb4cv8a8dqg056pqjye"),
696 # C Chinese (traditional):
697 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
698 "ihqwctvzc91f659drss3x8bo0yb"),
699 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
700 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
701 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
702 u"\u0065\u0073\u006B\u0079",
703 "Proprostnemluvesky-uyb24dma41a"),
704 # E Hebrew:
705 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
706 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
707 u"\u05D1\u05E8\u05D9\u05EA",
708 "4dbcagdahymbxekheh6e0a7fei0b"),
709 # F Hindi (Devanagari):
710 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
711 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
712 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
713 u"\u0939\u0948\u0902",
714 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
716 #(G) Japanese (kanji and hiragana):
717 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
718 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
719 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
721 # (H) Korean (Hangul syllables):
722 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
723 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
724 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
725 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
726 "psd879ccm6fea98c"),
728 # (I) Russian (Cyrillic):
729 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
730 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
731 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
732 u"\u0438",
733 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
735 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
736 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
737 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
738 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
739 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
740 u"\u0061\u00F1\u006F\u006C",
741 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
743 # (K) Vietnamese:
744 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
745 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
746 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
747 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
748 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
749 u"\u0056\u0069\u1EC7\u0074",
750 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
752 #(L) 3<nen>B<gumi><kinpachi><sensei>
753 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
754 "3B-ww4c5e180e575a65lsy2b"),
756 # (M) <amuro><namie>-with-SUPER-MONKEYS
757 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
758 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
759 u"\u004F\u004E\u004B\u0045\u0059\u0053",
760 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
762 # (N) Hello-Another-Way-<sorezore><no><basho>
763 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
764 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
765 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
766 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
768 # (O) <hitotsu><yane><no><shita>2
769 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
770 "2-u9tlzr9756bt3uc0v"),
772 # (P) Maji<de>Koi<suru>5<byou><mae>
773 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
774 u"\u308B\u0035\u79D2\u524D",
775 "MajiKoi5-783gue6qz075azm5e"),
777 # (Q) <pafii>de<runba>
778 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
779 "de-jg4avhby1noc0d"),
781 # (R) <sono><supiido><de>
782 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
783 "d9juau41awczczp"),
785 # (S) -> $1.00 <-
786 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
787 u"\u003C\u002D",
788 "-> $1.00 <--")
791 for i in punycode_testcases:
792 if len(i)!=2:
793 print repr(i)
795 class PunycodeTest(unittest.TestCase):
796 def test_encode(self):
797 for uni, puny in punycode_testcases:
798 # Need to convert both strings to lower case, since
799 # some of the extended encodings use upper case, but our
800 # code produces only lower case. Converting just puny to
801 # lower is also insufficient, since some of the input characters
802 # are upper case.
803 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
805 def test_decode(self):
806 for uni, puny in punycode_testcases:
807 self.assertEquals(uni, puny.decode("punycode"))
809 class UnicodeInternalTest(unittest.TestCase):
810 def test_bug1251300(self):
811 # Decoding with unicode_internal used to not correctly handle "code
812 # points" above 0x10ffff on UCS-4 builds.
813 if sys.maxunicode > 0xffff:
814 ok = [
815 ("\x00\x10\xff\xff", u"\U0010ffff"),
816 ("\x00\x00\x01\x01", u"\U00000101"),
817 ("", u""),
819 not_ok = [
820 "\x7f\xff\xff\xff",
821 "\x80\x00\x00\x00",
822 "\x81\x00\x00\x00",
823 "\x00",
824 "\x00\x00\x00\x00\x00",
826 for internal, uni in ok:
827 if sys.byteorder == "little":
828 internal = "".join(reversed(internal))
829 self.assertEquals(uni, internal.decode("unicode_internal"))
830 for internal in not_ok:
831 if sys.byteorder == "little":
832 internal = "".join(reversed(internal))
833 self.assertRaises(UnicodeDecodeError, internal.decode,
834 "unicode_internal")
836 def test_decode_error_attributes(self):
837 if sys.maxunicode > 0xffff:
838 try:
839 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
840 except UnicodeDecodeError, ex:
841 self.assertEquals("unicode_internal", ex.encoding)
842 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
843 self.assertEquals(4, ex.start)
844 self.assertEquals(8, ex.end)
845 else:
846 self.fail()
848 def test_decode_callback(self):
849 if sys.maxunicode > 0xffff:
850 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
851 decoder = codecs.getdecoder("unicode_internal")
852 ab = u"ab".encode("unicode_internal")
853 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
854 "UnicodeInternalTest")
855 self.assertEquals((u"ab", 12), ignored)
857 def test_encode_length(self):
858 # Issue 3739
859 encoder = codecs.getencoder("unicode_internal")
860 self.assertEquals(encoder(u"a")[1], 1)
861 self.assertEquals(encoder(u"\xe9\u0142")[1], 2)
863 encoder = codecs.getencoder("string-escape")
864 self.assertEquals(encoder(r'\x00')[1], 4)
866 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
867 nameprep_tests = [
868 # 3.1 Map to nothing.
869 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
870 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
871 '\xb8\x8f\xef\xbb\xbf',
872 'foobarbaz'),
873 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
874 ('CAFE',
875 'cafe'),
876 # 3.3 Case folding 8bit U+00DF (german sharp s).
877 # The original test case is bogus; it says \xc3\xdf
878 ('\xc3\x9f',
879 'ss'),
880 # 3.4 Case folding U+0130 (turkish capital I with dot).
881 ('\xc4\xb0',
882 'i\xcc\x87'),
883 # 3.5 Case folding multibyte U+0143 U+037A.
884 ('\xc5\x83\xcd\xba',
885 '\xc5\x84 \xce\xb9'),
886 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
887 # XXX: skip this as it fails in UCS-2 mode
888 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
889 # 'telc\xe2\x88\x95kg\xcf\x83'),
890 (None, None),
891 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
892 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
893 '\xc7\xb0 a'),
894 # 3.8 Case folding U+1FB7 and normalization.
895 ('\xe1\xbe\xb7',
896 '\xe1\xbe\xb6\xce\xb9'),
897 # 3.9 Self-reverting case folding U+01F0 and normalization.
898 # The original test case is bogus, it says `\xc7\xf0'
899 ('\xc7\xb0',
900 '\xc7\xb0'),
901 # 3.10 Self-reverting case folding U+0390 and normalization.
902 ('\xce\x90',
903 '\xce\x90'),
904 # 3.11 Self-reverting case folding U+03B0 and normalization.
905 ('\xce\xb0',
906 '\xce\xb0'),
907 # 3.12 Self-reverting case folding U+1E96 and normalization.
908 ('\xe1\xba\x96',
909 '\xe1\xba\x96'),
910 # 3.13 Self-reverting case folding U+1F56 and normalization.
911 ('\xe1\xbd\x96',
912 '\xe1\xbd\x96'),
913 # 3.14 ASCII space character U+0020.
914 (' ',
915 ' '),
916 # 3.15 Non-ASCII 8bit space character U+00A0.
917 ('\xc2\xa0',
918 ' '),
919 # 3.16 Non-ASCII multibyte space character U+1680.
920 ('\xe1\x9a\x80',
921 None),
922 # 3.17 Non-ASCII multibyte space character U+2000.
923 ('\xe2\x80\x80',
924 ' '),
925 # 3.18 Zero Width Space U+200b.
926 ('\xe2\x80\x8b',
927 ''),
928 # 3.19 Non-ASCII multibyte space character U+3000.
929 ('\xe3\x80\x80',
930 ' '),
931 # 3.20 ASCII control characters U+0010 U+007F.
932 ('\x10\x7f',
933 '\x10\x7f'),
934 # 3.21 Non-ASCII 8bit control character U+0085.
935 ('\xc2\x85',
936 None),
937 # 3.22 Non-ASCII multibyte control character U+180E.
938 ('\xe1\xa0\x8e',
939 None),
940 # 3.23 Zero Width No-Break Space U+FEFF.
941 ('\xef\xbb\xbf',
942 ''),
943 # 3.24 Non-ASCII control character U+1D175.
944 ('\xf0\x9d\x85\xb5',
945 None),
946 # 3.25 Plane 0 private use character U+F123.
947 ('\xef\x84\xa3',
948 None),
949 # 3.26 Plane 15 private use character U+F1234.
950 ('\xf3\xb1\x88\xb4',
951 None),
952 # 3.27 Plane 16 private use character U+10F234.
953 ('\xf4\x8f\x88\xb4',
954 None),
955 # 3.28 Non-character code point U+8FFFE.
956 ('\xf2\x8f\xbf\xbe',
957 None),
958 # 3.29 Non-character code point U+10FFFF.
959 ('\xf4\x8f\xbf\xbf',
960 None),
961 # 3.30 Surrogate code U+DF42.
962 ('\xed\xbd\x82',
963 None),
964 # 3.31 Non-plain text character U+FFFD.
965 ('\xef\xbf\xbd',
966 None),
967 # 3.32 Ideographic description character U+2FF5.
968 ('\xe2\xbf\xb5',
969 None),
970 # 3.33 Display property character U+0341.
971 ('\xcd\x81',
972 '\xcc\x81'),
973 # 3.34 Left-to-right mark U+200E.
974 ('\xe2\x80\x8e',
975 None),
976 # 3.35 Deprecated U+202A.
977 ('\xe2\x80\xaa',
978 None),
979 # 3.36 Language tagging character U+E0001.
980 ('\xf3\xa0\x80\x81',
981 None),
982 # 3.37 Language tagging character U+E0042.
983 ('\xf3\xa0\x81\x82',
984 None),
985 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
986 ('foo\xd6\xbebar',
987 None),
988 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
989 ('foo\xef\xb5\x90bar',
990 None),
991 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
992 ('foo\xef\xb9\xb6bar',
993 'foo \xd9\x8ebar'),
994 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
995 ('\xd8\xa71',
996 None),
997 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
998 ('\xd8\xa71\xd8\xa8',
999 '\xd8\xa71\xd8\xa8'),
1000 # 3.43 Unassigned code point U+E0002.
1001 # Skip this test as we allow unassigned
1002 #('\xf3\xa0\x80\x82',
1003 # None),
1004 (None, None),
1005 # 3.44 Larger test (shrinking).
1006 # Original test case reads \xc3\xdf
1007 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1008 '\xaa\xce\xb0\xe2\x80\x80',
1009 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1010 # 3.45 Larger test (expanding).
1011 # Original test case reads \xc3\x9f
1012 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1013 '\x80',
1014 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1015 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1016 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1020 class NameprepTest(unittest.TestCase):
1021 def test_nameprep(self):
1022 from encodings.idna import nameprep
1023 for pos, (orig, prepped) in enumerate(nameprep_tests):
1024 if orig is None:
1025 # Skipped
1026 continue
1027 # The Unicode strings are given in UTF-8
1028 orig = unicode(orig, "utf-8")
1029 if prepped is None:
1030 # Input contains prohibited characters
1031 self.assertRaises(UnicodeError, nameprep, orig)
1032 else:
1033 prepped = unicode(prepped, "utf-8")
1034 try:
1035 self.assertEquals(nameprep(orig), prepped)
1036 except Exception,e:
1037 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1039 class IDNACodecTest(unittest.TestCase):
1040 def test_builtin_decode(self):
1041 self.assertEquals(unicode("python.org", "idna"), u"python.org")
1042 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
1043 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1044 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
1046 def test_builtin_encode(self):
1047 self.assertEquals(u"python.org".encode("idna"), "python.org")
1048 self.assertEquals("python.org.".encode("idna"), "python.org.")
1049 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1050 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
1052 def test_stream(self):
1053 import StringIO
1054 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1055 r.read(3)
1056 self.assertEquals(r.read(), u"")
1058 def test_incremental_decode(self):
1059 self.assertEquals(
1060 "".join(codecs.iterdecode("python.org", "idna")),
1061 u"python.org"
1063 self.assertEquals(
1064 "".join(codecs.iterdecode("python.org.", "idna")),
1065 u"python.org."
1067 self.assertEquals(
1068 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1069 u"pyth\xf6n.org."
1071 self.assertEquals(
1072 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1073 u"pyth\xf6n.org."
1076 decoder = codecs.getincrementaldecoder("idna")()
1077 self.assertEquals(decoder.decode("xn--xam", ), u"")
1078 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1079 self.assertEquals(decoder.decode(u"rg"), u"")
1080 self.assertEquals(decoder.decode(u"", True), u"org")
1082 decoder.reset()
1083 self.assertEquals(decoder.decode("xn--xam", ), u"")
1084 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1085 self.assertEquals(decoder.decode("rg."), u"org.")
1086 self.assertEquals(decoder.decode("", True), u"")
1088 def test_incremental_encode(self):
1089 self.assertEquals(
1090 "".join(codecs.iterencode(u"python.org", "idna")),
1091 "python.org"
1093 self.assertEquals(
1094 "".join(codecs.iterencode(u"python.org.", "idna")),
1095 "python.org."
1097 self.assertEquals(
1098 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1099 "xn--pythn-mua.org."
1101 self.assertEquals(
1102 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1103 "xn--pythn-mua.org."
1106 encoder = codecs.getincrementalencoder("idna")()
1107 self.assertEquals(encoder.encode(u"\xe4x"), "")
1108 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1109 self.assertEquals(encoder.encode(u"", True), "org")
1111 encoder.reset()
1112 self.assertEquals(encoder.encode(u"\xe4x"), "")
1113 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1114 self.assertEquals(encoder.encode(u"", True), "")
1116 class CodecsModuleTest(unittest.TestCase):
1118 def test_decode(self):
1119 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
1120 u'\xe4\xf6\xfc')
1121 self.assertRaises(TypeError, codecs.decode)
1122 self.assertEquals(codecs.decode('abc'), u'abc')
1123 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1125 def test_encode(self):
1126 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
1127 '\xe4\xf6\xfc')
1128 self.assertRaises(TypeError, codecs.encode)
1129 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
1130 self.assertEquals(codecs.encode(u'abc'), 'abc')
1131 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1133 def test_register(self):
1134 self.assertRaises(TypeError, codecs.register)
1135 self.assertRaises(TypeError, codecs.register, 42)
1137 def test_lookup(self):
1138 self.assertRaises(TypeError, codecs.lookup)
1139 self.assertRaises(LookupError, codecs.lookup, "__spam__")
1140 self.assertRaises(LookupError, codecs.lookup, " ")
1142 def test_getencoder(self):
1143 self.assertRaises(TypeError, codecs.getencoder)
1144 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1146 def test_getdecoder(self):
1147 self.assertRaises(TypeError, codecs.getdecoder)
1148 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1150 def test_getreader(self):
1151 self.assertRaises(TypeError, codecs.getreader)
1152 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1154 def test_getwriter(self):
1155 self.assertRaises(TypeError, codecs.getwriter)
1156 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
1158 class StreamReaderTest(unittest.TestCase):
1160 def setUp(self):
1161 self.reader = codecs.getreader('utf-8')
1162 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1164 def test_readlines(self):
1165 f = self.reader(self.stream)
1166 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
1168 class EncodedFileTest(unittest.TestCase):
1170 def test_basic(self):
1171 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1172 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1173 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
1175 f = StringIO.StringIO()
1176 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1177 ef.write('\xc3\xbc')
1178 self.assertEquals(f.getvalue(), '\xfc')
1180 class Str2StrTest(unittest.TestCase):
1182 def test_read(self):
1183 sin = "\x80".encode("base64_codec")
1184 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1185 sout = reader.read()
1186 self.assertEqual(sout, "\x80")
1187 self.assertIsInstance(sout, str)
1189 def test_readline(self):
1190 sin = "\x80".encode("base64_codec")
1191 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1192 sout = reader.readline()
1193 self.assertEqual(sout, "\x80")
1194 self.assertIsInstance(sout, str)
1196 all_unicode_encodings = [
1197 "ascii",
1198 "base64_codec",
1199 "big5",
1200 "big5hkscs",
1201 "charmap",
1202 "cp037",
1203 "cp1006",
1204 "cp1026",
1205 "cp1140",
1206 "cp1250",
1207 "cp1251",
1208 "cp1252",
1209 "cp1253",
1210 "cp1254",
1211 "cp1255",
1212 "cp1256",
1213 "cp1257",
1214 "cp1258",
1215 "cp424",
1216 "cp437",
1217 "cp500",
1218 "cp720",
1219 "cp737",
1220 "cp775",
1221 "cp850",
1222 "cp852",
1223 "cp855",
1224 "cp856",
1225 "cp857",
1226 "cp858",
1227 "cp860",
1228 "cp861",
1229 "cp862",
1230 "cp863",
1231 "cp864",
1232 "cp865",
1233 "cp866",
1234 "cp869",
1235 "cp874",
1236 "cp875",
1237 "cp932",
1238 "cp949",
1239 "cp950",
1240 "euc_jis_2004",
1241 "euc_jisx0213",
1242 "euc_jp",
1243 "euc_kr",
1244 "gb18030",
1245 "gb2312",
1246 "gbk",
1247 "hex_codec",
1248 "hp_roman8",
1249 "hz",
1250 "idna",
1251 "iso2022_jp",
1252 "iso2022_jp_1",
1253 "iso2022_jp_2",
1254 "iso2022_jp_2004",
1255 "iso2022_jp_3",
1256 "iso2022_jp_ext",
1257 "iso2022_kr",
1258 "iso8859_1",
1259 "iso8859_10",
1260 "iso8859_11",
1261 "iso8859_13",
1262 "iso8859_14",
1263 "iso8859_15",
1264 "iso8859_16",
1265 "iso8859_2",
1266 "iso8859_3",
1267 "iso8859_4",
1268 "iso8859_5",
1269 "iso8859_6",
1270 "iso8859_7",
1271 "iso8859_8",
1272 "iso8859_9",
1273 "johab",
1274 "koi8_r",
1275 "koi8_u",
1276 "latin_1",
1277 "mac_cyrillic",
1278 "mac_greek",
1279 "mac_iceland",
1280 "mac_latin2",
1281 "mac_roman",
1282 "mac_turkish",
1283 "palmos",
1284 "ptcp154",
1285 "punycode",
1286 "raw_unicode_escape",
1287 "rot_13",
1288 "shift_jis",
1289 "shift_jis_2004",
1290 "shift_jisx0213",
1291 "tis_620",
1292 "unicode_escape",
1293 "unicode_internal",
1294 "utf_16",
1295 "utf_16_be",
1296 "utf_16_le",
1297 "utf_7",
1298 "utf_8",
1301 if hasattr(codecs, "mbcs_encode"):
1302 all_unicode_encodings.append("mbcs")
1304 # The following encodings work only with str, not unicode
1305 all_string_encodings = [
1306 "quopri_codec",
1307 "string_escape",
1308 "uu_codec",
1311 # The following encoding is not tested, because it's not supposed
1312 # to work:
1313 # "undefined"
1315 # The following encodings don't work in stateful mode
1316 broken_unicode_with_streams = [
1317 "base64_codec",
1318 "hex_codec",
1319 "punycode",
1320 "unicode_internal"
1322 broken_incremental_coders = broken_unicode_with_streams[:]
1324 # The following encodings only support "strict" mode
1325 only_strict_mode = [
1326 "idna",
1327 "zlib_codec",
1328 "bz2_codec",
1331 try:
1332 import bz2
1333 except ImportError:
1334 pass
1335 else:
1336 all_unicode_encodings.append("bz2_codec")
1337 broken_unicode_with_streams.append("bz2_codec")
1339 try:
1340 import zlib
1341 except ImportError:
1342 pass
1343 else:
1344 all_unicode_encodings.append("zlib_codec")
1345 broken_unicode_with_streams.append("zlib_codec")
1347 class BasicUnicodeTest(unittest.TestCase):
1348 def test_basics(self):
1349 s = u"abc123" # all codecs should be able to encode these
1350 for encoding in all_unicode_encodings:
1351 name = codecs.lookup(encoding).name
1352 if encoding.endswith("_codec"):
1353 name += "_codec"
1354 elif encoding == "latin_1":
1355 name = "latin_1"
1356 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
1357 (bytes, size) = codecs.getencoder(encoding)(s)
1358 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1359 (chars, size) = codecs.getdecoder(encoding)(bytes)
1360 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1362 if encoding not in broken_unicode_with_streams:
1363 # check stream reader/writer
1364 q = Queue()
1365 writer = codecs.getwriter(encoding)(q)
1366 encodedresult = ""
1367 for c in s:
1368 writer.write(c)
1369 encodedresult += q.read()
1370 q = Queue()
1371 reader = codecs.getreader(encoding)(q)
1372 decodedresult = u""
1373 for c in encodedresult:
1374 q.write(c)
1375 decodedresult += reader.read()
1376 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1378 if encoding not in broken_incremental_coders:
1379 # check incremental decoder/encoder (fetched via the Python
1380 # and C API) and iterencode()/iterdecode()
1381 try:
1382 encoder = codecs.getincrementalencoder(encoding)()
1383 cencoder = _testcapi.codec_incrementalencoder(encoding)
1384 except LookupError: # no IncrementalEncoder
1385 pass
1386 else:
1387 # check incremental decoder/encoder
1388 encodedresult = ""
1389 for c in s:
1390 encodedresult += encoder.encode(c)
1391 encodedresult += encoder.encode(u"", True)
1392 decoder = codecs.getincrementaldecoder(encoding)()
1393 decodedresult = u""
1394 for c in encodedresult:
1395 decodedresult += decoder.decode(c)
1396 decodedresult += decoder.decode("", True)
1397 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1399 # check C API
1400 encodedresult = ""
1401 for c in s:
1402 encodedresult += cencoder.encode(c)
1403 encodedresult += cencoder.encode(u"", True)
1404 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1405 decodedresult = u""
1406 for c in encodedresult:
1407 decodedresult += cdecoder.decode(c)
1408 decodedresult += cdecoder.decode("", True)
1409 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1411 # check iterencode()/iterdecode()
1412 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1413 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1415 # check iterencode()/iterdecode() with empty string
1416 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1417 self.assertEqual(result, u"")
1419 if encoding not in only_strict_mode:
1420 # check incremental decoder/encoder with errors argument
1421 try:
1422 encoder = codecs.getincrementalencoder(encoding)("ignore")
1423 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1424 except LookupError: # no IncrementalEncoder
1425 pass
1426 else:
1427 encodedresult = "".join(encoder.encode(c) for c in s)
1428 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1429 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1430 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1432 encodedresult = "".join(cencoder.encode(c) for c in s)
1433 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1434 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1435 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1437 def test_seek(self):
1438 # all codecs should be able to encode these
1439 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1440 for encoding in all_unicode_encodings:
1441 if encoding == "idna": # FIXME: See SF bug #1163178
1442 continue
1443 if encoding in broken_unicode_with_streams:
1444 continue
1445 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1446 for t in xrange(5):
1447 # Test that calling seek resets the internal codec state and buffers
1448 reader.seek(0, 0)
1449 line = reader.readline()
1450 self.assertEqual(s[:len(line)], line)
1452 def test_bad_decode_args(self):
1453 for encoding in all_unicode_encodings:
1454 decoder = codecs.getdecoder(encoding)
1455 self.assertRaises(TypeError, decoder)
1456 if encoding not in ("idna", "punycode"):
1457 self.assertRaises(TypeError, decoder, 42)
1459 def test_bad_encode_args(self):
1460 for encoding in all_unicode_encodings:
1461 encoder = codecs.getencoder(encoding)
1462 self.assertRaises(TypeError, encoder)
1464 def test_encoding_map_type_initialized(self):
1465 from encodings import cp1140
1466 # This used to crash, we are only verifying there's no crash.
1467 table_type = type(cp1140.encoding_table)
1468 self.assertEqual(table_type, table_type)
1470 class BasicStrTest(unittest.TestCase):
1471 def test_basics(self):
1472 s = "abc123"
1473 for encoding in all_string_encodings:
1474 (bytes, size) = codecs.getencoder(encoding)(s)
1475 self.assertEqual(size, len(s))
1476 (chars, size) = codecs.getdecoder(encoding)(bytes)
1477 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1479 class CharmapTest(unittest.TestCase):
1480 def test_decode_with_string_map(self):
1481 self.assertEquals(
1482 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1483 (u"abc", 3)
1486 self.assertEquals(
1487 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1488 (u"ab\ufffd", 3)
1491 self.assertEquals(
1492 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1493 (u"ab\ufffd", 3)
1496 self.assertEquals(
1497 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1498 (u"ab", 3)
1501 self.assertEquals(
1502 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1503 (u"ab", 3)
1506 allbytes = "".join(chr(i) for i in xrange(256))
1507 self.assertEquals(
1508 codecs.charmap_decode(allbytes, "ignore", u""),
1509 (u"", len(allbytes))
1512 class WithStmtTest(unittest.TestCase):
1513 def test_encodedfile(self):
1514 f = StringIO.StringIO("\xc3\xbc")
1515 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1516 self.assertEquals(ef.read(), "\xfc")
1518 def test_streamreaderwriter(self):
1519 f = StringIO.StringIO("\xc3\xbc")
1520 info = codecs.lookup("utf-8")
1521 with codecs.StreamReaderWriter(f, info.streamreader,
1522 info.streamwriter, 'strict') as srw:
1523 self.assertEquals(srw.read(), u"\xfc")
1526 class BomTest(unittest.TestCase):
1527 def test_seek0(self):
1528 data = u"1234567890"
1529 tests = ("utf-16",
1530 "utf-16-le",
1531 "utf-16-be",
1532 "utf-32",
1533 "utf-32-le",
1534 "utf-32-be")
1535 for encoding in tests:
1536 # Check if the BOM is written only once
1537 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1538 f.write(data)
1539 f.write(data)
1540 f.seek(0)
1541 self.assertEquals(f.read(), data * 2)
1542 f.seek(0)
1543 self.assertEquals(f.read(), data * 2)
1545 # Check that the BOM is written after a seek(0)
1546 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1547 f.write(data[0])
1548 self.assertNotEquals(f.tell(), 0)
1549 f.seek(0)
1550 f.write(data)
1551 f.seek(0)
1552 self.assertEquals(f.read(), data)
1554 # (StreamWriter) Check that the BOM is written after a seek(0)
1555 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1556 f.writer.write(data[0])
1557 self.assertNotEquals(f.writer.tell(), 0)
1558 f.writer.seek(0)
1559 f.writer.write(data)
1560 f.seek(0)
1561 self.assertEquals(f.read(), data)
1563 # Check that the BOM is not written after a seek() at a position
1564 # different than the start
1565 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1566 f.write(data)
1567 f.seek(f.tell())
1568 f.write(data)
1569 f.seek(0)
1570 self.assertEquals(f.read(), data * 2)
1572 # (StreamWriter) Check that the BOM is not written after a seek()
1573 # at a position different than the start
1574 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1575 f.writer.write(data)
1576 f.writer.seek(f.writer.tell())
1577 f.writer.write(data)
1578 f.seek(0)
1579 self.assertEquals(f.read(), data * 2)
1582 def test_main():
1583 test_support.run_unittest(
1584 UTF32Test,
1585 UTF32LETest,
1586 UTF32BETest,
1587 UTF16Test,
1588 UTF16LETest,
1589 UTF16BETest,
1590 UTF8Test,
1591 UTF8SigTest,
1592 UTF7Test,
1593 UTF16ExTest,
1594 ReadBufferTest,
1595 CharBufferTest,
1596 EscapeDecodeTest,
1597 RecodingTest,
1598 PunycodeTest,
1599 UnicodeInternalTest,
1600 NameprepTest,
1601 IDNACodecTest,
1602 CodecsModuleTest,
1603 StreamReaderTest,
1604 EncodedFileTest,
1605 Str2StrTest,
1606 BasicUnicodeTest,
1607 BasicStrTest,
1608 CharmapTest,
1609 WithStmtTest,
1610 BomTest,
1614 if __name__ == "__main__":
1615 test_main()