Issue #6615: logging: Used weak references in internal handler list. Thanks to flox...
[python.git] / Lib / test / test_multibytecodec.py
blobf23e6e0bb2e99a86d7b4801e6bb76beac9e2744d
1 #!/usr/bin/env python
3 # test_multibytecodec.py
4 # Unit test for multibytecodec itself
7 from test import test_support
8 from test import test_multibytecodec_support
9 from test.test_support import TESTFN
10 import unittest, StringIO, codecs, sys, os
11 import _multibytecodec
13 ALL_CJKENCODINGS = [
14 # _codecs_cn
15 'gb2312', 'gbk', 'gb18030', 'hz',
16 # _codecs_hk
17 'big5hkscs',
18 # _codecs_jp
19 'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
20 'euc_jis_2004', 'shift_jis_2004',
21 # _codecs_kr
22 'cp949', 'euc_kr', 'johab',
23 # _codecs_tw
24 'big5', 'cp950',
25 # _codecs_iso2022
26 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
27 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
30 class Test_MultibyteCodec(unittest.TestCase):
32 def test_nullcoding(self):
33 for enc in ALL_CJKENCODINGS:
34 self.assertEqual(''.decode(enc), u'')
35 self.assertEqual(unicode('', enc), u'')
36 self.assertEqual(u''.encode(enc), '')
38 def test_str_decode(self):
39 for enc in ALL_CJKENCODINGS:
40 self.assertEqual('abcd'.encode(enc), 'abcd')
42 def test_errorcallback_longindex(self):
43 dec = codecs.getdecoder('euc-kr')
44 myreplace = lambda exc: (u'', sys.maxint+1)
45 codecs.register_error('test.cjktest', myreplace)
46 self.assertRaises(IndexError, dec,
47 'apple\x92ham\x93spam', 'test.cjktest')
49 def test_codingspec(self):
50 try:
51 for enc in ALL_CJKENCODINGS:
52 print >> open(TESTFN, 'w'), '# coding:', enc
53 exec open(TESTFN)
54 finally:
55 os.unlink(TESTFN)
57 def test_init_segfault(self):
58 # bug #3305: this used to segfault
59 self.assertRaises(AttributeError,
60 _multibytecodec.MultibyteStreamReader, None)
61 self.assertRaises(AttributeError,
62 _multibytecodec.MultibyteStreamWriter, None)
65 class Test_IncrementalEncoder(unittest.TestCase):
67 def test_stateless(self):
68 # cp949 encoder isn't stateful at all.
69 encoder = codecs.getincrementalencoder('cp949')()
70 self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'),
71 '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
72 self.assertEqual(encoder.reset(), None)
73 self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True),
74 '\xa1\xd9\xa1\xad\xa1\xd9')
75 self.assertEqual(encoder.reset(), None)
76 self.assertEqual(encoder.encode(u'', True), '')
77 self.assertEqual(encoder.encode(u'', False), '')
78 self.assertEqual(encoder.reset(), None)
80 def test_stateful(self):
81 # jisx0213 encoder is stateful for a few codepoints. eg)
82 # U+00E6 => A9DC
83 # U+00E6 U+0300 => ABC4
84 # U+0300 => ABDC
86 encoder = codecs.getincrementalencoder('jisx0213')()
87 self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4')
88 self.assertEqual(encoder.encode(u'\u00e6'), '')
89 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4')
90 self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc')
92 self.assertEqual(encoder.reset(), None)
93 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
95 self.assertEqual(encoder.encode(u'\u00e6'), '')
96 self.assertEqual(encoder.encode('', True), '\xa9\xdc')
97 self.assertEqual(encoder.encode('', True), '')
99 def test_stateful_keep_buffer(self):
100 encoder = codecs.getincrementalencoder('jisx0213')()
101 self.assertEqual(encoder.encode(u'\u00e6'), '')
102 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
103 self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4')
104 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
105 self.assertEqual(encoder.reset(), None)
106 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')
107 self.assertEqual(encoder.encode(u'\u00e6'), '')
108 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')
109 self.assertEqual(encoder.encode(u'', True), '\xa9\xdc')
112 class Test_IncrementalDecoder(unittest.TestCase):
114 def test_dbcs(self):
115 # cp949 decoder is simple with only 1 or 2 bytes sequences.
116 decoder = codecs.getincrementaldecoder('cp949')()
117 self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'),
118 u'\ud30c\uc774')
119 self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'),
120 u'\uc36c \ub9c8\uc744')
121 self.assertEqual(decoder.decode(''), u'')
123 def test_dbcs_keep_buffer(self):
124 decoder = codecs.getincrementaldecoder('cp949')()
125 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
126 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
127 self.assertEqual(decoder.decode('\xcc'), u'\uc774')
129 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')
130 self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True)
131 self.assertEqual(decoder.decode('\xcc'), u'\uc774')
133 def test_iso2022(self):
134 decoder = codecs.getincrementaldecoder('iso2022-jp')()
135 ESC = '\x1b'
136 self.assertEqual(decoder.decode(ESC + '('), u'')
137 self.assertEqual(decoder.decode('B', True), u'')
138 self.assertEqual(decoder.decode(ESC + '$'), u'')
139 self.assertEqual(decoder.decode('B@$'), u'\u4e16')
140 self.assertEqual(decoder.decode('@$@'), u'\u4e16')
141 self.assertEqual(decoder.decode('$', True), u'\u4e16')
142 self.assertEqual(decoder.reset(), None)
143 self.assertEqual(decoder.decode('@$'), u'@$')
144 self.assertEqual(decoder.decode(ESC + '$'), u'')
145 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)
146 self.assertEqual(decoder.decode('B@$'), u'\u4e16')
148 class Test_StreamReader(unittest.TestCase):
149 def test_bug1728403(self):
150 try:
151 open(TESTFN, 'w').write('\xa1')
152 f = codecs.open(TESTFN, encoding='cp949')
153 self.assertRaises(UnicodeDecodeError, f.read, 2)
154 finally:
155 try: f.close()
156 except: pass
157 os.unlink(TESTFN)
159 class Test_StreamWriter(unittest.TestCase):
160 if len(u'\U00012345') == 2: # UCS2
161 def test_gb18030(self):
162 s = StringIO.StringIO()
163 c = codecs.getwriter('gb18030')(s)
164 c.write(u'123')
165 self.assertEqual(s.getvalue(), '123')
166 c.write(u'\U00012345')
167 self.assertEqual(s.getvalue(), '123\x907\x959')
168 c.write(u'\U00012345'[0])
169 self.assertEqual(s.getvalue(), '123\x907\x959')
170 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
171 self.assertEqual(s.getvalue(),
172 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
173 c.write(u'\U00012345'[0])
174 self.assertEqual(s.getvalue(),
175 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
176 self.assertRaises(UnicodeError, c.reset)
177 self.assertEqual(s.getvalue(),
178 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
180 def test_utf_8(self):
181 s= StringIO.StringIO()
182 c = codecs.getwriter('utf-8')(s)
183 c.write(u'123')
184 self.assertEqual(s.getvalue(), '123')
185 c.write(u'\U00012345')
186 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
188 # Python utf-8 codec can't buffer surrogate pairs yet.
189 if 0:
190 c.write(u'\U00012345'[0])
191 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')
192 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')
193 self.assertEqual(s.getvalue(),
194 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
195 '\xea\xb0\x80\xc2\xac')
196 c.write(u'\U00012345'[0])
197 self.assertEqual(s.getvalue(),
198 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
199 '\xea\xb0\x80\xc2\xac')
200 c.reset()
201 self.assertEqual(s.getvalue(),
202 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
203 '\xea\xb0\x80\xc2\xac\xed\xa0\x88')
204 c.write(u'\U00012345'[1])
205 self.assertEqual(s.getvalue(),
206 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
207 '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
209 else: # UCS4
210 pass
212 def test_streamwriter_strwrite(self):
213 s = StringIO.StringIO()
214 wr = codecs.getwriter('gb18030')(s)
215 wr.write('abcd')
216 self.assertEqual(s.getvalue(), 'abcd')
218 class Test_ISO2022(unittest.TestCase):
219 def test_g2(self):
220 iso2022jp2 = '\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
221 uni = u':hu4:unit\xe9 de famille'
222 self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)
224 def test_iso2022_jp_g0(self):
225 self.assertFalse('\x0e' in u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
226 for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
227 e = u'\u3406'.encode(encoding)
228 self.assertFalse(filter(lambda x: x >= '\x80', e))
230 def test_bug1572832(self):
231 if sys.maxunicode >= 0x10000:
232 myunichr = unichr
233 else:
234 myunichr = lambda x: unichr(0xD7C0+(x>>10)) + unichr(0xDC00+(x&0x3FF))
236 for x in xrange(0x10000, 0x110000):
237 # Any ISO 2022 codec will cause the segfault
238 myunichr(x).encode('iso_2022_jp', 'ignore')
240 def test_main():
241 test_support.run_unittest(__name__)
243 if __name__ == "__main__":
244 test_main()