3 # test_multibytecodec.py
4 # Unit test for multibytecodec itself
7 from test
import test_support
8 from test
import test_multibytecodec_support
9 from test
.test_support
import TESTFN
10 import unittest
, StringIO
, codecs
, sys
, os
11 import _multibytecodec
15 'gb2312', 'gbk', 'gb18030', 'hz',
19 'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',
20 'euc_jis_2004', 'shift_jis_2004',
22 'cp949', 'euc_kr', 'johab',
26 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',
27 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
30 class Test_MultibyteCodec(unittest
.TestCase
):
32 def test_nullcoding(self
):
33 for enc
in ALL_CJKENCODINGS
:
34 self
.assertEqual(''.decode(enc
), u
'')
35 self
.assertEqual(unicode('', enc
), u
'')
36 self
.assertEqual(u
''.encode(enc
), '')
38 def test_str_decode(self
):
39 for enc
in ALL_CJKENCODINGS
:
40 self
.assertEqual('abcd'.encode(enc
), 'abcd')
42 def test_errorcallback_longindex(self
):
43 dec
= codecs
.getdecoder('euc-kr')
44 myreplace
= lambda exc
: (u
'', sys
.maxint
+1)
45 codecs
.register_error('test.cjktest', myreplace
)
46 self
.assertRaises(IndexError, dec
,
47 'apple\x92ham\x93spam', 'test.cjktest')
49 def test_codingspec(self
):
51 for enc
in ALL_CJKENCODINGS
:
52 print >> open(TESTFN
, 'w'), '# coding:', enc
57 def test_init_segfault(self
):
58 # bug #3305: this used to segfault
59 self
.assertRaises(AttributeError,
60 _multibytecodec
.MultibyteStreamReader
, None)
61 self
.assertRaises(AttributeError,
62 _multibytecodec
.MultibyteStreamWriter
, None)
65 class Test_IncrementalEncoder(unittest
.TestCase
):
67 def test_stateless(self
):
68 # cp949 encoder isn't stateful at all.
69 encoder
= codecs
.getincrementalencoder('cp949')()
70 self
.assertEqual(encoder
.encode(u
'\ud30c\uc774\uc36c \ub9c8\uc744'),
71 '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')
72 self
.assertEqual(encoder
.reset(), None)
73 self
.assertEqual(encoder
.encode(u
'\u2606\u223c\u2606', True),
74 '\xa1\xd9\xa1\xad\xa1\xd9')
75 self
.assertEqual(encoder
.reset(), None)
76 self
.assertEqual(encoder
.encode(u
'', True), '')
77 self
.assertEqual(encoder
.encode(u
'', False), '')
78 self
.assertEqual(encoder
.reset(), None)
80 def test_stateful(self
):
81 # jisx0213 encoder is stateful for a few codepoints. eg)
83 # U+00E6 U+0300 => ABC4
86 encoder
= codecs
.getincrementalencoder('jisx0213')()
87 self
.assertEqual(encoder
.encode(u
'\u00e6\u0300'), '\xab\xc4')
88 self
.assertEqual(encoder
.encode(u
'\u00e6'), '')
89 self
.assertEqual(encoder
.encode(u
'\u0300'), '\xab\xc4')
90 self
.assertEqual(encoder
.encode(u
'\u00e6', True), '\xa9\xdc')
92 self
.assertEqual(encoder
.reset(), None)
93 self
.assertEqual(encoder
.encode(u
'\u0300'), '\xab\xdc')
95 self
.assertEqual(encoder
.encode(u
'\u00e6'), '')
96 self
.assertEqual(encoder
.encode('', True), '\xa9\xdc')
97 self
.assertEqual(encoder
.encode('', True), '')
99 def test_stateful_keep_buffer(self
):
100 encoder
= codecs
.getincrementalencoder('jisx0213')()
101 self
.assertEqual(encoder
.encode(u
'\u00e6'), '')
102 self
.assertRaises(UnicodeEncodeError, encoder
.encode
, u
'\u0123')
103 self
.assertEqual(encoder
.encode(u
'\u0300\u00e6'), '\xab\xc4')
104 self
.assertRaises(UnicodeEncodeError, encoder
.encode
, u
'\u0123')
105 self
.assertEqual(encoder
.reset(), None)
106 self
.assertEqual(encoder
.encode(u
'\u0300'), '\xab\xdc')
107 self
.assertEqual(encoder
.encode(u
'\u00e6'), '')
108 self
.assertRaises(UnicodeEncodeError, encoder
.encode
, u
'\u0123')
109 self
.assertEqual(encoder
.encode(u
'', True), '\xa9\xdc')
112 class Test_IncrementalDecoder(unittest
.TestCase
):
115 # cp949 decoder is simple with only 1 or 2 bytes sequences.
116 decoder
= codecs
.getincrementaldecoder('cp949')()
117 self
.assertEqual(decoder
.decode('\xc6\xc4\xc0\xcc\xbd'),
119 self
.assertEqual(decoder
.decode('\xe3 \xb8\xb6\xc0\xbb'),
120 u
'\uc36c \ub9c8\uc744')
121 self
.assertEqual(decoder
.decode(''), u
'')
123 def test_dbcs_keep_buffer(self
):
124 decoder
= codecs
.getincrementaldecoder('cp949')()
125 self
.assertEqual(decoder
.decode('\xc6\xc4\xc0'), u
'\ud30c')
126 self
.assertRaises(UnicodeDecodeError, decoder
.decode
, '', True)
127 self
.assertEqual(decoder
.decode('\xcc'), u
'\uc774')
129 self
.assertEqual(decoder
.decode('\xc6\xc4\xc0'), u
'\ud30c')
130 self
.assertRaises(UnicodeDecodeError, decoder
.decode
, '\xcc\xbd', True)
131 self
.assertEqual(decoder
.decode('\xcc'), u
'\uc774')
133 def test_iso2022(self
):
134 decoder
= codecs
.getincrementaldecoder('iso2022-jp')()
136 self
.assertEqual(decoder
.decode(ESC
+ '('), u
'')
137 self
.assertEqual(decoder
.decode('B', True), u
'')
138 self
.assertEqual(decoder
.decode(ESC
+ '$'), u
'')
139 self
.assertEqual(decoder
.decode('B@$'), u
'\u4e16')
140 self
.assertEqual(decoder
.decode('@$@'), u
'\u4e16')
141 self
.assertEqual(decoder
.decode('$', True), u
'\u4e16')
142 self
.assertEqual(decoder
.reset(), None)
143 self
.assertEqual(decoder
.decode('@$'), u
'@$')
144 self
.assertEqual(decoder
.decode(ESC
+ '$'), u
'')
145 self
.assertRaises(UnicodeDecodeError, decoder
.decode
, '', True)
146 self
.assertEqual(decoder
.decode('B@$'), u
'\u4e16')
148 class Test_StreamReader(unittest
.TestCase
):
149 def test_bug1728403(self
):
151 open(TESTFN
, 'w').write('\xa1')
152 f
= codecs
.open(TESTFN
, encoding
='cp949')
153 self
.assertRaises(UnicodeDecodeError, f
.read
, 2)
159 class Test_StreamWriter(unittest
.TestCase
):
160 if len(u
'\U00012345') == 2: # UCS2
161 def test_gb18030(self
):
162 s
= StringIO
.StringIO()
163 c
= codecs
.getwriter('gb18030')(s
)
165 self
.assertEqual(s
.getvalue(), '123')
166 c
.write(u
'\U00012345')
167 self
.assertEqual(s
.getvalue(), '123\x907\x959')
168 c
.write(u
'\U00012345'[0])
169 self
.assertEqual(s
.getvalue(), '123\x907\x959')
170 c
.write(u
'\U00012345'[1] + u
'\U00012345' + u
'\uac00\u00ac')
171 self
.assertEqual(s
.getvalue(),
172 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
173 c
.write(u
'\U00012345'[0])
174 self
.assertEqual(s
.getvalue(),
175 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
176 self
.assertRaises(UnicodeError, c
.reset
)
177 self
.assertEqual(s
.getvalue(),
178 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')
180 def test_utf_8(self
):
181 s
= StringIO
.StringIO()
182 c
= codecs
.getwriter('utf-8')(s
)
184 self
.assertEqual(s
.getvalue(), '123')
185 c
.write(u
'\U00012345')
186 self
.assertEqual(s
.getvalue(), '123\xf0\x92\x8d\x85')
188 # Python utf-8 codec can't buffer surrogate pairs yet.
190 c
.write(u
'\U00012345'[0])
191 self
.assertEqual(s
.getvalue(), '123\xf0\x92\x8d\x85')
192 c
.write(u
'\U00012345'[1] + u
'\U00012345' + u
'\uac00\u00ac')
193 self
.assertEqual(s
.getvalue(),
194 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
195 '\xea\xb0\x80\xc2\xac')
196 c
.write(u
'\U00012345'[0])
197 self
.assertEqual(s
.getvalue(),
198 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
199 '\xea\xb0\x80\xc2\xac')
201 self
.assertEqual(s
.getvalue(),
202 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
203 '\xea\xb0\x80\xc2\xac\xed\xa0\x88')
204 c
.write(u
'\U00012345'[1])
205 self
.assertEqual(s
.getvalue(),
206 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'
207 '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')
212 def test_streamwriter_strwrite(self
):
213 s
= StringIO
.StringIO()
214 wr
= codecs
.getwriter('gb18030')(s
)
216 self
.assertEqual(s
.getvalue(), 'abcd')
218 class Test_ISO2022(unittest
.TestCase
):
220 iso2022jp2
= '\x1b(B:hu4:unit\x1b.A\x1bNi de famille'
221 uni
= u
':hu4:unit\xe9 de famille'
222 self
.assertEqual(iso2022jp2
.decode('iso2022-jp-2'), uni
)
224 def test_iso2022_jp_g0(self
):
225 self
.assertFalse('\x0e' in u
'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))
226 for encoding
in ('iso-2022-jp-2004', 'iso-2022-jp-3'):
227 e
= u
'\u3406'.encode(encoding
)
228 self
.assertFalse(filter(lambda x
: x
>= '\x80', e
))
230 def test_bug1572832(self
):
231 if sys
.maxunicode
>= 0x10000:
234 myunichr
= lambda x
: unichr(0xD7C0+(x
>>10)) + unichr(0xDC00+(x
&0x3FF))
236 for x
in xrange(0x10000, 0x110000):
237 # Any ISO 2022 codec will cause the segfault
238 myunichr(x
).encode('iso_2022_jp', 'ignore')
241 test_support
.run_unittest(__name__
)
243 if __name__
== "__main__":