Fix the transient refleaks in test_zipimport_support.
[python.git] / Modules / cjkcodecs / _codecs_cn.c
blob4542ce62f06a4db77eb838549f76141b750178cc
1 /*
2 * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
7 #include "cjkcodecs.h"
8 #include "mappings_cn.h"
10 /**
11 * hz is predefined as 100 on AIX. So we undefine it to avoid
12 * conflict against hz codec's.
14 #ifdef _AIX
15 #undef hz
16 #endif
18 /* GBK and GB2312 map differently in few codepoints that are listed below:
20 * gb2312 gbk
21 * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT
22 * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH
23 * A844 undefined U+2015 HORIZONTAL BAR
26 #define GBK_DECODE(dc1, dc2, assi) \
27 if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
28 else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
29 else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
30 else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
31 else TRYMAP_DEC(gbkext, assi, dc1, dc2);
33 #define GBK_ENCODE(code, assi) \
34 if ((code) == 0x2014) (assi) = 0xa1aa; \
35 else if ((code) == 0x2015) (assi) = 0xa844; \
36 else if ((code) == 0x00b7) (assi) = 0xa1a4; \
37 else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code));
40 * GB2312 codec
43 ENCODER(gb2312)
45 while (inleft > 0) {
46 Py_UNICODE c = IN1;
47 DBCHAR code;
49 if (c < 0x80) {
50 WRITE1((unsigned char)c)
51 NEXT(1, 1)
52 continue;
54 UCS4INVALID(c)
56 REQUIRE_OUTBUF(2)
57 TRYMAP_ENC(gbcommon, code, c);
58 else return 1;
60 if (code & 0x8000) /* MSB set: GBK */
61 return 1;
63 OUT1((code >> 8) | 0x80)
64 OUT2((code & 0xFF) | 0x80)
65 NEXT(1, 2)
68 return 0;
71 DECODER(gb2312)
73 while (inleft > 0) {
74 unsigned char c = **inbuf;
76 REQUIRE_OUTBUF(1)
78 if (c < 0x80) {
79 OUT1(c)
80 NEXT(1, 1)
81 continue;
84 REQUIRE_INBUF(2)
85 TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
86 NEXT(2, 1)
88 else return 2;
91 return 0;
96 * GBK codec
99 ENCODER(gbk)
101 while (inleft > 0) {
102 Py_UNICODE c = IN1;
103 DBCHAR code;
105 if (c < 0x80) {
106 WRITE1((unsigned char)c)
107 NEXT(1, 1)
108 continue;
110 UCS4INVALID(c)
112 REQUIRE_OUTBUF(2)
114 GBK_ENCODE(c, code)
115 else return 1;
117 OUT1((code >> 8) | 0x80)
118 if (code & 0x8000)
119 OUT2((code & 0xFF)) /* MSB set: GBK */
120 else
121 OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
122 NEXT(1, 2)
125 return 0;
128 DECODER(gbk)
130 while (inleft > 0) {
131 unsigned char c = IN1;
133 REQUIRE_OUTBUF(1)
135 if (c < 0x80) {
136 OUT1(c)
137 NEXT(1, 1)
138 continue;
141 REQUIRE_INBUF(2)
143 GBK_DECODE(c, IN2, **outbuf)
144 else return 2;
146 NEXT(2, 1)
149 return 0;
154 * GB18030 codec
157 ENCODER(gb18030)
159 while (inleft > 0) {
160 ucs4_t c = IN1;
161 DBCHAR code;
163 if (c < 0x80) {
164 WRITE1(c)
165 NEXT(1, 1)
166 continue;
169 DECODE_SURROGATE(c)
170 if (c > 0x10FFFF)
171 #if Py_UNICODE_SIZE == 2
172 return 2; /* surrogates pair */
173 #else
174 return 1;
175 #endif
176 else if (c >= 0x10000) {
177 ucs4_t tc = c - 0x10000;
179 REQUIRE_OUTBUF(4)
181 OUT4((unsigned char)(tc % 10) + 0x30)
182 tc /= 10;
183 OUT3((unsigned char)(tc % 126) + 0x81)
184 tc /= 126;
185 OUT2((unsigned char)(tc % 10) + 0x30)
186 tc /= 10;
187 OUT1((unsigned char)(tc + 0x90))
189 #if Py_UNICODE_SIZE == 2
190 NEXT(2, 4) /* surrogates pair */
191 #else
192 NEXT(1, 4)
193 #endif
194 continue;
197 REQUIRE_OUTBUF(2)
199 GBK_ENCODE(c, code)
200 else TRYMAP_ENC(gb18030ext, code, c);
201 else {
202 const struct _gb18030_to_unibmp_ranges *utrrange;
204 REQUIRE_OUTBUF(4)
206 for (utrrange = gb18030_to_unibmp_ranges;
207 utrrange->first != 0;
208 utrrange++)
209 if (utrrange->first <= c &&
210 c <= utrrange->last) {
211 Py_UNICODE tc;
213 tc = c - utrrange->first +
214 utrrange->base;
216 OUT4((unsigned char)(tc % 10) + 0x30)
217 tc /= 10;
218 OUT3((unsigned char)(tc % 126) + 0x81)
219 tc /= 126;
220 OUT2((unsigned char)(tc % 10) + 0x30)
221 tc /= 10;
222 OUT1((unsigned char)tc + 0x81)
224 NEXT(1, 4)
225 break;
228 if (utrrange->first == 0)
229 return 1;
230 continue;
233 OUT1((code >> 8) | 0x80)
234 if (code & 0x8000)
235 OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
236 else
237 OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
239 NEXT(1, 2)
242 return 0;
245 DECODER(gb18030)
247 while (inleft > 0) {
248 unsigned char c = IN1, c2;
250 REQUIRE_OUTBUF(1)
252 if (c < 0x80) {
253 OUT1(c)
254 NEXT(1, 1)
255 continue;
258 REQUIRE_INBUF(2)
260 c2 = IN2;
261 if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
262 const struct _gb18030_to_unibmp_ranges *utr;
263 unsigned char c3, c4;
264 ucs4_t lseq;
266 REQUIRE_INBUF(4)
267 c3 = IN3;
268 c4 = IN4;
269 if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
270 return 4;
271 c -= 0x81; c2 -= 0x30;
272 c3 -= 0x81; c4 -= 0x30;
274 if (c < 4) { /* U+0080 - U+FFFF */
275 lseq = ((ucs4_t)c * 10 + c2) * 1260 +
276 (ucs4_t)c3 * 10 + c4;
277 if (lseq < 39420) {
278 for (utr = gb18030_to_unibmp_ranges;
279 lseq >= (utr + 1)->base;
280 utr++) ;
281 OUT1(utr->first - utr->base + lseq)
282 NEXT(4, 1)
283 continue;
286 else if (c >= 15) { /* U+10000 - U+10FFFF */
287 lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2)
288 * 1260 + (ucs4_t)c3 * 10 + c4;
289 if (lseq <= 0x10FFFF) {
290 WRITEUCS4(lseq);
291 NEXT_IN(4)
292 continue;
295 return 4;
298 GBK_DECODE(c, c2, **outbuf)
299 else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
300 else return 2;
302 NEXT(2, 1)
305 return 0;
310 * HZ codec
313 ENCODER_INIT(hz)
315 state->i = 0;
316 return 0;
319 ENCODER_RESET(hz)
321 if (state->i != 0) {
322 WRITE2('~', '}')
323 state->i = 0;
324 NEXT_OUT(2)
326 return 0;
329 ENCODER(hz)
331 while (inleft > 0) {
332 Py_UNICODE c = IN1;
333 DBCHAR code;
335 if (c < 0x80) {
336 if (state->i == 0) {
337 WRITE1((unsigned char)c)
338 NEXT(1, 1)
340 else {
341 WRITE3('~', '}', (unsigned char)c)
342 NEXT(1, 3)
343 state->i = 0;
345 continue;
348 UCS4INVALID(c)
350 TRYMAP_ENC(gbcommon, code, c);
351 else return 1;
353 if (code & 0x8000) /* MSB set: GBK */
354 return 1;
356 if (state->i == 0) {
357 WRITE4('~', '{', code >> 8, code & 0xff)
358 NEXT(1, 4)
359 state->i = 1;
361 else {
362 WRITE2(code >> 8, code & 0xff)
363 NEXT(1, 2)
367 return 0;
370 DECODER_INIT(hz)
372 state->i = 0;
373 return 0;
376 DECODER_RESET(hz)
378 state->i = 0;
379 return 0;
382 DECODER(hz)
384 while (inleft > 0) {
385 unsigned char c = IN1;
387 if (c == '~') {
388 unsigned char c2 = IN2;
390 REQUIRE_INBUF(2)
391 if (c2 == '~') {
392 WRITE1('~')
393 NEXT(2, 1)
394 continue;
396 else if (c2 == '{' && state->i == 0)
397 state->i = 1; /* set GB */
398 else if (c2 == '}' && state->i == 1)
399 state->i = 0; /* set ASCII */
400 else if (c2 == '\n')
401 ; /* line-continuation */
402 else
403 return 2;
404 NEXT(2, 0);
405 continue;
408 if (c & 0x80)
409 return 1;
411 if (state->i == 0) { /* ASCII mode */
412 WRITE1(c)
413 NEXT(1, 1)
415 else { /* GB mode */
416 REQUIRE_INBUF(2)
417 REQUIRE_OUTBUF(1)
418 TRYMAP_DEC(gb2312, **outbuf, c, IN2) {
419 NEXT(2, 1)
421 else
422 return 2;
426 return 0;
430 BEGIN_MAPPINGS_LIST
431 MAPPING_DECONLY(gb2312)
432 MAPPING_DECONLY(gbkext)
433 MAPPING_ENCONLY(gbcommon)
434 MAPPING_ENCDEC(gb18030ext)
435 END_MAPPINGS_LIST
437 BEGIN_CODECS_LIST
438 CODEC_STATELESS(gb2312)
439 CODEC_STATELESS(gbk)
440 CODEC_STATELESS(gb18030)
441 CODEC_STATEFUL(hz)
442 END_CODECS_LIST
444 I_AM_A_MODULE_FOR(cn)