Change to flush and close logic to fix #1760556.
[python.git] / Modules / cjkcodecs / _codecs_iso2022.c
blob55196a9ea4cdf449158a686f3db5cb3c5ea5451a
1 /*
2 * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
7 #define USING_IMPORTED_MAPS
8 #define USING_BINARY_PAIR_SEARCH
9 #define EXTERN_JISX0213_PAIR
10 #define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
11 #define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE
13 #include "cjkcodecs.h"
14 #include "alg_jisx0201.h"
15 #include "emu_jisx0213_2000.h"
16 #include "mappings_jisx0213_pair.h"
18 /* STATE
20 state->c[0-3]
22 00000000
23 ||^^^^^|
24 |+-----+---- G0-3 Character Set
25 +----------- Is G0-3 double byte?
27 state->c[4]
29 00000000
31 |+---- Locked-Shift?
32 +----- ESC Throughout
35 #define ESC 0x1B
36 #define SO 0x0E
37 #define SI 0x0F
38 #define LF 0x0A
40 #define MAX_ESCSEQLEN 16
42 #define CHARSET_ISO8859_1 'A'
43 #define CHARSET_ASCII 'B'
44 #define CHARSET_ISO8859_7 'F'
45 #define CHARSET_JISX0201_K 'I'
46 #define CHARSET_JISX0201_R 'J'
48 #define CHARSET_GB2312 ('A'|CHARSET_DBCS)
49 #define CHARSET_JISX0208 ('B'|CHARSET_DBCS)
50 #define CHARSET_KSX1001 ('C'|CHARSET_DBCS)
51 #define CHARSET_JISX0212 ('D'|CHARSET_DBCS)
52 #define CHARSET_GB2312_8565 ('E'|CHARSET_DBCS)
53 #define CHARSET_CNS11643_1 ('G'|CHARSET_DBCS)
54 #define CHARSET_CNS11643_2 ('H'|CHARSET_DBCS)
55 #define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS)
56 #define CHARSET_JISX0213_2 ('P'|CHARSET_DBCS)
57 #define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS)
58 #define CHARSET_JISX0208_O ('@'|CHARSET_DBCS)
60 #define CHARSET_DBCS 0x80
61 #define ESCMARK(mark) ((mark) & 0x7f)
63 #define IS_ESCEND(c) (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
64 #define IS_ISO2022ESC(c2) \
65 ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
66 (c2) == '.' || (c2) == '&')
67 /* this is not a complete list of ISO-2022 escape sequence headers.
68 * but, it's enough to implement CJK instances of iso-2022. */
70 #define MAP_UNMAPPABLE 0xFFFF
71 #define MAP_MULTIPLE_AVAIL 0xFFFE /* for JIS X 0213 */
73 #define F_SHIFTED 0x01
74 #define F_ESCTHROUGHOUT 0x02
76 #define STATE_SETG(dn, v) ((state)->c[dn]) = (v);
77 #define STATE_GETG(dn) ((state)->c[dn])
79 #define STATE_G0 STATE_GETG(0)
80 #define STATE_G1 STATE_GETG(1)
81 #define STATE_G2 STATE_GETG(2)
82 #define STATE_G3 STATE_GETG(3)
83 #define STATE_SETG0(v) STATE_SETG(0, v)
84 #define STATE_SETG1(v) STATE_SETG(1, v)
85 #define STATE_SETG2(v) STATE_SETG(2, v)
86 #define STATE_SETG3(v) STATE_SETG(3, v)
88 #define STATE_SETFLAG(f) ((state)->c[4]) |= (f);
89 #define STATE_GETFLAG(f) ((state)->c[4] & (f))
90 #define STATE_CLEARFLAG(f) ((state)->c[4]) &= ~(f);
91 #define STATE_CLEARFLAGS() ((state)->c[4]) = 0;
93 #define ISO2022_CONFIG ((const struct iso2022_config *)config)
94 #define CONFIG_ISSET(flag) (ISO2022_CONFIG->flags & (flag))
95 #define CONFIG_DESIGNATIONS (ISO2022_CONFIG->designations)
97 /* iso2022_config.flags */
98 #define NO_SHIFT 0x01
99 #define USE_G2 0x02
100 #define USE_JISX0208_EXT 0x04
102 /*-*- internal data structures -*-*/
104 typedef int (*iso2022_init_func)(void);
105 typedef ucs4_t (*iso2022_decode_func)(const unsigned char *data);
106 typedef DBCHAR (*iso2022_encode_func)(const ucs4_t *data, Py_ssize_t *length);
108 struct iso2022_designation {
109 unsigned char mark;
110 unsigned char plane;
111 unsigned char width;
112 iso2022_init_func initializer;
113 iso2022_decode_func decoder;
114 iso2022_encode_func encoder;
117 struct iso2022_config {
118 int flags;
119 const struct iso2022_designation *designations; /* non-ascii desigs */
122 /*-*- iso-2022 codec implementation -*-*/
124 CODEC_INIT(iso2022)
126 const struct iso2022_designation *desig = CONFIG_DESIGNATIONS;
127 for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++)
128 if (desig->initializer != NULL && desig->initializer() != 0)
129 return -1;
130 return 0;
133 ENCODER_INIT(iso2022)
135 STATE_CLEARFLAGS()
136 STATE_SETG0(CHARSET_ASCII)
137 STATE_SETG1(CHARSET_ASCII)
138 return 0;
141 ENCODER_RESET(iso2022)
143 if (STATE_GETFLAG(F_SHIFTED)) {
144 WRITE1(SI)
145 NEXT_OUT(1)
146 STATE_CLEARFLAG(F_SHIFTED)
148 if (STATE_G0 != CHARSET_ASCII) {
149 WRITE3(ESC, '(', 'B')
150 NEXT_OUT(3)
151 STATE_SETG0(CHARSET_ASCII)
153 return 0;
156 ENCODER(iso2022)
158 while (inleft > 0) {
159 const struct iso2022_designation *dsg;
160 DBCHAR encoded;
161 ucs4_t c = **inbuf;
162 Py_ssize_t insize;
164 if (c < 0x80) {
165 if (STATE_G0 != CHARSET_ASCII) {
166 WRITE3(ESC, '(', 'B')
167 STATE_SETG0(CHARSET_ASCII)
168 NEXT_OUT(3)
170 if (STATE_GETFLAG(F_SHIFTED)) {
171 WRITE1(SI)
172 STATE_CLEARFLAG(F_SHIFTED)
173 NEXT_OUT(1)
175 WRITE1((unsigned char)c)
176 NEXT(1, 1)
177 continue;
180 DECODE_SURROGATE(c)
181 insize = GET_INSIZE(c);
183 encoded = MAP_UNMAPPABLE;
184 for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
185 Py_ssize_t length = 1;
186 encoded = dsg->encoder(&c, &length);
187 if (encoded == MAP_MULTIPLE_AVAIL) {
188 /* this implementation won't work for pair
189 * of non-bmp characters. */
190 if (inleft < 2) {
191 if (!(flags & MBENC_FLUSH))
192 return MBERR_TOOFEW;
193 length = -1;
195 else
196 length = 2;
197 #if Py_UNICODE_SIZE == 2
198 if (length == 2) {
199 ucs4_t u4in[2];
200 u4in[0] = (ucs4_t)IN1;
201 u4in[1] = (ucs4_t)IN2;
202 encoded = dsg->encoder(u4in, &length);
203 } else
204 encoded = dsg->encoder(&c, &length);
205 #else
206 encoded = dsg->encoder(*inbuf, &length);
207 #endif
208 if (encoded != MAP_UNMAPPABLE) {
209 insize = length;
210 break;
213 else if (encoded != MAP_UNMAPPABLE)
214 break;
217 if (!dsg->mark)
218 return 1;
219 assert(dsg->width == 1 || dsg->width == 2);
221 switch (dsg->plane) {
222 case 0: /* G0 */
223 if (STATE_GETFLAG(F_SHIFTED)) {
224 WRITE1(SI)
225 STATE_CLEARFLAG(F_SHIFTED)
226 NEXT_OUT(1)
228 if (STATE_G0 != dsg->mark) {
229 if (dsg->width == 1) {
230 WRITE3(ESC, '(', ESCMARK(dsg->mark))
231 STATE_SETG0(dsg->mark)
232 NEXT_OUT(3)
234 else if (dsg->mark == CHARSET_JISX0208) {
235 WRITE3(ESC, '$', ESCMARK(dsg->mark))
236 STATE_SETG0(dsg->mark)
237 NEXT_OUT(3)
239 else {
240 WRITE4(ESC, '$', '(',
241 ESCMARK(dsg->mark))
242 STATE_SETG0(dsg->mark)
243 NEXT_OUT(4)
246 break;
247 case 1: /* G1 */
248 if (STATE_G1 != dsg->mark) {
249 if (dsg->width == 1) {
250 WRITE3(ESC, ')', ESCMARK(dsg->mark))
251 STATE_SETG1(dsg->mark)
252 NEXT_OUT(3)
254 else {
255 WRITE4(ESC, '$', ')',
256 ESCMARK(dsg->mark))
257 STATE_SETG1(dsg->mark)
258 NEXT_OUT(4)
261 if (!STATE_GETFLAG(F_SHIFTED)) {
262 WRITE1(SO)
263 STATE_SETFLAG(F_SHIFTED)
264 NEXT_OUT(1)
266 break;
267 default: /* G2 and G3 is not supported: no encoding in
268 * CJKCodecs are using them yet */
269 return MBERR_INTERNAL;
272 if (dsg->width == 1) {
273 WRITE1((unsigned char)encoded)
274 NEXT_OUT(1)
276 else {
277 WRITE2(encoded >> 8, encoded & 0xff)
278 NEXT_OUT(2)
280 NEXT_IN(insize)
283 return 0;
286 DECODER_INIT(iso2022)
288 STATE_CLEARFLAGS()
289 STATE_SETG0(CHARSET_ASCII)
290 STATE_SETG1(CHARSET_ASCII)
291 STATE_SETG2(CHARSET_ASCII)
292 return 0;
295 DECODER_RESET(iso2022)
297 STATE_SETG0(CHARSET_ASCII)
298 STATE_CLEARFLAG(F_SHIFTED)
299 return 0;
302 static Py_ssize_t
303 iso2022processesc(const void *config, MultibyteCodec_State *state,
304 const unsigned char **inbuf, Py_ssize_t *inleft)
306 unsigned char charset, designation;
307 Py_ssize_t i, esclen;
309 for (i = 1;i < MAX_ESCSEQLEN;i++) {
310 if (i >= *inleft)
311 return MBERR_TOOFEW;
312 if (IS_ESCEND((*inbuf)[i])) {
313 esclen = i + 1;
314 break;
316 else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
317 (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@')
318 i += 2;
321 if (i >= MAX_ESCSEQLEN)
322 return 1; /* unterminated escape sequence */
324 switch (esclen) {
325 case 3:
326 if (IN2 == '$') {
327 charset = IN3 | CHARSET_DBCS;
328 designation = 0;
330 else {
331 charset = IN3;
332 if (IN2 == '(') designation = 0;
333 else if (IN2 == ')') designation = 1;
334 else if (CONFIG_ISSET(USE_G2) && IN2 == '.')
335 designation = 2;
336 else return 3;
338 break;
339 case 4:
340 if (IN2 != '$')
341 return 4;
343 charset = IN4 | CHARSET_DBCS;
344 if (IN3 == '(') designation = 0;
345 else if (IN3 == ')') designation = 1;
346 else return 4;
347 break;
348 case 6: /* designation with prefix */
349 if (CONFIG_ISSET(USE_JISX0208_EXT) &&
350 (*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
351 (*inbuf)[5] == 'B') {
352 charset = 'B' | CHARSET_DBCS;
353 designation = 0;
355 else
356 return 6;
357 break;
358 default:
359 return esclen;
362 /* raise error when the charset is not designated for this encoding */
363 if (charset != CHARSET_ASCII) {
364 const struct iso2022_designation *dsg;
366 for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++)
367 if (dsg->mark == charset)
368 break;
369 if (!dsg->mark)
370 return esclen;
373 STATE_SETG(designation, charset)
374 *inleft -= esclen;
375 (*inbuf) += esclen;
376 return 0;
379 #define ISO8859_7_DECODE(c, assi) \
380 if ((c) < 0xa0) (assi) = (c); \
381 else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) \
382 (assi) = (c); \
383 else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 || \
384 (0xbffffd77L & (1L << ((c)-0xb4))))) \
385 (assi) = 0x02d0 + (c); \
386 else if ((c) == 0xa1) (assi) = 0x2018; \
387 else if ((c) == 0xa2) (assi) = 0x2019; \
388 else if ((c) == 0xaf) (assi) = 0x2015;
390 static Py_ssize_t
391 iso2022processg2(const void *config, MultibyteCodec_State *state,
392 const unsigned char **inbuf, Py_ssize_t *inleft,
393 Py_UNICODE **outbuf, Py_ssize_t *outleft)
395 /* not written to use encoder, decoder functions because only few
396 * encodings use G2 designations in CJKCodecs */
397 if (STATE_G2 == CHARSET_ISO8859_1) {
398 if (IN3 < 0x80)
399 OUT1(IN3 + 0x80)
400 else
401 return 3;
403 else if (STATE_G2 == CHARSET_ISO8859_7) {
404 ISO8859_7_DECODE(IN3 ^ 0x80, **outbuf)
405 else return 3;
407 else if (STATE_G2 == CHARSET_ASCII) {
408 if (IN3 & 0x80) return 3;
409 else **outbuf = IN3;
411 else
412 return MBERR_INTERNAL;
414 (*inbuf) += 3;
415 *inleft -= 3;
416 (*outbuf) += 1;
417 *outleft -= 1;
418 return 0;
421 DECODER(iso2022)
423 const struct iso2022_designation *dsgcache = NULL;
425 while (inleft > 0) {
426 unsigned char c = IN1;
427 Py_ssize_t err;
429 if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
430 /* ESC throughout mode:
431 * for non-iso2022 escape sequences */
432 WRITE1(c) /* assume as ISO-8859-1 */
433 NEXT(1, 1)
434 if (IS_ESCEND(c)) {
435 STATE_CLEARFLAG(F_ESCTHROUGHOUT)
437 continue;
440 switch (c) {
441 case ESC:
442 REQUIRE_INBUF(2)
443 if (IS_ISO2022ESC(IN2)) {
444 err = iso2022processesc(config, state,
445 inbuf, &inleft);
446 if (err != 0)
447 return err;
449 else if (CONFIG_ISSET(USE_G2) && IN2 == 'N') {/* SS2 */
450 REQUIRE_INBUF(3)
451 err = iso2022processg2(config, state,
452 inbuf, &inleft, outbuf, &outleft);
453 if (err != 0)
454 return err;
456 else {
457 WRITE1(ESC)
458 STATE_SETFLAG(F_ESCTHROUGHOUT)
459 NEXT(1, 1)
461 break;
462 case SI:
463 if (CONFIG_ISSET(NO_SHIFT))
464 goto bypass;
465 STATE_CLEARFLAG(F_SHIFTED)
466 NEXT_IN(1)
467 break;
468 case SO:
469 if (CONFIG_ISSET(NO_SHIFT))
470 goto bypass;
471 STATE_SETFLAG(F_SHIFTED)
472 NEXT_IN(1)
473 break;
474 case LF:
475 STATE_CLEARFLAG(F_SHIFTED)
476 WRITE1(LF)
477 NEXT(1, 1)
478 break;
479 default:
480 if (c < 0x20) /* C0 */
481 goto bypass;
482 else if (c >= 0x80)
483 return 1;
484 else {
485 const struct iso2022_designation *dsg;
486 unsigned char charset;
487 ucs4_t decoded;
489 if (STATE_GETFLAG(F_SHIFTED))
490 charset = STATE_G1;
491 else
492 charset = STATE_G0;
494 if (charset == CHARSET_ASCII) {
495 bypass: WRITE1(c)
496 NEXT(1, 1)
497 break;
500 if (dsgcache != NULL &&
501 dsgcache->mark == charset)
502 dsg = dsgcache;
503 else {
504 for (dsg = CONFIG_DESIGNATIONS;
505 dsg->mark != charset
506 #ifdef Py_DEBUG
507 && dsg->mark != '\0'
508 #endif
509 ;dsg++)
510 /* noop */;
511 assert(dsg->mark != '\0');
512 dsgcache = dsg;
515 REQUIRE_INBUF(dsg->width)
516 decoded = dsg->decoder(*inbuf);
517 if (decoded == MAP_UNMAPPABLE)
518 return dsg->width;
520 if (decoded < 0x10000) {
521 WRITE1(decoded)
522 NEXT_OUT(1)
524 else if (decoded < 0x30000) {
525 WRITEUCS4(decoded)
527 else { /* JIS X 0213 pairs */
528 WRITE2(decoded >> 16, decoded & 0xffff)
529 NEXT_OUT(2)
531 NEXT_IN(dsg->width)
533 break;
536 return 0;
539 /*-*- mapping table holders -*-*/
541 #define ENCMAP(enc) static const encode_map *enc##_encmap = NULL;
542 #define DECMAP(enc) static const decode_map *enc##_decmap = NULL;
544 /* kr */
545 ENCMAP(cp949)
546 DECMAP(ksx1001)
548 /* jp */
549 ENCMAP(jisxcommon)
550 DECMAP(jisx0208)
551 DECMAP(jisx0212)
552 ENCMAP(jisx0213_bmp)
553 DECMAP(jisx0213_1_bmp)
554 DECMAP(jisx0213_2_bmp)
555 ENCMAP(jisx0213_emp)
556 DECMAP(jisx0213_1_emp)
557 DECMAP(jisx0213_2_emp)
559 /* cn */
560 ENCMAP(gbcommon)
561 DECMAP(gb2312)
563 /* tw */
565 /*-*- mapping access functions -*-*/
567 static int
568 ksx1001_init(void)
570 static int initialized = 0;
572 if (!initialized && (
573 IMPORT_MAP(kr, cp949, &cp949_encmap, NULL) ||
574 IMPORT_MAP(kr, ksx1001, NULL, &ksx1001_decmap)))
575 return -1;
576 initialized = 1;
577 return 0;
580 static ucs4_t
581 ksx1001_decoder(const unsigned char *data)
583 ucs4_t u;
584 TRYMAP_DEC(ksx1001, u, data[0], data[1])
585 return u;
586 else
587 return MAP_UNMAPPABLE;
590 static DBCHAR
591 ksx1001_encoder(const ucs4_t *data, Py_ssize_t *length)
593 DBCHAR coded;
594 assert(*length == 1);
595 if (*data < 0x10000) {
596 TRYMAP_ENC(cp949, coded, *data)
597 if (!(coded & 0x8000))
598 return coded;
600 return MAP_UNMAPPABLE;
603 static int
604 jisx0208_init(void)
606 static int initialized = 0;
608 if (!initialized && (
609 IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
610 IMPORT_MAP(jp, jisx0208, NULL, &jisx0208_decmap)))
611 return -1;
612 initialized = 1;
613 return 0;
616 static ucs4_t
617 jisx0208_decoder(const unsigned char *data)
619 ucs4_t u;
620 if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
621 return 0xff3c;
622 else TRYMAP_DEC(jisx0208, u, data[0], data[1])
623 return u;
624 else
625 return MAP_UNMAPPABLE;
628 static DBCHAR
629 jisx0208_encoder(const ucs4_t *data, Py_ssize_t *length)
631 DBCHAR coded;
632 assert(*length == 1);
633 if (*data < 0x10000) {
634 if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
635 return 0x2140;
636 else TRYMAP_ENC(jisxcommon, coded, *data) {
637 if (!(coded & 0x8000))
638 return coded;
641 return MAP_UNMAPPABLE;
644 static int
645 jisx0212_init(void)
647 static int initialized = 0;
649 if (!initialized && (
650 IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
651 IMPORT_MAP(jp, jisx0212, NULL, &jisx0212_decmap)))
652 return -1;
653 initialized = 1;
654 return 0;
657 static ucs4_t
658 jisx0212_decoder(const unsigned char *data)
660 ucs4_t u;
661 TRYMAP_DEC(jisx0212, u, data[0], data[1])
662 return u;
663 else
664 return MAP_UNMAPPABLE;
667 static DBCHAR
668 jisx0212_encoder(const ucs4_t *data, Py_ssize_t *length)
670 DBCHAR coded;
671 assert(*length == 1);
672 if (*data < 0x10000) {
673 TRYMAP_ENC(jisxcommon, coded, *data) {
674 if (coded & 0x8000)
675 return coded & 0x7fff;
678 return MAP_UNMAPPABLE;
681 static int
682 jisx0213_init(void)
684 static int initialized = 0;
686 if (!initialized && (
687 jisx0208_init() ||
688 IMPORT_MAP(jp, jisx0213_bmp,
689 &jisx0213_bmp_encmap, NULL) ||
690 IMPORT_MAP(jp, jisx0213_1_bmp,
691 NULL, &jisx0213_1_bmp_decmap) ||
692 IMPORT_MAP(jp, jisx0213_2_bmp,
693 NULL, &jisx0213_2_bmp_decmap) ||
694 IMPORT_MAP(jp, jisx0213_emp,
695 &jisx0213_emp_encmap, NULL) ||
696 IMPORT_MAP(jp, jisx0213_1_emp,
697 NULL, &jisx0213_1_emp_decmap) ||
698 IMPORT_MAP(jp, jisx0213_2_emp,
699 NULL, &jisx0213_2_emp_decmap) ||
700 IMPORT_MAP(jp, jisx0213_pair, &jisx0213_pair_encmap,
701 &jisx0213_pair_decmap)))
702 return -1;
703 initialized = 1;
704 return 0;
707 #define config ((void *)2000)
708 static ucs4_t
709 jisx0213_2000_1_decoder(const unsigned char *data)
711 ucs4_t u;
712 EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1])
713 else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
714 return 0xff3c;
715 else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
716 else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
717 else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
718 u |= 0x20000;
719 else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
720 else
721 return MAP_UNMAPPABLE;
722 return u;
725 static ucs4_t
726 jisx0213_2000_2_decoder(const unsigned char *data)
728 ucs4_t u;
729 EMULATE_JISX0213_2000_DECODE_PLANE2(u, data[0], data[1])
730 TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
731 else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
732 u |= 0x20000;
733 else
734 return MAP_UNMAPPABLE;
735 return u;
737 #undef config
739 static ucs4_t
740 jisx0213_2004_1_decoder(const unsigned char *data)
742 ucs4_t u;
743 if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
744 return 0xff3c;
745 else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
746 else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
747 else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
748 u |= 0x20000;
749 else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
750 else
751 return MAP_UNMAPPABLE;
752 return u;
755 static ucs4_t
756 jisx0213_2004_2_decoder(const unsigned char *data)
758 ucs4_t u;
759 TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
760 else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
761 u |= 0x20000;
762 else
763 return MAP_UNMAPPABLE;
764 return u;
767 static DBCHAR
768 jisx0213_encoder(const ucs4_t *data, Py_ssize_t *length, void *config)
770 DBCHAR coded;
772 switch (*length) {
773 case 1: /* first character */
774 if (*data >= 0x10000) {
775 if ((*data) >> 16 == 0x20000 >> 16) {
776 EMULATE_JISX0213_2000_ENCODE_EMP(coded, *data)
777 else TRYMAP_ENC(jisx0213_emp, coded,
778 (*data) & 0xffff)
779 return coded;
781 return MAP_UNMAPPABLE;
784 EMULATE_JISX0213_2000_ENCODE_BMP(coded, *data)
785 else TRYMAP_ENC(jisx0213_bmp, coded, *data) {
786 if (coded == MULTIC)
787 return MAP_MULTIPLE_AVAIL;
789 else TRYMAP_ENC(jisxcommon, coded, *data) {
790 if (coded & 0x8000)
791 return MAP_UNMAPPABLE;
793 else
794 return MAP_UNMAPPABLE;
795 return coded;
796 case 2: /* second character of unicode pair */
797 coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
798 jisx0213_pair_encmap, JISX0213_ENCPAIRS);
799 if (coded == DBCINV) {
800 *length = 1;
801 coded = find_pairencmap((ucs2_t)data[0], 0,
802 jisx0213_pair_encmap, JISX0213_ENCPAIRS);
803 if (coded == DBCINV)
804 return MAP_UNMAPPABLE;
806 else
807 return coded;
808 case -1: /* flush unterminated */
809 *length = 1;
810 coded = find_pairencmap((ucs2_t)data[0], 0,
811 jisx0213_pair_encmap, JISX0213_ENCPAIRS);
812 if (coded == DBCINV)
813 return MAP_UNMAPPABLE;
814 else
815 return coded;
816 default:
817 return MAP_UNMAPPABLE;
821 static DBCHAR
822 jisx0213_2000_1_encoder(const ucs4_t *data, Py_ssize_t *length)
824 DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
825 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
826 return coded;
827 else if (coded & 0x8000)
828 return MAP_UNMAPPABLE;
829 else
830 return coded;
833 static DBCHAR
834 jisx0213_2000_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
836 DBCHAR coded;
837 Py_ssize_t ilength = *length;
839 coded = jisx0213_encoder(data, length, (void *)2000);
840 switch (ilength) {
841 case 1:
842 if (coded == MAP_MULTIPLE_AVAIL)
843 return MAP_MULTIPLE_AVAIL;
844 else
845 return MAP_UNMAPPABLE;
846 case 2:
847 if (*length != 2)
848 return MAP_UNMAPPABLE;
849 else
850 return coded;
851 default:
852 return MAP_UNMAPPABLE;
856 static DBCHAR
857 jisx0213_2000_2_encoder(const ucs4_t *data, Py_ssize_t *length)
859 DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
860 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
861 return coded;
862 else if (coded & 0x8000)
863 return coded & 0x7fff;
864 else
865 return MAP_UNMAPPABLE;
868 static DBCHAR
869 jisx0213_2004_1_encoder(const ucs4_t *data, Py_ssize_t *length)
871 DBCHAR coded = jisx0213_encoder(data, length, NULL);
872 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
873 return coded;
874 else if (coded & 0x8000)
875 return MAP_UNMAPPABLE;
876 else
877 return coded;
880 static DBCHAR
881 jisx0213_2004_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
883 DBCHAR coded;
884 Py_ssize_t ilength = *length;
886 coded = jisx0213_encoder(data, length, NULL);
887 switch (ilength) {
888 case 1:
889 if (coded == MAP_MULTIPLE_AVAIL)
890 return MAP_MULTIPLE_AVAIL;
891 else
892 return MAP_UNMAPPABLE;
893 case 2:
894 if (*length != 2)
895 return MAP_UNMAPPABLE;
896 else
897 return coded;
898 default:
899 return MAP_UNMAPPABLE;
903 static DBCHAR
904 jisx0213_2004_2_encoder(const ucs4_t *data, Py_ssize_t *length)
906 DBCHAR coded = jisx0213_encoder(data, length, NULL);
907 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
908 return coded;
909 else if (coded & 0x8000)
910 return coded & 0x7fff;
911 else
912 return MAP_UNMAPPABLE;
915 static ucs4_t
916 jisx0201_r_decoder(const unsigned char *data)
918 ucs4_t u;
919 JISX0201_R_DECODE(*data, u)
920 else return MAP_UNMAPPABLE;
921 return u;
924 static DBCHAR
925 jisx0201_r_encoder(const ucs4_t *data, Py_ssize_t *length)
927 DBCHAR coded;
928 JISX0201_R_ENCODE(*data, coded)
929 else return MAP_UNMAPPABLE;
930 return coded;
933 static ucs4_t
934 jisx0201_k_decoder(const unsigned char *data)
936 ucs4_t u;
937 JISX0201_K_DECODE(*data ^ 0x80, u)
938 else return MAP_UNMAPPABLE;
939 return u;
942 static DBCHAR
943 jisx0201_k_encoder(const ucs4_t *data, Py_ssize_t *length)
945 DBCHAR coded;
946 JISX0201_K_ENCODE(*data, coded)
947 else return MAP_UNMAPPABLE;
948 return coded - 0x80;
951 static int
952 gb2312_init(void)
954 static int initialized = 0;
956 if (!initialized && (
957 IMPORT_MAP(cn, gbcommon, &gbcommon_encmap, NULL) ||
958 IMPORT_MAP(cn, gb2312, NULL, &gb2312_decmap)))
959 return -1;
960 initialized = 1;
961 return 0;
964 static ucs4_t
965 gb2312_decoder(const unsigned char *data)
967 ucs4_t u;
968 TRYMAP_DEC(gb2312, u, data[0], data[1])
969 return u;
970 else
971 return MAP_UNMAPPABLE;
974 static DBCHAR
975 gb2312_encoder(const ucs4_t *data, Py_ssize_t *length)
977 DBCHAR coded;
978 assert(*length == 1);
979 if (*data < 0x10000) {
980 TRYMAP_ENC(gbcommon, coded, *data) {
981 if (!(coded & 0x8000))
982 return coded;
985 return MAP_UNMAPPABLE;
989 static ucs4_t
990 dummy_decoder(const unsigned char *data)
992 return MAP_UNMAPPABLE;
995 static DBCHAR
996 dummy_encoder(const ucs4_t *data, Py_ssize_t *length)
998 return MAP_UNMAPPABLE;
1001 /*-*- registry tables -*-*/
1003 #define REGISTRY_KSX1001_G0 { CHARSET_KSX1001, 0, 2, \
1004 ksx1001_init, \
1005 ksx1001_decoder, ksx1001_encoder }
1006 #define REGISTRY_KSX1001_G1 { CHARSET_KSX1001, 1, 2, \
1007 ksx1001_init, \
1008 ksx1001_decoder, ksx1001_encoder }
1009 #define REGISTRY_JISX0201_R { CHARSET_JISX0201_R, 0, 1, \
1010 NULL, \
1011 jisx0201_r_decoder, jisx0201_r_encoder }
1012 #define REGISTRY_JISX0201_K { CHARSET_JISX0201_K, 0, 1, \
1013 NULL, \
1014 jisx0201_k_decoder, jisx0201_k_encoder }
1015 #define REGISTRY_JISX0208 { CHARSET_JISX0208, 0, 2, \
1016 jisx0208_init, \
1017 jisx0208_decoder, jisx0208_encoder }
1018 #define REGISTRY_JISX0208_O { CHARSET_JISX0208_O, 0, 2, \
1019 jisx0208_init, \
1020 jisx0208_decoder, jisx0208_encoder }
1021 #define REGISTRY_JISX0212 { CHARSET_JISX0212, 0, 2, \
1022 jisx0212_init, \
1023 jisx0212_decoder, jisx0212_encoder }
1024 #define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2, \
1025 jisx0213_init, \
1026 jisx0213_2000_1_decoder, \
1027 jisx0213_2000_1_encoder }
1028 #define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
1029 jisx0213_init, \
1030 jisx0213_2000_1_decoder, \
1031 jisx0213_2000_1_encoder_paironly }
1032 #define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2, \
1033 jisx0213_init, \
1034 jisx0213_2000_2_decoder, \
1035 jisx0213_2000_2_encoder }
1036 #define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2, \
1037 jisx0213_init, \
1038 jisx0213_2004_1_decoder, \
1039 jisx0213_2004_1_encoder }
1040 #define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
1041 jisx0213_init, \
1042 jisx0213_2004_1_decoder, \
1043 jisx0213_2004_1_encoder_paironly }
1044 #define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2, \
1045 jisx0213_init, \
1046 jisx0213_2004_2_decoder, \
1047 jisx0213_2004_2_encoder }
1048 #define REGISTRY_GB2312 { CHARSET_GB2312, 0, 2, \
1049 gb2312_init, \
1050 gb2312_decoder, gb2312_encoder }
1051 #define REGISTRY_CNS11643_1 { CHARSET_CNS11643_1, 1, 2, \
1052 cns11643_init, \
1053 cns11643_1_decoder, cns11643_1_encoder }
1054 #define REGISTRY_CNS11643_2 { CHARSET_CNS11643_2, 2, 2, \
1055 cns11643_init, \
1056 cns11643_2_decoder, cns11643_2_encoder }
1057 #define REGISTRY_ISO8859_1 { CHARSET_ISO8859_1, 2, 1, \
1058 NULL, dummy_decoder, dummy_encoder }
1059 #define REGISTRY_ISO8859_7 { CHARSET_ISO8859_7, 2, 1, \
1060 NULL, dummy_decoder, dummy_encoder }
1061 #define REGISTRY_SENTINEL { 0, }
1062 #define CONFIGDEF(var, attrs) \
1063 static const struct iso2022_config iso2022_##var##_config = { \
1064 attrs, iso2022_##var##_designations \
1067 static const struct iso2022_designation iso2022_kr_designations[] = {
1068 REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
1070 CONFIGDEF(kr, 0)
1072 static const struct iso2022_designation iso2022_jp_designations[] = {
1073 REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
1074 REGISTRY_SENTINEL
1076 CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)
1078 static const struct iso2022_designation iso2022_jp_1_designations[] = {
1079 REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
1080 REGISTRY_JISX0208_O, REGISTRY_SENTINEL
1082 CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
1084 static const struct iso2022_designation iso2022_jp_2_designations[] = {
1085 REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
1086 REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
1087 REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
1089 CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)
1091 static const struct iso2022_designation iso2022_jp_2004_designations[] = {
1092 REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
1093 REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
1095 CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)
1097 static const struct iso2022_designation iso2022_jp_3_designations[] = {
1098 REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
1099 REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
1101 CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)
1103 static const struct iso2022_designation iso2022_jp_ext_designations[] = {
1104 REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
1105 REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
1107 CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)
1110 BEGIN_MAPPINGS_LIST
1111 /* no mapping table here */
1112 END_MAPPINGS_LIST
1114 #define ISO2022_CODEC(variation) { \
1115 "iso2022_" #variation, \
1116 &iso2022_##variation##_config, \
1117 iso2022_codec_init, \
1118 _STATEFUL_METHODS(iso2022) \
1121 BEGIN_CODECS_LIST
1122 ISO2022_CODEC(kr)
1123 ISO2022_CODEC(jp)
1124 ISO2022_CODEC(jp_1)
1125 ISO2022_CODEC(jp_2)
1126 ISO2022_CODEC(jp_2004)
1127 ISO2022_CODEC(jp_3)
1128 ISO2022_CODEC(jp_ext)
1129 END_CODECS_LIST
1131 I_AM_A_MODULE_FOR(iso2022)