Update.
[glibc.git] / iconvdata / iso-2022-cn.c
blob038c4485b9d9ab9f2171a9252477e8d63d7f5650
1 /* Conversion module for ISO-2022-CN.
2 Copyright (C) 1999, 2000, 2001 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <dlfcn.h>
22 #include <gconv.h>
23 #include <stdint.h>
24 #include <string.h>
25 #include "gb2312.h"
26 #include "cns11643l1.h"
27 #include "cns11643l2.h"
29 #include <assert.h>
31 /* This makes obvious what everybody knows: 0x1b is the Esc character. */
32 #define ESC 0x1b
34 /* We have single-byte shift-in and shift-out sequences, and the single
35 shift sequence SS2 which replaces the SS2 designation for the next
36 two bytes. */
37 #define SI 0x0f
38 #define SO 0x0e
39 #define SS2_0 ESC
40 #define SS2_1 0x4e
42 /* Definitions used in the body of the `gconv' function. */
43 #define CHARSET_NAME "ISO-2022-CN//"
44 #define DEFINE_INIT 1
45 #define DEFINE_FINI 1
46 #define FROM_LOOP from_iso2022cn_loop
47 #define TO_LOOP to_iso2022cn_loop
48 #define MIN_NEEDED_FROM 1
49 #define MAX_NEEDED_FROM 4
50 #define MIN_NEEDED_TO 4
51 #define MAX_NEEDED_TO 4
52 #define PREPARE_LOOP \
53 int save_set; \
54 int *setp = &data->__statep->__count;
55 #define EXTRA_LOOP_ARGS , setp
58 /* The COUNT element of the state keeps track of the currently selected
59 character set. The possible values are: */
60 enum
62 ASCII_set = 0,
63 GB2312_set = 8,
64 CNS11643_1_set = 16,
65 CNS11643_2_set = 24,
66 CURRENT_SEL_MASK = 24,
67 GB2312_ann = 32,
68 CNS11643_1_ann = 64,
69 CNS11643_2_ann = 128,
70 CURRENT_ANN_MASK = 224
74 /* Since this is a stateful encoding we have to provide code which resets
75 the output state to the initial state. This has to be done during the
76 flushing. */
77 #define EMIT_SHIFT_TO_INIT \
78 if (data->__statep->__count != ASCII_set) \
79 { \
80 if (FROM_DIRECTION) \
81 /* It's easy, we don't have to emit anything, we just reset the \
82 state for the input. */ \
83 data->__statep->__count = ASCII_set; \
84 else \
85 { \
86 /* We are not in the initial state. To switch back we have \
87 to emit `SI'. */ \
88 if (__builtin_expect (outbuf == outend, 0)) \
89 /* We don't have enough room in the output buffer. */ \
90 status = __GCONV_FULL_OUTPUT; \
91 else \
92 { \
93 /* Write out the shift sequence. */ \
94 *outbuf++ = SI; \
95 data->__statep->__count = ASCII_set; \
96 } \
97 } \
101 /* Since we might have to reset input pointer we must be able to save
102 and retore the state. */
103 #define SAVE_RESET_STATE(Save) \
104 if (Save) \
105 save_set = *setp; \
106 else \
107 *setp = save_set
110 /* First define the conversion function from ISO-2022-CN to UCS4. */
111 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
112 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
113 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
114 #define LOOPFCT FROM_LOOP
115 #define BODY \
117 uint32_t ch = *inptr; \
119 /* This is a 7bit character set, disallow all 8bit characters. */ \
120 if (__builtin_expect (ch >= 0x7f, 0)) \
122 if (! ignore_errors_p ()) \
124 result = __GCONV_ILLEGAL_INPUT; \
125 break; \
128 ++inptr; \
129 ++*irreversible; \
130 continue; \
133 /* Recognize escape sequences. */ \
134 if (__builtin_expect (ch, 0) == ESC) \
136 /* There are two kinds of escape sequences we have to handle: \
137 - those announcing the use of GB and CNS characters on the \
138 line; we can simply ignore them \
139 - the initial byte of the SS2 sequence. \
140 */ \
141 if (__builtin_expect (inptr + 2 > inend, 0) \
142 || (inptr[1] == '$' \
143 && (__builtin_expect (inptr + 3 > inend, 0) \
144 || (inptr[2] == ')' \
145 && __builtin_expect (inptr + 4 > inend, 0)) \
146 || (inptr[2] == '*' \
147 && __builtin_expect (inptr + 4 > inend, 0)))) \
148 || (inptr[1] == SS2_1 \
149 && __builtin_expect (inptr + 4 > inend, 0))) \
151 result = __GCONV_INCOMPLETE_INPUT; \
152 break; \
154 if (inptr[1] == '$' \
155 && ((inptr[2] == ')' && (inptr[3] == 'A' || inptr[3] == 'G')) \
156 || (inptr[2] == '*' && inptr[3] == 'H'))) \
158 /* OK, we accept those character sets. */ \
159 if (inptr[3] == 'A') \
160 ann = GB2312_ann; \
161 else if (inptr[3] == 'G') \
162 ann = CNS11643_1_ann; \
163 inptr += 4; \
164 continue; \
167 else if (__builtin_expect (ch, 0) == SO) \
169 /* Switch to use GB2312 or CNS 11643 plane 1, depending on which \
170 S0 designation came last. The only problem is what to do with \
171 faulty input files where no designator came. \
172 XXX For now I'll default to use GB2312. If this is not the \
173 best behaviour (e.g., we should flag an error) let me know. */ \
174 ++inptr; \
175 set = ann == CNS11643_1_ann ? CNS11643_1_set : GB2312_set; \
176 continue; \
178 else if (__builtin_expect (ch, 0) == SI) \
180 /* Switch to use ASCII. */ \
181 ++inptr; \
182 set = ASCII_set; \
183 continue; \
186 if (__builtin_expect (ch, 0) == ESC && inptr[1] == SS2_1) \
188 /* This is a character from CNS 11643 plane 2. \
189 XXX We could test here whether the use of this character \
190 set was announced. */ \
191 inptr += 2; \
192 ch = cns11643l2_to_ucs4 (&inptr, 2, 0); \
193 if (__builtin_expect (ch, 0) == __UNKNOWN_10646_CHAR) \
195 if (! ignore_errors_p ()) \
197 /* This is an illegal character. */ \
198 inptr -= 2; \
199 result = __GCONV_ILLEGAL_INPUT; \
200 break; \
203 ++*irreversible; \
204 continue; \
207 else if (set == ASCII_set) \
209 /* Almost done, just advance the input pointer. */ \
210 ++inptr; \
212 else \
214 /* That's pretty easy, we have a dedicated functions for this. */ \
215 if (set == GB2312_set) \
216 ch = gb2312_to_ucs4 (&inptr, inend - inptr, 0); \
217 else \
219 assert (set == CNS11643_1_set); \
220 ch = cns11643l1_to_ucs4 (&inptr, inend - inptr, 0); \
223 if (__builtin_expect (ch, 1) == 0) \
225 result = __GCONV_INCOMPLETE_INPUT; \
226 break; \
228 else if (__builtin_expect (ch, 1) == __UNKNOWN_10646_CHAR) \
230 if (! ignore_errors_p ()) \
232 /* This is an illegal character. */ \
233 result = __GCONV_ILLEGAL_INPUT; \
234 break; \
237 ++inptr; \
238 ++*irreversible; \
239 continue; \
243 put32 (outptr, ch); \
244 outptr += 4; \
246 #define LOOP_NEED_FLAGS
247 #define EXTRA_LOOP_DECLS , int *setp
248 #define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \
249 int ann = *setp & CURRENT_ANN_MASK
250 #define UPDATE_PARAMS *setp = set | ann
251 #include <iconv/loop.c>
254 /* Next, define the other direction. */
255 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
256 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
257 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
258 #define LOOPFCT TO_LOOP
259 #define BODY \
261 uint32_t ch = get32 (inptr); \
263 /* First see whether we can write the character using the currently \
264 selected character set. */ \
265 if (ch < 0x80) \
267 if (set != ASCII_set) \
269 *outptr++ = SI; \
270 set = ASCII_set; \
271 if (__builtin_expect (outptr == outend, 0)) \
273 result = __GCONV_FULL_OUTPUT; \
274 break; \
278 *outptr++ = ch; \
280 /* At the end of the line we have to clear the `ann' flags since \
281 every line must contain this information again. */ \
282 if (ch == L'\n') \
283 ann = 0; \
285 else \
287 char buf[2]; \
288 int used; \
289 size_t written = 0; \
291 if (set == GB2312_set || (ann & CNS11643_1_ann) == 0) \
293 written = ucs4_to_gb2312 (ch, buf, 2); \
294 used = GB2312_set; \
296 else \
298 written = ucs4_to_cns11643l1 (ch, buf, 2); \
299 used = CNS11643_1_set; \
302 if (written == __UNKNOWN_10646_CHAR) \
304 /* Cannot convert it using the currently selected SO set. \
305 Next try the SS2 set. */ \
306 written = ucs4_to_cns11643l2 (ch, buf, 2); \
307 if (written != __UNKNOWN_10646_CHAR) \
308 /* Yep, that worked. */ \
309 used = CNS11643_2_set; \
310 else \
312 /* Well, see whether we have to change the SO set. */ \
313 if (used == GB2312_set) \
314 written = ucs4_to_cns11643l1 (ch, buf, 2); \
315 else \
316 written = ucs4_to_gb2312 (ch, buf, 2); \
318 if (__builtin_expect (written, 0) != __UNKNOWN_10646_CHAR) \
319 /* Oh well, then switch SO. */ \
320 used = GB2312_set + CNS11643_1_set - used; \
321 else \
323 UNICODE_TAG_HANDLER (ch, 4); \
325 /* Even this does not work. Error. */ \
326 STANDARD_ERR_HANDLER (4); \
330 assert (written == 2); \
332 /* See whether we have to emit an escape sequence. */ \
333 if (set != used) \
335 /* First see whether we announced that we use this \
336 character set. */ \
337 if ((ann & (16 << (used >> 3))) == 0) \
339 const char *escseq; \
341 if (__builtin_expect (outptr + 4 > outend, 0)) \
343 result = __GCONV_FULL_OUTPUT; \
344 break; \
347 assert ((used >> 3) >= 1 && (used >> 3) <= 3); \
348 escseq = ")A)G*H" + ((used >> 3) - 1) * 2; \
349 *outptr++ = ESC; \
350 *outptr++ = '$'; \
351 *outptr++ = *escseq++; \
352 *outptr++ = *escseq++; \
354 if (used == GB2312_set) \
355 ann = (ann & CNS11643_2_ann) | GB2312_ann; \
356 else if (used == CNS11643_1_set) \
357 ann = (ann & CNS11643_2_ann) | CNS11643_1_ann; \
358 else \
359 ann |= CNS11643_2_ann; \
362 if (used == CNS11643_2_set) \
364 if (__builtin_expect (outptr + 2 > outend, 0)) \
366 result = __GCONV_FULL_OUTPUT; \
367 break; \
369 *outptr++ = SS2_0; \
370 *outptr++ = SS2_1; \
372 else \
374 /* We only have to emit something is currently ASCII is \
375 selected. Otherwise we are switching within the \
376 SO charset. */ \
377 if (set == ASCII_set) \
379 if (__builtin_expect (outptr + 1 > outend, 0)) \
381 result = __GCONV_FULL_OUTPUT; \
382 break; \
384 *outptr++ = SO; \
388 /* Always test the length here since we have used up all the \
389 guaranteed output buffer slots. */ \
390 if (__builtin_expect (outptr + 2 > outend, 0)) \
392 result = __GCONV_FULL_OUTPUT; \
393 break; \
396 else if (__builtin_expect (outptr + 2 > outend, 0)) \
398 result = __GCONV_FULL_OUTPUT; \
399 break; \
402 *outptr++ = buf[0]; \
403 *outptr++ = buf[1]; \
404 set = used; \
407 /* Now that we wrote the output increment the input pointer. */ \
408 inptr += 4; \
410 #define LOOP_NEED_FLAGS
411 #define EXTRA_LOOP_DECLS , int *setp
412 #define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \
413 int ann = *setp & CURRENT_ANN_MASK
414 #define UPDATE_PARAMS *setp = set | ann
415 #include <iconv/loop.c>
418 /* Now define the toplevel functions. */
419 #include <iconv/skeleton.c>