Update.
[glibc.git] / iconvdata / iso-2022-cn.c
blob5ffbfa565e0ab0a9ef66625f02828fc72c0ab2ac
1 /* Conversion module for ISO-2022-CN.
2 Copyright (C) 1999, 2000 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
16 You should have received a copy of the GNU Library General Public
17 License along with the GNU C Library; see the file COPYING.LIB. If not,
18 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
21 #include <gconv.h>
22 #include <stdint.h>
23 #include <string.h>
24 #include "gb2312.h"
25 #include "cns11643l1.h"
26 #include "cns11643l2.h"
28 #include <assert.h>
30 /* This makes obvious what everybody knows: 0x1b is the Esc character. */
31 #define ESC 0x1b
33 /* We have single-byte shift-in and shift-out sequences, and the single
34 shift sequence SS2 which replaces the SS2 designation for the next
35 two bytes. */
36 #define SI 0x0f
37 #define SO 0x0e
38 #define SS2_0 ESC
39 #define SS2_1 0x4e
41 /* Definitions used in the body of the `gconv' function. */
42 #define CHARSET_NAME "ISO-2022-CN//"
43 #define DEFINE_INIT 1
44 #define DEFINE_FINI 1
45 #define FROM_LOOP from_iso2022cn_loop
46 #define TO_LOOP to_iso2022cn_loop
47 #define MIN_NEEDED_FROM 1
48 #define MAX_NEEDED_FROM 4
49 #define MIN_NEEDED_TO 4
50 #define MAX_NEEDED_TO 4
51 #define PREPARE_LOOP \
52 int save_set; \
53 int *setp = &data->__statep->__count;
54 #define EXTRA_LOOP_ARGS , setp
57 /* The COUNT element of the state keeps track of the currently selected
58 character set. The possible values are: */
59 enum
61 ASCII_set = 0,
62 GB2312_set = 8,
63 CNS11643_1_set = 16,
64 CNS11643_2_set = 24,
65 CURRENT_SEL_MASK = 24,
66 GB2312_ann = 32,
67 CNS11643_1_ann = 64,
68 CNS11643_2_ann = 128,
69 CURRENT_ANN_MASK = 224
73 /* Since this is a stateful encoding we have to provide code which resets
74 the output state to the initial state. This has to be done during the
75 flushing. */
76 #define EMIT_SHIFT_TO_INIT \
77 if (data->__statep->__count != ASCII_set) \
78 { \
79 if (FROM_DIRECTION) \
80 /* It's easy, we don't have to emit anything, we just reset the \
81 state for the input. */ \
82 data->__statep->__count = ASCII_set; \
83 else \
84 { \
85 unsigned char *outbuf = data->__outbuf; \
87 /* We are not in the initial state. To switch back we have \
88 to emit `SI'. */ \
89 if (outbuf == data->__outbufend) \
90 /* We don't have enough room in the output buffer. */ \
91 status = __GCONV_FULL_OUTPUT; \
92 else \
93 { \
94 /* Write out the shift sequence. */ \
95 *outbuf++ = SI; \
96 if (data->__is_last) \
97 *written += 1; \
98 data->__outbuf = outbuf; \
99 data->__statep->__count = ASCII_set; \
105 /* Since we might have to reset input pointer we must be able to save
106 and retore the state. */
107 #define SAVE_RESET_STATE(Save) \
108 if (Save) \
109 save_set = *setp; \
110 else \
111 *setp = save_set
114 /* First define the conversion function from ISO-2022-CN to UCS4. */
115 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
116 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
117 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
118 #define LOOPFCT FROM_LOOP
119 #define BODY \
121 uint32_t ch = *inptr; \
123 /* This is a 7bit character set, disallow all 8bit characters. */ \
124 if (ch > 0x7f) \
126 result = __GCONV_ILLEGAL_INPUT; \
127 break; \
130 /* Recognize escape sequences. */ \
131 if (ch == ESC) \
133 /* There are two kinds of escape sequences we have to handle: \
134 - those announcing the use of GB and CNS characters on the \
135 line; we can simply ignore them \
136 - the initial byte of the SS2 sequence. \
137 */ \
138 if (NEED_LENGTH_TEST \
139 && (inptr + 1 > inend \
140 || (inptr[1] == '$' \
141 && (inptr + 2 > inend \
142 || (inptr[2] == ')' && inptr + 3 > inend) \
143 || (inptr[2] == '*' && inptr + 3 > inend))) \
144 || (inptr[1] == SS2_1 && inptr + 3 > inend))) \
146 result = __GCONV_EMPTY_INPUT; \
147 break; \
149 if (inptr[1] == '$' \
150 && ((inptr[2] == ')' && (inptr[3] == 'A' || inptr[3] == 'G')) \
151 || (inptr[2] == '*' && inptr[3] == 'H'))) \
153 /* OK, we accept those character sets. */ \
154 if (inptr[3] == 'A') \
155 ann = GB2312_ann; \
156 else if (inptr[3] == 'G') \
157 ann = CNS11643_1_ann; \
158 inptr += 4; \
159 continue; \
162 else if (ch == SO) \
164 /* Switch to use GB2312 or CNS 11643 plane 1, depending on which \
165 S0 designation came last. The only problem is what to do with \
166 faulty input files where no designator came. \
167 XXX For now I'll default to use GB2312. If this is not the \
168 best behaviour (e.g., we should flag an error) let me know. */ \
169 ++inptr; \
170 set = ann == CNS11643_1_ann ? CNS11643_1_set : GB2312_set; \
171 continue; \
173 else if (ch == SI) \
175 /* Switch to use ASCII. */ \
176 ++inptr; \
177 set = ASCII_set; \
178 continue; \
181 if (ch == ESC && inptr[1] == SS2_1) \
183 /* This is a character from CNS 11643 plane 2. \
184 XXX We could test here whether the use of this character \
185 set was announced. */ \
186 inptr += 2; \
187 ch = cns11643l2_to_ucs4 (&inptr, 2, 0); \
188 if (ch == __UNKNOWN_10646_CHAR) \
190 inptr -= 2; \
191 result = __GCONV_ILLEGAL_INPUT; \
192 break; \
195 else if (set == ASCII_set) \
197 /* Almost done, just advance the input pointer. */ \
198 ++inptr; \
200 else \
202 /* That's pretty easy, we have a dedicated functions for this. */ \
203 if (set == GB2312_set) \
204 ch = gb2312_to_ucs4 (&inptr, \
205 NEED_LENGTH_TEST ? inend - inptr : 2, 0); \
206 else \
208 assert (set == CNS11643_1_set); \
209 ch = cns11643l1_to_ucs4 (&inptr, \
210 NEED_LENGTH_TEST ? inend - inptr : 2, 0);\
213 if (NEED_LENGTH_TEST && ch == 0) \
215 result = __GCONV_EMPTY_INPUT; \
216 break; \
218 else if (ch == __UNKNOWN_10646_CHAR) \
220 result = __GCONV_ILLEGAL_INPUT; \
221 break; \
225 put32 (outptr, ch); \
226 outptr += 4; \
228 #define EXTRA_LOOP_DECLS , int *setp
229 #define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \
230 int ann = *setp & CURRENT_ANN_MASK
231 #define UPDATE_PARAMS *setp = set | ann
232 #include <iconv/loop.c>
235 /* Next, define the other direction. */
236 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
237 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
238 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
239 #define LOOPFCT TO_LOOP
240 #define BODY \
242 uint32_t ch; \
243 size_t written = 0; \
245 ch = get32 (inptr); \
247 /* First see whether we can write the character using the currently \
248 selected character set. */ \
249 if (ch < 0x80) \
251 if (set != ASCII_set) \
253 *outptr++ = SI; \
254 set = ASCII_set; \
255 if (NEED_LENGTH_TEST && outptr == outend) \
257 result = __GCONV_FULL_OUTPUT; \
258 break; \
262 *outptr++ = ch; \
263 written = 1; \
265 /* At the end of the line we have to clear the `ann' flags since \
266 every line must contain this information again. */ \
267 if (ch == L'\n') \
268 ann = 0; \
270 else \
272 char buf[2]; \
273 int used; \
275 if (set == GB2312_set || (ann & CNS11643_1_ann) == 0) \
277 written = ucs4_to_gb2312 (ch, buf, 2); \
278 used = GB2312_set; \
280 else \
282 written = ucs4_to_cns11643l1 (ch, buf, 2); \
283 used = CNS11643_1_set; \
286 if (written == __UNKNOWN_10646_CHAR) \
288 /* Cannot convert it using the currently selected SO set. \
289 Next try the SS2 set. */ \
290 written = ucs4_to_cns11643l2 (ch, buf, 2); \
291 if (written != __UNKNOWN_10646_CHAR) \
292 /* Yep, that worked. */ \
293 used = CNS11643_2_set; \
294 else \
296 /* Well, see whether we have to change the SO set. */ \
297 if (set == GB2312_set) \
298 written = ucs4_to_cns11643l1 (ch, buf, 2); \
299 else \
300 written = ucs4_to_gb2312 (ch, buf, 2); \
302 if (written != __UNKNOWN_10646_CHAR) \
303 /* Oh well, then switch SO. */ \
304 used = GB2312_set + CNS11643_1_set - set; \
305 else \
307 /* Even this does not work. Error. */ \
308 result = __GCONV_ILLEGAL_INPUT; \
309 break; \
313 assert (written == 2); \
315 /* See whether we have to emit an escape sequence. */ \
316 if (set != used) \
318 /* First see whether we announced that we use this \
319 character set. */ \
320 if ((ann & (2 << used)) == 0) \
322 const char *escseq; \
324 if (NEED_LENGTH_TEST && outptr + 4 > outend) \
326 result = __GCONV_FULL_OUTPUT; \
327 break; \
330 assert (used >= 1 && used <= 3); \
331 escseq = "\e$)A\e$)G\e$*H" + (used - 1) * 4; \
332 *outptr++ = *escseq++; \
333 *outptr++ = *escseq++; \
334 *outptr++ = *escseq++; \
335 *outptr++ = *escseq++; \
337 if (used == GB2312_set) \
338 ann = (ann & CNS11643_2_ann) | GB2312_ann; \
339 else if (used == CNS11643_1_set) \
340 ann = (ann & CNS11643_2_ann) | CNS11643_1_ann; \
341 else \
342 ann |= CNS11643_2_ann; \
345 if (used == CNS11643_2_set) \
347 if (outptr + 2 > outend) \
349 result = __GCONV_FULL_OUTPUT; \
350 break; \
352 *outptr++ = SS2_0; \
353 *outptr++ = SS2_1; \
355 else \
357 /* We only have to emit something is currently ASCII is \
358 selected. Otherwise we are switching within the \
359 SO charset. */ \
360 if (set == ASCII_set) \
362 if (outptr + 1 > outend) \
364 result = __GCONV_FULL_OUTPUT; \
365 break; \
367 *outptr++ = SO; \
371 /* Always test the length here since we have used up all the \
372 guaranteed output buffer slots. */ \
373 if (outptr + 2 > outend) \
375 result = __GCONV_FULL_OUTPUT; \
376 break; \
379 else if (NEED_LENGTH_TEST && outptr + 2 > outend) \
381 result = __GCONV_FULL_OUTPUT; \
382 break; \
385 *outptr++ = buf[0]; \
386 *outptr++ = buf[1]; \
389 /* Now that we wrote the output increment the input pointer. */ \
390 inptr += 4; \
392 #define EXTRA_LOOP_DECLS , int *setp
393 #define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \
394 int ann = *setp & CURRENT_ANN_MASK
395 #define UPDATE_PARAMS *setp = set | ann
396 #include <iconv/loop.c>
399 /* Now define the toplevel functions. */
400 #include <iconv/skeleton.c>