1 /* Conversion module for ISO-2022-CN.
2 Copyright (C) 1999-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
24 #include "cns11643l1.h"
25 #include "cns11643l2.h"
29 /* This makes obvious what everybody knows: 0x1b is the Esc character. */
32 /* We have single-byte shift-in and shift-out sequences, and the single
33 shift sequence SS2 which replaces the SS2 designation for the next
40 /* Definitions used in the body of the `gconv' function. */
41 #define CHARSET_NAME "ISO-2022-CN//"
44 #define FROM_LOOP from_iso2022cn_loop
45 #define TO_LOOP to_iso2022cn_loop
46 #define ONE_DIRECTION 0
47 #define FROM_LOOP_MIN_NEEDED_FROM 1
48 #define FROM_LOOP_MAX_NEEDED_FROM 4
49 #define FROM_LOOP_MIN_NEEDED_TO 4
50 #define FROM_LOOP_MAX_NEEDED_TO 4
51 #define TO_LOOP_MIN_NEEDED_FROM 4
52 #define TO_LOOP_MAX_NEEDED_FROM 4
53 #define TO_LOOP_MIN_NEEDED_TO 1
54 #define TO_LOOP_MAX_NEEDED_TO 6
55 #define PREPARE_LOOP \
57 int *setp = &data->__statep->__count;
58 #define EXTRA_LOOP_ARGS , setp
61 /* The COUNT element of the state keeps track of the currently selected
62 character set. The possible values are: */
69 CURRENT_SEL_MASK
= 24,
73 CURRENT_ANN_MASK
= 224
77 /* Since this is a stateful encoding we have to provide code which resets
78 the output state to the initial state. This has to be done during the
80 #define EMIT_SHIFT_TO_INIT \
81 if (data->__statep->__count != ASCII_set) \
84 /* It's easy, we don't have to emit anything, we just reset the \
85 state for the input. */ \
86 data->__statep->__count = ASCII_set; \
89 /* We are not in the initial state. To switch back we have \
91 if (__glibc_unlikely (outbuf == outend)) \
92 /* We don't have enough room in the output buffer. */ \
93 status = __GCONV_FULL_OUTPUT; \
96 /* Write out the shift sequence. */ \
98 data->__statep->__count = ASCII_set; \
104 /* Since we might have to reset input pointer we must be able to save
105 and retore the state. */
106 #define SAVE_RESET_STATE(Save) \
113 /* First define the conversion function from ISO-2022-CN to UCS4. */
114 #define MIN_NEEDED_INPUT FROM_LOOP_MIN_NEEDED_FROM
115 #define MAX_NEEDED_INPUT FROM_LOOP_MAX_NEEDED_FROM
116 #define MIN_NEEDED_OUTPUT FROM_LOOP_MIN_NEEDED_TO
117 #define MAX_NEEDED_OUTPUT FROM_LOOP_MAX_NEEDED_TO
118 #define LOOPFCT FROM_LOOP
121 uint32_t ch = *inptr; \
123 /* This is a 7bit character set, disallow all 8bit characters. */ \
124 if (__glibc_unlikely (ch >= 0x7f)) \
125 STANDARD_FROM_LOOP_ERR_HANDLER (1); \
127 /* Recognize escape sequences. */ \
128 if (__builtin_expect (ch, 0) == ESC) \
130 /* There are two kinds of escape sequences we have to handle: \
131 - those announcing the use of GB and CNS characters on the \
132 line; we can simply ignore them \
133 - the initial byte of the SS2 sequence. \
135 if (__builtin_expect (inptr + 2 > inend, 0) \
136 || (inptr[1] == '$' \
137 && (__builtin_expect (inptr + 3 > inend, 0) \
138 || (inptr[2] == ')' \
139 && __builtin_expect (inptr + 4 > inend, 0)) \
140 || (inptr[2] == '*' \
141 && __builtin_expect (inptr + 4 > inend, 0)))) \
142 || (inptr[1] == SS2_1 \
143 && __builtin_expect (inptr + 4 > inend, 0))) \
145 result = __GCONV_INCOMPLETE_INPUT; \
148 if (inptr[1] == '$' \
149 && ((inptr[2] == ')' && (inptr[3] == 'A' || inptr[3] == 'G')) \
150 || (inptr[2] == '*' && inptr[3] == 'H'))) \
152 /* OK, we accept those character sets. */ \
153 if (inptr[3] == 'A') \
155 else if (inptr[3] == 'G') \
156 ann = CNS11643_1_ann; \
161 else if (__builtin_expect (ch, 0) == SO) \
163 /* Switch to use GB2312 or CNS 11643 plane 1, depending on which \
164 S0 designation came last. The only problem is what to do with \
165 faulty input files where no designator came. \
166 XXX For now I'll default to use GB2312. If this is not the \
167 best behaviour (e.g., we should flag an error) let me know. */ \
169 set = ann == CNS11643_1_ann ? CNS11643_1_set : GB2312_set; \
172 else if (__builtin_expect (ch, 0) == SI) \
174 /* Switch to use ASCII. */ \
180 if (__builtin_expect (ch, 0) == ESC && inptr[1] == SS2_1) \
182 /* This is a character from CNS 11643 plane 2. \
183 XXX We could test here whether the use of this character \
184 set was announced. */ \
186 ch = cns11643l2_to_ucs4 (&inptr, 2, 0); \
187 if (__builtin_expect (ch, 0) == __UNKNOWN_10646_CHAR) \
190 STANDARD_FROM_LOOP_ERR_HANDLER (2); \
193 else if (set == ASCII_set) \
195 /* Almost done, just advance the input pointer. */ \
200 /* That's pretty easy, we have a dedicated functions for this. */ \
201 if (set == GB2312_set) \
202 ch = gb2312_to_ucs4 (&inptr, inend - inptr, 0); \
205 assert (set == CNS11643_1_set); \
206 ch = cns11643l1_to_ucs4 (&inptr, inend - inptr, 0); \
209 if (__builtin_expect (ch, 1) == 0) \
211 result = __GCONV_INCOMPLETE_INPUT; \
214 else if (__builtin_expect (ch, 1) == __UNKNOWN_10646_CHAR) \
216 STANDARD_FROM_LOOP_ERR_HANDLER (1); \
220 put32 (outptr, ch); \
223 #define LOOP_NEED_FLAGS
224 #define EXTRA_LOOP_DECLS , int *setp
225 #define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \
226 int ann = *setp & CURRENT_ANN_MASK
227 #define UPDATE_PARAMS *setp = set | ann
228 #include <iconv/loop.c>
231 /* Next, define the other direction. */
232 #define MIN_NEEDED_INPUT TO_LOOP_MIN_NEEDED_FROM
233 #define MAX_NEEDED_INPUT TO_LOOP_MAX_NEEDED_FROM
234 #define MIN_NEEDED_OUTPUT TO_LOOP_MIN_NEEDED_TO
235 #define MAX_NEEDED_OUTPUT TO_LOOP_MAX_NEEDED_TO
236 #define LOOPFCT TO_LOOP
239 uint32_t ch = get32 (inptr); \
241 /* First see whether we can write the character using the currently \
242 selected character set. */ \
245 if (set != ASCII_set) \
249 if (__glibc_unlikely (outptr == outend)) \
251 result = __GCONV_FULL_OUTPUT; \
258 /* At the end of the line we have to clear the `ann' flags since \
259 every line must contain this information again. */ \
265 unsigned char buf[2]; \
266 /* Fake initialization to keep gcc quiet. */ \
267 asm ("" : "=m" (buf)); \
270 size_t written = 0; \
272 if (set == GB2312_set || (ann & CNS11643_1_ann) == 0) \
274 written = ucs4_to_gb2312 (ch, buf, 2); \
279 written = ucs4_to_cns11643l1 (ch, buf, 2); \
280 used = CNS11643_1_set; \
283 if (written == __UNKNOWN_10646_CHAR) \
285 /* Cannot convert it using the currently selected SO set. \
286 Next try the SS2 set. */ \
287 written = ucs4_to_cns11643l2 (ch, buf, 2); \
288 if (written != __UNKNOWN_10646_CHAR) \
289 /* Yep, that worked. */ \
290 used = CNS11643_2_set; \
293 /* Well, see whether we have to change the SO set. */ \
294 if (used == GB2312_set) \
295 written = ucs4_to_cns11643l1 (ch, buf, 2); \
297 written = ucs4_to_gb2312 (ch, buf, 2); \
299 if (__builtin_expect (written, 0) != __UNKNOWN_10646_CHAR) \
300 /* Oh well, then switch SO. */ \
301 used = GB2312_set + CNS11643_1_set - used; \
304 UNICODE_TAG_HANDLER (ch, 4); \
306 /* Even this does not work. Error. */ \
307 STANDARD_TO_LOOP_ERR_HANDLER (4); \
311 assert (written == 2); \
313 /* See whether we have to emit an escape sequence. */ \
316 /* First see whether we announced that we use this \
318 if ((ann & (16 << (used >> 3))) == 0) \
320 const char *escseq; \
322 if (__glibc_unlikely (outptr + 4 > outend)) \
324 result = __GCONV_FULL_OUTPUT; \
328 assert ((used >> 3) >= 1 && (used >> 3) <= 3); \
329 escseq = ")A)G*H" + ((used >> 3) - 1) * 2; \
332 *outptr++ = *escseq++; \
333 *outptr++ = *escseq++; \
335 if (used == GB2312_set) \
336 ann = (ann & CNS11643_2_ann) | GB2312_ann; \
337 else if (used == CNS11643_1_set) \
338 ann = (ann & CNS11643_2_ann) | CNS11643_1_ann; \
340 ann |= CNS11643_2_ann; \
343 if (used == CNS11643_2_set) \
345 if (__glibc_unlikely (outptr + 2 > outend)) \
347 result = __GCONV_FULL_OUTPUT; \
355 /* We only have to emit something is currently ASCII is \
356 selected. Otherwise we are switching within the \
358 if (set == ASCII_set) \
360 if (__glibc_unlikely (outptr + 1 > outend)) \
362 result = __GCONV_FULL_OUTPUT; \
369 /* Always test the length here since we have used up all the \
370 guaranteed output buffer slots. */ \
371 if (__glibc_unlikely (outptr + 2 > outend)) \
373 result = __GCONV_FULL_OUTPUT; \
377 else if (__glibc_unlikely (outptr + 2 > outend)) \
379 result = __GCONV_FULL_OUTPUT; \
383 *outptr++ = buf[0]; \
384 *outptr++ = buf[1]; \
388 /* Now that we wrote the output increment the input pointer. */ \
391 #define LOOP_NEED_FLAGS
392 #define EXTRA_LOOP_DECLS , int *setp
393 #define INIT_PARAMS int set = *setp & CURRENT_SEL_MASK; \
394 int ann = *setp & CURRENT_ANN_MASK
395 #define REINIT_PARAMS do \
397 set = *setp & CURRENT_SEL_MASK; \
398 ann = *setp & CURRENT_ANN_MASK; \
401 #define UPDATE_PARAMS *setp = set | ann
402 #include <iconv/loop.c>
405 /* Now define the toplevel functions. */
406 #include <iconv/skeleton.c>