2000-05-02 Jeff Sturm <jsturm@one-point.com>
[official-gcc.git] / gcc / mbchar.c
blobf5f7beca4c60dace2db1f3d2d81e4a25c85c50e7
1 /* Multibyte Character Functions.
2 Copyright (C) 1998 Free Software Foundation, Inc.
4 This file is part of GNU CC.
6 GNU CC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 GNU CC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU CC; see the file COPYING. If not, write to
18 the Free Software Foundation, 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
21 /* Note regarding cross compilation:
23 In general, translation of multibyte characters to wide characters can
24 only work in a native compiler since the translation function (mbtowc)
25 needs to know about both the source and target character encoding. However,
26 this particular implementation for JIS, SJIS and EUCJP source characters
27 will work for any compiler with a newlib target. Other targets may also
28 work provided that their wchar_t implementation is 2 bytes and the encoding
29 leaves the source character values unchanged (except for removing the
30 state shifting markers). */
32 #include "config.h"
33 #ifdef MULTIBYTE_CHARS
34 #include "system.h"
35 #include "mbchar.h"
36 #include <locale.h>
38 typedef enum {ESCAPE, DOLLAR, BRACKET, AT, B, J, NUL, JIS_CHAR, OTHER,
39 JIS_C_NUM} JIS_CHAR_TYPE;
41 typedef enum {ASCII, A_ESC, A_ESC_DL, JIS, JIS_1, JIS_2, J_ESC, J_ESC_BR,
42 J2_ESC, J2_ESC_BR, INV, JIS_S_NUM} JIS_STATE;
44 typedef enum {COPYA, COPYJ, COPYJ2, MAKE_A, MAKE_J, NOOP,
45 EMPTY, ERROR} JIS_ACTION;
47 /* State/action tables for processing JIS encoding:
49 Where possible, switches to JIS are grouped with proceding JIS characters
50 and switches to ASCII are grouped with preceding JIS characters.
51 Thus, maximum returned length is:
52 2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6. */
54 static JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = {
55 /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTH*/
56 /*ASCII*/ { A_ESC, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
57 /*A_ESC*/ { ASCII, A_ESC_DL,ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
58 /*A_ESC_DL*/{ ASCII, ASCII, ASCII, JIS, JIS, ASCII, ASCII,ASCII,ASCII},
59 /*JIS*/ { J_ESC, JIS_1, JIS_1, JIS_1, JIS_1, JIS_1, INV, JIS_1,INV },
60 /*JIS_1*/ { INV, JIS_2, JIS_2, JIS_2, JIS_2, JIS_2, INV, JIS_2,INV },
61 /*JIS_2*/ { J2_ESC,JIS, JIS, JIS, JIS, JIS, INV, JIS, JIS },
62 /*J_ESC*/ { INV, INV, J_ESC_BR, INV, INV, INV, INV, INV, INV },
63 /*J_ESC_BR*/{ INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV },
64 /*J2_ESC*/ { INV, INV, J2_ESC_BR,INV, INV, INV, INV, INV, INV },
65 /*J2_ESC_BR*/{INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV },
68 static JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = {
69 /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTH */
70 /*ASCII */ {NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, EMPTY, COPYA, COPYA},
71 /*A_ESC */ {COPYA, NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA},
72 /*A_ESC_DL */{COPYA, COPYA, COPYA, MAKE_J, MAKE_J, COPYA, COPYA, COPYA, COPYA},
73 /*JIS */ {NOOP, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR},
74 /*JIS_1 */ {ERROR, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR},
75 /*JIS_2 */ {NOOP, COPYJ2,COPYJ2,COPYJ2, COPYJ2, COPYJ2,ERROR, COPYJ2,COPYJ2},
76 /*J_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR},
77 /*J_ESC_BR */{ERROR, ERROR, ERROR, ERROR, NOOP, NOOP, ERROR, ERROR, ERROR},
78 /*J2_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR},
79 /*J2_ESC_BR*/{ERROR, ERROR, ERROR, ERROR, COPYJ, COPYJ, ERROR, ERROR, ERROR},
83 const char *literal_codeset = NULL;
85 /* Store into *PWC (if PWC is not null) the wide character
86 corresponding to the multibyte character at the start of the
87 buffer S of size N. Return the number of bytes in the multibyte
88 character. Return -1 if the bytes do not form a valid character,
89 or 0 if S is null or points to a null byte.
91 This function behaves like the Standard C function mbtowc, except
92 it treats locale names of the form "C-..." specially. */
94 int
95 local_mbtowc (pwc, s, n)
96 wchar_t *pwc;
97 const char *s;
98 size_t n;
100 static JIS_STATE save_state = ASCII;
101 JIS_STATE curr_state = save_state;
102 const unsigned char *t = (const unsigned char *) s;
104 if (s != NULL && n == 0)
105 return -1;
107 if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
108 /* This must be the "C" locale or unknown locale -- fall thru */
110 else if (! strcmp (literal_codeset, "C-SJIS"))
112 int char1;
113 if (s == NULL)
114 /* Not state-dependent. */
115 return 0;
117 char1 = *t;
118 if (ISSJIS1 (char1))
120 int char2 = t[1];
122 if (n <= 1)
123 return -1;
125 if (ISSJIS2 (char2))
127 if (pwc != NULL)
128 *pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
129 return 2;
132 return -1;
135 if (pwc != NULL)
136 *pwc = (wchar_t) *t;
138 if (*t == '\0')
139 return 0;
141 return 1;
143 else if (! strcmp (literal_codeset, "C-EUCJP"))
145 int char1;
147 if (s == NULL)
148 /* Not state-dependent. */
149 return 0;
151 char1 = *t;
152 if (ISEUCJP (char1))
154 int char2 = t[1];
156 if (n <= 1)
157 return -1;
159 if (ISEUCJP (char2))
161 if (pwc != NULL)
162 *pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
163 return 2;
166 return -1;
169 if (pwc != NULL)
170 *pwc = (wchar_t) *t;
172 if (*t == '\0')
173 return 0;
175 return 1;
177 else if (! strcmp (literal_codeset, "C-JIS"))
179 JIS_ACTION action;
180 JIS_CHAR_TYPE ch;
181 const unsigned char *ptr;
182 size_t i, curr_ch;
184 if (s == NULL)
186 save_state = ASCII;
187 /* State-dependent. */
188 return 1;
191 ptr = t;
193 for (i = 0; i < n; i++)
195 curr_ch = t[i];
196 switch (curr_ch)
198 case JIS_ESC_CHAR:
199 ch = ESCAPE;
200 break;
201 case '$':
202 ch = DOLLAR;
203 break;
204 case '@':
205 ch = AT;
206 break;
207 case '(':
208 ch = BRACKET;
209 break;
210 case 'B':
211 ch = B;
212 break;
213 case 'J':
214 ch = J;
215 break;
216 case '\0':
217 ch = NUL;
218 break;
219 default:
220 if (ISJIS (curr_ch))
221 ch = JIS_CHAR;
222 else
223 ch = OTHER;
226 action = JIS_action_table[curr_state][ch];
227 curr_state = JIS_state_table[curr_state][ch];
229 switch (action)
231 case NOOP:
232 break;
234 case EMPTY:
235 if (pwc != NULL)
236 *pwc = (wchar_t) 0;
238 save_state = curr_state;
239 return i;
241 case COPYA:
242 if (pwc != NULL)
243 *pwc = (wchar_t) *ptr;
244 save_state = curr_state;
245 return i + 1;
247 case COPYJ:
248 if (pwc != NULL)
249 *pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));
251 save_state = curr_state;
252 return i + 1;
254 case COPYJ2:
255 if (pwc != NULL)
256 *pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));
258 save_state = curr_state;
259 return ptr - t + 2;
261 case MAKE_A:
262 case MAKE_J:
263 ptr = (const unsigned char *) (t + i + 1);
264 break;
266 case ERROR:
267 default:
268 return -1;
272 /* More than n bytes needed. */
273 return -1;
276 #ifdef CROSS_COMPILE
277 if (s == NULL)
278 /* Not state-dependent. */
279 return 0;
281 if (pwc != NULL)
282 *pwc = *s;
283 return 1;
284 #else
286 /* This must be the "C" locale or unknown locale. */
287 return mbtowc (pwc, s, n);
288 #endif
291 /* Return the number of bytes in the multibyte character at the start
292 of the buffer S of size N. Return -1 if the bytes do not form a
293 valid character, or 0 if S is null or points to a null byte.
295 This function behaves like the Standard C function mblen, except
296 it treats locale names of the form "C-..." specially. */
299 local_mblen (s, n)
300 const char *s;
301 size_t n;
303 return local_mbtowc (NULL, s, n);
306 /* Return the maximum mumber of bytes in a multibyte character.
308 This function returns the same value as the Standard C macro MB_CUR_MAX,
309 except it treats locale names of the form "C-..." specially. */
312 local_mb_cur_max ()
314 if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
316 else if (! strcmp (literal_codeset, "C-SJIS"))
317 return 2;
318 else if (! strcmp (literal_codeset, "C-EUCJP"))
319 return 2;
320 else if (! strcmp (literal_codeset, "C-JIS"))
321 return 8; /* 3 + 2 + 3 */
323 #ifdef CROSS_COMPILE
324 return 1;
325 #else
326 if (MB_CUR_MAX > 0)
327 return MB_CUR_MAX;
329 return 1; /* default */
330 #endif
332 #else /* MULTIBYTE_CHARS */
333 extern int dummy; /* silence 'ANSI C forbids an empty source file' warning */
334 #endif /* MULTIBYTE_CHARS */