* config/h8300/clzsi2.c: Remove.
[official-gcc.git] / gcc / mbchar.c
blob2767235457f0c49aaf108a8e079dd2cf3d1f6619
1 /* Multibyte Character Functions.
2 Copyright (C) 1998 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING. If not, write to the Free
18 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
19 02111-1307, USA. */
21 /* Note regarding cross compilation:
23 In general, translation of multibyte characters to wide characters can
24 only work in a native compiler since the translation function (mbtowc)
25 needs to know about both the source and target character encoding. However,
26 this particular implementation for JIS, SJIS and EUCJP source characters
27 will work for any compiler with a newlib target. Other targets may also
28 work provided that their wchar_t implementation is 2 bytes and the encoding
29 leaves the source character values unchanged (except for removing the
30 state shifting markers). */
32 #include "config.h"
33 #ifdef MULTIBYTE_CHARS
34 #include "system.h"
35 #include "coretypes.h"
36 #include "tm.h"
37 #include "mbchar.h"
38 #include <locale.h>
40 typedef enum {ESCAPE, DOLLAR, BRACKET, AT, B, J, NUL, JIS_CHAR, OTHER,
41 JIS_C_NUM} JIS_CHAR_TYPE;
43 typedef enum {ASCII, A_ESC, A_ESC_DL, JIS, JIS_1, JIS_2, J_ESC, J_ESC_BR,
44 J2_ESC, J2_ESC_BR, INV, JIS_S_NUM} JIS_STATE;
46 typedef enum {COPYA, COPYJ, COPYJ2, MAKE_A, MAKE_J, NOOP,
47 EMPTY, ERROR} JIS_ACTION;
49 /* State/action tables for processing JIS encoding:
51 Where possible, switches to JIS are grouped with proceding JIS characters
52 and switches to ASCII are grouped with preceding JIS characters.
53 Thus, maximum returned length is:
54 2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6. */
56 static const JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = {
57 /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTH*/
58 /*ASCII*/ { A_ESC, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
59 /*A_ESC*/ { ASCII, A_ESC_DL,ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
60 /*A_ESC_DL*/{ ASCII, ASCII, ASCII, JIS, JIS, ASCII, ASCII,ASCII,ASCII},
61 /*JIS*/ { J_ESC, JIS_1, JIS_1, JIS_1, JIS_1, JIS_1, INV, JIS_1,INV },
62 /*JIS_1*/ { INV, JIS_2, JIS_2, JIS_2, JIS_2, JIS_2, INV, JIS_2,INV },
63 /*JIS_2*/ { J2_ESC,JIS, JIS, JIS, JIS, JIS, INV, JIS, JIS },
64 /*J_ESC*/ { INV, INV, J_ESC_BR, INV, INV, INV, INV, INV, INV },
65 /*J_ESC_BR*/{ INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV },
66 /*J2_ESC*/ { INV, INV, J2_ESC_BR,INV, INV, INV, INV, INV, INV },
67 /*J2_ESC_BR*/{INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV },
70 static const JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = {
71 /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTH */
72 /*ASCII */ {NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, EMPTY, COPYA, COPYA},
73 /*A_ESC */ {COPYA, NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA},
74 /*A_ESC_DL */{COPYA, COPYA, COPYA, MAKE_J, MAKE_J, COPYA, COPYA, COPYA, COPYA},
75 /*JIS */ {NOOP, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR},
76 /*JIS_1 */ {ERROR, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR},
77 /*JIS_2 */ {NOOP, COPYJ2,COPYJ2,COPYJ2, COPYJ2, COPYJ2,ERROR, COPYJ2,COPYJ2},
78 /*J_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR},
79 /*J_ESC_BR */{ERROR, ERROR, ERROR, ERROR, NOOP, NOOP, ERROR, ERROR, ERROR},
80 /*J2_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR},
81 /*J2_ESC_BR*/{ERROR, ERROR, ERROR, ERROR, COPYJ, COPYJ, ERROR, ERROR, ERROR},
85 const char *literal_codeset = NULL;
87 /* Store into *PWC (if PWC is not null) the wide character
88 corresponding to the multibyte character at the start of the
89 buffer S of size N. Return the number of bytes in the multibyte
90 character. Return -1 if the bytes do not form a valid character,
91 or 0 if S is null or points to a null byte.
93 This function behaves like the Standard C function mbtowc, except
94 it treats locale names of the form "C-..." specially. */
96 int
97 local_mbtowc (pwc, s, n)
98 wchar_t *pwc;
99 const char *s;
100 size_t n;
102 static JIS_STATE save_state = ASCII;
103 JIS_STATE curr_state = save_state;
104 const unsigned char *t = (const unsigned char *) s;
106 if (s != NULL && n == 0)
107 return -1;
109 if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
110 /* This must be the "C" locale or unknown locale -- fall thru */
112 else if (! strcmp (literal_codeset, "C-SJIS"))
114 int char1;
115 if (s == NULL)
116 /* Not state-dependent. */
117 return 0;
119 char1 = *t;
120 if (ISSJIS1 (char1))
122 int char2 = t[1];
124 if (n <= 1)
125 return -1;
127 if (ISSJIS2 (char2))
129 if (pwc != NULL)
130 *pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
131 return 2;
134 return -1;
137 if (pwc != NULL)
138 *pwc = (wchar_t) *t;
140 if (*t == '\0')
141 return 0;
143 return 1;
145 else if (! strcmp (literal_codeset, "C-EUCJP"))
147 int char1;
149 if (s == NULL)
150 /* Not state-dependent. */
151 return 0;
153 char1 = *t;
154 if (ISEUCJP (char1))
156 int char2 = t[1];
158 if (n <= 1)
159 return -1;
161 if (ISEUCJP (char2))
163 if (pwc != NULL)
164 *pwc = (((wchar_t) *t) << 8) + (wchar_t) (*(t + 1));
165 return 2;
168 return -1;
171 if (pwc != NULL)
172 *pwc = (wchar_t) *t;
174 if (*t == '\0')
175 return 0;
177 return 1;
179 else if (! strcmp (literal_codeset, "C-JIS"))
181 JIS_ACTION action;
182 JIS_CHAR_TYPE ch;
183 const unsigned char *ptr;
184 size_t i, curr_ch;
186 if (s == NULL)
188 save_state = ASCII;
189 /* State-dependent. */
190 return 1;
193 ptr = t;
195 for (i = 0; i < n; i++)
197 curr_ch = t[i];
198 switch (curr_ch)
200 case JIS_ESC_CHAR:
201 ch = ESCAPE;
202 break;
203 case '$':
204 ch = DOLLAR;
205 break;
206 case '@':
207 ch = AT;
208 break;
209 case '(':
210 ch = BRACKET;
211 break;
212 case 'B':
213 ch = B;
214 break;
215 case 'J':
216 ch = J;
217 break;
218 case '\0':
219 ch = NUL;
220 break;
221 default:
222 if (ISJIS (curr_ch))
223 ch = JIS_CHAR;
224 else
225 ch = OTHER;
228 action = JIS_action_table[curr_state][ch];
229 curr_state = JIS_state_table[curr_state][ch];
231 switch (action)
233 case NOOP:
234 break;
236 case EMPTY:
237 if (pwc != NULL)
238 *pwc = (wchar_t) 0;
240 save_state = curr_state;
241 return i;
243 case COPYA:
244 if (pwc != NULL)
245 *pwc = (wchar_t) *ptr;
246 save_state = curr_state;
247 return i + 1;
249 case COPYJ:
250 if (pwc != NULL)
251 *pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));
253 save_state = curr_state;
254 return i + 1;
256 case COPYJ2:
257 if (pwc != NULL)
258 *pwc = (((wchar_t) *ptr) << 8) + (wchar_t) (*(ptr + 1));
260 save_state = curr_state;
261 return ptr - t + 2;
263 case MAKE_A:
264 case MAKE_J:
265 ptr = (const unsigned char *) (t + i + 1);
266 break;
268 case ERROR:
269 default:
270 return -1;
274 /* More than n bytes needed. */
275 return -1;
278 #ifdef CROSS_COMPILE
279 if (s == NULL)
280 /* Not state-dependent. */
281 return 0;
283 if (pwc != NULL)
284 *pwc = *s;
285 return 1;
286 #else
288 /* This must be the "C" locale or unknown locale. */
289 return mbtowc (pwc, s, n);
290 #endif
293 /* Return the number of bytes in the multibyte character at the start
294 of the buffer S of size N. Return -1 if the bytes do not form a
295 valid character, or 0 if S is null or points to a null byte.
297 This function behaves like the Standard C function mblen, except
298 it treats locale names of the form "C-..." specially. */
301 local_mblen (s, n)
302 const char *s;
303 size_t n;
305 return local_mbtowc (NULL, s, n);
308 /* Return the maximum mumber of bytes in a multibyte character.
310 This function returns the same value as the Standard C macro MB_CUR_MAX,
311 except it treats locale names of the form "C-..." specially. */
314 local_mb_cur_max ()
316 if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
318 else if (! strcmp (literal_codeset, "C-SJIS"))
319 return 2;
320 else if (! strcmp (literal_codeset, "C-EUCJP"))
321 return 2;
322 else if (! strcmp (literal_codeset, "C-JIS"))
323 return 8; /* 3 + 2 + 3 */
325 #ifdef CROSS_COMPILE
326 return 1;
327 #else
328 if (MB_CUR_MAX > 0)
329 return MB_CUR_MAX;
331 return 1; /* default */
332 #endif
334 #else /* MULTIBYTE_CHARS */
335 extern int dummy; /* silence 'ANSI C forbids an empty source file' warning */
336 #endif /* MULTIBYTE_CHARS */