1 /* Multibyte Character Functions.
2 Copyright (C) 1998 Free Software Foundation, Inc.
4 This file is part of GNU CC.
6 GNU CC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 GNU CC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU CC; see the file COPYING. If not, write to
18 the Free Software Foundation, 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
21 /* Note regarding cross compilation:
23 In general, translation of multibyte characters to wide characters can
24 only work in a native compiler since the translation function (mbtowc)
25 needs to know about both the source and target character encoding. However,
26 this particular implementation for JIS, SJIS and EUCJP source characters
27 will work for any compiler with a newlib target. Other targets may also
28 work provided that their wchar_t implementation is 2 bytes and the encoding
29 leaves the source character values unchanged (except for removing the
30 state shifting markers). */
32 #ifdef MULTIBYTE_CHARS
38 typedef enum {ESCAPE
, DOLLAR
, BRACKET
, AT
, B
, J
, NUL
, JIS_CHAR
, OTHER
,
39 JIS_C_NUM
} JIS_CHAR_TYPE
;
41 typedef enum {ASCII
, A_ESC
, A_ESC_DL
, JIS
, JIS_1
, JIS_2
, J_ESC
, J_ESC_BR
,
42 J2_ESC
, J2_ESC_BR
, INV
, JIS_S_NUM
} JIS_STATE
;
44 typedef enum {COPYA
, COPYJ
, COPYJ2
, MAKE_A
, MAKE_J
, NOOP
,
45 EMPTY
, ERROR
} JIS_ACTION
;
47 /* State/action tables for processing JIS encoding:
49 Where possible, switches to JIS are grouped with proceding JIS characters
50 and switches to ASCII are grouped with preceding JIS characters.
51 Thus, maximum returned length is:
52 2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6. */
54 static JIS_STATE JIS_state_table
[JIS_S_NUM
][JIS_C_NUM
] = {
55 /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTH*/
56 /*ASCII*/ { A_ESC
, ASCII
, ASCII
, ASCII
, ASCII
, ASCII
, ASCII
,ASCII
,ASCII
},
57 /*A_ESC*/ { ASCII
, A_ESC_DL
,ASCII
, ASCII
, ASCII
, ASCII
, ASCII
,ASCII
,ASCII
},
58 /*A_ESC_DL*/{ ASCII
, ASCII
, ASCII
, JIS
, JIS
, ASCII
, ASCII
,ASCII
,ASCII
},
59 /*JIS*/ { J_ESC
, JIS_1
, JIS_1
, JIS_1
, JIS_1
, JIS_1
, INV
, JIS_1
,INV
},
60 /*JIS_1*/ { INV
, JIS_2
, JIS_2
, JIS_2
, JIS_2
, JIS_2
, INV
, JIS_2
,INV
},
61 /*JIS_2*/ { J2_ESC
,JIS
, JIS
, JIS
, JIS
, JIS
, INV
, JIS
, JIS
},
62 /*J_ESC*/ { INV
, INV
, J_ESC_BR
, INV
, INV
, INV
, INV
, INV
, INV
},
63 /*J_ESC_BR*/{ INV
, INV
, INV
, INV
, ASCII
, ASCII
, INV
, INV
, INV
},
64 /*J2_ESC*/ { INV
, INV
, J2_ESC_BR
,INV
, INV
, INV
, INV
, INV
, INV
},
65 /*J2_ESC_BR*/{INV
, INV
, INV
, INV
, ASCII
, ASCII
, INV
, INV
, INV
},
68 static JIS_ACTION JIS_action_table
[JIS_S_NUM
][JIS_C_NUM
] = {
69 /* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTH */
70 /*ASCII */ {NOOP
, COPYA
, COPYA
, COPYA
, COPYA
, COPYA
, EMPTY
, COPYA
, COPYA
},
71 /*A_ESC */ {COPYA
, NOOP
, COPYA
, COPYA
, COPYA
, COPYA
, COPYA
, COPYA
, COPYA
},
72 /*A_ESC_DL */{COPYA
, COPYA
, COPYA
, MAKE_J
, MAKE_J
, COPYA
, COPYA
, COPYA
, COPYA
},
73 /*JIS */ {NOOP
, NOOP
, NOOP
, NOOP
, NOOP
, NOOP
, ERROR
, NOOP
, ERROR
},
74 /*JIS_1 */ {ERROR
, NOOP
, NOOP
, NOOP
, NOOP
, NOOP
, ERROR
, NOOP
, ERROR
},
75 /*JIS_2 */ {NOOP
, COPYJ2
,COPYJ2
,COPYJ2
, COPYJ2
, COPYJ2
,ERROR
, COPYJ2
,COPYJ2
},
76 /*J_ESC */ {ERROR
, ERROR
, NOOP
, ERROR
, ERROR
, ERROR
, ERROR
, ERROR
, ERROR
},
77 /*J_ESC_BR */{ERROR
, ERROR
, ERROR
, ERROR
, NOOP
, NOOP
, ERROR
, ERROR
, ERROR
},
78 /*J2_ESC */ {ERROR
, ERROR
, NOOP
, ERROR
, ERROR
, ERROR
, ERROR
, ERROR
, ERROR
},
79 /*J2_ESC_BR*/{ERROR
, ERROR
, ERROR
, ERROR
, COPYJ
, COPYJ
, ERROR
, ERROR
, ERROR
},
83 const char *literal_codeset
= NULL
;
85 /* Store into *PWC (if PWC is not null) the wide character
86 corresponding to the multibyte character at the start of the
87 buffer S of size N. Return the number of bytes in the multibyte
88 character. Return -1 if the bytes do not form a valid character,
89 or 0 if S is null or points to a null byte.
91 This function behaves like the Standard C function mbtowc, except
92 it treats locale names of the form "C-..." specially. */
95 local_mbtowc (pwc
, s
, n
)
100 static JIS_STATE save_state
= ASCII
;
101 JIS_STATE curr_state
= save_state
;
102 const unsigned char *t
= (const unsigned char *) s
;
104 if (s
!= NULL
&& n
== 0)
107 if (literal_codeset
== NULL
|| strlen (literal_codeset
) <= 1)
108 /* This must be the "C" locale or unknown locale -- fall thru */
110 else if (! strcmp (literal_codeset
, "C-SJIS"))
114 /* Not state-dependent. */
128 *pwc
= (((wchar_t) *t
) << 8) + (wchar_t) (*(t
+ 1));
143 else if (! strcmp (literal_codeset
, "C-EUCJP"))
148 /* Not state-dependent. */
162 *pwc
= (((wchar_t) *t
) << 8) + (wchar_t) (*(t
+ 1));
177 else if (! strcmp (literal_codeset
, "C-JIS"))
181 const unsigned char *ptr
;
187 /* State-dependent. */
193 for (i
= 0; i
< n
; i
++)
226 action
= JIS_action_table
[curr_state
][ch
];
227 curr_state
= JIS_state_table
[curr_state
][ch
];
238 save_state
= curr_state
;
243 *pwc
= (wchar_t) *ptr
;
244 save_state
= curr_state
;
249 *pwc
= (((wchar_t) *ptr
) << 8) + (wchar_t) (*(ptr
+ 1));
251 save_state
= curr_state
;
256 *pwc
= (((wchar_t) *ptr
) << 8) + (wchar_t) (*(ptr
+ 1));
258 save_state
= curr_state
;
263 ptr
= (const unsigned char *) (t
+ i
+ 1);
272 /* More than n bytes needed. */
278 /* Not state-dependent. */
286 /* This must be the "C" locale or unknown locale. */
287 return mbtowc (pwc
, s
, n
);
291 /* Return the number of bytes in the multibyte character at the start
292 of the buffer S of size N. Return -1 if the bytes do not form a
293 valid character, or 0 if S is null or points to a null byte.
295 This function behaves like the Standard C function mblen, except
296 it treats locale names of the form "C-..." specially. */
303 return local_mbtowc (NULL
, s
, n
);
306 /* Return the maximum mumber of bytes in a multibyte character.
308 This function returns the same value as the Standard C macro MB_CUR_MAX,
309 except it treats locale names of the form "C-..." specially. */
314 if (literal_codeset
== NULL
|| strlen (literal_codeset
) <= 1)
316 else if (! strcmp (literal_codeset
, "C-SJIS"))
318 else if (! strcmp (literal_codeset
, "C-EUCJP"))
320 else if (! strcmp (literal_codeset
, "C-JIS"))
321 return 8; /* 3 + 2 + 3 */
329 return 1; /* default */
332 #else /* MULTIBYTE_CHARS */
333 extern int dummy
; /* silence 'ANSI C forbids an empty source file' warning */
334 #endif /* MULTIBYTE_CHARS */