1 /* ========================================================================
2 * Copyright 2008-2010 Mark Crispin
3 * ========================================================================
7 * Program: UTF-8 routines
12 * Last Edited: 28 May 2010
14 * Previous versions of this file were
16 * Copyright 1988-2008 University of Washington
18 * Licensed under the Apache License, Version 2.0 (the "License");
19 * you may not use this file except in compliance with the License.
20 * You may obtain a copy of the License at
22 * http://www.apache.org/licenses/LICENSE-2.0
33 * There is a very important difference between "character set" and "charset",
34 * and the comments in this file reflect these differences. A "character set"
35 * (also known as "coded character set") is a mapping between codepoints and
36 * characters. A "charset" is as defined in MIME, and incorporates one or more
37 * coded character sets in a character encoding scheme. See RFC 2130 for more
42 /* Character set conversion tables */
44 #include "iso_8859.c" /* 8-bit single-byte coded graphic */
45 #include "koi8_r.c" /* Cyrillic - Russia */
46 #include "koi8_u.c" /* Cyrillic - Ukraine */
47 #include "tis_620.c" /* Thai */
48 #include "viscii.c" /* Vietnamese */
49 #include "windows.c" /* Windows */
50 #include "ibm.c" /* IBM */
51 #include "gb_2312.c" /* Chinese (PRC) - simplified */
52 #include "gb_12345.c" /* Chinese (PRC) - traditional */
53 #include "jis_0208.c" /* Japanese - basic */
54 #include "jis_0212.c" /* Japanese - supplementary */
55 #include "ksc_5601.c" /* Korean */
56 #include "big5.c" /* Taiwanese (ROC) - industrial standard */
57 #include "cns11643.c" /* Taiwanese (ROC) - national standard */
60 #include "widths.c" /* Unicode character widths */
61 #include "tmap.c" /* Unicode titlecase mapping */
62 #include "decomtab.c" /* Unicode decomposions */
66 #ifdef GBTOUNICODE /* PRC simplified Chinese */
67 static const struct utf8_eucparam gb_param
= {
68 BASE_GB2312_KU
,BASE_GB2312_TEN
,MAX_GB2312_KU
,MAX_GB2312_TEN
,
73 #ifdef GB12345TOUNICODE /* PRC traditional Chinese */
74 static const struct utf8_eucparam gbt_param
= {
75 BASE_GB12345_KU
,BASE_GB12345_TEN
,MAX_GB12345_KU
,MAX_GB12345_TEN
,
80 #ifdef BIG5TOUNICODE /* ROC traditional Chinese */
81 static const struct utf8_eucparam big5_param
[] = {
82 {BASE_BIG5_KU
,BASE_BIG5_TEN_0
,MAX_BIG5_KU
,MAX_BIG5_TEN_0
,(void *) big5tab
},
83 {BASE_BIG5_KU
,BASE_BIG5_TEN_1
,MAX_BIG5_KU
,MAX_BIG5_TEN_1
,NIL
}
88 #ifdef JISTOUNICODE /* Japanese */
89 static const struct utf8_eucparam jis_param
[] = {
90 {BASE_JIS0208_KU
,BASE_JIS0208_TEN
,MAX_JIS0208_KU
,MAX_JIS0208_TEN
,
92 {MIN_KANA_8
,0,MAX_KANA_8
,0,(void *) KANA_8
},
93 #ifdef JIS0212TOUNICODE /* Japanese extended */
94 {BASE_JIS0212_KU
,BASE_JIS0212_TEN
,MAX_JIS0212_KU
,MAX_JIS0212_TEN
,
103 #ifdef KSCTOUNICODE /* Korean */
104 static const struct utf8_eucparam ksc_param
= {
105 BASE_KSC5601_KU
,BASE_KSC5601_TEN
,MAX_KSC5601_KU
,MAX_KSC5601_TEN
,
106 (void *) ksc5601tab
};
109 /* List of supported charsets */
111 static const CHARSET utf8_csvalid
[] = {
112 {"US-ASCII",CT_ASCII
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
114 {"UTF-8",CT_UTF8
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
116 {"UTF-7",CT_UTF7
,CF_PRIMARY
| CF_POSTING
| CF_UNSUPRT
,
117 NIL
,SC_UNICODE
,"UTF-8"},
118 {"ISO-8859-1",CT_1BYTE0
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
120 {"ISO-8859-2",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
121 (void *) iso8859_2tab
,SC_LATIN_2
,NIL
},
122 {"ISO-8859-3",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
123 (void *) iso8859_3tab
,SC_LATIN_3
,NIL
},
124 {"ISO-8859-4",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
125 (void *) iso8859_4tab
,SC_LATIN_4
,NIL
},
126 {"ISO-8859-5",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
127 (void *) iso8859_5tab
,SC_CYRILLIC
,"KOI8-R"},
128 {"ISO-8859-6",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
129 (void *) iso8859_6tab
,SC_ARABIC
,NIL
},
130 {"ISO-8859-7",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
131 (void *) iso8859_7tab
,SC_GREEK
,NIL
},
132 {"ISO-8859-8",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
133 (void *) iso8859_8tab
,SC_HEBREW
,NIL
},
134 {"ISO-8859-9",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
135 (void *) iso8859_9tab
,SC_LATIN_5
,NIL
},
136 {"ISO-8859-10",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
137 (void *) iso8859_10tab
,SC_LATIN_6
,NIL
},
138 {"ISO-8859-11",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
139 (void *) iso8859_11tab
,SC_THAI
,NIL
},
140 #if 0 /* ISO 8859-12 reserved for ISCII(?) */
141 {"ISO-8859-12",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
142 (void *) iso8859_12tab
,NIL
,NIL
},
144 {"ISO-8859-13",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
145 (void *) iso8859_13tab
,SC_LATIN_7
,NIL
},
146 {"ISO-8859-14",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
147 (void *) iso8859_14tab
,SC_LATIN_8
,NIL
},
148 {"ISO-8859-15",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
149 (void *) iso8859_15tab
,SC_LATIN_9
,NIL
},
150 {"ISO-8859-16",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
151 (void *) iso8859_16tab
,SC_LATIN_10
,NIL
},
152 {"KOI8-R",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
153 (void *) koi8rtab
,SC_CYRILLIC
,NIL
},
154 {"KOI8-U",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
155 (void *) koi8utab
,SC_CYRILLIC
| SC_UKRANIAN
,NIL
},
156 {"KOI8-RU",CT_1BYTE
,CF_DISPLAY
,
157 (void *) koi8utab
,SC_CYRILLIC
| SC_UKRANIAN
,"KOI8-U"},
158 {"TIS-620",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
159 (void *) tis620tab
,SC_THAI
,"ISO-8859-11"},
160 {"VISCII",CT_1BYTE8
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
161 (void *) visciitab
,SC_VIETNAMESE
,NIL
},
164 {"GBK",CT_DBYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
165 (void *) &gb_param
,SC_CHINESE_SIMPLIFIED
,NIL
},
166 {"GB2312",CT_DBYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
167 (void *) &gb_param
,SC_CHINESE_SIMPLIFIED
,"GBK"},
168 {"CN-GB",CT_DBYTE
,CF_DISPLAY
,
169 (void *) &gb_param
,SC_CHINESE_SIMPLIFIED
,"GBK"},
171 {"ISO-2022-CN",CT_2022
,CF_PRIMARY
| CF_UNSUPRT
,
172 NIL
,SC_CHINESE_SIMPLIFIED
| SC_CHINESE_TRADITIONAL
,
176 #ifdef GB12345TOUNICODE
177 {"CN-GB-12345",CT_DBYTE
,CF_PRIMARY
| CF_DISPLAY
,
178 (void *) &gbt_param
,SC_CHINESE_TRADITIONAL
,"BIG5"},
181 {"BIG5",CT_DBYTE2
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
182 (void *) big5_param
,SC_CHINESE_TRADITIONAL
,NIL
},
183 {"CN-BIG5",CT_DBYTE2
,CF_DISPLAY
,
184 (void *) big5_param
,SC_CHINESE_TRADITIONAL
,"BIG5"},
185 {"BIG-5",CT_DBYTE2
,CF_DISPLAY
,
186 (void *) big5_param
,SC_CHINESE_TRADITIONAL
,"BIG5"},
189 {"ISO-2022-JP",CT_2022
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
190 NIL
,SC_JAPANESE
,NIL
},
191 {"EUC-JP",CT_EUC
,CF_PRIMARY
| CF_DISPLAY
,
192 (void *) jis_param
,SC_JAPANESE
,"ISO-2022-JP"},
193 {"SHIFT_JIS",CT_SJIS
,CF_PRIMARY
| CF_DISPLAY
,
194 NIL
,SC_JAPANESE
,"ISO-2022-JP"},
195 {"SHIFT-JIS",CT_SJIS
,CF_PRIMARY
| CF_DISPLAY
,
196 NIL
,SC_JAPANESE
,"ISO-2022-JP"},
197 #ifdef JIS0212TOUNICODE
198 {"ISO-2022-JP-1",CT_2022
,CF_UNSUPRT
,
199 NIL
,SC_JAPANESE
,"ISO-2022-JP"},
202 {"ISO-2022-JP-2",CT_2022
,CF_UNSUPRT
,
204 SC_LATIN_1
| SC_LATIN_2
| SC_LATIN_3
| SC_LATIN_4
| SC_LATIN_5
|
205 SC_LATIN_6
| SC_LATIN_7
| SC_LATIN_8
| SC_LATIN_9
| SC_LATIN_10
|
206 SC_ARABIC
| SC_CYRILLIC
| SC_GREEK
| SC_HEBREW
| SC_THAI
|
207 SC_VIETNAMESE
| SC_CHINESE_SIMPLIFIED
| SC_JAPANESE
| SC_KOREAN
209 | SC_CHINESE_TRADITIONAL
218 {"ISO-2022-KR",CT_2022
,CF_PRIMARY
| CF_DISPLAY
| CF_UNSUPRT
,
219 NIL
,SC_KOREAN
,"EUC-KR"},
220 {"EUC-KR",CT_DBYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
221 (void *) &ksc_param
,SC_KOREAN
,NIL
},
222 {"KSC5601",CT_DBYTE
,CF_PRIMARY
| CF_DISPLAY
,
223 (void *) &ksc_param
,SC_KOREAN
,"EUC-KR"},
224 {"KSC_5601",CT_DBYTE
,CF_PRIMARY
| CF_DISPLAY
,
225 (void *) &ksc_param
,SC_KOREAN
,"EUC-KR"},
226 {"KS_C_5601-1987",CT_DBYTE
,CF_DISPLAY
,
227 (void *) &ksc_param
,SC_KOREAN
,"EUC-KR"},
228 {"KS_C_5601-1989",CT_DBYTE
,CF_DISPLAY
,
229 (void *) &ksc_param
,SC_KOREAN
,"EUC-KR"},
230 {"KS_C_5601-1992",CT_DBYTE
,CF_DISPLAY
,
231 (void *) &ksc_param
,SC_KOREAN
,"EUC-KR"},
232 {"KS_C_5601-1997",CT_DBYTE
,CF_DISPLAY
,
233 (void *) &ksc_param
,SC_KOREAN
,"EUC-KR"},
237 {"WINDOWS-874",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
238 (void *) windows_874tab
,SC_THAI
,"ISO-8859-11"},
239 {"CP874",CT_1BYTE
,CF_DISPLAY
,
240 (void *) windows_874tab
,SC_THAI
,"ISO-8859-11"},
242 {"WINDOWS-936",CT_DBYTE
,CF_PRIMARY
| CF_DISPLAY
,
243 (void *) &gb_param
,SC_CHINESE_SIMPLIFIED
,"GBK"},
244 {"CP936",CT_DBYTE
,CF_DISPLAY
,
245 (void *) &gb_param
,SC_CHINESE_SIMPLIFIED
,"GBK"},
248 {"WINDOWS-949",CT_DBYTE
,CF_PRIMARY
| CF_DISPLAY
,
249 (void *) &ksc_param
,SC_KOREAN
,"EUC-KR"},
250 {"CP949",CT_DBYTE
,CF_DISPLAY
,
251 (void *) &ksc_param
,SC_KOREAN
,"EUC-KR"},
252 {"X-WINDOWS-949",CT_DBYTE
,CF_PRIMARY
| CF_DISPLAY
,
253 (void *) &ksc_param
,SC_KOREAN
,"EUC-KR"},
255 {"WINDOWS-1250",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
256 (void *) windows_1250tab
,SC_LATIN_2
,"ISO-8859-2"},
257 {"CP1250",CT_1BYTE
,CF_DISPLAY
,
258 (void *) windows_1250tab
,SC_LATIN_2
,"ISO-8859-2"},
259 {"WINDOWS-1251",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
| CF_POSTING
,
260 (void *) windows_1251tab
,SC_CYRILLIC
,"KOI8-R"},
261 {"CP1251",CT_1BYTE
,CF_DISPLAY
,
262 (void *) windows_1251tab
,SC_CYRILLIC
,"KOI8-R"},
263 {"WINDOWS-1252",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
264 (void *) windows_1252tab
,SC_LATIN_1
,"ISO-8859-1"},
265 {"CP1252",CT_1BYTE
,CF_DISPLAY
,
266 (void *) windows_1252tab
,SC_LATIN_1
,"ISO-8859-1"},
267 {"WINDOWS-1253",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
268 (void *) windows_1253tab
,SC_GREEK
,"ISO-8859-7"},
269 {"CP1253",CT_1BYTE
,CF_DISPLAY
,
270 (void *) windows_1253tab
,SC_GREEK
,"ISO-8859-7"},
271 {"WINDOWS-1254",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
272 (void *) windows_1254tab
,SC_LATIN_5
,"ISO-8859-9"},
273 {"CP1254",CT_1BYTE
,CF_DISPLAY
,
274 (void *) windows_1254tab
,SC_LATIN_5
,"ISO-8859-9"},
275 {"WINDOWS-1255",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
276 (void *) windows_1255tab
,SC_HEBREW
,"ISO-8859-8"},
277 {"CP1255",CT_1BYTE
,CF_DISPLAY
,
278 (void *) windows_1255tab
,SC_HEBREW
,"ISO-8859-8"},
279 {"WINDOWS-1256",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
280 (void *) windows_1256tab
,SC_ARABIC
,"ISO-8859-6"},
281 {"CP1256",CT_1BYTE
,CF_DISPLAY
,
282 (void *) windows_1256tab
,SC_ARABIC
,"ISO-8859-6"},
283 {"WINDOWS-1257",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
284 (void *) windows_1257tab
,SC_LATIN_7
,"ISO-8859-13"},
285 {"CP1257",CT_1BYTE
,CF_DISPLAY
,
286 (void *) windows_1257tab
,SC_LATIN_7
,"ISO-8859-13"},
287 {"WINDOWS-1258",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
288 (void *) windows_1258tab
,SC_VIETNAMESE
,"VISCII"},
289 {"CP1258",CT_1BYTE
,CF_DISPLAY
,
290 (void *) windows_1258tab
,SC_VIETNAMESE
,"VISCII"},
293 {"IBM367",CT_ASCII
,CF_PRIMARY
| CF_DISPLAY
,
295 {"IBM437",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
296 (void *) ibm_437tab
,SC_LATIN_1
,"ISO-8859-1"},
297 {"IBM737",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
298 (void *) ibm_737tab
,SC_GREEK
,"ISO-8859-7"},
299 {"IBM775",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
300 (void *) ibm_775tab
,SC_LATIN_7
,"ISO-8859-13"},
301 {"IBM850",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
302 (void *) ibm_850tab
,SC_LATIN_1
,"ISO-8859-1"},
303 {"IBM852",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
304 (void *) ibm_852tab
,SC_LATIN_2
,"ISO-8859-2"},
305 {"IBM855",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
306 (void *) ibm_855tab
,SC_CYRILLIC
,"ISO-8859-5"},
307 {"IBM857",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
308 (void *) ibm_857tab
,SC_LATIN_5
,"ISO-8859-9"},
309 {"IBM860",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
310 (void *) ibm_860tab
,SC_LATIN_1
,"ISO-8859-1"},
311 {"IBM861",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
312 (void *) ibm_861tab
,SC_LATIN_6
,"ISO-8859-10"},
313 {"IBM862",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
314 (void *) ibm_862tab
,SC_HEBREW
,"ISO-8859-8"},
315 {"IBM863",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
316 (void *) ibm_863tab
,SC_LATIN_1
,"ISO-8859-1"},
317 {"IBM864",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
318 (void *) ibm_864tab
,SC_ARABIC
,"ISO-8859-6"},
319 {"IBM865",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
320 (void *) ibm_865tab
,SC_LATIN_6
,"ISO-8859-10"},
321 {"IBM866",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
322 (void *) ibm_866tab
,SC_CYRILLIC
,"KOI8-R"},
323 {"IBM869",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
324 (void *) ibm_869tab
,SC_GREEK
,"ISO-8859-7"},
325 {"IBM874",CT_1BYTE
,CF_PRIMARY
| CF_DISPLAY
,
326 (void *) ibm_874tab
,SC_THAI
,"ISO-8859-11"},
328 {"ANSI_X3.4-1968",CT_ASCII
,CF_DISPLAY
,
330 {"UNICODE-1-1-UTF-7",CT_UTF7
,CF_UNSUPRT
,
331 NIL
,SC_UNICODE
,"UTF-8"},
332 /* these should never appear in email */
333 {"UCS-2",CT_UCS2
,CF_PRIMARY
| CF_DISPLAY
| CF_NOEMAIL
,
334 NIL
,SC_UNICODE
,"UTF-8"},
335 {"UCS-4",CT_UCS4
,CF_PRIMARY
| CF_DISPLAY
| CF_NOEMAIL
,
336 NIL
,SC_UNICODE
,"UTF-8"},
337 {"UTF-16",CT_UTF16
,CF_PRIMARY
| CF_DISPLAY
| CF_NOEMAIL
,
338 NIL
,SC_UNICODE
,"UTF-8"},
342 /* Non-Unicode Script table */
344 static const SCRIPT utf8_scvalid
[] = {
345 {"Arabic",NIL
,SC_ARABIC
},
346 {"Chinese Simplified","China, Singapore",SC_CHINESE_SIMPLIFIED
},
347 {"Chinese Traditional","Taiwan, Hong Kong, Macao",SC_CHINESE_TRADITIONAL
},
348 {"Cyrillic",NIL
,SC_CYRILLIC
},
349 {"Cyrillic Ukranian",NIL
,SC_UKRANIAN
},
350 {"Greek",NIL
,SC_GREEK
},
351 {"Hebrew",NIL
,SC_HEBREW
},
352 {"Japanese",NIL
,SC_JAPANESE
},
353 {"Korean",NIL
,SC_KOREAN
},
354 {"Latin-1","Western Europe",SC_LATIN_1
},
355 {"Latin-2","Eastern Europe",SC_LATIN_2
},
356 {"Latin-3","Southern Europe",SC_LATIN_3
},
357 {"Latin-4","Northern Europe",SC_LATIN_4
},
358 {"Latin-5","Turkish",SC_LATIN_5
},
359 {"Latin-6","Nordic",SC_LATIN_6
},
360 {"Latin-7","Baltic",SC_LATIN_7
},
361 {"Latin-8","Celtic",SC_LATIN_8
},
362 {"Latin-9","Euro",SC_LATIN_9
},
363 {"Latin-10","Balkan",SC_LATIN_10
},
364 {"Thai",NIL
,SC_THAI
},
365 {"Vietnamese",NIL
,SC_VIETNAMESE
},
369 /* Look up script name or return entire table
370 * Accepts: script name or NIL
371 * Returns: pointer to script table entry or NIL if unknown
374 SCRIPT
*utf8_script (char *script
)
377 if (!script
) return (SCRIPT
*) &utf8_scvalid
[0];
378 else if (*script
&& (strlen (script
) < 128))
379 for (i
= 0; utf8_scvalid
[i
].name
; i
++)
380 if (!compare_cstring (script
,utf8_scvalid
[i
].name
))
381 return (SCRIPT
*) &utf8_scvalid
[i
];
382 return NIL
; /* failed */
386 /* Look up charset name or return entire table
387 * Accepts: charset name or NIL
388 * Returns: charset table entry or NIL if unknown
391 const CHARSET
*utf8_charset (char *charset
)
394 if (!charset
) return (CHARSET
*) &utf8_csvalid
[0];
395 else if (*charset
&& (strlen (charset
) < 128))
396 for (i
= 0; utf8_csvalid
[i
].name
; i
++)
397 if (!compare_cstring (charset
,utf8_csvalid
[i
].name
))
398 return (CHARSET
*) &utf8_csvalid
[i
];
399 return NIL
; /* failed */
402 /* Validate charset and generate error message if invalid
403 * Accepts: bad character set
404 * Returns: NIL if good charset, else error message string
407 #define BADCSS "[BADCHARSET ("
408 #define BADCSE ")] Unknown charset: "
410 char *utf8_badcharset (char *charset
)
413 if (!utf8_charset (charset
)) {
416 /* calculate size of header, trailer, and bad
417 * charset plus charset names */
418 for (i
= 0, j
= sizeof (BADCSS
) + sizeof (BADCSE
) + strlen (charset
) - 2;
419 utf8_csvalid
[i
].name
; i
++)
420 j
+= strlen (utf8_csvalid
[i
].name
) + 1;
421 /* not built right */
422 if (!i
) fatal ("No valid charsets!");
424 for (s
= msg
= (char *) fs_get (j
), t
= BADCSS
; *t
; *s
++ = *t
++);
426 for (i
= 0; utf8_csvalid
[i
].name
; *s
++ = ' ', i
++)
427 for (t
= utf8_csvalid
[i
].name
; *t
; *s
++ = *t
++);
428 /* back over last space, trailer */
429 for (t
= BADCSE
, --s
; *t
; *s
++ = *t
++);
430 /* finally bogus charset */
431 for (t
= charset
; *t
; *s
++ = *t
++);
432 *s
++ = '\0'; /* finally tie off string */
433 if (s
!= (msg
+ j
)) fatal ("charset msg botch");
438 /* Convert charset labelled sized text to UTF-8
439 * Accepts: source sized text
441 * pointer to returned sized text if non-NIL
443 * Returns: T if successful, NIL if failure
446 long utf8_text (SIZEDTEXT
*text
,char *charset
,SIZEDTEXT
*ret
,long flags
)
448 ucs4cn_t cv
= (flags
& U8T_CASECANON
) ? ucs4_titlecase
: NIL
;
449 ucs4de_t de
= (flags
& U8T_DECOMPOSE
) ? ucs4_decompose_recursive
: NIL
;
450 const CHARSET
*cs
= (charset
&& *charset
) ?
451 utf8_charset (charset
) : utf8_infercharset (text
);
452 if (cs
) return (text
&& ret
) ? utf8_text_cs (text
,cs
,ret
,cv
,de
) : LONGT
;
453 if (ret
) { /* no conversion possible */
454 ret
->data
= text
->data
; /* so return source */
455 ret
->size
= text
->size
;
457 return NIL
; /* failure */
461 /* Operations used in converting data */
463 #define UTF8_COUNT_BMP(count,c,cv,de) { \
465 if (cv) c = (*cv) (c); \
466 if (de) c = (*de) (c,&more); \
467 do count += UTF8_SIZE_BMP(c); \
468 while (more && (c = (*de) (U8G_ERROR,&more)));\
471 #define UTF8_WRITE_BMP(b,c,cv,de) { \
473 if (cv) c = (*cv) (c); \
474 if (de) c = (*de) (c,&more); \
475 do UTF8_PUT_BMP (b,c) \
476 while (more && (c = (*de) (U8G_ERROR,&more)));\
479 #define UTF8_COUNT(count,c,cv,de) { \
481 if (cv) c = (*cv) (c); \
482 if (de) c = (*de) (c,&more); \
483 do count += utf8_size (c); \
484 while (more && (c = (*de) (U8G_ERROR,&more)));\
487 #define UTF8_WRITE(b,c,cv,de) { \
489 if (cv) c = (*cv) (c); \
490 if (de) c = (*de) (c,&more); \
491 do b = utf8_put (b,c); \
492 while (more && (c = (*de) (U8G_ERROR,&more)));\
495 /* Convert sized text to UTF-8 given CHARSET block
496 * Accepts: source sized text
498 * pointer to returned sized text
499 * canonicalization function
500 * decomposition function
501 * Returns: T if successful, NIL if failure
504 long utf8_text_cs (SIZEDTEXT
*text
,const CHARSET
*cs
,SIZEDTEXT
*ret
,
505 ucs4cn_t cv
,ucs4de_t de
)
507 ret
->data
= text
->data
; /* default to source */
508 ret
->size
= text
->size
;
509 switch (cs
->type
) { /* convert if type known */
510 case CT_ASCII
: /* 7-bit ASCII no table */
511 case CT_UTF8
: /* variable UTF-8 encoded Unicode no table */
512 if (cv
|| de
) utf8_text_utf8 (text
,ret
,cv
,de
);
514 case CT_1BYTE0
: /* 1 byte no table */
515 utf8_text_1byte0 (text
,ret
,cv
,de
);
517 case CT_1BYTE
: /* 1 byte ASCII + table 0x80-0xff */
518 utf8_text_1byte (text
,ret
,cs
->tab
,cv
,de
);
520 case CT_1BYTE8
: /* 1 byte table 0x00 - 0xff */
521 utf8_text_1byte8 (text
,ret
,cs
->tab
,cv
,de
);
523 case CT_EUC
: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
524 utf8_text_euc (text
,ret
,cs
->tab
,cv
,de
);
526 case CT_DBYTE
: /* 2 byte ASCII + utf8_eucparam */
527 utf8_text_dbyte (text
,ret
,cs
->tab
,cv
,de
);
529 case CT_DBYTE2
: /* 2 byte ASCII + utf8_eucparam plane1/2 */
530 utf8_text_dbyte2 (text
,ret
,cs
->tab
,cv
,de
);
532 case CT_UTF7
: /* variable UTF-7 encoded Unicode no table */
533 utf8_text_utf7 (text
,ret
,cv
,de
);
535 case CT_UCS2
: /* 2 byte 16-bit Unicode no table */
536 utf8_text_ucs2 (text
,ret
,cv
,de
);
538 case CT_UCS4
: /* 4 byte 32-bit Unicode no table */
539 utf8_text_ucs4 (text
,ret
,cv
,de
);
541 case CT_UTF16
: /* variable UTF-16 encoded Unicode no table */
542 utf8_text_utf16 (text
,ret
,cv
,de
);
544 case CT_2022
: /* variable ISO-2022 encoded no table*/
545 utf8_text_2022 (text
,ret
,cv
,de
);
547 case CT_SJIS
: /* 2 byte Shift-JIS encoded JIS no table */
548 utf8_text_sjis (text
,ret
,cv
,de
);
550 default: /* unknown character set type */
553 return LONGT
; /* return success */
556 /* Reverse mapping routines
558 * These routines only support character sets, not all possible charsets. In
559 * particular, they do not support any Unicode encodings or ISO 2022.
561 * As a special dispensation, utf8_cstext() and utf8_cstocstext() support
562 * support ISO-2022-JP if EUC-JP can be reverse mapped; and utf8_rmaptext()
563 * will generated ISO-2022-JP using an EUC-JP rmap if flagged to do so.
565 * No attempt is made to map "equivalent" Unicode characters or Unicode
566 * characters that have the same glyph; nor is there any attempt to handle
567 * combining characters or otherwise do any stringprep. Maybe later.
571 /* Convert UTF-8 sized text to charset
572 * Accepts: source sized text
573 * destination charset
574 * pointer to returned sized text
575 * substitute character if not in cs, else NIL to return failure
576 * Returns: T if successful, NIL if failure
580 long utf8_cstext (SIZEDTEXT
*text
,char *charset
,SIZEDTEXT
*ret
,
583 short iso2022jp
= !compare_cstring (charset
,"ISO-2022-JP");
584 unsigned short *rmap
= utf8_rmap (iso2022jp
? "EUC-JP" : charset
);
585 return rmap
? utf8_rmaptext (text
,rmap
,ret
,errch
,iso2022jp
) : NIL
;
588 /* Convert charset labelled sized text to another charset
589 * Accepts: source sized text
591 * pointer to returned sized text
592 * destination charset
593 * substitute character if not in dest cs, else NIL to return failure
594 * Returns: T if successful, NIL if failure
596 * This routine has the same restricts as utf8_cstext().
599 long utf8_cstocstext (SIZEDTEXT
*src
,char *sc
,SIZEDTEXT
*dst
,char *dc
,
603 const CHARSET
*scs
,*dcs
;
604 unsigned short *rmap
;
607 /* lookup charsets and reverse map */
608 if ((dc
&& (dcs
= utf8_charset (dc
))) &&
609 (rmap
= (iso2022jp
= ((dcs
->type
== CT_2022
) &&
610 !compare_cstring (dcs
->name
,"ISO-2022-JP"))) ?
611 utf8_rmap ("EUC-JP") : utf8_rmap_cs (dcs
)) &&
612 (scs
= (sc
&& *sc
) ? utf8_charset (sc
) : utf8_infercharset (src
))) {
613 /* init temporary buffer */
614 memset (&utf8
,NIL
,sizeof (SIZEDTEXT
));
615 /* source cs equivalent to dest cs? */
616 if ((scs
->type
== dcs
->type
) && (scs
->tab
== dcs
->tab
)) {
617 dst
->data
= src
->data
; /* yes, just copy pointers */
618 dst
->size
= src
->size
;
621 /* otherwise do the conversion */
622 else ret
= (utf8_text_cs (src
,scs
,&utf8
,NIL
,NIL
) &&
623 utf8_rmaptext (&utf8
,rmap
,dst
,errch
,iso2022jp
));
624 /* flush temporary buffer */
625 if (utf8
.data
&& (utf8
.data
!= src
->data
) && (utf8
.data
!= dst
->data
))
626 fs_give ((void **) &utf8
.data
);
633 static const CHARSET
*currmapcs
= NIL
;
634 static unsigned short *currmap
= NIL
;
637 /* Cache and return map for UTF-8 -> character set
638 * Accepts: character set name
639 * Returns: cached map if character set found, else NIL
642 unsigned short *utf8_rmap (char *charset
)
644 return (currmapcs
&& !compare_cstring (charset
,currmapcs
->name
)) ? currmap
:
645 utf8_rmap_cs (utf8_charset (charset
));
649 /* Cache and return map for UTF-8 -> character set given CHARSET block
650 * Accepts: CHARSET block
651 * Returns: cached map if character set found, else NIL
654 unsigned short *utf8_rmap_cs (const CHARSET
*cs
)
656 unsigned short *ret
= NIL
;
657 if (!cs
); /* have charset? */
658 else if (cs
== currmapcs
) ret
= currmap
;
659 else if (ret
= utf8_rmap_gen (cs
,currmap
)) {
666 /* Return map for UTF-8 -> character set given CHARSET block
667 * Accepts: CHARSET block
669 * Returns: map if character set found, else NIL
672 unsigned short *utf8_rmap_gen (const CHARSET
*cs
,unsigned short *oldmap
)
674 unsigned short u
,*tab
,*rmap
;
675 unsigned int i
,m
,ku
,ten
;
676 struct utf8_eucparam
*param
,*p2
;
677 switch (cs
->type
) { /* is a character set? */
678 case CT_ASCII
: /* 7-bit ASCII no table */
679 case CT_1BYTE0
: /* 1 byte no table */
680 case CT_1BYTE
: /* 1 byte ASCII + table 0x80-0xff */
681 case CT_1BYTE8
: /* 1 byte table 0x00 - 0xff */
682 case CT_EUC
: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
683 case CT_DBYTE
: /* 2 byte ASCII + utf8_eucparam */
684 case CT_DBYTE2
: /* 2 byte ASCII + utf8_eucparam plane1/2 */
685 case CT_SJIS
: /* 2 byte Shift-JIS */
686 rmap
= oldmap
? oldmap
: /* recycle old map if supplied else make new */
687 (unsigned short *) fs_get (65536 * sizeof (unsigned short));
688 /* initialize table for ASCII */
689 for (i
= 0; i
< 128; i
++) rmap
[i
] = (unsigned short) i
;
690 /* populate remainder of table with NOCHAR */
691 #define NOCHARBYTE (NOCHAR & 0xff)
692 #if NOCHAR - ((NOCHARBYTE << 8) | NOCHARBYTE)
693 while (i
< 65536) rmap
[i
++] = NOCHAR
;
695 memset (rmap
+ 128,NOCHARBYTE
,(65536 - 128) * sizeof (unsigned short));
698 default: /* unsupported charset type */
699 rmap
= NIL
; /* no map possible */
701 if (rmap
) { /* have a map? */
702 switch (cs
->type
) { /* additional reverse map actions */
703 case CT_1BYTE0
: /* 1 byte no table */
704 for (i
= 128; i
< 256; i
++) rmap
[i
] = (unsigned short) i
;
706 case CT_1BYTE
: /* 1 byte ASCII + table 0x80-0xff */
707 for (tab
= (unsigned short *) cs
->tab
,i
= 128; i
< 256; i
++)
708 if (tab
[i
& BITS7
] != UBOGON
) rmap
[tab
[i
& BITS7
]] = (unsigned short)i
;
710 case CT_1BYTE8
: /* 1 byte table 0x00 - 0xff */
711 for (tab
= (unsigned short *) cs
->tab
,i
= 0; i
< 256; i
++)
712 if (tab
[i
] != UBOGON
) rmap
[tab
[i
]] = (unsigned short) i
;
714 case CT_EUC
: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
715 for (param
= (struct utf8_eucparam
*) cs
->tab
,
716 tab
= (unsigned short *) param
->tab
, ku
= 0;
717 ku
< param
->max_ku
; ku
++)
718 for (ten
= 0; ten
< param
->max_ten
; ten
++)
719 if ((u
= tab
[(ku
* param
->max_ten
) + ten
]) != UBOGON
)
720 rmap
[u
] = ((ku
+ param
->base_ku
) << 8) +
721 (ten
+ param
->base_ten
) + 0x8080;
724 case CT_DBYTE
: /* 2 byte ASCII + utf8_eucparam */
725 for (param
= (struct utf8_eucparam
*) cs
->tab
,
726 tab
= (unsigned short *) param
->tab
, ku
= 0;
727 ku
< param
->max_ku
; ku
++)
728 for (ten
= 0; ten
< param
->max_ten
; ten
++)
729 if ((u
= tab
[(ku
* param
->max_ten
) + ten
]) != UBOGON
)
730 rmap
[u
] = ((ku
+ param
->base_ku
) << 8) + (ten
+ param
->base_ten
);
732 case CT_DBYTE2
: /* 2 byte ASCII + utf8_eucparam plane1/2 */
733 param
= (struct utf8_eucparam
*) cs
->tab
;
734 p2
= param
+ 1; /* plane 2 parameters */
735 /* only ten parameters should differ */
736 if ((param
->base_ku
!= p2
->base_ku
) || (param
->max_ku
!= p2
->max_ku
))
737 fatal ("ku definition error for CT_DBYTE2 charset");
738 /* total codepoints in each ku */
739 m
= param
->max_ten
+ p2
->max_ten
;
740 tab
= (unsigned short *) param
->tab
;
741 for (ku
= 0; ku
< param
->max_ku
; ku
++) {
742 for (ten
= 0; ten
< param
->max_ten
; ten
++)
743 if ((u
= tab
[(ku
* m
) + ten
]) != UBOGON
)
744 rmap
[u
] = ((ku
+ param
->base_ku
) << 8) + (ten
+ param
->base_ten
);
745 for (ten
= 0; ten
< p2
->max_ten
; ten
++)
746 if ((u
= tab
[(ku
* m
) + param
->max_ten
+ ten
]) != UBOGON
)
747 rmap
[u
] = ((ku
+ param
->base_ku
) << 8) + (ten
+ p2
->base_ten
);
750 case CT_SJIS
: /* 2 byte Shift-JIS */
751 for (ku
= 0; ku
< MAX_JIS0208_KU
; ku
++)
752 for (ten
= 0; ten
< MAX_JIS0208_TEN
; ten
++)
753 if ((u
= jis0208tab
[ku
][ten
]) != UBOGON
) {
754 int sku
= ku
+ BASE_JIS0208_KU
;
755 int sten
= ten
+ BASE_JIS0208_TEN
;
756 rmap
[u
] = ((((sku
+ 1) >> 1) + ((sku
< 95) ? 112 : 176)) << 8) +
757 sten
+ ((sku
% 2) ? ((sten
> 95) ? 32 : 31) : 126);
760 rmap
[UCS2_YEN
] = JISROMAN_YEN
;
761 rmap
[UCS2_OVERLINE
] = JISROMAN_OVERLINE
;
762 /* JIS hankaku katakana */
763 for (u
= 0; u
< (MAX_KANA_8
- MIN_KANA_8
); u
++)
764 rmap
[UCS2_KATAKANA
+ u
] = MIN_KANA_8
+ u
;
767 /* hack: map NBSP to SP if otherwise no map */
768 if (rmap
[0x00a0] == NOCHAR
) rmap
[0x00a0] = rmap
[0x0020];
770 return rmap
; /* return map */
773 /* Convert UTF-8 sized text to charset using rmap
774 * Accepts: source sized text
776 * pointer to returned sized text
777 * substitute character if not in rmap, else NIL to return failure
778 * ISO-2022-JP conversion flag
779 * Returns T if successful, NIL if failure
781 * This routine doesn't try to convert to all possible charsets; in particular
782 * it doesn't support other Unicode encodings or any ISO 2022 other than
786 long utf8_rmaptext (SIZEDTEXT
*text
,unsigned short *rmap
,SIZEDTEXT
*ret
,
787 unsigned long errch
,long iso2022jp
)
790 /* get size of buffer */
791 if (i
= utf8_rmapsize (text
,rmap
,errch
,iso2022jp
)) {
792 unsigned char *s
= text
->data
;
793 unsigned char *t
= ret
->data
= (unsigned char *) fs_get (i
);
794 ret
->size
= i
- 1; /* number of octets in destination buffer */
795 /* start non-zero ISO-2022-JP state at 1 */
796 if (iso2022jp
) iso2022jp
= 1;
797 /* convert string, ignore BOM */
798 for (i
= text
->size
; i
;) if ((u
= utf8_get (&s
,&i
)) != UCS2_BOM
) {
799 /* substitute error character for NOCHAR */
800 if ((u
& U8GM_NONBMP
) || ((c
= rmap
[u
]) == NOCHAR
)) c
= errch
;
801 switch (iso2022jp
) { /* depends upon ISO 2022 mode */
802 case 0: /* ISO 2022 not in effect */
803 /* two-byte character */
804 if (c
> 0xff) *t
++ = (unsigned char) (c
>> 8);
805 /* single-byte or low-byte of two-byte */
806 *t
++ = (unsigned char) (c
& 0xff);
808 case 1: /* ISO 2022 Roman */
810 if (c
< 0x80) *t
++ = (unsigned char) c
;
811 else { /* JIS character */
812 *t
++ = I2C_ESC
; /* ESC $ B <hi> <lo> */
814 *t
++ = I2CS_94x94_JIS_NEW
;
815 *t
++ = (unsigned char) (c
>> 8) & 0x7f;
816 *t
++ = (unsigned char) c
& 0x7f;
817 iso2022jp
= 2; /* shift to ISO 2022 JIS */
820 case 2: /* ISO 2022 JIS */
821 if (c
> 0x7f) { /* <hi> <lo> */
822 *t
++ = (unsigned char) (c
>> 8) & 0x7f;
823 *t
++ = (unsigned char) c
& 0x7f;
825 else { /* ASCII character */
826 *t
++ = I2C_ESC
; /* ESC ( J <ch> */
828 *t
++ = I2CS_94_JIS_ROMAN
;
829 *t
++ = (unsigned char) c
;
830 iso2022jp
= 1; /* shift to ISO 2022 Roman */
835 if (iso2022jp
== 2) { /* ISO-2022-JP string must end in Roman */
836 *t
++ = I2C_ESC
; /* ESC ( J */
838 *t
++ = I2CS_94_JIS_ROMAN
;
840 *t
++ = NIL
; /* tie off returned data */
841 return LONGT
; /* return success */
845 return NIL
; /* failure */
848 /* Calculate size of convertsion of UTF-8 sized text to charset using rmap
849 * Accepts: source sized text
851 * pointer to returned sized text
852 * substitute character if not in rmap, else NIL to return failure
853 * ISO-2022-JP conversion flag
854 * Returns size+1 if successful, NIL if failure
856 * This routine doesn't try to handle to all possible charsets; in particular
857 * it doesn't support other Unicode encodings or any ISO 2022 other than
861 unsigned long utf8_rmapsize (SIZEDTEXT
*text
,unsigned short *rmap
,
862 unsigned long errch
,long iso2022jp
)
865 unsigned long ret
= 1; /* terminating NUL */
866 unsigned char *s
= text
->data
;
867 if (iso2022jp
) iso2022jp
= 1; /* start non-zero ISO-2022-JP state at 1 */
868 for (i
= text
->size
; i
;) if ((u
= utf8_get (&s
,&i
)) != UCS2_BOM
) {
869 if ((u
& U8GM_NONBMP
) || (((c
= rmap
[u
]) == NOCHAR
) && !(c
= errch
)))
870 return NIL
; /* not in BMP, or NOCHAR and no err char */
871 switch (iso2022jp
) { /* depends upon ISO 2022 mode */
872 case 0: /* ISO 2022 not in effect */
873 ret
+= (c
> 0xff) ? 2 : 1;
875 case 1: /* ISO 2022 Roman */
876 if (c
< 0x80) ret
+= 1; /* <ch> */
877 else { /* JIS character */
878 ret
+= 5; /* ESC $ B <hi> <lo> */
879 iso2022jp
= 2; /* shift to ISO 2022 JIS */
882 case 2: /* ISO 2022 JIS */
883 if (c
> 0x7f) ret
+= 2; /* <hi> <lo> */
884 else { /* ASCII character */
885 ret
+= 4; /* ESC ( J <ch> */
886 iso2022jp
= 1; /* shift to ISO 2022 Roman */
891 if (iso2022jp
== 2) { /* ISO-2022-JP string must end in Roman */
892 ret
+= 3; /* ESC ( J */
893 iso2022jp
= 1; /* reset state to Roman */
898 /* Convert UCS-4 to charset using rmap
899 * Accepts: source UCS-4 character(s)
900 * numver of UCS-4 characters
902 * pointer to returned sized text
903 * substitute character if not in rmap, else NIL to return failure
904 * Returns T if successful, NIL if failure
906 * Currently only supports BMP characters, and does not support ISO-2022
909 long ucs4_rmaptext (unsigned long *ucs4
,unsigned long len
,unsigned short *rmap
,
910 SIZEDTEXT
*ret
,unsigned long errch
)
912 long size
= ucs4_rmaplen (ucs4
,len
,rmap
,errch
);
913 return (size
>= 0) ? /* build in newly-created buffer */
914 ucs4_rmapbuf (ret
->data
= (unsigned char *) fs_get ((ret
->size
= size
) +1),
915 ucs4
,len
,rmap
,errch
) : NIL
;
918 /* Return size of UCS-4 string converted to other CS via rmap
919 * Accepts: source UCS-4 character(s)
920 * numver of UCS-4 characters
922 * substitute character if not in rmap, else NIL to return failure
923 * Returns: length if success, negative if failure (no-convert)
926 long ucs4_rmaplen (unsigned long *ucs4
,unsigned long len
,unsigned short *rmap
,
931 /* count non-BOM characters */
932 for (ret
= 0,i
= 0; i
< len
; ++i
) if ((u
= ucs4
[i
]) != UCS2_BOM
) {
933 if ((u
& U8GM_NONBMP
) || (((c
= rmap
[u
]) == NOCHAR
) && !(c
= errch
)))
934 return -1; /* not in BMP, or NOCHAR and no err char? */
935 ret
+= (c
> 0xff) ? 2 : 1;
941 /* Stuff buffer with UCS-4 string converted to other CS via rmap
942 * Accepts: destination buffer
943 * source UCS-4 character(s)
944 * number of UCS-4 characters
946 * substitute character if not in rmap, else NIL to return failure
950 long ucs4_rmapbuf (unsigned char *t
,unsigned long *ucs4
,unsigned long len
,
951 unsigned short *rmap
,unsigned long errch
)
954 /* convert non-BOM characters */
955 for (i
= 0; i
< len
; ++i
) if ((u
= ucs4
[i
]) != UCS2_BOM
) {
956 /* substitute error character for NOCHAR */
957 if ((u
& U8GM_NONBMP
) || ((c
= rmap
[u
]) == NOCHAR
)) c
= errch
;
958 /* two-byte character? */
959 if (c
> 0xff) *t
++ = (unsigned char) (c
>> 8);
960 /* single-byte or low-byte of two-byte */
961 *t
++ = (unsigned char) (c
& 0xff);
963 *t
++ = NIL
; /* tie off returned data */
967 /* Return UCS-4 Unicode character from UTF-8 string
968 * Accepts: pointer to string
969 * remaining octets in string
970 * Returns: UCS-4 character with pointer and count updated
971 * or error code with pointer and count unchanged
974 unsigned long utf8_get (unsigned char **s
,unsigned long *i
)
976 unsigned char *t
= *s
;
977 unsigned long j
= *i
;
978 /* decode raw UTF-8 string */
979 unsigned long ret
= utf8_get_raw (&t
,&j
);
980 if (ret
& U8G_ERROR
); /* invalid raw UTF-8 decoding? */
981 /* no, is it surrogate? */
982 else if ((ret
>= UTF16_SURR
) && (ret
<= UTF16_MAXSURR
)) ret
= U8G_SURROGA
;
983 /* or in non-Unicode ISO 10646 space? */
984 else if (ret
> UCS4_MAXUNICODE
) ret
= U8G_NOTUNIC
;
986 *s
= t
; /* all is well, update pointer */
987 *i
= j
; /* and counter */
989 return ret
; /* return value */
992 /* Return raw (including non-Unicode) UCS-4 character from UTF-8 string
993 * Accepts: pointer to string
994 * remaining octets in string
995 * Returns: UCS-4 character with pointer and count updated
996 * or error code with pointer and count unchanged
999 unsigned long utf8_get_raw (unsigned char **s
,unsigned long *i
)
1002 unsigned char *t
= *s
;
1003 unsigned long j
= *i
;
1004 unsigned long ret
= U8G_NOTUTF8
;
1006 do { /* make sure have source octets available */
1007 if (!j
--) return more
? U8G_ENDSTRI
: U8G_ENDSTRG
;
1008 /* UTF-8 continuation? */
1009 else if (((c
= *t
++) > 0x7f) && (c
< 0xc0)) {
1010 /* continuation when not in progress */
1011 if (!more
) return U8G_BADCONT
;
1012 --more
; /* found a continuation octet */
1013 ret
<<= 6; /* shift current value by 6 bits */
1014 ret
|= c
& 0x3f; /* merge continuation octet */
1016 /* incomplete UTF-8 character */
1017 else if (more
) return U8G_INCMPLT
;
1018 else { /* start of sequence */
1019 c1
= j
? *t
: 0xbf; /* assume valid continuation if incomplete */
1020 if (c
< 0x80) ret
= c
; /* U+0000 - U+007f */
1021 else if (c
< 0xc2); /* c0 and c1 never valid */
1022 else if (c
< 0xe0) { /* U+0080 - U+07ff */
1024 if (c1
>= 0x80) more
= 1;
1026 else if (c
== 0xe0) { /* U+0800 - U+0fff */
1028 if (c1
>= 0xa0) more
= 2;
1030 else if (c
< 0xed) { /* U+1000 - U+cfff */
1032 if (c1
>= 0x80) more
= 2;
1034 else if (c
== 0xed) { /* U+d000 - U+d7ff */
1036 if (j
== 0 || ((c1
>= 0x80) && (c1
<= 0x9f))) more
= 2;
1038 else if (c
< 0xf0) { /* U+e000 - U+ffff */
1040 if (c1
>= 0x80) more
= 2;
1042 else if (c
== 0xf0) { /* U+10000 - U+3ffff */
1044 if (c1
>= 0x90) more
= 3;
1046 else if (c
< 0xf3) { /* U+40000 - U+fffff */
1048 if (c1
>= 0x80) more
= 3;
1051 else if (c
== 0xf4) { /* U+100000 - U+10ffff */
1053 if (((c1
>= 0x80) && (c1
<= 0x8f))) more
= 3;
1056 else if (c
< 0xf8) { /* U+100000 - U+10ffff (and 110000 - 1fffff) */
1058 if ((c1
>= 0x80)) more
= 3;
1060 else if (c
< 0xfc) { /* ISO 10646 200000 - 3ffffff */
1062 if ((c1
>= 0x80)) more
= 4;
1064 else if (c
< 0xfe) { /* ISO 10646 4000000 - 7fffffff */
1066 if ((c1
>= 0x80)) more
= 5;
1069 /* fe and ff never valid */
1070 if (more
) { /* multi-octet, make sure more to come */
1071 if (!j
) return U8G_ENDSTRI
;
1072 ret
= c
; /* continuation needed, save start bits */
1076 if (!(ret
& U8G_ERROR
)) { /* success return? */
1077 *s
= t
; /* yes, update pointer */
1078 *i
= j
; /* and counter */
1080 return ret
; /* return value */
1083 /* Return UCS-4 character from named charset string
1086 * remaining octets in string
1087 * Returns: UCS-4 character with pointer and count updated, negative if error
1089 * Error codes are the same as utf8_get().
1092 unsigned long ucs4_cs_get (CHARSET
*cs
,unsigned char **s
,unsigned long *i
)
1094 unsigned char c
,c1
,ku
,ten
;
1095 unsigned long ret
,d
;
1096 unsigned char *t
= *s
;
1097 unsigned long j
= *i
;
1098 struct utf8_eucparam
*p1
,*p2
,*p3
;
1099 if (j
--) c
= *t
++; /* get first octet */
1100 else return U8G_ENDSTRG
; /* empty string */
1101 switch (cs
->type
) { /* convert if type known */
1102 case CT_UTF8
: /* variable UTF-8 encoded Unicode no table */
1103 return utf8_get (s
,i
);
1104 case CT_ASCII
: /* 7-bit ASCII no table */
1105 if (c
>= 0x80) return U8G_NOTUTF8
;
1106 case CT_1BYTE0
: /* 1 byte no table */
1107 ret
= c
; /* identity */
1109 case CT_1BYTE
: /* 1 byte ASCII + table 0x80-0xff */
1110 ret
= (c
> 0x80) ? ((unsigned short *) cs
->tab
)[c
& BITS7
] : c
;
1112 case CT_1BYTE8
: /* 1 byte table 0x00 - 0xff */
1113 ret
= ((unsigned short *) cs
->tab
)[c
];
1116 case CT_EUC
: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
1118 p1
= (struct utf8_eucparam
*) cs
->tab
;
1121 if (j
--) c1
= *t
++; /* get second octet */
1122 else return U8G_ENDSTRI
;
1123 if (!(c1
& BIT8
)) return U8G_NOTUTF8
;
1124 switch (c
) { /* check 8bit code set */
1125 case EUC_CS2
: /* CS2 */
1126 if (p2
->base_ku
) { /* CS2 set up? */
1127 if (p2
->base_ten
) { /* yes, multibyte? */
1128 if (j
--) c
= *t
++; /* get second octet */
1129 else return U8G_ENDSTRI
;
1131 ((ku
= (c1
& BITS7
) - p2
->base_ku
) < p2
->max_ku
) &&
1132 ((ten
= (c
& BITS7
) - p2
->base_ten
) < p2
->max_ten
)) {
1133 ret
= ((unsigned short *) p2
->tab
)[(ku
*p2
->max_ten
) + ten
];
1137 else if ((c1
>= p2
->base_ku
) && (c1
< p2
->max_ku
)) {
1138 ret
= c1
+ ((unsigned long) p2
->tab
);
1142 return U8G_NOTUTF8
; /* CS2 not set up or bogus */
1143 case EUC_CS3
: /* CS3 */
1144 if (p3
->base_ku
) { /* CS3 set up? */
1145 if (p3
->base_ten
) { /* yes, multibyte? */
1146 if (j
--) c
= *t
++; /* get second octet */
1147 else return U8G_ENDSTRI
;
1149 ((ku
= (c1
& BITS7
) - p3
->base_ku
) < p3
->max_ku
) &&
1150 ((ten
= (c
& BITS7
) - p3
->base_ten
) < p3
->max_ten
)) {
1151 ret
= ((unsigned short *) p3
->tab
)[(ku
*p3
->max_ten
) + ten
];
1155 else if ((c1
>= p3
->base_ku
) && (c1
< p3
->max_ku
)) {
1156 ret
= c1
+ ((unsigned long) p3
->tab
);
1160 return U8G_NOTUTF8
; /* CS3 not set up or bogus */
1162 if (((ku
= (c
& BITS7
) - p1
->base_ku
) >= p1
->max_ku
) ||
1163 ((ten
= (c1
& BITS7
) - p1
->base_ten
) >= p1
->max_ten
))
1165 ret
= ((unsigned short *) p1
->tab
)[(ku
*p1
->max_ten
) + ten
];
1166 /* special hack for JIS X 0212: merge rows less than 10 */
1167 if ((ret
== UBOGON
) && ku
&& (ku
< 10) && p3
->tab
&& p3
->base_ten
)
1168 ret
= ((unsigned short *) p3
->tab
)
1169 [((ku
- (p3
->base_ku
- p1
->base_ku
))*p3
->max_ten
) + ten
];
1173 else ret
= c
; /* ASCII character */
1176 case CT_DBYTE
: /* 2 byte ASCII + utf8_eucparam */
1177 if (c
& BIT8
) { /* double-byte character? */
1178 p1
= (struct utf8_eucparam
*) cs
->tab
;
1179 if (j
--) c1
= *t
++; /* get second octet */
1180 else return U8G_ENDSTRI
;
1181 if (((ku
= c
- p1
->base_ku
) < p1
->max_ku
) &&
1182 ((ten
= c1
- p1
->base_ten
) < p1
->max_ten
))
1183 ret
= ((unsigned short *) p1
->tab
)[(ku
*p1
->max_ten
) + ten
];
1184 else return U8G_NOTUTF8
;
1186 else ret
= c
; /* ASCII character */
1188 case CT_DBYTE2
: /* 2 byte ASCII + utf8_eucparam plane1/2 */
1189 if (c
& BIT8
) { /* double-byte character? */
1190 p1
= (struct utf8_eucparam
*) cs
->tab
;
1192 if (j
--) c1
= *t
++; /* get second octet */
1193 else return U8G_ENDSTRI
;
1194 if (c1
& BIT8
) { /* high vs. low plane */
1195 if ((ku
= c
- p2
->base_ku
) < p2
->max_ku
&&
1196 ((ten
= c1
- p2
->base_ten
) < p2
->max_ten
))
1197 ret
= ((unsigned short *) p1
->tab
)
1198 [(ku
*(p1
->max_ten
+ p2
->max_ten
)) + p1
->max_ten
+ ten
];
1199 else return U8G_NOTUTF8
;
1201 else if ((ku
= c
- p1
->base_ku
) < p1
->max_ku
&&
1202 ((ten
= c1
- p1
->base_ten
) < p1
->max_ten
))
1203 ret
= ((unsigned short *) p1
->tab
)
1204 [(ku
*(p1
->max_ten
+ p2
->max_ten
)) + ten
];
1205 else return U8G_NOTUTF8
;
1207 else ret
= c
; /* ASCII character */
1209 case CT_SJIS
: /* 2 byte Shift-JIS encoded JIS no table */
1210 /* compromise - do yen sign but not overline */
1211 if (!(c
& BIT8
)) ret
= (c
== JISROMAN_YEN
) ? UCS2_YEN
: c
;
1212 /* half-width katakana? */
1213 else if ((c
>= MIN_KANA_8
) && (c
< MAX_KANA_8
)) ret
= c
+ KANA_8
;
1214 else { /* Shift-JIS */
1215 if (j
--) c1
= *t
++; /* get second octet */
1216 else return U8G_ENDSTRI
;
1218 ret
= JISTOUNICODE (c
,c1
,ku
,ten
);
1222 case CT_UCS2
: /* 2 byte 16-bit Unicode no table */
1224 if (j
--) c
= *t
++; /* get second octet */
1225 else return U8G_ENDSTRI
; /* empty string */
1228 case CT_UCS4
: /* 4 byte 32-bit Unicode no table */
1229 if (c
& 0x80) return U8G_NOTUTF8
;
1230 if (j
< 3) return U8G_ENDSTRI
;
1231 j
-= 3; /* count three octets */
1233 ret
|= (*t
++) << 16;
1237 case CT_UTF16
: /* variable UTF-16 encoded Unicode no table */
1239 if (j
--) c
= *t
++; /* get second octet */
1240 else return U8G_ENDSTRI
; /* empty string */
1243 if ((ret
>= UTF16_SURR
) && (ret
<= UTF16_MAXSURR
)) {
1244 /* invalid first surrogate */
1245 if ((ret
> UTF16_SURRHEND
) || (j
< 2)) return U8G_NOTUTF8
;
1246 j
-= 2; /* count two octets */
1247 d
= (*t
++) << 8; /* first octet of second surrogate */
1248 d
|= *t
++; /* second octet of second surrogate */
1249 if ((d
< UTF16_SURRL
) || (d
> UTF16_SURRLEND
)) return U8G_NOTUTF8
;
1250 ret
= UTF16_BASE
+ ((ret
& UTF16_MASK
) << UTF16_SHIFT
) +
1254 default: /* unknown/unsupported character set type */
1257 *s
= t
; /* update pointer and counter */
1262 /* Produce charset validity map for BMP
1263 * Accepts: list of charsets to map
1264 * Returns: validity map, indexed by BMP codepoint
1266 * Bit 0x1 is the "not-CJK" character bit
1269 unsigned long *utf8_csvalidmap (char *charsets
[])
1271 unsigned short u
,*tab
;
1272 unsigned int m
,ku
,ten
;
1273 unsigned long i
,csi
,csb
;
1274 struct utf8_eucparam
*param
,*p2
;
1277 unsigned long *ret
= (unsigned long *)
1278 fs_get (i
= 0x10000 * sizeof (unsigned long));
1279 memset (ret
,0,i
); /* zero the entire vector */
1280 /* mark all the non-CJK codepoints */
1281 /* U+0000 - U+2E7F non-CJK */
1282 for (i
= 0; i
< 0x2E7F; ++i
) ret
[i
] = 0x1;
1283 /* U+2E80 - U+2EFF CJK Radicals Supplement
1284 * U+2F00 - U+2FDF Kangxi Radicals
1285 * U+2FE0 - U+2FEF unassigned
1286 * U+2FF0 - U+2FFF Ideographic Description Characters
1287 * U+3000 - U+303F CJK Symbols and Punctuation
1288 * U+3040 - U+309F Hiragana
1289 * U+30A0 - U+30FF Katakana
1290 * U+3100 - U+312F BoPoMoFo
1291 * U+3130 - U+318F Hangul Compatibility Jamo
1292 * U+3190 - U+319F Kanbun
1293 * U+31A0 - U+31BF BoPoMoFo Extended
1294 * U+31C0 - U+31EF CJK Strokes
1295 * U+31F0 - U+31FF Katakana Phonetic Extensions
1296 * U+3200 - U+32FF Enclosed CJK Letters and Months
1297 * U+3300 - U+33FF CJK Compatibility
1298 * U+3400 - U+4DBF CJK Unified Ideographs Extension A
1299 * U+4DC0 - U+4DFF Yijing Hexagram Symbols
1300 * U+4E00 - U+9FFF CJK Unified Ideographs
1301 * U+A000 - U+A48F Yi Syllables
1302 * U+A490 - U+A4CF Yi Radicals
1303 * U+A700 - U+A71F Modifier Tone Letters
1305 for (i
= 0xa720; i
< 0xabff; ++i
) ret
[i
] = 0x1;
1306 /* U+AC00 - U+D7FF Hangul Syllables */
1307 for (i
= 0xd800; i
< 0xf8ff; ++i
) ret
[i
] = 0x1;
1308 /* U+F900 - U+FAFF CJK Compatibility Ideographs */
1309 for (i
= 0xfb00; i
< 0xfe2f; ++i
) ret
[i
] = 0x1;
1310 /* U+FE30 - U+FE4F CJK Compatibility Forms
1311 * U+FE50 - U+FE6F Small Form Variants (for CNS 11643)
1313 for (i
= 0xfe70; i
< 0xfeff; ++i
) ret
[i
] = 0x1;
1314 /* U+FF00 - U+FFEF CJK Compatibility Ideographs */
1315 for (i
= 0xfff0; i
< 0x10000; ++i
) ret
[i
] = 0x1;
1317 /* for each supplied charset */
1318 for (csi
= 1; ret
&& charsets
&& (s
= charsets
[csi
- 1]); ++csi
) {
1319 /* substitute EUC-JP for ISO-2022-JP */
1320 if (!compare_cstring (s
,"ISO-2022-JP")) s
= "EUC-JP";
1321 /* look up charset */
1322 if (cs
= utf8_charset (s
)) {
1323 csb
= 1 << csi
; /* charset bit */
1325 case CT_ASCII
: /* 7-bit ASCII no table */
1326 case CT_1BYTE0
: /* 1 byte no table */
1327 case CT_1BYTE
: /* 1 byte ASCII + table 0x80-0xff */
1328 case CT_1BYTE8
: /* 1 byte table 0x00 - 0xff */
1329 case CT_EUC
: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
1330 case CT_DBYTE
: /* 2 byte ASCII + utf8_eucparam */
1331 case CT_DBYTE2
: /* 2 byte ASCII + utf8_eucparam plane1/2 */
1332 case CT_SJIS
: /* 2 byte Shift-JIS */
1333 /* supported charset type, all ASCII is OK */
1334 for (i
= 0; i
< 128; ++i
) ret
[i
] |= csb
;
1336 default: /* unsupported charset type */
1337 fs_give ((void **) &ret
);
1340 /* now do additional operations */
1341 if (ret
) switch (cs
->type
) {
1342 case CT_1BYTE0
: /* 1 byte no table */
1343 for (i
= 128; i
< 256; i
++) ret
[i
] |= csb
;
1345 case CT_1BYTE
: /* 1 byte ASCII + table 0x80-0xff */
1346 for (tab
= (unsigned short *) cs
->tab
,i
= 128; i
< 256; i
++)
1347 if (tab
[i
& BITS7
] != UBOGON
) ret
[tab
[i
& BITS7
]] |= csb
;
1349 case CT_1BYTE8
: /* 1 byte table 0x00 - 0xff */
1350 for (tab
= (unsigned short *) cs
->tab
,i
= 0; i
< 256; i
++)
1351 if (tab
[i
] != UBOGON
) ret
[tab
[i
]] |= csb
;
1353 case CT_EUC
: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
1354 for (param
= (struct utf8_eucparam
*) cs
->tab
,
1355 tab
= (unsigned short *) param
->tab
, ku
= 0;
1356 ku
< param
->max_ku
; ku
++)
1357 for (ten
= 0; ten
< param
->max_ten
; ten
++)
1358 if ((u
= tab
[(ku
* param
->max_ten
) + ten
]) != UBOGON
)
1362 case CT_DBYTE
: /* 2 byte ASCII + utf8_eucparam */
1363 for (param
= (struct utf8_eucparam
*) cs
->tab
,
1364 tab
= (unsigned short *) param
->tab
, ku
= 0;
1365 ku
< param
->max_ku
; ku
++)
1366 for (ten
= 0; ten
< param
->max_ten
; ten
++)
1367 if ((u
= tab
[(ku
* param
->max_ten
) + ten
]) != UBOGON
)
1370 case CT_DBYTE2
: /* 2 byte ASCII + utf8_eucparam plane1/2 */
1371 param
= (struct utf8_eucparam
*) cs
->tab
;
1372 p2
= param
+ 1; /* plane 2 parameters */
1373 /* only ten parameters should differ */
1374 if ((param
->base_ku
!= p2
->base_ku
) || (param
->max_ku
!= p2
->max_ku
))
1375 fatal ("ku definition error for CT_DBYTE2 charset");
1376 /* total codepoints in each ku */
1377 m
= param
->max_ten
+ p2
->max_ten
;
1378 tab
= (unsigned short *) param
->tab
;
1379 for (ku
= 0; ku
< param
->max_ku
; ku
++) {
1380 for (ten
= 0; ten
< param
->max_ten
; ten
++)
1381 if ((u
= tab
[(ku
* m
) + ten
]) != UBOGON
)
1383 for (ten
= 0; ten
< p2
->max_ten
; ten
++)
1384 if ((u
= tab
[(ku
* m
) + param
->max_ten
+ ten
]) != UBOGON
)
1388 case CT_SJIS
: /* 2 byte Shift-JIS */
1389 for (ku
= 0; ku
< MAX_JIS0208_KU
; ku
++)
1390 for (ten
= 0; ten
< MAX_JIS0208_TEN
; ten
++)
1391 if ((u
= jis0208tab
[ku
][ten
]) != UBOGON
) ret
[u
] |= csb
;
1392 /* JIS hankaku katakana */
1393 for (u
= 0; u
< (MAX_KANA_8
- MIN_KANA_8
); u
++)
1394 ret
[UCS2_KATAKANA
+ u
] |= csb
;
1398 /* invalid charset, punt */
1399 else fs_give ((void **) &ret
);
1404 /* Infer charset from unlabelled sized text
1405 * Accepts: sized text
1406 * Returns: charset if one inferred, or NIL if unknown
1409 const CHARSET
*utf8_infercharset (SIZEDTEXT
*src
)
1411 long iso2022jp
= NIL
;
1412 long eightbit
= NIL
;
1414 /* look for ISO 2022 */
1415 if (src
) for (i
= 0; i
< src
->size
; i
++) {
1417 if ((src
->data
[i
] == I2C_ESC
) && (++i
< src
->size
)) switch (src
->data
[i
]) {
1418 case I2C_MULTI
: /* yes, multibyte? */
1419 if (++i
< src
->size
) switch (src
->data
[i
]) {
1420 case I2CS_94x94_JIS_OLD
: /* JIS X 0208-1978 */
1421 case I2CS_94x94_JIS_NEW
: /* JIS X 0208-1983 */
1422 case I2CS_94x94_JIS_EXT
: /* JIS X 0212-1990 (kludge...) */
1423 iso2022jp
= T
; /* found an ISO-2022-JP sequence */
1425 default: /* other multibyte */
1426 return NIL
; /* definitely invalid */
1429 case I2C_G0_94
: /* single byte */
1430 if (++i
< src
->size
) switch (src
->data
[i
]) {
1431 case I2CS_94_JIS_BUGROM
: /* in case old buggy software */
1432 case I2CS_94_JIS_ROMAN
: /* JIS X 0201-1976 left half */
1433 case I2CS_94_ASCII
: /* ASCII */
1434 case I2CS_94_BRITISH
: /* good enough for gov't work */
1436 default: /* other 94 single byte */
1437 return NIL
; /* definitely invalid */
1440 /* if possible UTF-8 and not ISO-2022-JP */
1441 else if (!iso2022jp
&& (eightbit
>= 0) && (src
->data
[i
] & BIT8
) &&
1442 (eightbit
= utf8_validate (src
->data
+ i
,src
->size
- i
)) > 0)
1443 i
+= eightbit
- 1; /* skip past all but last of UTF-8 char */
1445 /* ISO-2022-JP overrides other guesses */
1446 if (iso2022jp
) return utf8_charset ("ISO-2022-JP");
1447 if (eightbit
> 0) return utf8_charset ("UTF-8");
1448 return eightbit
? NIL
: utf8_charset ("US-ASCII");
1452 /* Validate that character at this position is UTF-8
1453 * Accepts: string pointer
1454 * size of remaining string
1455 * Returns: size of UTF-8 character in octets or -1 if not UTF-8
1458 long utf8_validate (unsigned char *s
,unsigned long i
)
1460 unsigned long j
= i
;
1461 return (utf8_get (&s
,&i
) & U8G_ERROR
) ? -1 : j
- i
;
1464 /* Convert ISO 8859-1 to UTF-8
1465 * Accepts: source sized text
1466 * pointer to return sized text
1467 * canonicalization function
1470 void utf8_text_1byte0 (SIZEDTEXT
*text
,SIZEDTEXT
*ret
,ucs4cn_t cv
,ucs4de_t de
)
1475 for (ret
->size
= i
= 0; i
< text
->size
;) {
1476 c
= text
->data
[i
++];
1477 UTF8_COUNT_BMP (ret
->size
,c
,cv
,de
)
1479 (s
= ret
->data
= (unsigned char *) fs_get (ret
->size
+ 1))[ret
->size
] =NIL
;
1480 for (i
= 0; i
< text
->size
;) {
1481 c
= text
->data
[i
++];
1482 UTF8_WRITE_BMP (s
,c
,cv
,de
) /* convert UCS-2 to UTF-8 */
1487 /* Convert single byte ASCII+8bit character set sized text to UTF-8
1488 * Accepts: source sized text
1489 * pointer to return sized text
1491 * canonicalization function
1494 void utf8_text_1byte (SIZEDTEXT
*text
,SIZEDTEXT
*ret
,void *tab
,ucs4cn_t cv
,
1500 unsigned short *tbl
= (unsigned short *) tab
;
1501 for (ret
->size
= i
= 0; i
< text
->size
;) {
1502 if ((c
= text
->data
[i
++]) & BIT8
) c
= tbl
[c
& BITS7
];
1503 UTF8_COUNT_BMP (ret
->size
,c
,cv
,de
)
1505 (s
= ret
->data
= (unsigned char *) fs_get (ret
->size
+ 1))[ret
->size
] =NIL
;
1506 for (i
= 0; i
< text
->size
;) {
1507 if ((c
= text
->data
[i
++]) & BIT8
) c
= tbl
[c
& BITS7
];
1508 UTF8_WRITE_BMP (s
,c
,cv
,de
) /* convert UCS-2 to UTF-8 */
1512 /* Convert single byte 8bit character set sized text to UTF-8
1513 * Accepts: source sized text
1514 * pointer to return sized text
1516 * canonicalization function
1519 void utf8_text_1byte8 (SIZEDTEXT
*text
,SIZEDTEXT
*ret
,void *tab
,ucs4cn_t cv
,
1525 unsigned short *tbl
= (unsigned short *) tab
;
1526 for (ret
->size
= i
= 0; i
< text
->size
;) {
1527 c
= tbl
[text
->data
[i
++]];
1528 UTF8_COUNT_BMP (ret
->size
,c
,cv
,de
)
1530 (s
= ret
->data
= (unsigned char *) fs_get (ret
->size
+ 1))[ret
->size
] =NIL
;
1531 for (i
= 0; i
< text
->size
;) {
1532 c
= tbl
[text
->data
[i
++]];
1533 UTF8_WRITE_BMP (s
,c
,cv
,de
) /* convert UCS-2 to UTF-8 */
1537 /* Convert EUC sized text to UTF-8
1538 * Accepts: source sized text
1539 * pointer to return sized text
1540 * EUC parameter table
1541 * canonicalization function
1544 void utf8_text_euc (SIZEDTEXT
*text
,SIZEDTEXT
*ret
,void *tab
,ucs4cn_t cv
,
1549 unsigned int pass
,c
,c1
,ku
,ten
;
1550 struct utf8_eucparam
*p1
= (struct utf8_eucparam
*) tab
;
1551 struct utf8_eucparam
*p2
= p1
+ 1;
1552 struct utf8_eucparam
*p3
= p1
+ 2;
1553 unsigned short *t1
= (unsigned short *) p1
->tab
;
1554 unsigned short *t2
= (unsigned short *) p2
->tab
;
1555 unsigned short *t3
= (unsigned short *) p3
->tab
;
1556 for (pass
= 0,s
= NIL
,ret
->size
= 0; pass
<= 1; pass
++) {
1557 for (i
= 0; i
< text
->size
;) {
1559 if ((c
= text
->data
[i
++]) & BIT8
) {
1560 /* yes, must have another high byte */
1561 if ((i
>= text
->size
) || !((c1
= text
->data
[i
++]) & BIT8
))
1562 c
= UBOGON
; /* out of space or bogon */
1563 else switch (c
) { /* check 8bit code set */
1564 case EUC_CS2
: /* CS2 */
1565 if (p2
->base_ku
) { /* CS2 set up? */
1566 if (p2
->base_ten
) /* yes, multibyte? */
1567 c
= ((i
< text
->size
) && ((c
= text
->data
[i
++]) & BIT8
) &&
1568 ((ku
= (c1
& BITS7
) - p2
->base_ku
) < p2
->max_ku
) &&
1569 ((ten
= (c
& BITS7
) - p2
->base_ten
) < p2
->max_ten
)) ?
1570 t2
[(ku
*p2
->max_ten
) + ten
] : UBOGON
;
1571 else c
= ((c1
>= p2
->base_ku
) && (c1
< p2
->max_ku
)) ?
1572 c1
+ ((unsigned long) p2
->tab
) : UBOGON
;
1574 else { /* CS2 not set up */
1575 c
= UBOGON
; /* swallow byte, say bogon */
1576 if (i
< text
->size
) i
++;
1579 case EUC_CS3
: /* CS3 */
1580 if (p3
->base_ku
) { /* CS3 set up? */
1581 if (p3
->base_ten
) /* yes, multibyte? */
1582 c
= ((i
< text
->size
) && ((c
= text
->data
[i
++]) & BIT8
) &&
1583 ((ku
= (c1
& BITS7
) - p3
->base_ku
) < p3
->max_ku
) &&
1584 ((ten
= (c
& BITS7
) - p3
->base_ten
) < p3
->max_ten
)) ?
1585 t3
[(ku
*p3
->max_ten
) + ten
] : UBOGON
;
1586 else c
= ((c1
>= p3
->base_ku
) && (c1
< p3
->max_ku
)) ?
1587 c1
+ ((unsigned long) p3
->tab
) : UBOGON
;
1589 else { /* CS3 not set up */
1590 c
= UBOGON
; /* swallow byte, say bogon */
1591 if (i
< text
->size
) i
++;
1596 if (((ku
= (c
& BITS7
) - p1
->base_ku
) >= p1
->max_ku
) ||
1597 ((ten
= (c1
& BITS7
) - p1
->base_ten
) >= p1
->max_ten
)) c
= UBOGON
;
1598 else if (((c
= t1
[(ku
*p1
->max_ten
) + ten
]) == UBOGON
) &&
1599 /* special hack for JIS X 0212: merge rows less than 10 */
1600 ku
&& (ku
< 10) && t3
&& p3
->base_ten
)
1601 c
= t3
[((ku
- (p3
->base_ku
- p1
->base_ku
))*p3
->max_ten
) + ten
];
1604 /* convert if second pass */
1605 if (pass
) UTF8_WRITE_BMP (s
,c
,cv
,de
)
1606 else UTF8_COUNT_BMP (ret
->size
,c
,cv
,de
);
1608 if (!pass
) (s
= ret
->data
= (unsigned char *)
1609 fs_get (ret
->size
+ 1))[ret
->size
] =NIL
;
1614 /* Convert ASCII + double-byte sized text to UTF-8
1615 * Accepts: source sized text
1616 * pointer to return sized text
1618 * canonicalization function
1621 void utf8_text_dbyte (SIZEDTEXT
*text
,SIZEDTEXT
*ret
,void *tab
,ucs4cn_t cv
,
1626 unsigned int c
,c1
,ku
,ten
;
1627 struct utf8_eucparam
*p1
= (struct utf8_eucparam
*) tab
;
1628 unsigned short *t1
= (unsigned short *) p1
->tab
;
1629 for (ret
->size
= i
= 0; i
< text
->size
;) {
1630 if ((c
= text
->data
[i
++]) & BIT8
) {
1631 /* special hack for GBK: 0x80 is Euro */
1632 if ((c
== 0x80) && (t1
== (unsigned short *) gb2312tab
)) c
= UCS2_EURO
;
1633 else c
= ((i
< text
->size
) && (c1
= text
->data
[i
++]) &&
1634 ((ku
= c
- p1
->base_ku
) < p1
->max_ku
) &&
1635 ((ten
= c1
- p1
->base_ten
) < p1
->max_ten
)) ?
1636 t1
[(ku
*p1
->max_ten
) + ten
] : UBOGON
;
1638 UTF8_COUNT_BMP (ret
->size
,c
,cv
,de
)
1640 (s
= ret
->data
= (unsigned char *) fs_get (ret
->size
+ 1))[ret
->size
] = NIL
;
1641 for (i
= 0; i
< text
->size
;) {
1642 if ((c
= text
->data
[i
++]) & BIT8
) {
1643 /* special hack for GBK: 0x80 is Euro */
1644 if ((c
== 0x80) && (t1
== (unsigned short *) gb2312tab
)) c
= UCS2_EURO
;
1645 else c
= ((i
< text
->size
) && (c1
= text
->data
[i
++]) &&
1646 ((ku
= c
- p1
->base_ku
) < p1
->max_ku
) &&
1647 ((ten
= c1
- p1
->base_ten
) < p1
->max_ten
)) ?
1648 t1
[(ku
*p1
->max_ten
) + ten
] : UBOGON
;
1650 UTF8_WRITE_BMP (s
,c
,cv
,de
) /* convert UCS-2 to UTF-8 */
1654 /* Convert ASCII + double byte 2 plane sized text to UTF-8
1655 * Accepts: source sized text
1656 * pointer to return sized text
1658 * canonicalization function
1661 void utf8_text_dbyte2 (SIZEDTEXT
*text
,SIZEDTEXT
*ret
,void *tab
,ucs4cn_t cv
,
1666 unsigned int c
,c1
,ku
,ten
;
1667 struct utf8_eucparam
*p1
= (struct utf8_eucparam
*) tab
;
1668 struct utf8_eucparam
*p2
= p1
+ 1;
1669 unsigned short *t
= (unsigned short *) p1
->tab
;
1670 for (ret
->size
= i
= 0; i
< text
->size
;) {
1671 if ((c
= text
->data
[i
++]) & BIT8
) {
1672 if ((i
>= text
->size
) || !(c1
= text
->data
[i
++]))
1673 c
= UBOGON
; /* out of space or bogon */
1674 else if (c1
& BIT8
) /* high vs. low plane */
1675 c
= ((ku
= c
- p2
->base_ku
) < p2
->max_ku
&&
1676 ((ten
= c1
- p2
->base_ten
) < p2
->max_ten
)) ?
1677 t
[(ku
*(p1
->max_ten
+ p2
->max_ten
)) + p1
->max_ten
+ ten
] :UBOGON
;
1678 else c
= ((ku
= c
- p1
->base_ku
) < p1
->max_ku
&&
1679 ((ten
= c1
- p1
->base_ten
) < p1
->max_ten
)) ?
1680 t
[(ku
*(p1
->max_ten
+ p2
->max_ten
)) + ten
] : UBOGON
;
1682 UTF8_COUNT_BMP (ret
->size
,c
,cv
,de
)
1684 (s
= ret
->data
= (unsigned char *) fs_get (ret
->size
+ 1))[ret
->size
] = NIL
;
1685 for (i
= 0; i
< text
->size
;) {
1686 if ((c
= text
->data
[i
++]) & BIT8
) {
1687 if ((i
>= text
->size
) || !(c1
= text
->data
[i
++]))
1688 c
= UBOGON
; /* out of space or bogon */
1689 else if (c1
& BIT8
) /* high vs. low plane */
1690 c
= ((ku
= c
- p2
->base_ku
) < p2
->max_ku
&&
1691 ((ten
= c1
- p2
->base_ten
) < p2
->max_ten
)) ?
1692 t
[(ku
*(p1
->max_ten
+ p2
->max_ten
)) + p1
->max_ten
+ ten
] :UBOGON
;
1693 else c
= ((ku
= c
- p1
->base_ku
) < p1
->max_ku
&&
1694 ((ten
= c1
- p1
->base_ten
) < p1
->max_ten
)) ?
1695 t
[(ku
*(p1
->max_ten
+ p2
->max_ten
)) + ten
] : UBOGON
;
1697 UTF8_WRITE_BMP (s
,c
,cv
,de
) /* convert UCS-2 to UTF-8 */
1701 #ifdef JISTOUNICODE /* Japanese */
1702 /* Convert Shift JIS sized text to UTF-8
1703 * Accepts: source sized text
1704 * pointer to return sized text
1705 * canonicalization function
1708 void utf8_text_sjis (SIZEDTEXT
*text
,SIZEDTEXT
*ret
,ucs4cn_t cv
,
1713 unsigned int c
,c1
,ku
,ten
;
1714 for (ret
->size
= i
= 0; i
< text
->size
;) {
1715 if ((c
= text
->data
[i
++]) & BIT8
) {
1716 /* half-width katakana */
1717 if ((c
>= MIN_KANA_8
) && (c
< MAX_KANA_8
)) c
+= KANA_8
;
1718 else if (i
>= text
->size
) c
= UBOGON
;
1719 else { /* Shift-JIS */
1720 c1
= text
->data
[i
++];
1722 c
= JISTOUNICODE (c
,c1
,ku
,ten
);
1725 /* compromise - do yen sign but not overline */
1726 else if (c
== JISROMAN_YEN
) c
= UCS2_YEN
;
1727 UTF8_COUNT_BMP (ret
->size
,c
,cv
,de
)
1729 (s
= ret
->data
= (unsigned char *) fs_get (ret
->size
+ 1))[ret
->size
] = NIL
;
1730 for (i
= 0; i
< text
->size
;) {
1731 if ((c
= text
->data
[i
++]) & BIT8
) {
1732 /* half-width katakana */
1733 if ((c
>= MIN_KANA_8
) && (c
< MAX_KANA_8
)) c
+= KANA_8
;
1734 else { /* Shift-JIS */
1735 c1
= text
->data
[i
++];
1737 c
= JISTOUNICODE (c
,c1
,ku
,ten
);
1740 /* compromise - do yen sign but not overline */
1741 else if (c
== JISROMAN_YEN
) c
= UCS2_YEN
;
1742 UTF8_WRITE_BMP (s
,c
,cv
,de
) /* convert UCS-2 to UTF-8 */
1747 /* Convert ISO-2022 sized text to UTF-8
1748 * Accepts: source sized text
1749 * pointer to returned sized text
1750 * canonicalization function
1753 void utf8_text_2022 (SIZEDTEXT
*text
,SIZEDTEXT
*ret
,ucs4cn_t cv
,ucs4de_t de
)
1757 unsigned int pass
,state
,c
,co
,gi
,gl
,gr
,g
[4],ku
,ten
;
1758 for (pass
= 0,s
= NIL
,ret
->size
= 0; pass
<= 1; pass
++) {
1759 gi
= 0; /* quell compiler warnings */
1760 state
= I2S_CHAR
; /* initialize engine */
1761 g
[0]= g
[2] = I2CS_ASCII
; /* G0 and G2 are ASCII */
1762 g
[1]= g
[3] = I2CS_ISO8859_1
;/* G1 and G3 are ISO-8850-1 */
1763 gl
= I2C_G0
; gr
= I2C_G1
; /* left is G0, right is G1 */
1764 for (i
= 0; i
< text
->size
;) {
1765 c
= text
->data
[i
++];
1766 switch (state
) { /* dispatch based upon engine state */
1767 case I2S_ESC
: /* ESC seen */
1768 switch (c
) { /* process intermediate character */
1769 case I2C_MULTI
: /* multibyte character? */
1770 state
= I2S_MUL
; /* mark multibyte flag seen */
1772 case I2C_SS2
: /* single shift GL to G2 */
1773 case I2C_SS2_ALT
: /* Taiwan SeedNet */
1776 case I2C_SS3
: /* single shift GL to G3 */
1777 case I2C_SS3_ALT
: /* Taiwan SeedNet */
1780 case I2C_LS2
: /* shift GL to G2 */
1783 case I2C_LS3
: /* shift GL to G3 */
1786 case I2C_LS1R
: /* shift GR to G1 */
1789 case I2C_LS2R
: /* shift GR to G2 */
1792 case I2C_LS3R
: /* shift GR to G3 */
1795 case I2C_G0_94
: case I2C_G1_94
: case I2C_G2_94
: case I2C_G3_94
:
1796 g
[gi
= c
- I2C_G0_94
] = (state
== I2S_MUL
) ? I2CS_94x94
: I2CS_94
;
1797 state
= I2S_INT
; /* ready for character set */
1799 case I2C_G0_96
: case I2C_G1_96
: case I2C_G2_96
: case I2C_G3_96
:
1800 g
[gi
= c
- I2C_G0_96
] = (state
== I2S_MUL
) ? I2CS_96x96
: I2CS_96
;
1801 state
= I2S_INT
; /* ready for character set */
1803 default: /* bogon */
1804 if (pass
) *s
++ = I2C_ESC
,*s
++ = c
;
1805 else ret
->size
+= 2;
1806 state
= I2S_CHAR
; /* return to previous state */
1810 case I2S_MUL
: /* ESC $ */
1811 switch (c
) { /* process multibyte intermediate character */
1812 case I2C_G0_94
: case I2C_G1_94
: case I2C_G2_94
: case I2C_G3_94
:
1813 g
[gi
= c
- I2C_G0_94
] = I2CS_94x94
;
1814 state
= I2S_INT
; /* ready for character set */
1816 case I2C_G0_96
: case I2C_G1_96
: case I2C_G2_96
: case I2C_G3_96
:
1817 g
[gi
= c
- I2C_G0_96
] = I2CS_96x96
;
1818 state
= I2S_INT
; /* ready for character set */
1820 default: /* probably omitted I2CS_94x94 */
1821 g
[gi
= I2C_G0
] = I2CS_94x94
| c
;
1822 state
= I2S_CHAR
; /* return to character state */
1826 state
= I2S_CHAR
; /* return to character state */
1827 g
[gi
] |= c
; /* set character set */
1830 case I2S_CHAR
: /* character data */
1832 case I2C_ESC
: /* ESC character */
1833 state
= I2S_ESC
; /* see if ISO-2022 prefix */
1835 case I2C_SI
: /* shift GL to G0 */
1838 case I2C_SO
: /* shift GL to G1 */
1841 case I2C_SS2_ALT
: /* single shift GL to G2 */
1845 case I2C_SS3_ALT
: /* single shift GL to G3 */
1850 default: /* ordinary character */
1851 co
= c
; /* note original character */
1852 if (gl
& (3 << 2)) { /* single shifted? */
1853 gi
= g
[gl
>> 2]; /* get shifted character set */
1854 gl
&= 0x3; /* cancel shift */
1856 /* select left or right half */
1857 else gi
= (c
& BIT8
) ? g
[gr
] : g
[gl
];
1858 c
&= BITS7
; /* make 7-bit */
1859 switch (gi
) { /* interpret in character set */
1860 case I2CS_ASCII
: /* ASCII */
1862 case I2CS_BRITISH
: /* British ASCII */
1863 /* Pound sterling sign */
1864 if (c
== BRITISH_POUNDSTERLING
) c
= UCS2_POUNDSTERLING
;
1866 case I2CS_JIS_ROMAN
: /* JIS Roman */
1867 case I2CS_JIS_BUGROM
: /* old bugs */
1868 switch (c
) { /* two exceptions to ASCII */
1869 case JISROMAN_YEN
: /* Yen sign */
1873 case JISROMAN_OVERLINE
:
1878 case I2CS_JIS_KANA
: /* JIS hankaku katakana */
1879 if ((c
>= MIN_KANA_7
) && (c
< MAX_KANA_7
)) c
+= KANA_7
;
1882 case I2CS_ISO8859_1
: /* Latin-1 (West European) */
1883 c
|= BIT8
; /* just turn on high bit */
1885 case I2CS_ISO8859_2
: /* Latin-2 (Czech, Slovak) */
1886 c
= iso8859_2tab
[c
];
1888 case I2CS_ISO8859_3
: /* Latin-3 (Dutch, Turkish) */
1889 c
= iso8859_3tab
[c
];
1891 case I2CS_ISO8859_4
: /* Latin-4 (Scandinavian) */
1892 c
= iso8859_4tab
[c
];
1894 case I2CS_ISO8859_5
: /* Cyrillic */
1895 c
= iso8859_5tab
[c
];
1897 case I2CS_ISO8859_6
: /* Arabic */
1898 c
= iso8859_6tab
[c
];
1900 case I2CS_ISO8859_7
: /* Greek */
1901 c
= iso8859_7tab
[c
];
1903 case I2CS_ISO8859_8
: /* Hebrew */
1904 c
= iso8859_8tab
[c
];
1906 case I2CS_ISO8859_9
: /* Latin-5 (Finnish, Portuguese) */
1907 c
= iso8859_9tab
[c
];
1909 case I2CS_TIS620
: /* Thai */
1912 case I2CS_ISO8859_10
: /* Latin-6 (Northern Europe) */
1913 c
= iso8859_10tab
[c
];
1915 case I2CS_ISO8859_13
: /* Latin-7 (Baltic) */
1916 c
= iso8859_13tab
[c
];
1918 case I2CS_VSCII
: /* Vietnamese */
1921 case I2CS_ISO8859_14
: /* Latin-8 (Celtic) */
1922 c
= iso8859_14tab
[c
];
1924 case I2CS_ISO8859_15
: /* Latin-9 (Euro) */
1925 c
= iso8859_15tab
[c
];
1927 case I2CS_ISO8859_16
: /* Latin-10 (Baltic) */
1928 c
= iso8859_16tab
[c
];
1931 default: /* all other character sets */
1932 /* multibyte character set */
1933 if ((gi
& I2CS_MUL
) && !(c
& BIT8
) && isgraph (c
)) {
1934 c
= (i
< text
->size
) ? text
->data
[i
++] : 0;
1937 case I2CS_GB
: /* GB 2312 */
1938 co
|= BIT8
; /* make into EUC */
1940 c
= GBTOUNICODE (co
,c
,ku
,ten
);
1944 case I2CS_JIS_OLD
:/* JIS X 0208-1978 */
1945 case I2CS_JIS_NEW
:/* JIS X 0208-1983 */
1946 c
= JISTOUNICODE (co
,c
,ku
,ten
);
1949 #ifdef JIS0212TOUNICODE
1950 case I2CS_JIS_EXT
:/* JIS X 0212-1990 */
1951 c
= JIS0212TOUNICODE (co
,c
,ku
,ten
);
1955 case I2CS_KSC
: /* KSC 5601 */
1956 co
|= BIT8
; /* make into EUC */
1958 c
= KSCTOUNICODE (co
,c
,ku
,ten
);
1961 #ifdef CNS1TOUNICODE
1962 case I2CS_CNS1
: /* CNS 11643 plane 1 */
1963 c
= CNS1TOUNICODE (co
,c
,ku
,ten
);
1966 #ifdef CNS2TOUNICODE
1967 case I2CS_CNS2
: /* CNS 11643 plane 2 */
1968 c
= CNS2TOUNICODE (co
,c
,ku
,ten
);
1971 #ifdef CNS3TOUNICODE
1972 case I2CS_CNS3
: /* CNS 11643 plane 3 */
1973 c
= CNS3TOUNICODE (co
,c
,ku
,ten
);
1976 #ifdef CNS4TOUNICODE
1977 case I2CS_CNS4
: /* CNS 11643 plane 4 */
1978 c
= CNS4TOUNICODE (co
,c
,ku
,ten
);
1981 #ifdef CNS5TOUNICODE
1982 case I2CS_CNS5
: /* CNS 11643 plane 5 */
1983 c
= CNS5TOUNICODE (co
,c
,ku
,ten
);
1986 #ifdef CNS6TOUNICODE
1987 case I2CS_CNS6
: /* CNS 11643 plane 6 */
1988 c
= CNS6TOUNICODE (co
,c
,ku
,ten
);
1991 #ifdef CNS7TOUNICODE
1992 case I2CS_CNS7
: /* CNS 11643 plane 7 */
1993 c
= CNS7TOUNICODE (co
,c
,ku
,ten
);
1996 default: /* unknown multibyte, treat as UCS-2 */
1997 c
|= (co
<< 8); /* wrong, but nothing else to do */
2001 else c
= co
; /* unknown single byte, treat as 8859-1 */
2003 /* convert if second pass */
2004 if (pass
) UTF8_WRITE_BMP (s
,c
,cv
,de
)
2005 else UTF8_COUNT_BMP (ret
->size
,c
,cv
,de
);
2009 if (!pass
) (s
= ret
->data
= (unsigned char *)
2010 fs_get (ret
->size
+ 1))[ret
->size
] = NIL
;
2011 else if (((unsigned long) (s
- ret
->data
)) != ret
->size
)
2012 fatal ("ISO-2022 to UTF-8 botch");
2016 /* Convert UTF-7 sized text to UTF-8
2017 * Accepts: source sized text
2018 * pointer to returned sized text
2019 * canonicalization function
2022 void utf8_text_utf7 (SIZEDTEXT
*text
,SIZEDTEXT
*ret
,ucs4cn_t cv
,ucs4de_t de
)
2026 unsigned int c
,c1
,d
,uc
,pass
,e
,e1
,state
,surrh
;
2027 for (pass
= 0,s
= NIL
,ret
->size
= 0; pass
<= 1; pass
++) {
2028 c1
= d
= uc
= e
= e1
= 0;
2029 for (i
= 0,state
= NIL
; i
< text
->size
;) {
2030 c
= text
->data
[i
++]; /* get next byte */
2032 case U7_PLUS
: /* previous character was + */
2033 if (c
== '-') { /* +- means textual + */
2035 state
= U7_ASCII
; /* revert to ASCII */
2038 state
= U7_UNICODE
; /* enter Unicode state */
2039 e
= e1
= 0; /* initialize Unicode quantum position */
2040 case U7_UNICODE
: /* Unicode state */
2041 if (c
== '-') state
= U7_MINUS
;
2042 else { /* decode Unicode */
2043 /* don't use isupper/islower since this is ASCII only */
2044 if ((c
>= 'A') && (c
<= 'Z')) c
-= 'A';
2045 else if ((c
>= 'a') && (c
<= 'z')) c
-= 'a' - 26;
2046 else if (isdigit (c
)) c
-= '0' - 52;
2047 else if (c
== '+') c
= 62;
2048 else if (c
== '/') c
= 63;
2049 else state
= U7_ASCII
;/* end of modified BASE64 */
2052 case U7_MINUS
: /* previous character was absorbed - */
2053 state
= U7_ASCII
; /* revert to ASCII */
2054 case U7_ASCII
: /* ASCII state */
2055 if (c
== '+') state
= U7_PLUS
;
2059 switch (state
) { /* store character if in character mode */
2060 case U7_UNICODE
: /* Unicode */
2061 switch (e
++) { /* install based on BASE64 state */
2063 c1
= c
<< 2; /* byte 1: high 6 bits */
2066 d
= c1
| (c
>> 4); /* byte 1: low 2 bits */
2067 c1
= c
<< 4; /* byte 2: high 4 bits */
2070 d
= c1
| (c
>> 2); /* byte 2: low 4 bits */
2071 c1
= c
<< 6; /* byte 3: high 2 bits */
2074 d
= c
| c1
; /* byte 3: low 6 bits */
2075 e
= 0; /* reinitialize mechanism */
2078 if (e
== 1) break; /* done if first BASE64 state */
2079 if (!e1
) { /* first byte of UCS-2 character */
2080 uc
= (d
& 0xff) << 8; /* note first byte */
2081 e1
= T
; /* enter second UCS-2 state */
2084 c
= uc
| (d
& 0xff); /* build UCS-2 character */
2085 e1
= NIL
; /* back to first UCS-2 state, drop in */
2086 /* surrogate pair? */
2087 if ((c
>= UTF16_SURR
) && (c
<= UTF16_MAXSURR
)) {
2088 /* save high surrogate for later */
2089 if (c
< UTF16_SURRL
) surrh
= c
;
2090 else c
= UTF16_BASE
+ ((surrh
& UTF16_MASK
) << UTF16_SHIFT
) +
2092 break; /* either way with surrogates, we're done */
2094 case U7_ASCII
: /* just install if ASCII */
2095 /* convert if second pass */
2096 if (pass
) UTF8_WRITE_BMP (s
,c
,cv
,de
)
2097 else UTF8_COUNT_BMP (ret
->size
,c
,cv
,de
);
2100 if (!pass
) (s
= ret
->data
= (unsigned char *)
2101 fs_get (ret
->size
+ 1))[ret
->size
] = NIL
;
2102 else if (((unsigned long) (s
- ret
->data
)) != ret
->size
)
2103 fatal ("UTF-7 to UTF-8 botch");
2108 /* Convert UTF-8 sized text to UTF-8
2109 * Accepts: source sized text
2110 * pointer to returned sized text
2111 * canonicalization function
2114 void utf8_text_utf8 (SIZEDTEXT
*text
,SIZEDTEXT
*ret
,ucs4cn_t cv
,ucs4de_t de
)
2117 unsigned char *s
,*t
;
2118 for (ret
->size
= 0, t
= text
->data
, i
= text
->size
; i
;) {
2119 if ((c
= utf8_get (&t
,&i
)) & U8G_ERROR
) {
2120 ret
->data
= text
->data
; /* conversion failed */
2121 ret
->size
= text
->size
;
2124 UTF8_COUNT (ret
->size
,c
,cv
,de
)
2126 (s
= ret
->data
= (unsigned char *) fs_get (ret
->size
+ 1))[ret
->size
] =NIL
;
2127 for (t
= text
->data
, i
= text
->size
; i
;) {
2128 c
= utf8_get (&t
,&i
);
2129 UTF8_WRITE (s
,c
,cv
,de
) /* convert UCS-4 to UTF-8 */
2131 if (((unsigned long) (s
- ret
->data
)) != ret
->size
)
2132 fatal ("UTF-8 to UTF-8 botch");
2135 /* Convert UCS-2 sized text to UTF-8
2136 * Accepts: source sized text
2137 * pointer to returned sized text
2138 * canonicalization function
2141 void utf8_text_ucs2 (SIZEDTEXT
*text
,SIZEDTEXT
*ret
,ucs4cn_t cv
,ucs4de_t de
)
2144 unsigned char *s
,*t
;
2146 for (ret
->size
= 0, t
= text
->data
, i
= text
->size
/ 2; i
; --i
) {
2149 UTF8_COUNT_BMP (ret
->size
,c
,cv
,de
);
2151 (s
= ret
->data
= (unsigned char *) fs_get (ret
->size
+ 1))[ret
->size
] = NIL
;
2152 for (t
= text
->data
, i
= text
->size
/ 2; i
; --i
) {
2155 UTF8_WRITE_BMP (s
,c
,cv
,de
) /* convert UCS-2 to UTF-8 */
2157 if (((unsigned long) (s
- ret
->data
)) != ret
->size
)
2158 fatal ("UCS-2 to UTF-8 botch");
2162 /* Convert UCS-4 sized text to UTF-8
2163 * Accepts: source sized text
2164 * pointer to returned sized text
2165 * canonicalization function
2168 void utf8_text_ucs4 (SIZEDTEXT
*text
,SIZEDTEXT
*ret
,ucs4cn_t cv
,ucs4de_t de
)
2171 unsigned char *s
,*t
;
2173 for (ret
->size
= 0, t
= text
->data
, i
= text
->size
/ 4; i
; --i
) {
2174 c
= *t
++ << 24; c
|= *t
++ << 16; c
|= *t
++ << 8; c
|= *t
++;
2175 UTF8_COUNT (ret
->size
,c
,cv
,de
);
2177 (s
= ret
->data
= (unsigned char *) fs_get (ret
->size
+ 1))[ret
->size
] = NIL
;
2178 for (t
= text
->data
, i
= text
->size
/ 2; i
; --i
) {
2179 c
= *t
++ << 24; c
|= *t
++ << 16; c
|= *t
++ << 8; c
|= *t
++;
2180 UTF8_WRITE (s
,c
,cv
,de
) /* convert UCS-4 to UTF-8 */
2182 if (((unsigned long) (s
- ret
->data
)) != ret
->size
)
2183 fatal ("UCS-4 to UTF-8 botch");
2186 /* Convert UTF-16 sized text to UTF-8
2187 * Accepts: source sized text
2188 * pointer to returned sized text
2189 * canonicalization function
2192 void utf8_text_utf16 (SIZEDTEXT
*text
,SIZEDTEXT
*ret
,ucs4cn_t cv
,ucs4de_t de
)
2195 unsigned char *s
,*t
;
2197 for (ret
->size
= 0, t
= text
->data
, i
= text
->size
/ 2; i
; --i
) {
2200 /* possible surrogate? */
2201 if ((c
>= UTF16_SURR
) && (c
<= UTF16_MAXSURR
)) {
2202 /* invalid first surrogate */
2203 if ((c
> UTF16_SURRHEND
) || !i
) c
= UBOGON
;
2204 else { /* get second surrogate */
2207 --i
; /* swallowed another 16-bits */
2208 /* invalid second surrogate */
2209 if ((d
< UTF16_SURRL
) || (d
> UTF16_SURRLEND
)) c
= UBOGON
;
2210 else c
= UTF16_BASE
+ ((c
& UTF16_MASK
) << UTF16_SHIFT
) +
2214 UTF8_COUNT (ret
->size
,c
,cv
,de
);
2216 (s
= ret
->data
= (unsigned char *) fs_get (ret
->size
+ 1))[ret
->size
] = NIL
;
2217 for (t
= text
->data
, i
= text
->size
/ 2; i
; --i
) {
2220 /* possible surrogate? */
2221 if ((c
>= UTF16_SURR
) && (c
<= UTF16_MAXSURR
)) {
2222 /* invalid first surrogate */
2223 if ((c
> UTF16_SURRHEND
) || !i
) c
= UBOGON
;
2224 else { /* get second surrogate */
2227 --i
; /* swallowed another 16-bits */
2228 /* invalid second surrogate */
2229 if ((d
< UTF16_SURRL
) || (d
> UTF16_SURRLEND
)) c
= UBOGON
;
2230 else c
= UTF16_BASE
+ ((c
& UTF16_MASK
) << UTF16_SHIFT
) +
2234 UTF8_WRITE (s
,c
,cv
,de
) /* convert UCS-4 to UTF-8 */
2236 if (((unsigned long) (s
- ret
->data
)) != ret
->size
)
2237 fatal ("UTF-16 to UTF-8 botch");
2240 /* Size of UCS-4 character, possibly not in BMP, as UTF-8 octets
2241 * Accepts: character
2242 * Returns: size (0 means bogon)
2244 * Use UTF8_SIZE macro if known to be in the BMP
2247 unsigned long utf8_size (unsigned long c
)
2249 if (c
< 0x80) return 1;
2250 else if (c
< 0x800) return 2;
2251 else if (c
< 0x10000) return 3;
2252 else if (c
< 0x200000) return 4;
2253 else if (c
< 0x4000000) return 5;
2254 else if (c
< 0x80000000) return 6;
2259 /* Put UCS-4 character, possibly not in BMP, as UTF-8 octets
2260 * Accepts: destination string pointer
2262 * Returns: updated destination pointer
2264 * Use UTF8_PUT_BMP macro if known to be in the BMP
2267 unsigned char *utf8_put (unsigned char *s
,unsigned long c
)
2269 unsigned char mark
[6] = {0x00,0xc0,0xe0,0xf0,0xf8,0xfc};
2270 unsigned long size
= utf8_size (c
);
2273 s
[5] = 0x80 | (unsigned char) (c
& 0x3f);
2276 s
[4] = 0x80 | (unsigned char) (c
& 0x3f);
2279 s
[3] = 0x80 | (unsigned char) (c
& 0x3f);
2282 s
[2] = 0x80 | (unsigned char) (c
& 0x3f);
2285 s
[1] = 0x80 | (unsigned char) (c
& 0x3f);
2288 *s
= mark
[size
-1] | (unsigned char) (c
& 0x7f);
2294 /* Return title case of a fixed-width UCS-4 character
2295 * Accepts: character
2296 * Returns: title case of character
2299 unsigned long ucs4_titlecase (unsigned long c
)
2301 if (c
<= UCS4_TMAPMAX
) return ucs4_tmaptab
[c
];
2302 if (c
< UCS4_TMAPHIMIN
) return c
;
2303 if (c
<= UCS4_TMAPHIMAX
) return c
- UCS4_TMAPHIMAP
;
2304 if (c
< UCS4_TMAPDESERETMIN
) return c
;
2305 if (c
<= UCS4_TMAPDESERETMAX
) return c
- UCS4_TMAPDESERETMAP
;
2310 /* Return width of a fixed-width UCS-4 character in planes 0-2
2311 * Accepts: character
2312 * Returns: width (0, 1, 2) or negative error condition if not valid
2315 long ucs4_width (unsigned long c
)
2318 /* out of range, not-a-char, or surrogates */
2319 if ((c
> UCS4_MAXUNICODE
) || ((c
& 0xfffe) == 0xfffe) ||
2320 ((c
>= UTF16_SURR
) && (c
<= UTF16_MAXSURR
))) ret
= U4W_NOTUNCD
;
2322 else if (c
>= UCS4_PVTBASE
) ret
= U4W_PRIVATE
;
2323 /* SSP are not printing characters */
2324 else if (c
>= UCS4_SSPBASE
) ret
= U4W_SSPCHAR
;
2325 /* unassigned planes */
2326 else if (c
>= UCS4_UNABASE
) ret
= U4W_UNASSGN
;
2327 /* SIP and reserved plane 3 are wide */
2328 else if (c
>= UCS4_SIPBASE
) ret
= 2;
2329 #if (UCS4_WIDLEN != UCS4_SIPBASE)
2330 #error "UCS4_WIDLEN != UCS4_SIPBASE"
2332 /* C0/C1 controls */
2333 else if ((c
<= UCS2_C0CONTROLEND
) ||
2334 ((c
>= UCS2_C1CONTROL
) && (c
<= UCS2_C1CONTROLEND
)))
2336 /* BMP and SMP get value from table */
2337 else switch (ret
= (ucs4_widthtab
[(c
>> 2)] >> ((3 - (c
& 0x3)) << 1)) &0x3){
2338 case 0: /* zero-width */
2339 if (c
== 0x00ad) ret
= 1; /* force U+00ad (SOFT HYPHEN) to width 1 */
2340 case 1: /* single-width */
2341 case 2: /* double-width */
2343 case 3: /* ambiguous width */
2344 ret
= (c
>= 0x2100) ? 2 : 1;/* need to do something better than this */
2350 /* Return screen width of UTF-8 string
2352 * Returns: width or negative if not valid UTF-8
2355 long utf8_strwidth (unsigned char *s
)
2357 unsigned long c
,i
,ret
;
2358 /* go through string */
2359 for (ret
= 0; *s
; ret
+= ucs4_width (c
)) {
2360 /* It's alright to give a fake value for the byte count to utf8_get()
2361 * since the null of a null-terminated string will stop processing anyway.
2363 i
= 6; /* fake value */
2364 if ((c
= utf8_get (&s
,&i
)) & U8G_ERROR
) return -1;
2370 /* Return screen width of UTF-8 text
2371 * Accepts: SIZEDTEXT to string
2372 * Returns: width or negative if not valid UTF-8
2375 long utf8_textwidth (SIZEDTEXT
*utf8
)
2378 unsigned char *s
= utf8
->data
;
2379 unsigned long i
= utf8
->size
;
2380 unsigned long ret
= 0;
2381 while (i
) { /* while there's a string to process */
2382 if ((c
= utf8_get (&s
,&i
)) & U8G_ERROR
) return -1;
2383 ret
+= ucs4_width (c
);
2388 /* Decomposition (phew!) */
2390 #define MORESINGLE 1 /* single UCS-4 tail value */
2391 #define MOREMULTIPLE 2 /* multiple UCS-2 tail values */
2393 struct decomposemore
{
2394 short type
; /* type of more */
2396 unsigned long single
; /* single decomposed value */
2397 struct { /* multiple BMP values */
2398 unsigned short *next
;
2399 unsigned long count
;
2404 #define RECURSIVEMORE struct recursivemore
2407 struct decomposemore
*more
;
2408 RECURSIVEMORE
*next
;
2412 /* Return decomposition of a UCS-4 character
2413 * Accepts: character or U8G_ERROR to return next from "more"
2414 * pointer to returned more
2415 * Returns: [next] decomposed value, more set if still more decomposition
2418 unsigned long ucs4_decompose (unsigned long c
,void **more
)
2420 unsigned long i
,ix
,ret
;
2421 struct decomposemore
*m
;
2422 if (c
& U8G_ERROR
) { /* want to chase more? */
2423 /* do sanity check */
2424 if (m
= (struct decomposemore
*) *more
) switch (m
->type
) {
2425 case MORESINGLE
: /* single value */
2426 ret
= m
->data
.single
;
2427 fs_give (more
); /* no more decomposition */
2429 case MOREMULTIPLE
: /* multiple value */
2430 ret
= *m
->data
.multiple
.next
++;
2431 if (!--m
->data
.multiple
.count
) fs_give (more
);
2433 default: /* uh-oh */
2434 fatal ("invalid more block argument to ucs4_decompose!");
2436 else fatal ("no more block provided to ucs4_decompose!");
2439 else { /* start decomposition */
2440 *more
= NIL
; /* initially set no more */
2441 /* BMP low decompositions */
2442 if (c
< UCS4_BMPLOMIN
) ret
= c
;
2443 /* fix this someday */
2444 else if (c
== UCS4_BMPLOMIN
) ret
= ucs4_dbmplotab
[0];
2445 else if (c
<= UCS4_BMPLOMAX
) {
2446 /* within range - have a decomposition? */
2447 if (i
= ucs4_dbmploixtab
[c
- UCS4_BMPLOMIN
]) {
2448 /* get first value of decomposition */
2449 ret
= ucs4_dbmplotab
[ix
= i
& UCS4_BMPLOIXMASK
];
2450 /* has continuation? */
2451 if (i
& UCS4_BMPLOSIZEMASK
) {
2452 m
= (struct decomposemore
*)
2453 (*more
= memset (fs_get (sizeof (struct decomposemore
)),0,
2454 sizeof (struct decomposemore
)));
2455 m
->type
= MOREMULTIPLE
;
2456 m
->data
.multiple
.next
= &ucs4_dbmplotab
[++ix
];
2457 m
->data
.multiple
.count
= i
>> UCS4_BMPLOSIZESHIFT
;
2460 else ret
= c
; /* in range but doesn't decompose */
2462 /* BMP CJK compatibility */
2463 else if (c
< UCS4_BMPCJKMIN
) ret
= c
;
2464 else if (c
<= UCS4_BMPCJKMAX
) {
2465 if (!(ret
= ucs4_bmpcjk1decomptab
[c
- UCS4_BMPCJKMIN
])) ret
= c
;
2467 /* BMP CJK compatibility - some not in BMP */
2468 #if UCS4_BMPCJK2MIN - (UCS4_BMPCJKMAX + 1)
2469 else if (c
< UCS4_BMPCJK2MIN
) ret
= c
;
2471 else if (c
<= UCS4_BMPCJK2MAX
)
2472 ret
= ucs4_bmpcjk2decomptab
[c
- UCS4_BMPCJK2MIN
];
2473 /* BMP high decompositions */
2474 else if (c
< UCS4_BMPHIMIN
) ret
= c
;
2475 else if (c
<= UCS4_BMPHIMAX
) {
2476 /* within range - have a decomposition? */
2477 if (i
= ucs4_dbmphiixtab
[c
- UCS4_BMPHIMIN
]) {
2478 /* get first value of decomposition */
2479 ret
= ucs4_dbmphitab
[ix
= i
& UCS4_BMPHIIXMASK
];
2480 /* has continuation? */
2481 if (i
& UCS4_BMPHISIZEMASK
) {
2482 m
= (struct decomposemore
*)
2483 (*more
= memset (fs_get (sizeof (struct decomposemore
)),0,
2484 sizeof (struct decomposemore
)));
2485 m
->type
= MOREMULTIPLE
;
2486 m
->data
.multiple
.next
= &ucs4_dbmphitab
[++ix
];
2487 m
->data
.multiple
.count
= i
>> UCS4_BMPHISIZESHIFT
;
2490 else ret
= c
; /* in range but doesn't decompose */
2493 /* BMP half and full width forms */
2494 else if (c
< UCS4_BMPHALFFULLMIN
) ret
= c
;
2495 else if (c
<= UCS4_BMPHALFFULLMAX
) {
2496 if (!(ret
= ucs4_bmphalffulldecomptab
[c
- UCS4_BMPHALFFULLMIN
])) ret
= c
;
2499 else if (c
< UCS4_SMPMUSIC1MIN
) ret
= c
;
2500 else if (c
<= UCS4_SMPMUSIC1MAX
) {
2501 ret
= ucs4_smpmusic1decomptab
[c
-= UCS4_SMPMUSIC1MIN
][0];
2502 m
= (struct decomposemore
*)
2503 (*more
= memset (fs_get (sizeof (struct decomposemore
)),0,
2504 sizeof (struct decomposemore
)));
2505 m
->type
= MORESINGLE
;
2506 m
->data
.single
= ucs4_smpmusic1decomptab
[c
][1];
2508 else if (c
< UCS4_SMPMUSIC2MIN
) ret
= c
;
2509 else if (c
<= UCS4_SMPMUSIC2MAX
) {
2510 ret
= ucs4_smpmusic2decomptab
[c
-= UCS4_SMPMUSIC2MIN
][0];
2511 m
= (struct decomposemore
*)
2512 (*more
= memset (fs_get (sizeof (struct decomposemore
)),0,
2513 sizeof (struct decomposemore
)));
2514 m
->type
= MORESINGLE
;
2515 m
->data
.single
= ucs4_smpmusic2decomptab
[c
][1];
2517 /* SMP mathematical forms */
2518 else if (c
< UCS4_SMPMATHMIN
) ret
= c
;
2519 else if (c
<= UCS4_SMPMATHMAX
) {
2520 if (!(ret
= ucs4_smpmathdecomptab
[c
- UCS4_SMPMATHMIN
])) ret
= c
;
2522 /* CJK compatibility ideographs in SIP */
2523 else if (!(ret
= ((c
>= UCS4_SIPMIN
) && (c
<= UCS4_SIPMAX
)) ?
2524 ucs4_sipdecomptab
[c
- UCS4_SIPMIN
] : c
)) ret
= c
;
2529 /* Return recursive decomposition of a UCS-4 character
2530 * Accepts: character or U8G_ERROR to return next from "more"
2531 * pointer to returned more
2532 * Returns: [next] decomposed value, more set if still more decomposition
2535 unsigned long ucs4_decompose_recursive (unsigned long c
,void **more
)
2540 if (c
& U8G_ERROR
) { /* want to chase more? */
2542 if (mr
= (RECURSIVEMORE
*) *more
) switch (mr
->more
->type
) {
2543 case MORESINGLE
: /* decompose single value */
2544 c
= ucs4_decompose_recursive (mr
->more
->data
.single
,&mn
);
2545 *more
= mr
->next
; /* done with this more, remove it */
2546 fs_give ((void **) &mr
->more
);
2547 fs_give ((void **) &mr
);
2549 case MOREMULTIPLE
: /* decompose current value in multiple */
2550 c
= ucs4_decompose_recursive (*mr
->more
->data
.multiple
.next
++,&mn
);
2551 /* if done with this multiple decomposition */
2552 if (!--mr
->more
->data
.multiple
.count
) {
2553 *more
= mr
->next
; /* done with this more, remove it */
2554 fs_give ((void **) &mr
->more
);
2555 fs_give ((void **) &mr
);
2558 default: /* uh-oh */
2559 fatal ("invalid more block argument to ucs4_decompose_recursive!");
2561 else fatal ("no more block provided to ucs4_decompose_recursive!");
2562 if (mr
= mn
) { /* did this value recurse on us? */
2563 mr
->next
= *more
; /* yes, insert new more at head */
2567 else { /* start decomposition */
2568 *more
= NIL
; /* initially set no more */
2570 do { /* repeatedly decompose this codepoint */
2571 c
= ucs4_decompose (c1
= c
,&m
);
2572 if (m
) { /* multi-byte decomposition */
2573 if (c1
== c
) fatal ("endless multiple decomposition!");
2574 /* create a block to stash this more */
2575 mr
= memset (fs_get (sizeof (RECURSIVEMORE
)),0,sizeof (RECURSIVEMORE
));
2576 mr
->more
= m
; /* note the expansion */
2577 mr
->next
= *more
; /* old list is the tail */
2578 *more
= mr
; /* and this is the new head */
2580 } while (c1
!= c
); /* until nothing more to decompose */