* new version 2.19.9999
[alpine.git] / imap / src / c-client / utf8.c
blob3f75ed98de3613fa14de6285a36a75b07abc8f32
1 /* ========================================================================
2 * Copyright 2008-2010 Mark Crispin
3 * ========================================================================
4 */
6 /*
7 * Program: UTF-8 routines
9 * Author: Mark Crispin
11 * Date: 11 June 1997
12 * Last Edited: 28 May 2010
14 * Previous versions of this file were
16 * Copyright 1988-2008 University of Washington
18 * Licensed under the Apache License, Version 2.0 (the "License");
19 * you may not use this file except in compliance with the License.
20 * You may obtain a copy of the License at
22 * http://www.apache.org/licenses/LICENSE-2.0
27 #include <stdio.h>
28 #include <ctype.h>
29 #include "c-client.h"
31 /* *** IMPORTANT ***
33 * There is a very important difference between "character set" and "charset",
34 * and the comments in this file reflect these differences. A "character set"
35 * (also known as "coded character set") is a mapping between codepoints and
36 * characters. A "charset" is as defined in MIME, and incorporates one or more
37 * coded character sets in a character encoding scheme. See RFC 2130 for more
38 * details.
42 /* Character set conversion tables */
44 #include "iso_8859.c" /* 8-bit single-byte coded graphic */
45 #include "koi8_r.c" /* Cyrillic - Russia */
46 #include "koi8_u.c" /* Cyrillic - Ukraine */
47 #include "tis_620.c" /* Thai */
48 #include "viscii.c" /* Vietnamese */
49 #include "windows.c" /* Windows */
50 #include "ibm.c" /* IBM */
51 #include "gb_2312.c" /* Chinese (PRC) - simplified */
52 #include "gb_12345.c" /* Chinese (PRC) - traditional */
53 #include "jis_0208.c" /* Japanese - basic */
54 #include "jis_0212.c" /* Japanese - supplementary */
55 #include "ksc_5601.c" /* Korean */
56 #include "big5.c" /* Taiwanese (ROC) - industrial standard */
57 #include "cns11643.c" /* Taiwanese (ROC) - national standard */
60 #include "widths.c" /* Unicode character widths */
61 #include "tmap.c" /* Unicode titlecase mapping */
62 #include "decomtab.c" /* Unicode decomposions */
64 /* EUC parameters */
66 #ifdef GBTOUNICODE /* PRC simplified Chinese */
67 static const struct utf8_eucparam gb_param = {
68 BASE_GB2312_KU,BASE_GB2312_TEN,MAX_GB2312_KU,MAX_GB2312_TEN,
69 (void *) gb2312tab};
70 #endif
73 #ifdef GB12345TOUNICODE /* PRC traditional Chinese */
74 static const struct utf8_eucparam gbt_param = {
75 BASE_GB12345_KU,BASE_GB12345_TEN,MAX_GB12345_KU,MAX_GB12345_TEN,
76 (void *) gb12345tab};
77 #endif
80 #ifdef BIG5TOUNICODE /* ROC traditional Chinese */
81 static const struct utf8_eucparam big5_param[] = {
82 {BASE_BIG5_KU,BASE_BIG5_TEN_0,MAX_BIG5_KU,MAX_BIG5_TEN_0,(void *) big5tab},
83 {BASE_BIG5_KU,BASE_BIG5_TEN_1,MAX_BIG5_KU,MAX_BIG5_TEN_1,NIL}
85 #endif
88 #ifdef JISTOUNICODE /* Japanese */
89 static const struct utf8_eucparam jis_param[] = {
90 {BASE_JIS0208_KU,BASE_JIS0208_TEN,MAX_JIS0208_KU,MAX_JIS0208_TEN,
91 (void *) jis0208tab},
92 {MIN_KANA_8,0,MAX_KANA_8,0,(void *) KANA_8},
93 #ifdef JIS0212TOUNICODE /* Japanese extended */
94 {BASE_JIS0212_KU,BASE_JIS0212_TEN,MAX_JIS0212_KU,MAX_JIS0212_TEN,
95 (void *) jis0212tab}
96 #else
97 {0,0,0,0,NIL}
98 #endif
100 #endif
103 #ifdef KSCTOUNICODE /* Korean */
104 static const struct utf8_eucparam ksc_param = {
105 BASE_KSC5601_KU,BASE_KSC5601_TEN,MAX_KSC5601_KU,MAX_KSC5601_TEN,
106 (void *) ksc5601tab};
107 #endif
109 /* List of supported charsets */
111 static const CHARSET utf8_csvalid[] = {
112 {"US-ASCII",CT_ASCII,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
113 NIL,NIL,NIL},
114 {"UTF-8",CT_UTF8,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
115 NIL,SC_UNICODE,NIL},
116 {"UTF-7",CT_UTF7,CF_PRIMARY | CF_POSTING | CF_UNSUPRT,
117 NIL,SC_UNICODE,"UTF-8"},
118 {"ISO-8859-1",CT_1BYTE0,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
119 NIL,SC_LATIN_1,NIL},
120 {"ISO-8859-2",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
121 (void *) iso8859_2tab,SC_LATIN_2,NIL},
122 {"ISO-8859-3",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
123 (void *) iso8859_3tab,SC_LATIN_3,NIL},
124 {"ISO-8859-4",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
125 (void *) iso8859_4tab,SC_LATIN_4,NIL},
126 {"ISO-8859-5",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
127 (void *) iso8859_5tab,SC_CYRILLIC,"KOI8-R"},
128 {"ISO-8859-6",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
129 (void *) iso8859_6tab,SC_ARABIC,NIL},
130 {"ISO-8859-7",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
131 (void *) iso8859_7tab,SC_GREEK,NIL},
132 {"ISO-8859-8",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
133 (void *) iso8859_8tab,SC_HEBREW,NIL},
134 {"ISO-8859-9",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
135 (void *) iso8859_9tab,SC_LATIN_5,NIL},
136 {"ISO-8859-10",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
137 (void *) iso8859_10tab,SC_LATIN_6,NIL},
138 {"ISO-8859-11",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
139 (void *) iso8859_11tab,SC_THAI,NIL},
140 #if 0 /* ISO 8859-12 reserved for ISCII(?) */
141 {"ISO-8859-12",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
142 (void *) iso8859_12tab,NIL,NIL},
143 #endif
144 {"ISO-8859-13",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
145 (void *) iso8859_13tab,SC_LATIN_7,NIL},
146 {"ISO-8859-14",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
147 (void *) iso8859_14tab,SC_LATIN_8,NIL},
148 {"ISO-8859-15",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
149 (void *) iso8859_15tab,SC_LATIN_9,NIL},
150 {"ISO-8859-16",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
151 (void *) iso8859_16tab,SC_LATIN_10,NIL},
152 {"KOI8-R",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
153 (void *) koi8rtab,SC_CYRILLIC,NIL},
154 {"KOI8-U",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
155 (void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN,NIL},
156 {"KOI8-RU",CT_1BYTE,CF_DISPLAY,
157 (void *) koi8utab,SC_CYRILLIC | SC_UKRANIAN,"KOI8-U"},
158 {"TIS-620",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
159 (void *) tis620tab,SC_THAI,"ISO-8859-11"},
160 {"VISCII",CT_1BYTE8,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
161 (void *) visciitab,SC_VIETNAMESE,NIL},
163 #ifdef GBTOUNICODE
164 {"GBK",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
165 (void *) &gb_param,SC_CHINESE_SIMPLIFIED,NIL},
166 {"GB2312",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
167 (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
168 {"CN-GB",CT_DBYTE,CF_DISPLAY,
169 (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
170 #ifdef CNS1TOUNICODE
171 {"ISO-2022-CN",CT_2022,CF_PRIMARY | CF_UNSUPRT,
172 NIL,SC_CHINESE_SIMPLIFIED | SC_CHINESE_TRADITIONAL,
173 NIL},
174 #endif
175 #endif
176 #ifdef GB12345TOUNICODE
177 {"CN-GB-12345",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
178 (void *) &gbt_param,SC_CHINESE_TRADITIONAL,"BIG5"},
179 #endif
180 #ifdef BIG5TOUNICODE
181 {"BIG5",CT_DBYTE2,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
182 (void *) big5_param,SC_CHINESE_TRADITIONAL,NIL},
183 {"CN-BIG5",CT_DBYTE2,CF_DISPLAY,
184 (void *) big5_param,SC_CHINESE_TRADITIONAL,"BIG5"},
185 {"BIG-5",CT_DBYTE2,CF_DISPLAY,
186 (void *) big5_param,SC_CHINESE_TRADITIONAL,"BIG5"},
187 #endif
188 #ifdef JISTOUNICODE
189 {"ISO-2022-JP",CT_2022,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
190 NIL,SC_JAPANESE,NIL},
191 {"EUC-JP",CT_EUC,CF_PRIMARY | CF_DISPLAY,
192 (void *) jis_param,SC_JAPANESE,"ISO-2022-JP"},
193 {"SHIFT_JIS",CT_SJIS,CF_PRIMARY | CF_DISPLAY,
194 NIL,SC_JAPANESE,"ISO-2022-JP"},
195 {"SHIFT-JIS",CT_SJIS,CF_PRIMARY | CF_DISPLAY,
196 NIL,SC_JAPANESE,"ISO-2022-JP"},
197 #ifdef JIS0212TOUNICODE
198 {"ISO-2022-JP-1",CT_2022,CF_UNSUPRT,
199 NIL,SC_JAPANESE,"ISO-2022-JP"},
200 #ifdef GBTOUNICODE
201 #ifdef KSCTOUNICODE
202 {"ISO-2022-JP-2",CT_2022,CF_UNSUPRT,
203 NIL,
204 SC_LATIN_1 | SC_LATIN_2 | SC_LATIN_3 | SC_LATIN_4 | SC_LATIN_5 |
205 SC_LATIN_6 | SC_LATIN_7 | SC_LATIN_8 | SC_LATIN_9 | SC_LATIN_10 |
206 SC_ARABIC | SC_CYRILLIC | SC_GREEK | SC_HEBREW | SC_THAI |
207 SC_VIETNAMESE | SC_CHINESE_SIMPLIFIED | SC_JAPANESE | SC_KOREAN
208 #ifdef CNS1TOUNICODE
209 | SC_CHINESE_TRADITIONAL
210 #endif
211 ,"UTF-8"},
212 #endif
213 #endif
214 #endif
215 #endif
217 #ifdef KSCTOUNICODE
218 {"ISO-2022-KR",CT_2022,CF_PRIMARY | CF_DISPLAY | CF_UNSUPRT,
219 NIL,SC_KOREAN,"EUC-KR"},
220 {"EUC-KR",CT_DBYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
221 (void *) &ksc_param,SC_KOREAN,NIL},
222 {"KSC5601",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
223 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
224 {"KSC_5601",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
225 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
226 {"KS_C_5601-1987",CT_DBYTE,CF_DISPLAY,
227 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
228 {"KS_C_5601-1989",CT_DBYTE,CF_DISPLAY,
229 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
230 {"KS_C_5601-1992",CT_DBYTE,CF_DISPLAY,
231 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
232 {"KS_C_5601-1997",CT_DBYTE,CF_DISPLAY,
233 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
234 #endif
236 /* deep sigh */
237 {"WINDOWS-874",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
238 (void *) windows_874tab,SC_THAI,"ISO-8859-11"},
239 {"CP874",CT_1BYTE,CF_DISPLAY,
240 (void *) windows_874tab,SC_THAI,"ISO-8859-11"},
241 #ifdef GBTOUNICODE
242 {"WINDOWS-936",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
243 (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
244 {"CP936",CT_DBYTE,CF_DISPLAY,
245 (void *) &gb_param,SC_CHINESE_SIMPLIFIED,"GBK"},
246 #endif
247 #ifdef KSCTOUNICODE
248 {"WINDOWS-949",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
249 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
250 {"CP949",CT_DBYTE,CF_DISPLAY,
251 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
252 {"X-WINDOWS-949",CT_DBYTE,CF_PRIMARY | CF_DISPLAY,
253 (void *) &ksc_param,SC_KOREAN,"EUC-KR"},
254 #endif
255 {"WINDOWS-1250",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
256 (void *) windows_1250tab,SC_LATIN_2,"ISO-8859-2"},
257 {"CP1250",CT_1BYTE,CF_DISPLAY,
258 (void *) windows_1250tab,SC_LATIN_2,"ISO-8859-2"},
259 {"WINDOWS-1251",CT_1BYTE,CF_PRIMARY | CF_DISPLAY | CF_POSTING,
260 (void *) windows_1251tab,SC_CYRILLIC,"KOI8-R"},
261 {"CP1251",CT_1BYTE,CF_DISPLAY,
262 (void *) windows_1251tab,SC_CYRILLIC,"KOI8-R"},
263 {"WINDOWS-1252",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
264 (void *) windows_1252tab,SC_LATIN_1,"ISO-8859-1"},
265 {"CP1252",CT_1BYTE,CF_DISPLAY,
266 (void *) windows_1252tab,SC_LATIN_1,"ISO-8859-1"},
267 {"WINDOWS-1253",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
268 (void *) windows_1253tab,SC_GREEK,"ISO-8859-7"},
269 {"CP1253",CT_1BYTE,CF_DISPLAY,
270 (void *) windows_1253tab,SC_GREEK,"ISO-8859-7"},
271 {"WINDOWS-1254",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
272 (void *) windows_1254tab,SC_LATIN_5,"ISO-8859-9"},
273 {"CP1254",CT_1BYTE,CF_DISPLAY,
274 (void *) windows_1254tab,SC_LATIN_5,"ISO-8859-9"},
275 {"WINDOWS-1255",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
276 (void *) windows_1255tab,SC_HEBREW,"ISO-8859-8"},
277 {"CP1255",CT_1BYTE,CF_DISPLAY,
278 (void *) windows_1255tab,SC_HEBREW,"ISO-8859-8"},
279 {"WINDOWS-1256",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
280 (void *) windows_1256tab,SC_ARABIC,"ISO-8859-6"},
281 {"CP1256",CT_1BYTE,CF_DISPLAY,
282 (void *) windows_1256tab,SC_ARABIC,"ISO-8859-6"},
283 {"WINDOWS-1257",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
284 (void *) windows_1257tab,SC_LATIN_7,"ISO-8859-13"},
285 {"CP1257",CT_1BYTE,CF_DISPLAY,
286 (void *) windows_1257tab,SC_LATIN_7,"ISO-8859-13"},
287 {"WINDOWS-1258",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
288 (void *) windows_1258tab,SC_VIETNAMESE,"VISCII"},
289 {"CP1258",CT_1BYTE,CF_DISPLAY,
290 (void *) windows_1258tab,SC_VIETNAMESE,"VISCII"},
292 /* deeper sigh */
293 {"IBM367",CT_ASCII,CF_PRIMARY | CF_DISPLAY,
294 NIL,NIL,"US-ASCII"},
295 {"IBM437",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
296 (void *) ibm_437tab,SC_LATIN_1,"ISO-8859-1"},
297 {"IBM737",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
298 (void *) ibm_737tab,SC_GREEK,"ISO-8859-7"},
299 {"IBM775",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
300 (void *) ibm_775tab,SC_LATIN_7,"ISO-8859-13"},
301 {"IBM850",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
302 (void *) ibm_850tab,SC_LATIN_1,"ISO-8859-1"},
303 {"IBM852",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
304 (void *) ibm_852tab,SC_LATIN_2,"ISO-8859-2"},
305 {"IBM855",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
306 (void *) ibm_855tab,SC_CYRILLIC,"ISO-8859-5"},
307 {"IBM857",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
308 (void *) ibm_857tab,SC_LATIN_5,"ISO-8859-9"},
309 {"IBM860",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
310 (void *) ibm_860tab,SC_LATIN_1,"ISO-8859-1"},
311 {"IBM861",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
312 (void *) ibm_861tab,SC_LATIN_6,"ISO-8859-10"},
313 {"IBM862",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
314 (void *) ibm_862tab,SC_HEBREW,"ISO-8859-8"},
315 {"IBM863",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
316 (void *) ibm_863tab,SC_LATIN_1,"ISO-8859-1"},
317 {"IBM864",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
318 (void *) ibm_864tab,SC_ARABIC,"ISO-8859-6"},
319 {"IBM865",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
320 (void *) ibm_865tab,SC_LATIN_6,"ISO-8859-10"},
321 {"IBM866",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
322 (void *) ibm_866tab,SC_CYRILLIC,"KOI8-R"},
323 {"IBM869",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
324 (void *) ibm_869tab,SC_GREEK,"ISO-8859-7"},
325 {"IBM874",CT_1BYTE,CF_PRIMARY | CF_DISPLAY,
326 (void *) ibm_874tab,SC_THAI,"ISO-8859-11"},
327 /* deepest sigh */
328 {"ANSI_X3.4-1968",CT_ASCII,CF_DISPLAY,
329 NIL,NIL,"US-ASCII"},
330 {"UNICODE-1-1-UTF-7",CT_UTF7,CF_UNSUPRT,
331 NIL,SC_UNICODE,"UTF-8"},
332 /* these should never appear in email */
333 {"UCS-2",CT_UCS2,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL,
334 NIL,SC_UNICODE,"UTF-8"},
335 {"UCS-4",CT_UCS4,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL,
336 NIL,SC_UNICODE,"UTF-8"},
337 {"UTF-16",CT_UTF16,CF_PRIMARY | CF_DISPLAY | CF_NOEMAIL,
338 NIL,SC_UNICODE,"UTF-8"},
342 /* Non-Unicode Script table */
344 static const SCRIPT utf8_scvalid[] = {
345 {"Arabic",NIL,SC_ARABIC},
346 {"Chinese Simplified","China, Singapore",SC_CHINESE_SIMPLIFIED},
347 {"Chinese Traditional","Taiwan, Hong Kong, Macao",SC_CHINESE_TRADITIONAL},
348 {"Cyrillic",NIL,SC_CYRILLIC},
349 {"Cyrillic Ukranian",NIL,SC_UKRANIAN},
350 {"Greek",NIL,SC_GREEK},
351 {"Hebrew",NIL,SC_HEBREW},
352 {"Japanese",NIL,SC_JAPANESE},
353 {"Korean",NIL,SC_KOREAN},
354 {"Latin-1","Western Europe",SC_LATIN_1},
355 {"Latin-2","Eastern Europe",SC_LATIN_2},
356 {"Latin-3","Southern Europe",SC_LATIN_3},
357 {"Latin-4","Northern Europe",SC_LATIN_4},
358 {"Latin-5","Turkish",SC_LATIN_5},
359 {"Latin-6","Nordic",SC_LATIN_6},
360 {"Latin-7","Baltic",SC_LATIN_7},
361 {"Latin-8","Celtic",SC_LATIN_8},
362 {"Latin-9","Euro",SC_LATIN_9},
363 {"Latin-10","Balkan",SC_LATIN_10},
364 {"Thai",NIL,SC_THAI},
365 {"Vietnamese",NIL,SC_VIETNAMESE},
369 /* Look up script name or return entire table
370 * Accepts: script name or NIL
371 * Returns: pointer to script table entry or NIL if unknown
374 SCRIPT *utf8_script (char *script)
376 unsigned long i;
377 if (!script) return (SCRIPT *) &utf8_scvalid[0];
378 else if (*script && (strlen (script) < 128))
379 for (i = 0; utf8_scvalid[i].name; i++)
380 if (!compare_cstring (script,utf8_scvalid[i].name))
381 return (SCRIPT *) &utf8_scvalid[i];
382 return NIL; /* failed */
386 /* Look up charset name or return entire table
387 * Accepts: charset name or NIL
388 * Returns: charset table entry or NIL if unknown
391 const CHARSET *utf8_charset (char *charset)
393 unsigned long i;
394 if (!charset) return (CHARSET *) &utf8_csvalid[0];
395 else if (*charset && (strlen (charset) < 128))
396 for (i = 0; utf8_csvalid[i].name; i++)
397 if (!compare_cstring (charset,utf8_csvalid[i].name))
398 return (CHARSET *) &utf8_csvalid[i];
399 return NIL; /* failed */
402 /* Validate charset and generate error message if invalid
403 * Accepts: bad character set
404 * Returns: NIL if good charset, else error message string
407 #define BADCSS "[BADCHARSET ("
408 #define BADCSE ")] Unknown charset: "
410 char *utf8_badcharset (char *charset)
412 char *msg = NIL;
413 if (!utf8_charset (charset)) {
414 char *s,*t;
415 unsigned long i,j;
416 /* calculate size of header, trailer, and bad
417 * charset plus charset names */
418 for (i = 0, j = sizeof (BADCSS) + sizeof (BADCSE) + strlen (charset) - 2;
419 utf8_csvalid[i].name; i++)
420 j += strlen (utf8_csvalid[i].name) + 1;
421 /* not built right */
422 if (!i) fatal ("No valid charsets!");
423 /* header */
424 for (s = msg = (char *) fs_get (j), t = BADCSS; *t; *s++ = *t++);
425 /* each charset */
426 for (i = 0; utf8_csvalid[i].name; *s++ = ' ', i++)
427 for (t = utf8_csvalid[i].name; *t; *s++ = *t++);
428 /* back over last space, trailer */
429 for (t = BADCSE, --s; *t; *s++ = *t++);
430 /* finally bogus charset */
431 for (t = charset; *t; *s++ = *t++);
432 *s++ = '\0'; /* finally tie off string */
433 if (s != (msg + j)) fatal ("charset msg botch");
435 return msg;
438 /* Convert charset labelled sized text to UTF-8
439 * Accepts: source sized text
440 * charset
441 * pointer to returned sized text if non-NIL
442 * flags
443 * Returns: T if successful, NIL if failure
446 long utf8_text (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,long flags)
448 ucs4cn_t cv = (flags & U8T_CASECANON) ? ucs4_titlecase : NIL;
449 ucs4de_t de = (flags & U8T_DECOMPOSE) ? ucs4_decompose_recursive : NIL;
450 const CHARSET *cs = (charset && *charset) ?
451 utf8_charset (charset) : utf8_infercharset (text);
452 if (cs) return (text && ret) ? utf8_text_cs (text,cs,ret,cv,de) : LONGT;
453 if (ret) { /* no conversion possible */
454 ret->data = text->data; /* so return source */
455 ret->size = text->size;
457 return NIL; /* failure */
461 /* Operations used in converting data */
463 #define UTF8_COUNT_BMP(count,c,cv,de) { \
464 void *more = NIL; \
465 if (cv) c = (*cv) (c); \
466 if (de) c = (*de) (c,&more); \
467 do count += UTF8_SIZE_BMP(c); \
468 while (more && (c = (*de) (U8G_ERROR,&more)));\
471 #define UTF8_WRITE_BMP(b,c,cv,de) { \
472 void *more = NIL; \
473 if (cv) c = (*cv) (c); \
474 if (de) c = (*de) (c,&more); \
475 do UTF8_PUT_BMP (b,c) \
476 while (more && (c = (*de) (U8G_ERROR,&more)));\
479 #define UTF8_COUNT(count,c,cv,de) { \
480 void *more = NIL; \
481 if (cv) c = (*cv) (c); \
482 if (de) c = (*de) (c,&more); \
483 do count += utf8_size (c); \
484 while (more && (c = (*de) (U8G_ERROR,&more)));\
487 #define UTF8_WRITE(b,c,cv,de) { \
488 void *more = NIL; \
489 if (cv) c = (*cv) (c); \
490 if (de) c = (*de) (c,&more); \
491 do b = utf8_put (b,c); \
492 while (more && (c = (*de) (U8G_ERROR,&more)));\
495 /* Convert sized text to UTF-8 given CHARSET block
496 * Accepts: source sized text
497 * CHARSET block
498 * pointer to returned sized text
499 * canonicalization function
500 * decomposition function
501 * Returns: T if successful, NIL if failure
504 long utf8_text_cs (SIZEDTEXT *text,const CHARSET *cs,SIZEDTEXT *ret,
505 ucs4cn_t cv,ucs4de_t de)
507 ret->data = text->data; /* default to source */
508 ret->size = text->size;
509 switch (cs->type) { /* convert if type known */
510 case CT_ASCII: /* 7-bit ASCII no table */
511 case CT_UTF8: /* variable UTF-8 encoded Unicode no table */
512 if (cv || de) utf8_text_utf8 (text,ret,cv,de);
513 break;
514 case CT_1BYTE0: /* 1 byte no table */
515 utf8_text_1byte0 (text,ret,cv,de);
516 break;
517 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
518 utf8_text_1byte (text,ret,cs->tab,cv,de);
519 break;
520 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
521 utf8_text_1byte8 (text,ret,cs->tab,cv,de);
522 break;
523 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
524 utf8_text_euc (text,ret,cs->tab,cv,de);
525 break;
526 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
527 utf8_text_dbyte (text,ret,cs->tab,cv,de);
528 break;
529 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
530 utf8_text_dbyte2 (text,ret,cs->tab,cv,de);
531 break;
532 case CT_UTF7: /* variable UTF-7 encoded Unicode no table */
533 utf8_text_utf7 (text,ret,cv,de);
534 break;
535 case CT_UCS2: /* 2 byte 16-bit Unicode no table */
536 utf8_text_ucs2 (text,ret,cv,de);
537 break;
538 case CT_UCS4: /* 4 byte 32-bit Unicode no table */
539 utf8_text_ucs4 (text,ret,cv,de);
540 break;
541 case CT_UTF16: /* variable UTF-16 encoded Unicode no table */
542 utf8_text_utf16 (text,ret,cv,de);
543 break;
544 case CT_2022: /* variable ISO-2022 encoded no table*/
545 utf8_text_2022 (text,ret,cv,de);
546 break;
547 case CT_SJIS: /* 2 byte Shift-JIS encoded JIS no table */
548 utf8_text_sjis (text,ret,cv,de);
549 break;
550 default: /* unknown character set type */
551 return NIL;
553 return LONGT; /* return success */
556 /* Reverse mapping routines
558 * These routines only support character sets, not all possible charsets. In
559 * particular, they do not support any Unicode encodings or ISO 2022.
561 * As a special dispensation, utf8_cstext() and utf8_cstocstext() support
562 * support ISO-2022-JP if EUC-JP can be reverse mapped; and utf8_rmaptext()
563 * will generated ISO-2022-JP using an EUC-JP rmap if flagged to do so.
565 * No attempt is made to map "equivalent" Unicode characters or Unicode
566 * characters that have the same glyph; nor is there any attempt to handle
567 * combining characters or otherwise do any stringprep. Maybe later.
571 /* Convert UTF-8 sized text to charset
572 * Accepts: source sized text
573 * destination charset
574 * pointer to returned sized text
575 * substitute character if not in cs, else NIL to return failure
576 * Returns: T if successful, NIL if failure
580 long utf8_cstext (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,
581 unsigned long errch)
583 short iso2022jp = !compare_cstring (charset,"ISO-2022-JP");
584 unsigned short *rmap = utf8_rmap (iso2022jp ? "EUC-JP" : charset);
585 return rmap ? utf8_rmaptext (text,rmap,ret,errch,iso2022jp) : NIL;
588 /* Convert charset labelled sized text to another charset
589 * Accepts: source sized text
590 * source charset
591 * pointer to returned sized text
592 * destination charset
593 * substitute character if not in dest cs, else NIL to return failure
594 * Returns: T if successful, NIL if failure
596 * This routine has the same restricts as utf8_cstext().
599 long utf8_cstocstext (SIZEDTEXT *src,char *sc,SIZEDTEXT *dst,char *dc,
600 unsigned long errch)
602 SIZEDTEXT utf8;
603 const CHARSET *scs,*dcs;
604 unsigned short *rmap;
605 long ret = NIL;
606 long iso2022jp;
607 /* lookup charsets and reverse map */
608 if ((dc && (dcs = utf8_charset (dc))) &&
609 (rmap = (iso2022jp = ((dcs->type == CT_2022) &&
610 !compare_cstring (dcs->name,"ISO-2022-JP"))) ?
611 utf8_rmap ("EUC-JP") : utf8_rmap_cs (dcs)) &&
612 (scs = (sc && *sc) ? utf8_charset (sc) : utf8_infercharset (src))) {
613 /* init temporary buffer */
614 memset (&utf8,NIL,sizeof (SIZEDTEXT));
615 /* source cs equivalent to dest cs? */
616 if ((scs->type == dcs->type) && (scs->tab == dcs->tab)) {
617 dst->data = src->data; /* yes, just copy pointers */
618 dst->size = src->size;
619 ret = LONGT;
621 /* otherwise do the conversion */
622 else ret = (utf8_text_cs (src,scs,&utf8,NIL,NIL) &&
623 utf8_rmaptext (&utf8,rmap,dst,errch,iso2022jp));
624 /* flush temporary buffer */
625 if (utf8.data && (utf8.data != src->data) && (utf8.data != dst->data))
626 fs_give ((void **) &utf8.data);
628 return ret;
631 /* Cached rmap */
633 static const CHARSET *currmapcs = NIL;
634 static unsigned short *currmap = NIL;
637 /* Cache and return map for UTF-8 -> character set
638 * Accepts: character set name
639 * Returns: cached map if character set found, else NIL
642 unsigned short *utf8_rmap (char *charset)
644 return (currmapcs && !compare_cstring (charset,currmapcs->name)) ? currmap :
645 utf8_rmap_cs (utf8_charset (charset));
649 /* Cache and return map for UTF-8 -> character set given CHARSET block
650 * Accepts: CHARSET block
651 * Returns: cached map if character set found, else NIL
654 unsigned short *utf8_rmap_cs (const CHARSET *cs)
656 unsigned short *ret = NIL;
657 if (!cs); /* have charset? */
658 else if (cs == currmapcs) ret = currmap;
659 else if (ret = utf8_rmap_gen (cs,currmap)) {
660 currmapcs = cs;
661 currmap = ret;
663 return ret;
666 /* Return map for UTF-8 -> character set given CHARSET block
667 * Accepts: CHARSET block
668 * old map to recycle
669 * Returns: map if character set found, else NIL
672 unsigned short *utf8_rmap_gen (const CHARSET *cs,unsigned short *oldmap)
674 unsigned short u,*tab,*rmap;
675 unsigned int i,m,ku,ten;
676 struct utf8_eucparam *param,*p2;
677 switch (cs->type) { /* is a character set? */
678 case CT_ASCII: /* 7-bit ASCII no table */
679 case CT_1BYTE0: /* 1 byte no table */
680 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
681 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
682 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
683 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
684 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
685 case CT_SJIS: /* 2 byte Shift-JIS */
686 rmap = oldmap ? oldmap : /* recycle old map if supplied else make new */
687 (unsigned short *) fs_get (65536 * sizeof (unsigned short));
688 /* initialize table for ASCII */
689 for (i = 0; i < 128; i++) rmap[i] = (unsigned short) i;
690 /* populate remainder of table with NOCHAR */
691 #define NOCHARBYTE (NOCHAR & 0xff)
692 #if NOCHAR - ((NOCHARBYTE << 8) | NOCHARBYTE)
693 while (i < 65536) rmap[i++] = NOCHAR;
694 #else
695 memset (rmap + 128,NOCHARBYTE,(65536 - 128) * sizeof (unsigned short));
696 #endif
697 break;
698 default: /* unsupported charset type */
699 rmap = NIL; /* no map possible */
701 if (rmap) { /* have a map? */
702 switch (cs->type) { /* additional reverse map actions */
703 case CT_1BYTE0: /* 1 byte no table */
704 for (i = 128; i < 256; i++) rmap[i] = (unsigned short) i;
705 break;
706 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
707 for (tab = (unsigned short *) cs->tab,i = 128; i < 256; i++)
708 if (tab[i & BITS7] != UBOGON) rmap[tab[i & BITS7]] = (unsigned short)i;
709 break;
710 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
711 for (tab = (unsigned short *) cs->tab,i = 0; i < 256; i++)
712 if (tab[i] != UBOGON) rmap[tab[i]] = (unsigned short) i;
713 break;
714 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
715 for (param = (struct utf8_eucparam *) cs->tab,
716 tab = (unsigned short *) param->tab, ku = 0;
717 ku < param->max_ku; ku++)
718 for (ten = 0; ten < param->max_ten; ten++)
719 if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
720 rmap[u] = ((ku + param->base_ku) << 8) +
721 (ten + param->base_ten) + 0x8080;
722 break;
724 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
725 for (param = (struct utf8_eucparam *) cs->tab,
726 tab = (unsigned short *) param->tab, ku = 0;
727 ku < param->max_ku; ku++)
728 for (ten = 0; ten < param->max_ten; ten++)
729 if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
730 rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten);
731 break;
732 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
733 param = (struct utf8_eucparam *) cs->tab;
734 p2 = param + 1; /* plane 2 parameters */
735 /* only ten parameters should differ */
736 if ((param->base_ku != p2->base_ku) || (param->max_ku != p2->max_ku))
737 fatal ("ku definition error for CT_DBYTE2 charset");
738 /* total codepoints in each ku */
739 m = param->max_ten + p2->max_ten;
740 tab = (unsigned short *) param->tab;
741 for (ku = 0; ku < param->max_ku; ku++) {
742 for (ten = 0; ten < param->max_ten; ten++)
743 if ((u = tab[(ku * m) + ten]) != UBOGON)
744 rmap[u] = ((ku + param->base_ku) << 8) + (ten + param->base_ten);
745 for (ten = 0; ten < p2->max_ten; ten++)
746 if ((u = tab[(ku * m) + param->max_ten + ten]) != UBOGON)
747 rmap[u] = ((ku + param->base_ku) << 8) + (ten + p2->base_ten);
749 break;
750 case CT_SJIS: /* 2 byte Shift-JIS */
751 for (ku = 0; ku < MAX_JIS0208_KU; ku++)
752 for (ten = 0; ten < MAX_JIS0208_TEN; ten++)
753 if ((u = jis0208tab[ku][ten]) != UBOGON) {
754 int sku = ku + BASE_JIS0208_KU;
755 int sten = ten + BASE_JIS0208_TEN;
756 rmap[u] = ((((sku + 1) >> 1) + ((sku < 95) ? 112 : 176)) << 8) +
757 sten + ((sku % 2) ? ((sten > 95) ? 32 : 31) : 126);
759 /* JIS Roman */
760 rmap[UCS2_YEN] = JISROMAN_YEN;
761 rmap[UCS2_OVERLINE] = JISROMAN_OVERLINE;
762 /* JIS hankaku katakana */
763 for (u = 0; u < (MAX_KANA_8 - MIN_KANA_8); u++)
764 rmap[UCS2_KATAKANA + u] = MIN_KANA_8 + u;
765 break;
767 /* hack: map NBSP to SP if otherwise no map */
768 if (rmap[0x00a0] == NOCHAR) rmap[0x00a0] = rmap[0x0020];
770 return rmap; /* return map */
773 /* Convert UTF-8 sized text to charset using rmap
774 * Accepts: source sized text
775 * conversion rmap
776 * pointer to returned sized text
777 * substitute character if not in rmap, else NIL to return failure
778 * ISO-2022-JP conversion flag
779 * Returns T if successful, NIL if failure
781 * This routine doesn't try to convert to all possible charsets; in particular
782 * it doesn't support other Unicode encodings or any ISO 2022 other than
783 * ISO-2022-JP.
786 long utf8_rmaptext (SIZEDTEXT *text,unsigned short *rmap,SIZEDTEXT *ret,
787 unsigned long errch,long iso2022jp)
789 unsigned long i,u,c;
790 /* get size of buffer */
791 if (i = utf8_rmapsize (text,rmap,errch,iso2022jp)) {
792 unsigned char *s = text->data;
793 unsigned char *t = ret->data = (unsigned char *) fs_get (i);
794 ret->size = i - 1; /* number of octets in destination buffer */
795 /* start non-zero ISO-2022-JP state at 1 */
796 if (iso2022jp) iso2022jp = 1;
797 /* convert string, ignore BOM */
798 for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) {
799 /* substitute error character for NOCHAR */
800 if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch;
801 switch (iso2022jp) { /* depends upon ISO 2022 mode */
802 case 0: /* ISO 2022 not in effect */
803 /* two-byte character */
804 if (c > 0xff) *t++ = (unsigned char) (c >> 8);
805 /* single-byte or low-byte of two-byte */
806 *t++ = (unsigned char) (c & 0xff);
807 break;
808 case 1: /* ISO 2022 Roman */
809 /* <ch> */
810 if (c < 0x80) *t++ = (unsigned char) c;
811 else { /* JIS character */
812 *t++ = I2C_ESC; /* ESC $ B <hi> <lo> */
813 *t++ = I2C_MULTI;
814 *t++ = I2CS_94x94_JIS_NEW;
815 *t++ = (unsigned char) (c >> 8) & 0x7f;
816 *t++ = (unsigned char) c & 0x7f;
817 iso2022jp = 2; /* shift to ISO 2022 JIS */
819 break;
820 case 2: /* ISO 2022 JIS */
821 if (c > 0x7f) { /* <hi> <lo> */
822 *t++ = (unsigned char) (c >> 8) & 0x7f;
823 *t++ = (unsigned char) c & 0x7f;
825 else { /* ASCII character */
826 *t++ = I2C_ESC; /* ESC ( J <ch> */
827 *t++ = I2C_G0_94;
828 *t++ = I2CS_94_JIS_ROMAN;
829 *t++ = (unsigned char) c;
830 iso2022jp = 1; /* shift to ISO 2022 Roman */
832 break;
835 if (iso2022jp == 2) { /* ISO-2022-JP string must end in Roman */
836 *t++ = I2C_ESC; /* ESC ( J */
837 *t++ = I2C_G0_94;
838 *t++ = I2CS_94_JIS_ROMAN;
840 *t++ = NIL; /* tie off returned data */
841 return LONGT; /* return success */
843 ret->data = NIL;
844 ret->size = 0;
845 return NIL; /* failure */
848 /* Calculate size of convertsion of UTF-8 sized text to charset using rmap
849 * Accepts: source sized text
850 * conversion rmap
851 * pointer to returned sized text
852 * substitute character if not in rmap, else NIL to return failure
853 * ISO-2022-JP conversion flag
854 * Returns size+1 if successful, NIL if failure
856 * This routine doesn't try to handle to all possible charsets; in particular
857 * it doesn't support other Unicode encodings or any ISO 2022 other than
858 * ISO-2022-JP.
861 unsigned long utf8_rmapsize (SIZEDTEXT *text,unsigned short *rmap,
862 unsigned long errch,long iso2022jp)
864 unsigned long i,u,c;
865 unsigned long ret = 1; /* terminating NUL */
866 unsigned char *s = text->data;
867 if (iso2022jp) iso2022jp = 1; /* start non-zero ISO-2022-JP state at 1 */
868 for (i = text->size; i;) if ((u = utf8_get (&s,&i)) != UCS2_BOM) {
869 if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch)))
870 return NIL; /* not in BMP, or NOCHAR and no err char */
871 switch (iso2022jp) { /* depends upon ISO 2022 mode */
872 case 0: /* ISO 2022 not in effect */
873 ret += (c > 0xff) ? 2 : 1;
874 break;
875 case 1: /* ISO 2022 Roman */
876 if (c < 0x80) ret += 1; /* <ch> */
877 else { /* JIS character */
878 ret += 5; /* ESC $ B <hi> <lo> */
879 iso2022jp = 2; /* shift to ISO 2022 JIS */
881 break;
882 case 2: /* ISO 2022 JIS */
883 if (c > 0x7f) ret += 2; /* <hi> <lo> */
884 else { /* ASCII character */
885 ret += 4; /* ESC ( J <ch> */
886 iso2022jp = 1; /* shift to ISO 2022 Roman */
888 break;
891 if (iso2022jp == 2) { /* ISO-2022-JP string must end in Roman */
892 ret += 3; /* ESC ( J */
893 iso2022jp = 1; /* reset state to Roman */
895 return ret;
898 /* Convert UCS-4 to charset using rmap
899 * Accepts: source UCS-4 character(s)
900 * numver of UCS-4 characters
901 * conversion rmap
902 * pointer to returned sized text
903 * substitute character if not in rmap, else NIL to return failure
904 * Returns T if successful, NIL if failure
906 * Currently only supports BMP characters, and does not support ISO-2022
909 long ucs4_rmaptext (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
910 SIZEDTEXT *ret,unsigned long errch)
912 long size = ucs4_rmaplen (ucs4,len,rmap,errch);
913 return (size >= 0) ? /* build in newly-created buffer */
914 ucs4_rmapbuf (ret->data = (unsigned char *) fs_get ((ret->size = size) +1),
915 ucs4,len,rmap,errch) : NIL;
918 /* Return size of UCS-4 string converted to other CS via rmap
919 * Accepts: source UCS-4 character(s)
920 * numver of UCS-4 characters
921 * conversion rmap
922 * substitute character if not in rmap, else NIL to return failure
923 * Returns: length if success, negative if failure (no-convert)
926 long ucs4_rmaplen (unsigned long *ucs4,unsigned long len,unsigned short *rmap,
927 unsigned long errch)
929 long ret;
930 unsigned long i,u,c;
931 /* count non-BOM characters */
932 for (ret = 0,i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) {
933 if ((u & U8GM_NONBMP) || (((c = rmap[u]) == NOCHAR) && !(c = errch)))
934 return -1; /* not in BMP, or NOCHAR and no err char? */
935 ret += (c > 0xff) ? 2 : 1;
937 return ret;
941 /* Stuff buffer with UCS-4 string converted to other CS via rmap
942 * Accepts: destination buffer
943 * source UCS-4 character(s)
944 * number of UCS-4 characters
945 * conversion rmap
946 * substitute character if not in rmap, else NIL to return failure
947 * Returns: T, always
950 long ucs4_rmapbuf (unsigned char *t,unsigned long *ucs4,unsigned long len,
951 unsigned short *rmap,unsigned long errch)
953 unsigned long i,u,c;
954 /* convert non-BOM characters */
955 for (i = 0; i < len; ++i) if ((u = ucs4[i]) != UCS2_BOM) {
956 /* substitute error character for NOCHAR */
957 if ((u & U8GM_NONBMP) || ((c = rmap[u]) == NOCHAR)) c = errch;
958 /* two-byte character? */
959 if (c > 0xff) *t++ = (unsigned char) (c >> 8);
960 /* single-byte or low-byte of two-byte */
961 *t++ = (unsigned char) (c & 0xff);
963 *t++ = NIL; /* tie off returned data */
964 return LONGT;
967 /* Return UCS-4 Unicode character from UTF-8 string
968 * Accepts: pointer to string
969 * remaining octets in string
970 * Returns: UCS-4 character with pointer and count updated
971 * or error code with pointer and count unchanged
974 unsigned long utf8_get (unsigned char **s,unsigned long *i)
976 unsigned char *t = *s;
977 unsigned long j = *i;
978 /* decode raw UTF-8 string */
979 unsigned long ret = utf8_get_raw (&t,&j);
980 if (ret & U8G_ERROR); /* invalid raw UTF-8 decoding? */
981 /* no, is it surrogate? */
982 else if ((ret >= UTF16_SURR) && (ret <= UTF16_MAXSURR)) ret = U8G_SURROGA;
983 /* or in non-Unicode ISO 10646 space? */
984 else if (ret > UCS4_MAXUNICODE) ret = U8G_NOTUNIC;
985 else {
986 *s = t; /* all is well, update pointer */
987 *i = j; /* and counter */
989 return ret; /* return value */
992 /* Return raw (including non-Unicode) UCS-4 character from UTF-8 string
993 * Accepts: pointer to string
994 * remaining octets in string
995 * Returns: UCS-4 character with pointer and count updated
996 * or error code with pointer and count unchanged
999 unsigned long utf8_get_raw (unsigned char **s,unsigned long *i)
1001 unsigned char c,c1;
1002 unsigned char *t = *s;
1003 unsigned long j = *i;
1004 unsigned long ret = U8G_NOTUTF8;
1005 int more = 0;
1006 do { /* make sure have source octets available */
1007 if (!j--) return more ? U8G_ENDSTRI : U8G_ENDSTRG;
1008 /* UTF-8 continuation? */
1009 else if (((c = *t++) > 0x7f) && (c < 0xc0)) {
1010 /* continuation when not in progress */
1011 if (!more) return U8G_BADCONT;
1012 --more; /* found a continuation octet */
1013 ret <<= 6; /* shift current value by 6 bits */
1014 ret |= c & 0x3f; /* merge continuation octet */
1016 /* incomplete UTF-8 character */
1017 else if (more) return U8G_INCMPLT;
1018 else { /* start of sequence */
1019 c1 = j ? *t : 0xbf; /* assume valid continuation if incomplete */
1020 if (c < 0x80) ret = c; /* U+0000 - U+007f */
1021 else if (c < 0xc2); /* c0 and c1 never valid */
1022 else if (c < 0xe0) { /* U+0080 - U+07ff */
1023 c &= 0x1f;
1024 if (c1 >= 0x80) more = 1;
1026 else if (c == 0xe0) { /* U+0800 - U+0fff */
1027 c &= 0x0f;
1028 if (c1 >= 0xa0) more = 2;
1030 else if (c < 0xed) { /* U+1000 - U+cfff */
1031 c &= 0x0f;
1032 if (c1 >= 0x80) more = 2;
1034 else if (c == 0xed) { /* U+d000 - U+d7ff */
1035 c &= 0x0f;
1036 if (j == 0 || ((c1 >= 0x80) && (c1 <= 0x9f))) more = 2;
1038 else if (c < 0xf0) { /* U+e000 - U+ffff */
1039 c &= 0x0f;
1040 if (c1 >= 0x80) more = 2;
1042 else if (c == 0xf0) { /* U+10000 - U+3ffff */
1043 c &= 0x07;
1044 if (c1 >= 0x90) more = 3;
1046 else if (c < 0xf3) { /* U+40000 - U+fffff */
1047 c &= 0x07;
1048 if (c1 >= 0x80) more = 3;
1050 #if 0
1051 else if (c == 0xf4) { /* U+100000 - U+10ffff */
1052 c &= 0x07;
1053 if (((c1 >= 0x80) && (c1 <= 0x8f))) more = 3;
1055 #else
1056 else if (c < 0xf8) { /* U+100000 - U+10ffff (and 110000 - 1fffff) */
1057 c &= 0x07;
1058 if ((c1 >= 0x80)) more = 3;
1060 else if (c < 0xfc) { /* ISO 10646 200000 - 3ffffff */
1061 c &= 0x03;
1062 if ((c1 >= 0x80)) more = 4;
1064 else if (c < 0xfe) { /* ISO 10646 4000000 - 7fffffff */
1065 c &= 0x01;
1066 if ((c1 >= 0x80)) more = 5;
1068 #endif
1069 /* fe and ff never valid */
1070 if (more) { /* multi-octet, make sure more to come */
1071 if (!j) return U8G_ENDSTRI;
1072 ret = c; /* continuation needed, save start bits */
1075 } while (more);
1076 if (!(ret & U8G_ERROR)) { /* success return? */
1077 *s = t; /* yes, update pointer */
1078 *i = j; /* and counter */
1080 return ret; /* return value */
1083 /* Return UCS-4 character from named charset string
1084 * Accepts: charset
1085 * pointer to string
1086 * remaining octets in string
1087 * Returns: UCS-4 character with pointer and count updated, negative if error
1089 * Error codes are the same as utf8_get().
1092 unsigned long ucs4_cs_get (CHARSET *cs,unsigned char **s,unsigned long *i)
1094 unsigned char c,c1,ku,ten;
1095 unsigned long ret,d;
1096 unsigned char *t = *s;
1097 unsigned long j = *i;
1098 struct utf8_eucparam *p1,*p2,*p3;
1099 if (j--) c = *t++; /* get first octet */
1100 else return U8G_ENDSTRG; /* empty string */
1101 switch (cs->type) { /* convert if type known */
1102 case CT_UTF8: /* variable UTF-8 encoded Unicode no table */
1103 return utf8_get (s,i);
1104 case CT_ASCII: /* 7-bit ASCII no table */
1105 if (c >= 0x80) return U8G_NOTUTF8;
1106 case CT_1BYTE0: /* 1 byte no table */
1107 ret = c; /* identity */
1108 break;
1109 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
1110 ret = (c > 0x80) ? ((unsigned short *) cs->tab)[c & BITS7] : c;
1111 break;
1112 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
1113 ret = ((unsigned short *) cs->tab)[c];
1114 break;
1116 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
1117 if (c & BIT8) {
1118 p1 = (struct utf8_eucparam *) cs->tab;
1119 p2 = p1 + 1;
1120 p3 = p1 + 2;
1121 if (j--) c1 = *t++; /* get second octet */
1122 else return U8G_ENDSTRI;
1123 if (!(c1 & BIT8)) return U8G_NOTUTF8;
1124 switch (c) { /* check 8bit code set */
1125 case EUC_CS2: /* CS2 */
1126 if (p2->base_ku) { /* CS2 set up? */
1127 if (p2->base_ten) { /* yes, multibyte? */
1128 if (j--) c = *t++; /* get second octet */
1129 else return U8G_ENDSTRI;
1130 if ((c & BIT8) &&
1131 ((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) &&
1132 ((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) {
1133 ret = ((unsigned short *) p2->tab)[(ku*p2->max_ten) + ten];
1134 break;
1137 else if ((c1 >= p2->base_ku) && (c1 < p2->max_ku)) {
1138 ret = c1 + ((unsigned long) p2->tab);
1139 break;
1142 return U8G_NOTUTF8; /* CS2 not set up or bogus */
1143 case EUC_CS3: /* CS3 */
1144 if (p3->base_ku) { /* CS3 set up? */
1145 if (p3->base_ten) { /* yes, multibyte? */
1146 if (j--) c = *t++; /* get second octet */
1147 else return U8G_ENDSTRI;
1148 if ((c & BIT8) &&
1149 ((ku = (c1 & BITS7) - p3->base_ku) < p3->max_ku) &&
1150 ((ten = (c & BITS7) - p3->base_ten) < p3->max_ten)) {
1151 ret = ((unsigned short *) p3->tab)[(ku*p3->max_ten) + ten];
1152 break;
1155 else if ((c1 >= p3->base_ku) && (c1 < p3->max_ku)) {
1156 ret = c1 + ((unsigned long) p3->tab);
1157 break;
1160 return U8G_NOTUTF8; /* CS3 not set up or bogus */
1161 default:
1162 if (((ku = (c & BITS7) - p1->base_ku) >= p1->max_ku) ||
1163 ((ten = (c1 & BITS7) - p1->base_ten) >= p1->max_ten))
1164 return U8G_NOTUTF8;
1165 ret = ((unsigned short *) p1->tab)[(ku*p1->max_ten) + ten];
1166 /* special hack for JIS X 0212: merge rows less than 10 */
1167 if ((ret == UBOGON) && ku && (ku < 10) && p3->tab && p3->base_ten)
1168 ret = ((unsigned short *) p3->tab)
1169 [((ku - (p3->base_ku - p1->base_ku))*p3->max_ten) + ten];
1170 break;
1173 else ret = c; /* ASCII character */
1174 break;
1176 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
1177 if (c & BIT8) { /* double-byte character? */
1178 p1 = (struct utf8_eucparam *) cs->tab;
1179 if (j--) c1 = *t++; /* get second octet */
1180 else return U8G_ENDSTRI;
1181 if (((ku = c - p1->base_ku) < p1->max_ku) &&
1182 ((ten = c1 - p1->base_ten) < p1->max_ten))
1183 ret = ((unsigned short *) p1->tab)[(ku*p1->max_ten) + ten];
1184 else return U8G_NOTUTF8;
1186 else ret = c; /* ASCII character */
1187 break;
1188 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
1189 if (c & BIT8) { /* double-byte character? */
1190 p1 = (struct utf8_eucparam *) cs->tab;
1191 p2 = p1 + 1;
1192 if (j--) c1 = *t++; /* get second octet */
1193 else return U8G_ENDSTRI;
1194 if (c1 & BIT8) { /* high vs. low plane */
1195 if ((ku = c - p2->base_ku) < p2->max_ku &&
1196 ((ten = c1 - p2->base_ten) < p2->max_ten))
1197 ret = ((unsigned short *) p1->tab)
1198 [(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten];
1199 else return U8G_NOTUTF8;
1201 else if ((ku = c - p1->base_ku) < p1->max_ku &&
1202 ((ten = c1 - p1->base_ten) < p1->max_ten))
1203 ret = ((unsigned short *) p1->tab)
1204 [(ku*(p1->max_ten + p2->max_ten)) + ten];
1205 else return U8G_NOTUTF8;
1207 else ret = c; /* ASCII character */
1208 break;
1209 case CT_SJIS: /* 2 byte Shift-JIS encoded JIS no table */
1210 /* compromise - do yen sign but not overline */
1211 if (!(c & BIT8)) ret = (c == JISROMAN_YEN) ? UCS2_YEN : c;
1212 /* half-width katakana? */
1213 else if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) ret = c + KANA_8;
1214 else { /* Shift-JIS */
1215 if (j--) c1 = *t++; /* get second octet */
1216 else return U8G_ENDSTRI;
1217 SJISTOJIS (c,c1);
1218 ret = JISTOUNICODE (c,c1,ku,ten);
1220 break;
1222 case CT_UCS2: /* 2 byte 16-bit Unicode no table */
1223 ret = c << 8;
1224 if (j--) c = *t++; /* get second octet */
1225 else return U8G_ENDSTRI; /* empty string */
1226 ret |= c;
1227 break;
1228 case CT_UCS4: /* 4 byte 32-bit Unicode no table */
1229 if (c & 0x80) return U8G_NOTUTF8;
1230 if (j < 3) return U8G_ENDSTRI;
1231 j -= 3; /* count three octets */
1232 ret = c << 24;
1233 ret |= (*t++) << 16;
1234 ret |= (*t++) << 8;
1235 ret |= (*t++);
1236 break;
1237 case CT_UTF16: /* variable UTF-16 encoded Unicode no table */
1238 ret = c << 8;
1239 if (j--) c = *t++; /* get second octet */
1240 else return U8G_ENDSTRI; /* empty string */
1241 ret |= c;
1242 /* surrogate? */
1243 if ((ret >= UTF16_SURR) && (ret <= UTF16_MAXSURR)) {
1244 /* invalid first surrogate */
1245 if ((ret > UTF16_SURRHEND) || (j < 2)) return U8G_NOTUTF8;
1246 j -= 2; /* count two octets */
1247 d = (*t++) << 8; /* first octet of second surrogate */
1248 d |= *t++; /* second octet of second surrogate */
1249 if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) return U8G_NOTUTF8;
1250 ret = UTF16_BASE + ((ret & UTF16_MASK) << UTF16_SHIFT) +
1251 (d & UTF16_MASK);
1253 break;
1254 default: /* unknown/unsupported character set type */
1255 return U8G_NOTUTF8;
1257 *s = t; /* update pointer and counter */
1258 *i = j;
1259 return ret;
1262 /* Produce charset validity map for BMP
1263 * Accepts: list of charsets to map
1264 * Returns: validity map, indexed by BMP codepoint
1266 * Bit 0x1 is the "not-CJK" character bit
1269 unsigned long *utf8_csvalidmap (char *charsets[])
1271 unsigned short u,*tab;
1272 unsigned int m,ku,ten;
1273 unsigned long i,csi,csb;
1274 struct utf8_eucparam *param,*p2;
1275 char *s;
1276 const CHARSET *cs;
1277 unsigned long *ret = (unsigned long *)
1278 fs_get (i = 0x10000 * sizeof (unsigned long));
1279 memset (ret,0,i); /* zero the entire vector */
1280 /* mark all the non-CJK codepoints */
1281 /* U+0000 - U+2E7F non-CJK */
1282 for (i = 0; i < 0x2E7F; ++i) ret[i] = 0x1;
1283 /* U+2E80 - U+2EFF CJK Radicals Supplement
1284 * U+2F00 - U+2FDF Kangxi Radicals
1285 * U+2FE0 - U+2FEF unassigned
1286 * U+2FF0 - U+2FFF Ideographic Description Characters
1287 * U+3000 - U+303F CJK Symbols and Punctuation
1288 * U+3040 - U+309F Hiragana
1289 * U+30A0 - U+30FF Katakana
1290 * U+3100 - U+312F BoPoMoFo
1291 * U+3130 - U+318F Hangul Compatibility Jamo
1292 * U+3190 - U+319F Kanbun
1293 * U+31A0 - U+31BF BoPoMoFo Extended
1294 * U+31C0 - U+31EF CJK Strokes
1295 * U+31F0 - U+31FF Katakana Phonetic Extensions
1296 * U+3200 - U+32FF Enclosed CJK Letters and Months
1297 * U+3300 - U+33FF CJK Compatibility
1298 * U+3400 - U+4DBF CJK Unified Ideographs Extension A
1299 * U+4DC0 - U+4DFF Yijing Hexagram Symbols
1300 * U+4E00 - U+9FFF CJK Unified Ideographs
1301 * U+A000 - U+A48F Yi Syllables
1302 * U+A490 - U+A4CF Yi Radicals
1303 * U+A700 - U+A71F Modifier Tone Letters
1305 for (i = 0xa720; i < 0xabff; ++i) ret[i] = 0x1;
1306 /* U+AC00 - U+D7FF Hangul Syllables */
1307 for (i = 0xd800; i < 0xf8ff; ++i) ret[i] = 0x1;
1308 /* U+F900 - U+FAFF CJK Compatibility Ideographs */
1309 for (i = 0xfb00; i < 0xfe2f; ++i) ret[i] = 0x1;
1310 /* U+FE30 - U+FE4F CJK Compatibility Forms
1311 * U+FE50 - U+FE6F Small Form Variants (for CNS 11643)
1313 for (i = 0xfe70; i < 0xfeff; ++i) ret[i] = 0x1;
1314 /* U+FF00 - U+FFEF CJK Compatibility Ideographs */
1315 for (i = 0xfff0; i < 0x10000; ++i) ret[i] = 0x1;
1317 /* for each supplied charset */
1318 for (csi = 1; ret && charsets && (s = charsets[csi - 1]); ++csi) {
1319 /* substitute EUC-JP for ISO-2022-JP */
1320 if (!compare_cstring (s,"ISO-2022-JP")) s = "EUC-JP";
1321 /* look up charset */
1322 if (cs = utf8_charset (s)) {
1323 csb = 1 << csi; /* charset bit */
1324 switch (cs->type) {
1325 case CT_ASCII: /* 7-bit ASCII no table */
1326 case CT_1BYTE0: /* 1 byte no table */
1327 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
1328 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
1329 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
1330 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
1331 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
1332 case CT_SJIS: /* 2 byte Shift-JIS */
1333 /* supported charset type, all ASCII is OK */
1334 for (i = 0; i < 128; ++i) ret[i] |= csb;
1335 break;
1336 default: /* unsupported charset type */
1337 fs_give ((void **) &ret);
1338 break;
1340 /* now do additional operations */
1341 if (ret) switch (cs->type) {
1342 case CT_1BYTE0: /* 1 byte no table */
1343 for (i = 128; i < 256; i++) ret[i] |= csb;
1344 break;
1345 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
1346 for (tab = (unsigned short *) cs->tab,i = 128; i < 256; i++)
1347 if (tab[i & BITS7] != UBOGON) ret[tab[i & BITS7]] |= csb;
1348 break;
1349 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
1350 for (tab = (unsigned short *) cs->tab,i = 0; i < 256; i++)
1351 if (tab[i] != UBOGON) ret[tab[i]] |= csb;
1352 break;
1353 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
1354 for (param = (struct utf8_eucparam *) cs->tab,
1355 tab = (unsigned short *) param->tab, ku = 0;
1356 ku < param->max_ku; ku++)
1357 for (ten = 0; ten < param->max_ten; ten++)
1358 if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
1359 ret[u] |= csb;
1360 break;
1362 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
1363 for (param = (struct utf8_eucparam *) cs->tab,
1364 tab = (unsigned short *) param->tab, ku = 0;
1365 ku < param->max_ku; ku++)
1366 for (ten = 0; ten < param->max_ten; ten++)
1367 if ((u = tab[(ku * param->max_ten) + ten]) != UBOGON)
1368 ret[u] |= csb;
1369 break;
1370 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
1371 param = (struct utf8_eucparam *) cs->tab;
1372 p2 = param + 1; /* plane 2 parameters */
1373 /* only ten parameters should differ */
1374 if ((param->base_ku != p2->base_ku) || (param->max_ku != p2->max_ku))
1375 fatal ("ku definition error for CT_DBYTE2 charset");
1376 /* total codepoints in each ku */
1377 m = param->max_ten + p2->max_ten;
1378 tab = (unsigned short *) param->tab;
1379 for (ku = 0; ku < param->max_ku; ku++) {
1380 for (ten = 0; ten < param->max_ten; ten++)
1381 if ((u = tab[(ku * m) + ten]) != UBOGON)
1382 ret[u] |= csb;
1383 for (ten = 0; ten < p2->max_ten; ten++)
1384 if ((u = tab[(ku * m) + param->max_ten + ten]) != UBOGON)
1385 ret[u] |= csb;
1387 break;
1388 case CT_SJIS: /* 2 byte Shift-JIS */
1389 for (ku = 0; ku < MAX_JIS0208_KU; ku++)
1390 for (ten = 0; ten < MAX_JIS0208_TEN; ten++)
1391 if ((u = jis0208tab[ku][ten]) != UBOGON) ret[u] |= csb;
1392 /* JIS hankaku katakana */
1393 for (u = 0; u < (MAX_KANA_8 - MIN_KANA_8); u++)
1394 ret[UCS2_KATAKANA + u] |= csb;
1395 break;
1398 /* invalid charset, punt */
1399 else fs_give ((void **) &ret);
1401 return ret;
1404 /* Infer charset from unlabelled sized text
1405 * Accepts: sized text
1406 * Returns: charset if one inferred, or NIL if unknown
1409 const CHARSET *utf8_infercharset (SIZEDTEXT *src)
1411 long iso2022jp = NIL;
1412 long eightbit = NIL;
1413 unsigned long i;
1414 /* look for ISO 2022 */
1415 if (src) for (i = 0; i < src->size; i++) {
1416 /* ESC sequence? */
1417 if ((src->data[i] == I2C_ESC) && (++i < src->size)) switch (src->data[i]) {
1418 case I2C_MULTI: /* yes, multibyte? */
1419 if (++i < src->size) switch (src->data[i]) {
1420 case I2CS_94x94_JIS_OLD: /* JIS X 0208-1978 */
1421 case I2CS_94x94_JIS_NEW: /* JIS X 0208-1983 */
1422 case I2CS_94x94_JIS_EXT: /* JIS X 0212-1990 (kludge...) */
1423 iso2022jp = T; /* found an ISO-2022-JP sequence */
1424 break;
1425 default: /* other multibyte */
1426 return NIL; /* definitely invalid */
1428 break;
1429 case I2C_G0_94: /* single byte */
1430 if (++i < src->size) switch (src->data[i]) {
1431 case I2CS_94_JIS_BUGROM: /* in case old buggy software */
1432 case I2CS_94_JIS_ROMAN: /* JIS X 0201-1976 left half */
1433 case I2CS_94_ASCII: /* ASCII */
1434 case I2CS_94_BRITISH: /* good enough for gov't work */
1435 break;
1436 default: /* other 94 single byte */
1437 return NIL; /* definitely invalid */
1440 /* if possible UTF-8 and not ISO-2022-JP */
1441 else if (!iso2022jp && (eightbit >= 0) && (src->data[i] & BIT8) &&
1442 (eightbit = utf8_validate (src->data + i,src->size - i)) > 0)
1443 i += eightbit - 1; /* skip past all but last of UTF-8 char */
1445 /* ISO-2022-JP overrides other guesses */
1446 if (iso2022jp) return utf8_charset ("ISO-2022-JP");
1447 if (eightbit > 0) return utf8_charset ("UTF-8");
1448 return eightbit ? NIL : utf8_charset ("US-ASCII");
1452 /* Validate that character at this position is UTF-8
1453 * Accepts: string pointer
1454 * size of remaining string
1455 * Returns: size of UTF-8 character in octets or -1 if not UTF-8
1458 long utf8_validate (unsigned char *s,unsigned long i)
1460 unsigned long j = i;
1461 return (utf8_get (&s,&i) & U8G_ERROR) ? -1 : j - i;
1464 /* Convert ISO 8859-1 to UTF-8
1465 * Accepts: source sized text
1466 * pointer to return sized text
1467 * canonicalization function
1470 void utf8_text_1byte0 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
1472 unsigned long i;
1473 unsigned char *s;
1474 unsigned int c;
1475 for (ret->size = i = 0; i < text->size;) {
1476 c = text->data[i++];
1477 UTF8_COUNT_BMP (ret->size,c,cv,de)
1479 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
1480 for (i = 0; i < text->size;) {
1481 c = text->data[i++];
1482 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
1487 /* Convert single byte ASCII+8bit character set sized text to UTF-8
1488 * Accepts: source sized text
1489 * pointer to return sized text
1490 * conversion table
1491 * canonicalization function
1494 void utf8_text_1byte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
1495 ucs4de_t de)
1497 unsigned long i;
1498 unsigned char *s;
1499 unsigned int c;
1500 unsigned short *tbl = (unsigned short *) tab;
1501 for (ret->size = i = 0; i < text->size;) {
1502 if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7];
1503 UTF8_COUNT_BMP (ret->size,c,cv,de)
1505 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
1506 for (i = 0; i < text->size;) {
1507 if ((c = text->data[i++]) & BIT8) c = tbl[c & BITS7];
1508 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
1512 /* Convert single byte 8bit character set sized text to UTF-8
1513 * Accepts: source sized text
1514 * pointer to return sized text
1515 * conversion table
1516 * canonicalization function
1519 void utf8_text_1byte8 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
1520 ucs4de_t de)
1522 unsigned long i;
1523 unsigned char *s;
1524 unsigned int c;
1525 unsigned short *tbl = (unsigned short *) tab;
1526 for (ret->size = i = 0; i < text->size;) {
1527 c = tbl[text->data[i++]];
1528 UTF8_COUNT_BMP (ret->size,c,cv,de)
1530 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
1531 for (i = 0; i < text->size;) {
1532 c = tbl[text->data[i++]];
1533 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
1537 /* Convert EUC sized text to UTF-8
1538 * Accepts: source sized text
1539 * pointer to return sized text
1540 * EUC parameter table
1541 * canonicalization function
1544 void utf8_text_euc (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
1545 ucs4de_t de)
1547 unsigned long i;
1548 unsigned char *s;
1549 unsigned int pass,c,c1,ku,ten;
1550 struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab;
1551 struct utf8_eucparam *p2 = p1 + 1;
1552 struct utf8_eucparam *p3 = p1 + 2;
1553 unsigned short *t1 = (unsigned short *) p1->tab;
1554 unsigned short *t2 = (unsigned short *) p2->tab;
1555 unsigned short *t3 = (unsigned short *) p3->tab;
1556 for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) {
1557 for (i = 0; i < text->size;) {
1558 /* not CS0? */
1559 if ((c = text->data[i++]) & BIT8) {
1560 /* yes, must have another high byte */
1561 if ((i >= text->size) || !((c1 = text->data[i++]) & BIT8))
1562 c = UBOGON; /* out of space or bogon */
1563 else switch (c) { /* check 8bit code set */
1564 case EUC_CS2: /* CS2 */
1565 if (p2->base_ku) { /* CS2 set up? */
1566 if (p2->base_ten) /* yes, multibyte? */
1567 c = ((i < text->size) && ((c = text->data[i++]) & BIT8) &&
1568 ((ku = (c1 & BITS7) - p2->base_ku) < p2->max_ku) &&
1569 ((ten = (c & BITS7) - p2->base_ten) < p2->max_ten)) ?
1570 t2[(ku*p2->max_ten) + ten] : UBOGON;
1571 else c = ((c1 >= p2->base_ku) && (c1 < p2->max_ku)) ?
1572 c1 + ((unsigned long) p2->tab) : UBOGON;
1574 else { /* CS2 not set up */
1575 c = UBOGON; /* swallow byte, say bogon */
1576 if (i < text->size) i++;
1578 break;
1579 case EUC_CS3: /* CS3 */
1580 if (p3->base_ku) { /* CS3 set up? */
1581 if (p3->base_ten) /* yes, multibyte? */
1582 c = ((i < text->size) && ((c = text->data[i++]) & BIT8) &&
1583 ((ku = (c1 & BITS7) - p3->base_ku) < p3->max_ku) &&
1584 ((ten = (c & BITS7) - p3->base_ten) < p3->max_ten)) ?
1585 t3[(ku*p3->max_ten) + ten] : UBOGON;
1586 else c = ((c1 >= p3->base_ku) && (c1 < p3->max_ku)) ?
1587 c1 + ((unsigned long) p3->tab) : UBOGON;
1589 else { /* CS3 not set up */
1590 c = UBOGON; /* swallow byte, say bogon */
1591 if (i < text->size) i++;
1593 break;
1595 default:
1596 if (((ku = (c & BITS7) - p1->base_ku) >= p1->max_ku) ||
1597 ((ten = (c1 & BITS7) - p1->base_ten) >= p1->max_ten)) c = UBOGON;
1598 else if (((c = t1[(ku*p1->max_ten) + ten]) == UBOGON) &&
1599 /* special hack for JIS X 0212: merge rows less than 10 */
1600 ku && (ku < 10) && t3 && p3->base_ten)
1601 c = t3[((ku - (p3->base_ku - p1->base_ku))*p3->max_ten) + ten];
1604 /* convert if second pass */
1605 if (pass) UTF8_WRITE_BMP (s,c,cv,de)
1606 else UTF8_COUNT_BMP (ret->size,c,cv,de);
1608 if (!pass) (s = ret->data = (unsigned char *)
1609 fs_get (ret->size + 1))[ret->size] =NIL;
1614 /* Convert ASCII + double-byte sized text to UTF-8
1615 * Accepts: source sized text
1616 * pointer to return sized text
1617 * conversion table
1618 * canonicalization function
1621 void utf8_text_dbyte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
1622 ucs4de_t de)
1624 unsigned long i;
1625 unsigned char *s;
1626 unsigned int c,c1,ku,ten;
1627 struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab;
1628 unsigned short *t1 = (unsigned short *) p1->tab;
1629 for (ret->size = i = 0; i < text->size;) {
1630 if ((c = text->data[i++]) & BIT8) {
1631 /* special hack for GBK: 0x80 is Euro */
1632 if ((c == 0x80) && (t1 == (unsigned short *) gb2312tab)) c = UCS2_EURO;
1633 else c = ((i < text->size) && (c1 = text->data[i++]) &&
1634 ((ku = c - p1->base_ku) < p1->max_ku) &&
1635 ((ten = c1 - p1->base_ten) < p1->max_ten)) ?
1636 t1[(ku*p1->max_ten) + ten] : UBOGON;
1638 UTF8_COUNT_BMP (ret->size,c,cv,de)
1640 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
1641 for (i = 0; i < text->size;) {
1642 if ((c = text->data[i++]) & BIT8) {
1643 /* special hack for GBK: 0x80 is Euro */
1644 if ((c == 0x80) && (t1 == (unsigned short *) gb2312tab)) c = UCS2_EURO;
1645 else c = ((i < text->size) && (c1 = text->data[i++]) &&
1646 ((ku = c - p1->base_ku) < p1->max_ku) &&
1647 ((ten = c1 - p1->base_ten) < p1->max_ten)) ?
1648 t1[(ku*p1->max_ten) + ten] : UBOGON;
1650 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
1654 /* Convert ASCII + double byte 2 plane sized text to UTF-8
1655 * Accepts: source sized text
1656 * pointer to return sized text
1657 * conversion table
1658 * canonicalization function
1661 void utf8_text_dbyte2 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv,
1662 ucs4de_t de)
1664 unsigned long i;
1665 unsigned char *s;
1666 unsigned int c,c1,ku,ten;
1667 struct utf8_eucparam *p1 = (struct utf8_eucparam *) tab;
1668 struct utf8_eucparam *p2 = p1 + 1;
1669 unsigned short *t = (unsigned short *) p1->tab;
1670 for (ret->size = i = 0; i < text->size;) {
1671 if ((c = text->data[i++]) & BIT8) {
1672 if ((i >= text->size) || !(c1 = text->data[i++]))
1673 c = UBOGON; /* out of space or bogon */
1674 else if (c1 & BIT8) /* high vs. low plane */
1675 c = ((ku = c - p2->base_ku) < p2->max_ku &&
1676 ((ten = c1 - p2->base_ten) < p2->max_ten)) ?
1677 t[(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten] :UBOGON;
1678 else c = ((ku = c - p1->base_ku) < p1->max_ku &&
1679 ((ten = c1 - p1->base_ten) < p1->max_ten)) ?
1680 t[(ku*(p1->max_ten + p2->max_ten)) + ten] : UBOGON;
1682 UTF8_COUNT_BMP (ret->size,c,cv,de)
1684 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
1685 for (i = 0; i < text->size;) {
1686 if ((c = text->data[i++]) & BIT8) {
1687 if ((i >= text->size) || !(c1 = text->data[i++]))
1688 c = UBOGON; /* out of space or bogon */
1689 else if (c1 & BIT8) /* high vs. low plane */
1690 c = ((ku = c - p2->base_ku) < p2->max_ku &&
1691 ((ten = c1 - p2->base_ten) < p2->max_ten)) ?
1692 t[(ku*(p1->max_ten + p2->max_ten)) + p1->max_ten + ten] :UBOGON;
1693 else c = ((ku = c - p1->base_ku) < p1->max_ku &&
1694 ((ten = c1 - p1->base_ten) < p1->max_ten)) ?
1695 t[(ku*(p1->max_ten + p2->max_ten)) + ten] : UBOGON;
1697 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
1701 #ifdef JISTOUNICODE /* Japanese */
1702 /* Convert Shift JIS sized text to UTF-8
1703 * Accepts: source sized text
1704 * pointer to return sized text
1705 * canonicalization function
1708 void utf8_text_sjis (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,
1709 ucs4de_t de)
1711 unsigned long i;
1712 unsigned char *s;
1713 unsigned int c,c1,ku,ten;
1714 for (ret->size = i = 0; i < text->size;) {
1715 if ((c = text->data[i++]) & BIT8) {
1716 /* half-width katakana */
1717 if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) c += KANA_8;
1718 else if (i >= text->size) c = UBOGON;
1719 else { /* Shift-JIS */
1720 c1 = text->data[i++];
1721 SJISTOJIS (c,c1);
1722 c = JISTOUNICODE (c,c1,ku,ten);
1725 /* compromise - do yen sign but not overline */
1726 else if (c == JISROMAN_YEN) c = UCS2_YEN;
1727 UTF8_COUNT_BMP (ret->size,c,cv,de)
1729 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
1730 for (i = 0; i < text->size;) {
1731 if ((c = text->data[i++]) & BIT8) {
1732 /* half-width katakana */
1733 if ((c >= MIN_KANA_8) && (c < MAX_KANA_8)) c += KANA_8;
1734 else { /* Shift-JIS */
1735 c1 = text->data[i++];
1736 SJISTOJIS (c,c1);
1737 c = JISTOUNICODE (c,c1,ku,ten);
1740 /* compromise - do yen sign but not overline */
1741 else if (c == JISROMAN_YEN) c = UCS2_YEN;
1742 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
1745 #endif
1747 /* Convert ISO-2022 sized text to UTF-8
1748 * Accepts: source sized text
1749 * pointer to returned sized text
1750 * canonicalization function
1753 void utf8_text_2022 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
1755 unsigned long i;
1756 unsigned char *s;
1757 unsigned int pass,state,c,co,gi,gl,gr,g[4],ku,ten;
1758 for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) {
1759 gi = 0; /* quell compiler warnings */
1760 state = I2S_CHAR; /* initialize engine */
1761 g[0]= g[2] = I2CS_ASCII; /* G0 and G2 are ASCII */
1762 g[1]= g[3] = I2CS_ISO8859_1;/* G1 and G3 are ISO-8850-1 */
1763 gl = I2C_G0; gr = I2C_G1; /* left is G0, right is G1 */
1764 for (i = 0; i < text->size;) {
1765 c = text->data[i++];
1766 switch (state) { /* dispatch based upon engine state */
1767 case I2S_ESC: /* ESC seen */
1768 switch (c) { /* process intermediate character */
1769 case I2C_MULTI: /* multibyte character? */
1770 state = I2S_MUL; /* mark multibyte flag seen */
1771 break;
1772 case I2C_SS2: /* single shift GL to G2 */
1773 case I2C_SS2_ALT: /* Taiwan SeedNet */
1774 gl |= I2C_SG2;
1775 break;
1776 case I2C_SS3: /* single shift GL to G3 */
1777 case I2C_SS3_ALT: /* Taiwan SeedNet */
1778 gl |= I2C_SG3;
1779 break;
1780 case I2C_LS2: /* shift GL to G2 */
1781 gl = I2C_G2;
1782 break;
1783 case I2C_LS3: /* shift GL to G3 */
1784 gl = I2C_G3;
1785 break;
1786 case I2C_LS1R: /* shift GR to G1 */
1787 gr = I2C_G1;
1788 break;
1789 case I2C_LS2R: /* shift GR to G2 */
1790 gr = I2C_G2;
1791 break;
1792 case I2C_LS3R: /* shift GR to G3 */
1793 gr = I2C_G3;
1794 break;
1795 case I2C_G0_94: case I2C_G1_94: case I2C_G2_94: case I2C_G3_94:
1796 g[gi = c - I2C_G0_94] = (state == I2S_MUL) ? I2CS_94x94 : I2CS_94;
1797 state = I2S_INT; /* ready for character set */
1798 break;
1799 case I2C_G0_96: case I2C_G1_96: case I2C_G2_96: case I2C_G3_96:
1800 g[gi = c - I2C_G0_96] = (state == I2S_MUL) ? I2CS_96x96 : I2CS_96;
1801 state = I2S_INT; /* ready for character set */
1802 break;
1803 default: /* bogon */
1804 if (pass) *s++ = I2C_ESC,*s++ = c;
1805 else ret->size += 2;
1806 state = I2S_CHAR; /* return to previous state */
1808 break;
1810 case I2S_MUL: /* ESC $ */
1811 switch (c) { /* process multibyte intermediate character */
1812 case I2C_G0_94: case I2C_G1_94: case I2C_G2_94: case I2C_G3_94:
1813 g[gi = c - I2C_G0_94] = I2CS_94x94;
1814 state = I2S_INT; /* ready for character set */
1815 break;
1816 case I2C_G0_96: case I2C_G1_96: case I2C_G2_96: case I2C_G3_96:
1817 g[gi = c - I2C_G0_96] = I2CS_96x96;
1818 state = I2S_INT; /* ready for character set */
1819 break;
1820 default: /* probably omitted I2CS_94x94 */
1821 g[gi = I2C_G0] = I2CS_94x94 | c;
1822 state = I2S_CHAR; /* return to character state */
1824 break;
1825 case I2S_INT:
1826 state = I2S_CHAR; /* return to character state */
1827 g[gi] |= c; /* set character set */
1828 break;
1830 case I2S_CHAR: /* character data */
1831 switch (c) {
1832 case I2C_ESC: /* ESC character */
1833 state = I2S_ESC; /* see if ISO-2022 prefix */
1834 break;
1835 case I2C_SI: /* shift GL to G0 */
1836 gl = I2C_G0;
1837 break;
1838 case I2C_SO: /* shift GL to G1 */
1839 gl = I2C_G1;
1840 break;
1841 case I2C_SS2_ALT: /* single shift GL to G2 */
1842 case I2C_SS2_ALT_7:
1843 gl |= I2C_SG2;
1844 break;
1845 case I2C_SS3_ALT: /* single shift GL to G3 */
1846 case I2C_SS3_ALT_7:
1847 gl |= I2C_SG3;
1848 break;
1850 default: /* ordinary character */
1851 co = c; /* note original character */
1852 if (gl & (3 << 2)) { /* single shifted? */
1853 gi = g[gl >> 2]; /* get shifted character set */
1854 gl &= 0x3; /* cancel shift */
1856 /* select left or right half */
1857 else gi = (c & BIT8) ? g[gr] : g[gl];
1858 c &= BITS7; /* make 7-bit */
1859 switch (gi) { /* interpret in character set */
1860 case I2CS_ASCII: /* ASCII */
1861 break; /* easy! */
1862 case I2CS_BRITISH: /* British ASCII */
1863 /* Pound sterling sign */
1864 if (c == BRITISH_POUNDSTERLING) c = UCS2_POUNDSTERLING;
1865 break;
1866 case I2CS_JIS_ROMAN: /* JIS Roman */
1867 case I2CS_JIS_BUGROM: /* old bugs */
1868 switch (c) { /* two exceptions to ASCII */
1869 case JISROMAN_YEN: /* Yen sign */
1870 c = UCS2_YEN;
1871 break;
1872 /* overline */
1873 case JISROMAN_OVERLINE:
1874 c = UCS2_OVERLINE;
1875 break;
1877 break;
1878 case I2CS_JIS_KANA: /* JIS hankaku katakana */
1879 if ((c >= MIN_KANA_7) && (c < MAX_KANA_7)) c += KANA_7;
1880 break;
1882 case I2CS_ISO8859_1: /* Latin-1 (West European) */
1883 c |= BIT8; /* just turn on high bit */
1884 break;
1885 case I2CS_ISO8859_2: /* Latin-2 (Czech, Slovak) */
1886 c = iso8859_2tab[c];
1887 break;
1888 case I2CS_ISO8859_3: /* Latin-3 (Dutch, Turkish) */
1889 c = iso8859_3tab[c];
1890 break;
1891 case I2CS_ISO8859_4: /* Latin-4 (Scandinavian) */
1892 c = iso8859_4tab[c];
1893 break;
1894 case I2CS_ISO8859_5: /* Cyrillic */
1895 c = iso8859_5tab[c];
1896 break;
1897 case I2CS_ISO8859_6: /* Arabic */
1898 c = iso8859_6tab[c];
1899 break;
1900 case I2CS_ISO8859_7: /* Greek */
1901 c = iso8859_7tab[c];
1902 break;
1903 case I2CS_ISO8859_8: /* Hebrew */
1904 c = iso8859_8tab[c];
1905 break;
1906 case I2CS_ISO8859_9: /* Latin-5 (Finnish, Portuguese) */
1907 c = iso8859_9tab[c];
1908 break;
1909 case I2CS_TIS620: /* Thai */
1910 c = tis620tab[c];
1911 break;
1912 case I2CS_ISO8859_10: /* Latin-6 (Northern Europe) */
1913 c = iso8859_10tab[c];
1914 break;
1915 case I2CS_ISO8859_13: /* Latin-7 (Baltic) */
1916 c = iso8859_13tab[c];
1917 break;
1918 case I2CS_VSCII: /* Vietnamese */
1919 c = visciitab[c];
1920 break;
1921 case I2CS_ISO8859_14: /* Latin-8 (Celtic) */
1922 c = iso8859_14tab[c];
1923 break;
1924 case I2CS_ISO8859_15: /* Latin-9 (Euro) */
1925 c = iso8859_15tab[c];
1926 break;
1927 case I2CS_ISO8859_16: /* Latin-10 (Baltic) */
1928 c = iso8859_16tab[c];
1929 break;
1931 default: /* all other character sets */
1932 /* multibyte character set */
1933 if ((gi & I2CS_MUL) && !(c & BIT8) && isgraph (c)) {
1934 c = (i < text->size) ? text->data[i++] : 0;
1935 switch (gi) {
1936 #ifdef GBTOUNICODE
1937 case I2CS_GB: /* GB 2312 */
1938 co |= BIT8; /* make into EUC */
1939 c |= BIT8;
1940 c = GBTOUNICODE (co,c,ku,ten);
1941 break;
1942 #endif
1943 #ifdef JISTOUNICODE
1944 case I2CS_JIS_OLD:/* JIS X 0208-1978 */
1945 case I2CS_JIS_NEW:/* JIS X 0208-1983 */
1946 c = JISTOUNICODE (co,c,ku,ten);
1947 break;
1948 #endif
1949 #ifdef JIS0212TOUNICODE
1950 case I2CS_JIS_EXT:/* JIS X 0212-1990 */
1951 c = JIS0212TOUNICODE (co,c,ku,ten);
1952 break;
1953 #endif
1954 #ifdef KSCTOUNICODE
1955 case I2CS_KSC: /* KSC 5601 */
1956 co |= BIT8; /* make into EUC */
1957 c |= BIT8;
1958 c = KSCTOUNICODE (co,c,ku,ten);
1959 break;
1960 #endif
1961 #ifdef CNS1TOUNICODE
1962 case I2CS_CNS1: /* CNS 11643 plane 1 */
1963 c = CNS1TOUNICODE (co,c,ku,ten);
1964 break;
1965 #endif
1966 #ifdef CNS2TOUNICODE
1967 case I2CS_CNS2: /* CNS 11643 plane 2 */
1968 c = CNS2TOUNICODE (co,c,ku,ten);
1969 break;
1970 #endif
1971 #ifdef CNS3TOUNICODE
1972 case I2CS_CNS3: /* CNS 11643 plane 3 */
1973 c = CNS3TOUNICODE (co,c,ku,ten);
1974 break;
1975 #endif
1976 #ifdef CNS4TOUNICODE
1977 case I2CS_CNS4: /* CNS 11643 plane 4 */
1978 c = CNS4TOUNICODE (co,c,ku,ten);
1979 break;
1980 #endif
1981 #ifdef CNS5TOUNICODE
1982 case I2CS_CNS5: /* CNS 11643 plane 5 */
1983 c = CNS5TOUNICODE (co,c,ku,ten);
1984 break;
1985 #endif
1986 #ifdef CNS6TOUNICODE
1987 case I2CS_CNS6: /* CNS 11643 plane 6 */
1988 c = CNS6TOUNICODE (co,c,ku,ten);
1989 break;
1990 #endif
1991 #ifdef CNS7TOUNICODE
1992 case I2CS_CNS7: /* CNS 11643 plane 7 */
1993 c = CNS7TOUNICODE (co,c,ku,ten);
1994 break;
1995 #endif
1996 default: /* unknown multibyte, treat as UCS-2 */
1997 c |= (co << 8); /* wrong, but nothing else to do */
1998 break;
2001 else c = co; /* unknown single byte, treat as 8859-1 */
2003 /* convert if second pass */
2004 if (pass) UTF8_WRITE_BMP (s,c,cv,de)
2005 else UTF8_COUNT_BMP (ret->size,c,cv,de);
2009 if (!pass) (s = ret->data = (unsigned char *)
2010 fs_get (ret->size + 1))[ret->size] = NIL;
2011 else if (((unsigned long) (s - ret->data)) != ret->size)
2012 fatal ("ISO-2022 to UTF-8 botch");
2016 /* Convert UTF-7 sized text to UTF-8
2017 * Accepts: source sized text
2018 * pointer to returned sized text
2019 * canonicalization function
2022 void utf8_text_utf7 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
2024 unsigned long i;
2025 unsigned char *s;
2026 unsigned int c,c1,d,uc,pass,e,e1,state,surrh;
2027 for (pass = 0,s = NIL,ret->size = 0; pass <= 1; pass++) {
2028 c1 = d = uc = e = e1 = 0;
2029 for (i = 0,state = NIL; i < text->size;) {
2030 c = text->data[i++]; /* get next byte */
2031 switch (state) {
2032 case U7_PLUS: /* previous character was + */
2033 if (c == '-') { /* +- means textual + */
2034 c = '+';
2035 state = U7_ASCII; /* revert to ASCII */
2036 break;
2038 state = U7_UNICODE; /* enter Unicode state */
2039 e = e1 = 0; /* initialize Unicode quantum position */
2040 case U7_UNICODE: /* Unicode state */
2041 if (c == '-') state = U7_MINUS;
2042 else { /* decode Unicode */
2043 /* don't use isupper/islower since this is ASCII only */
2044 if ((c >= 'A') && (c <= 'Z')) c -= 'A';
2045 else if ((c >= 'a') && (c <= 'z')) c -= 'a' - 26;
2046 else if (isdigit (c)) c -= '0' - 52;
2047 else if (c == '+') c = 62;
2048 else if (c == '/') c = 63;
2049 else state = U7_ASCII;/* end of modified BASE64 */
2051 break;
2052 case U7_MINUS: /* previous character was absorbed - */
2053 state = U7_ASCII; /* revert to ASCII */
2054 case U7_ASCII: /* ASCII state */
2055 if (c == '+') state = U7_PLUS;
2056 break;
2059 switch (state) { /* store character if in character mode */
2060 case U7_UNICODE: /* Unicode */
2061 switch (e++) { /* install based on BASE64 state */
2062 case 0:
2063 c1 = c << 2; /* byte 1: high 6 bits */
2064 break;
2065 case 1:
2066 d = c1 | (c >> 4); /* byte 1: low 2 bits */
2067 c1 = c << 4; /* byte 2: high 4 bits */
2068 break;
2069 case 2:
2070 d = c1 | (c >> 2); /* byte 2: low 4 bits */
2071 c1 = c << 6; /* byte 3: high 2 bits */
2072 break;
2073 case 3:
2074 d = c | c1; /* byte 3: low 6 bits */
2075 e = 0; /* reinitialize mechanism */
2076 break;
2078 if (e == 1) break; /* done if first BASE64 state */
2079 if (!e1) { /* first byte of UCS-2 character */
2080 uc = (d & 0xff) << 8; /* note first byte */
2081 e1 = T; /* enter second UCS-2 state */
2082 break; /* done */
2084 c = uc | (d & 0xff); /* build UCS-2 character */
2085 e1 = NIL; /* back to first UCS-2 state, drop in */
2086 /* surrogate pair? */
2087 if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) {
2088 /* save high surrogate for later */
2089 if (c < UTF16_SURRL) surrh = c;
2090 else c = UTF16_BASE + ((surrh & UTF16_MASK) << UTF16_SHIFT) +
2091 (c & UTF16_MASK);
2092 break; /* either way with surrogates, we're done */
2094 case U7_ASCII: /* just install if ASCII */
2095 /* convert if second pass */
2096 if (pass) UTF8_WRITE_BMP (s,c,cv,de)
2097 else UTF8_COUNT_BMP (ret->size,c,cv,de);
2100 if (!pass) (s = ret->data = (unsigned char *)
2101 fs_get (ret->size + 1))[ret->size] = NIL;
2102 else if (((unsigned long) (s - ret->data)) != ret->size)
2103 fatal ("UTF-7 to UTF-8 botch");
2108 /* Convert UTF-8 sized text to UTF-8
2109 * Accepts: source sized text
2110 * pointer to returned sized text
2111 * canonicalization function
2114 void utf8_text_utf8 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
2116 unsigned long i,c;
2117 unsigned char *s,*t;
2118 for (ret->size = 0, t = text->data, i = text->size; i;) {
2119 if ((c = utf8_get (&t,&i)) & U8G_ERROR) {
2120 ret->data = text->data; /* conversion failed */
2121 ret->size = text->size;
2122 return;
2124 UTF8_COUNT (ret->size,c,cv,de)
2126 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] =NIL;
2127 for (t = text->data, i = text->size; i;) {
2128 c = utf8_get (&t,&i);
2129 UTF8_WRITE (s,c,cv,de) /* convert UCS-4 to UTF-8 */
2131 if (((unsigned long) (s - ret->data)) != ret->size)
2132 fatal ("UTF-8 to UTF-8 botch");
2135 /* Convert UCS-2 sized text to UTF-8
2136 * Accepts: source sized text
2137 * pointer to returned sized text
2138 * canonicalization function
2141 void utf8_text_ucs2 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
2143 unsigned long i;
2144 unsigned char *s,*t;
2145 unsigned int c;
2146 for (ret->size = 0, t = text->data, i = text->size / 2; i; --i) {
2147 c = *t++ << 8;
2148 c |= *t++;
2149 UTF8_COUNT_BMP (ret->size,c,cv,de);
2151 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
2152 for (t = text->data, i = text->size / 2; i; --i) {
2153 c = *t++ << 8;
2154 c |= *t++;
2155 UTF8_WRITE_BMP (s,c,cv,de) /* convert UCS-2 to UTF-8 */
2157 if (((unsigned long) (s - ret->data)) != ret->size)
2158 fatal ("UCS-2 to UTF-8 botch");
2162 /* Convert UCS-4 sized text to UTF-8
2163 * Accepts: source sized text
2164 * pointer to returned sized text
2165 * canonicalization function
2168 void utf8_text_ucs4 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
2170 unsigned long i;
2171 unsigned char *s,*t;
2172 unsigned long c;
2173 for (ret->size = 0, t = text->data, i = text->size / 4; i; --i) {
2174 c = *t++ << 24; c |= *t++ << 16; c |= *t++ << 8; c |= *t++;
2175 UTF8_COUNT (ret->size,c,cv,de);
2177 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
2178 for (t = text->data, i = text->size / 2; i; --i) {
2179 c = *t++ << 24; c |= *t++ << 16; c |= *t++ << 8; c |= *t++;
2180 UTF8_WRITE (s,c,cv,de) /* convert UCS-4 to UTF-8 */
2182 if (((unsigned long) (s - ret->data)) != ret->size)
2183 fatal ("UCS-4 to UTF-8 botch");
2186 /* Convert UTF-16 sized text to UTF-8
2187 * Accepts: source sized text
2188 * pointer to returned sized text
2189 * canonicalization function
2192 void utf8_text_utf16 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de)
2194 unsigned long i;
2195 unsigned char *s,*t;
2196 unsigned long c,d;
2197 for (ret->size = 0, t = text->data, i = text->size / 2; i; --i) {
2198 c = *t++ << 8;
2199 c |= *t++;
2200 /* possible surrogate? */
2201 if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) {
2202 /* invalid first surrogate */
2203 if ((c > UTF16_SURRHEND) || !i) c = UBOGON;
2204 else { /* get second surrogate */
2205 d = *t++ << 8;
2206 d |= *t++;
2207 --i; /* swallowed another 16-bits */
2208 /* invalid second surrogate */
2209 if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) c = UBOGON;
2210 else c = UTF16_BASE + ((c & UTF16_MASK) << UTF16_SHIFT) +
2211 (d & UTF16_MASK);
2214 UTF8_COUNT (ret->size,c,cv,de);
2216 (s = ret->data = (unsigned char *) fs_get (ret->size + 1))[ret->size] = NIL;
2217 for (t = text->data, i = text->size / 2; i; --i) {
2218 c = *t++ << 8;
2219 c |= *t++;
2220 /* possible surrogate? */
2221 if ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR)) {
2222 /* invalid first surrogate */
2223 if ((c > UTF16_SURRHEND) || !i) c = UBOGON;
2224 else { /* get second surrogate */
2225 d = *t++ << 8;
2226 d |= *t++;
2227 --i; /* swallowed another 16-bits */
2228 /* invalid second surrogate */
2229 if ((d < UTF16_SURRL) || (d > UTF16_SURRLEND)) c = UBOGON;
2230 else c = UTF16_BASE + ((c & UTF16_MASK) << UTF16_SHIFT) +
2231 (d & UTF16_MASK);
2234 UTF8_WRITE (s,c,cv,de) /* convert UCS-4 to UTF-8 */
2236 if (((unsigned long) (s - ret->data)) != ret->size)
2237 fatal ("UTF-16 to UTF-8 botch");
2240 /* Size of UCS-4 character, possibly not in BMP, as UTF-8 octets
2241 * Accepts: character
2242 * Returns: size (0 means bogon)
2244 * Use UTF8_SIZE macro if known to be in the BMP
2247 unsigned long utf8_size (unsigned long c)
2249 if (c < 0x80) return 1;
2250 else if (c < 0x800) return 2;
2251 else if (c < 0x10000) return 3;
2252 else if (c < 0x200000) return 4;
2253 else if (c < 0x4000000) return 5;
2254 else if (c < 0x80000000) return 6;
2255 return 0;
2259 /* Put UCS-4 character, possibly not in BMP, as UTF-8 octets
2260 * Accepts: destination string pointer
2261 * character
2262 * Returns: updated destination pointer
2264 * Use UTF8_PUT_BMP macro if known to be in the BMP
2267 unsigned char *utf8_put (unsigned char *s,unsigned long c)
2269 unsigned char mark[6] = {0x00,0xc0,0xe0,0xf0,0xf8,0xfc};
2270 unsigned long size = utf8_size (c);
2271 switch (size) {
2272 case 6:
2273 s[5] = 0x80 | (unsigned char) (c & 0x3f);
2274 c >>= 6;
2275 case 5:
2276 s[4] = 0x80 | (unsigned char) (c & 0x3f);
2277 c >>= 6;
2278 case 4:
2279 s[3] = 0x80 | (unsigned char) (c & 0x3f);
2280 c >>= 6;
2281 case 3:
2282 s[2] = 0x80 | (unsigned char) (c & 0x3f);
2283 c >>= 6;
2284 case 2:
2285 s[1] = 0x80 | (unsigned char) (c & 0x3f);
2286 c >>= 6;
2287 case 1:
2288 *s = mark[size-1] | (unsigned char) (c & 0x7f);
2289 break;
2291 return s + size;
2294 /* Return title case of a fixed-width UCS-4 character
2295 * Accepts: character
2296 * Returns: title case of character
2299 unsigned long ucs4_titlecase (unsigned long c)
2301 if (c <= UCS4_TMAPMAX) return ucs4_tmaptab[c];
2302 if (c < UCS4_TMAPHIMIN) return c;
2303 if (c <= UCS4_TMAPHIMAX) return c - UCS4_TMAPHIMAP;
2304 if (c < UCS4_TMAPDESERETMIN) return c;
2305 if (c <= UCS4_TMAPDESERETMAX) return c - UCS4_TMAPDESERETMAP;
2306 return c;
2310 /* Return width of a fixed-width UCS-4 character in planes 0-2
2311 * Accepts: character
2312 * Returns: width (0, 1, 2) or negative error condition if not valid
2315 long ucs4_width (unsigned long c)
2317 long ret;
2318 /* out of range, not-a-char, or surrogates */
2319 if ((c > UCS4_MAXUNICODE) || ((c & 0xfffe) == 0xfffe) ||
2320 ((c >= UTF16_SURR) && (c <= UTF16_MAXSURR))) ret = U4W_NOTUNCD;
2321 /* private-use */
2322 else if (c >= UCS4_PVTBASE) ret = U4W_PRIVATE;
2323 /* SSP are not printing characters */
2324 else if (c >= UCS4_SSPBASE) ret = U4W_SSPCHAR;
2325 /* unassigned planes */
2326 else if (c >= UCS4_UNABASE) ret = U4W_UNASSGN;
2327 /* SIP and reserved plane 3 are wide */
2328 else if (c >= UCS4_SIPBASE) ret = 2;
2329 #if (UCS4_WIDLEN != UCS4_SIPBASE)
2330 #error "UCS4_WIDLEN != UCS4_SIPBASE"
2331 #endif
2332 /* C0/C1 controls */
2333 else if ((c <= UCS2_C0CONTROLEND) ||
2334 ((c >= UCS2_C1CONTROL) && (c <= UCS2_C1CONTROLEND)))
2335 ret = U4W_CONTROL;
2336 /* BMP and SMP get value from table */
2337 else switch (ret = (ucs4_widthtab[(c >> 2)] >> ((3 - (c & 0x3)) << 1)) &0x3){
2338 case 0: /* zero-width */
2339 if (c == 0x00ad) ret = 1; /* force U+00ad (SOFT HYPHEN) to width 1 */
2340 case 1: /* single-width */
2341 case 2: /* double-width */
2342 break;
2343 case 3: /* ambiguous width */
2344 ret = (c >= 0x2100) ? 2 : 1;/* need to do something better than this */
2345 break;
2347 return ret;
2350 /* Return screen width of UTF-8 string
2351 * Accepts: string
2352 * Returns: width or negative if not valid UTF-8
2355 long utf8_strwidth (unsigned char *s)
2357 unsigned long c,i,ret;
2358 /* go through string */
2359 for (ret = 0; *s; ret += ucs4_width (c)) {
2360 /* It's alright to give a fake value for the byte count to utf8_get()
2361 * since the null of a null-terminated string will stop processing anyway.
2363 i = 6; /* fake value */
2364 if ((c = utf8_get (&s,&i)) & U8G_ERROR) return -1;
2366 return ret;
2370 /* Return screen width of UTF-8 text
2371 * Accepts: SIZEDTEXT to string
2372 * Returns: width or negative if not valid UTF-8
2375 long utf8_textwidth (SIZEDTEXT *utf8)
2377 unsigned long c;
2378 unsigned char *s = utf8->data;
2379 unsigned long i = utf8->size;
2380 unsigned long ret = 0;
2381 while (i) { /* while there's a string to process */
2382 if ((c = utf8_get (&s,&i)) & U8G_ERROR) return -1;
2383 ret += ucs4_width (c);
2385 return ret;
2388 /* Decomposition (phew!) */
2390 #define MORESINGLE 1 /* single UCS-4 tail value */
2391 #define MOREMULTIPLE 2 /* multiple UCS-2 tail values */
2393 struct decomposemore {
2394 short type; /* type of more */
2395 union {
2396 unsigned long single; /* single decomposed value */
2397 struct { /* multiple BMP values */
2398 unsigned short *next;
2399 unsigned long count;
2400 } multiple;
2401 } data;
2404 #define RECURSIVEMORE struct recursivemore
2406 RECURSIVEMORE {
2407 struct decomposemore *more;
2408 RECURSIVEMORE *next;
2412 /* Return decomposition of a UCS-4 character
2413 * Accepts: character or U8G_ERROR to return next from "more"
2414 * pointer to returned more
2415 * Returns: [next] decomposed value, more set if still more decomposition
2418 unsigned long ucs4_decompose (unsigned long c,void **more)
2420 unsigned long i,ix,ret;
2421 struct decomposemore *m;
2422 if (c & U8G_ERROR) { /* want to chase more? */
2423 /* do sanity check */
2424 if (m = (struct decomposemore *) *more) switch (m->type) {
2425 case MORESINGLE: /* single value */
2426 ret = m->data.single;
2427 fs_give (more); /* no more decomposition */
2428 break;
2429 case MOREMULTIPLE: /* multiple value */
2430 ret = *m->data.multiple.next++;
2431 if (!--m->data.multiple.count) fs_give (more);
2432 break;
2433 default: /* uh-oh */
2434 fatal ("invalid more block argument to ucs4_decompose!");
2436 else fatal ("no more block provided to ucs4_decompose!");
2439 else { /* start decomposition */
2440 *more = NIL; /* initially set no more */
2441 /* BMP low decompositions */
2442 if (c < UCS4_BMPLOMIN) ret = c;
2443 /* fix this someday */
2444 else if (c == UCS4_BMPLOMIN) ret = ucs4_dbmplotab[0];
2445 else if (c <= UCS4_BMPLOMAX) {
2446 /* within range - have a decomposition? */
2447 if (i = ucs4_dbmploixtab[c - UCS4_BMPLOMIN]) {
2448 /* get first value of decomposition */
2449 ret = ucs4_dbmplotab[ix = i & UCS4_BMPLOIXMASK];
2450 /* has continuation? */
2451 if (i & UCS4_BMPLOSIZEMASK) {
2452 m = (struct decomposemore *)
2453 (*more = memset (fs_get (sizeof (struct decomposemore)),0,
2454 sizeof (struct decomposemore)));
2455 m->type = MOREMULTIPLE;
2456 m->data.multiple.next = &ucs4_dbmplotab[++ix];
2457 m->data.multiple.count = i >> UCS4_BMPLOSIZESHIFT;
2460 else ret = c; /* in range but doesn't decompose */
2462 /* BMP CJK compatibility */
2463 else if (c < UCS4_BMPCJKMIN) ret = c;
2464 else if (c <= UCS4_BMPCJKMAX) {
2465 if (!(ret = ucs4_bmpcjk1decomptab[c - UCS4_BMPCJKMIN])) ret = c;
2467 /* BMP CJK compatibility - some not in BMP */
2468 #if UCS4_BMPCJK2MIN - (UCS4_BMPCJKMAX + 1)
2469 else if (c < UCS4_BMPCJK2MIN) ret = c;
2470 #endif
2471 else if (c <= UCS4_BMPCJK2MAX)
2472 ret = ucs4_bmpcjk2decomptab[c - UCS4_BMPCJK2MIN];
2473 /* BMP high decompositions */
2474 else if (c < UCS4_BMPHIMIN) ret = c;
2475 else if (c <= UCS4_BMPHIMAX) {
2476 /* within range - have a decomposition? */
2477 if (i = ucs4_dbmphiixtab[c - UCS4_BMPHIMIN]) {
2478 /* get first value of decomposition */
2479 ret = ucs4_dbmphitab[ix = i & UCS4_BMPHIIXMASK];
2480 /* has continuation? */
2481 if (i & UCS4_BMPHISIZEMASK) {
2482 m = (struct decomposemore *)
2483 (*more = memset (fs_get (sizeof (struct decomposemore)),0,
2484 sizeof (struct decomposemore)));
2485 m->type = MOREMULTIPLE;
2486 m->data.multiple.next = &ucs4_dbmphitab[++ix];
2487 m->data.multiple.count = i >> UCS4_BMPHISIZESHIFT;
2490 else ret = c; /* in range but doesn't decompose */
2493 /* BMP half and full width forms */
2494 else if (c < UCS4_BMPHALFFULLMIN) ret = c;
2495 else if (c <= UCS4_BMPHALFFULLMAX) {
2496 if (!(ret = ucs4_bmphalffulldecomptab[c - UCS4_BMPHALFFULLMIN])) ret = c;
2498 /* SMP music */
2499 else if (c < UCS4_SMPMUSIC1MIN) ret = c;
2500 else if (c <= UCS4_SMPMUSIC1MAX) {
2501 ret = ucs4_smpmusic1decomptab[c -= UCS4_SMPMUSIC1MIN][0];
2502 m = (struct decomposemore *)
2503 (*more = memset (fs_get (sizeof (struct decomposemore)),0,
2504 sizeof (struct decomposemore)));
2505 m->type = MORESINGLE;
2506 m->data.single = ucs4_smpmusic1decomptab[c][1];
2508 else if (c < UCS4_SMPMUSIC2MIN) ret = c;
2509 else if (c <= UCS4_SMPMUSIC2MAX) {
2510 ret = ucs4_smpmusic2decomptab[c -= UCS4_SMPMUSIC2MIN][0];
2511 m = (struct decomposemore *)
2512 (*more = memset (fs_get (sizeof (struct decomposemore)),0,
2513 sizeof (struct decomposemore)));
2514 m->type = MORESINGLE;
2515 m->data.single = ucs4_smpmusic2decomptab[c][1];
2517 /* SMP mathematical forms */
2518 else if (c < UCS4_SMPMATHMIN) ret = c;
2519 else if (c <= UCS4_SMPMATHMAX) {
2520 if (!(ret = ucs4_smpmathdecomptab[c - UCS4_SMPMATHMIN])) ret = c;
2522 /* CJK compatibility ideographs in SIP */
2523 else if (!(ret = ((c >= UCS4_SIPMIN) && (c <= UCS4_SIPMAX)) ?
2524 ucs4_sipdecomptab[c - UCS4_SIPMIN] : c)) ret = c;
2526 return ret;
2529 /* Return recursive decomposition of a UCS-4 character
2530 * Accepts: character or U8G_ERROR to return next from "more"
2531 * pointer to returned more
2532 * Returns: [next] decomposed value, more set if still more decomposition
2535 unsigned long ucs4_decompose_recursive (unsigned long c,void **more)
2537 unsigned long c1;
2538 void *m,*mn;
2539 RECURSIVEMORE *mr;
2540 if (c & U8G_ERROR) { /* want to chase more? */
2541 mn = NIL;
2542 if (mr = (RECURSIVEMORE *) *more) switch (mr->more->type) {
2543 case MORESINGLE: /* decompose single value */
2544 c = ucs4_decompose_recursive (mr->more->data.single,&mn);
2545 *more = mr->next; /* done with this more, remove it */
2546 fs_give ((void **) &mr->more);
2547 fs_give ((void **) &mr);
2548 break;
2549 case MOREMULTIPLE: /* decompose current value in multiple */
2550 c = ucs4_decompose_recursive (*mr->more->data.multiple.next++,&mn);
2551 /* if done with this multiple decomposition */
2552 if (!--mr->more->data.multiple.count) {
2553 *more = mr->next; /* done with this more, remove it */
2554 fs_give ((void **) &mr->more);
2555 fs_give ((void **) &mr);
2557 break;
2558 default: /* uh-oh */
2559 fatal ("invalid more block argument to ucs4_decompose_recursive!");
2561 else fatal ("no more block provided to ucs4_decompose_recursive!");
2562 if (mr = mn) { /* did this value recurse on us? */
2563 mr->next = *more; /* yes, insert new more at head */
2564 *more = mr;
2567 else { /* start decomposition */
2568 *more = NIL; /* initially set no more */
2569 mr = NIL;
2570 do { /* repeatedly decompose this codepoint */
2571 c = ucs4_decompose (c1 = c,&m);
2572 if (m) { /* multi-byte decomposition */
2573 if (c1 == c) fatal ("endless multiple decomposition!");
2574 /* create a block to stash this more */
2575 mr = memset (fs_get (sizeof (RECURSIVEMORE)),0,sizeof (RECURSIVEMORE));
2576 mr->more = m; /* note the expansion */
2577 mr->next = *more; /* old list is the tail */
2578 *more = mr; /* and this is the new head */
2580 } while (c1 != c); /* until nothing more to decompose */
2582 return c;