1034: Initialize l in deflate_read to shut up GCC
[elinks.git] / src / intl / charsets.c
blob637faeba68ecb9a11676b0c31746a94952241a40
1 /* Charsets convertor */
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE /* strcasecmp() */
5 #endif
7 #ifdef HAVE_CONFIG_H
8 #include "config.h"
9 #endif
11 #if HAVE_LANGINFO_CODESET
12 #include <langinfo.h>
13 #endif
15 #include <ctype.h>
16 #include <stdlib.h>
17 #if HAVE_WCTYPE_H
18 #include <wctype.h>
19 #endif
21 #include "elinks.h"
23 #include "document/options.h"
24 #include "intl/charsets.h"
25 #include "util/conv.h"
26 #include "util/error.h"
27 #include "util/fastfind.h"
28 #include "util/memory.h"
29 #include "util/string.h"
32 /* Fix namespace clash on MacOS. */
33 #define table table_elinks
35 struct table_entry {
36 unsigned char c;
37 /* This should in principle be unicode_val_T, but because all
38 * the values currently in codepage.inc fit in 16 bits, we can
39 * as well use uint16_t and halve sizeof(struct table_entry)
40 * from 8 bytes to 4. Should other characters ever be needed,
41 * unicode_val_T u : 24 might be a possibility, although it
42 * seems a little unportable as bitfields are in principle
43 * restricted to int, which may be 16-bit. */
44 uint16_t u;
47 struct codepage_desc {
48 unsigned char *name;
49 unsigned char *const *aliases;
51 /* The Unicode mappings of codepage bytes 0x80...0xFF.
52 * (0x00...0x7F are assumed to be ASCII in all codepages.)
53 * Because all current values fit in 16 bits, we store them as
54 * uint16_t rather than unicode_val_T. If the codepage does
55 * not use some byte, then @highhalf maps that byte to 0xFFFF,
56 * which C code converts to UCS_REPLACEMENT_CHARACTER where
57 * appropriate. (U+FFFF is reserved and will never be
58 * assigned as a character.) */
59 const uint16_t *highhalf;
61 /* If some byte in the codepage corresponds to multiple Unicode
62 * characters, then the preferred character is in @highhalf
63 * above, and the rest are listed here in @table. This table
64 * is not used for translating from the codepage to Unicode. */
65 const struct table_entry *table;
68 #include "intl/codepage.inc"
69 #include "intl/uni_7b.inc"
70 #include "intl/entity.inc"
73 static const char strings[256][2] = {
74 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
75 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
76 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
77 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
78 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
79 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
80 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
81 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
82 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
83 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
84 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
85 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
86 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
87 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
88 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
89 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
90 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
91 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
92 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
93 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
94 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
95 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
96 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
97 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
98 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
99 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
100 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
101 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
102 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
103 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
104 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
105 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
108 static void
109 free_translation_table(struct conv_table *p)
111 int i;
113 for (i = 0; i < 256; i++)
114 if (p[i].t)
115 free_translation_table(p[i].u.tbl);
117 mem_free(p);
120 /* A string used in conversion tables when there is no correct
121 * conversion. This is compared by address and therefore should be a
122 * named array rather than a pointer so that it won't share storage
123 * with any other string literal that happens to have the same
124 * characters. */
125 static const unsigned char no_str[] = "*";
127 static void
128 new_translation_table(struct conv_table *p)
130 int i;
132 for (i = 0; i < 256; i++)
133 if (p[i].t)
134 free_translation_table(p[i].u.tbl);
135 for (i = 0; i < 128; i++) {
136 p[i].t = 0;
137 p[i].u.str = strings[i];
139 for (; i < 256; i++) {
140 p[i].t = 0;
141 p[i].u.str = no_str;
145 #define BIN_SEARCH(table, entry, entries, key, result) \
147 long _s = 0, _e = (entries) - 1; \
149 while (_s <= _e || !((result) = -1)) { \
150 long _m = (_s + _e) / 2; \
152 if ((table)[_m].entry == (key)) { \
153 (result) = _m; \
154 break; \
156 if ((table)[_m].entry > (key)) _e = _m - 1; \
157 if ((table)[_m].entry < (key)) _s = _m + 1; \
161 static const unicode_val_T strange_chars[32] = {
162 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
163 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
164 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
165 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
168 #define SYSTEM_CHARSET_FLAG 128
169 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
171 const unsigned char *
172 u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
174 int j;
175 int s;
177 if (u < 128) return strings[u];
179 to &= ~SYSTEM_CHARSET_FLAG;
181 #ifdef CONFIG_UTF8
182 if (is_cp_ptr_utf8(&codepages[to]))
183 return encode_utf8(u);
184 #endif /* CONFIG_UTF8 */
186 /* To mark non breaking spaces in non-UTF-8 strings, we use a
187 * special char NBSP_CHAR. */
188 if (u == UCS_NO_BREAK_SPACE) {
189 if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
190 else /* NBSP_MODE_ASCII */ return " ";
192 if (u == UCS_SOFT_HYPHEN) return "";
194 if (u < 0xa0) {
195 unicode_val_T strange = strange_chars[u - 0x80];
197 if (!strange) return NULL;
198 return u2cp_(strange, to, nbsp_mode);
201 if (u < 0xFFFF)
202 for (j = 0; j < 0x80; j++)
203 if (codepages[to].highhalf[j] == u)
204 return strings[0x80 + j];
205 for (j = 0; codepages[to].table[j].c; j++)
206 if (codepages[to].table[j].u == u)
207 return strings[codepages[to].table[j].c];
209 BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
210 if (s != -1) return unicode_7b[s].s;
212 return no_str;
215 static unsigned char utf_buffer[7];
217 #ifdef CONFIG_UTF8
218 inline unsigned char *
219 encode_utf8(unicode_val_T u)
220 #else
221 static unsigned char *
222 encode_utf8(unicode_val_T u)
223 #endif /* CONFIG_UTF8 */
225 memset(utf_buffer, 0, 7);
227 if (u < 0x80)
228 utf_buffer[0] = u;
229 else if (u < 0x800)
230 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
231 utf_buffer[1] = 0x80 | (u & 0x3f);
232 else if (u < 0x10000)
233 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
234 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
235 utf_buffer[2] = 0x80 | (u & 0x3f);
236 else if (u < 0x200000)
237 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
238 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
239 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
240 utf_buffer[3] = 0x80 | (u & 0x3f);
241 else if (u < 0x4000000)
242 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
243 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
244 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
245 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
246 utf_buffer[4] = 0x80 | (u & 0x3f);
247 else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
248 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
249 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
250 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
251 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
252 utf_buffer[5] = 0x80 | (u & 0x3f);
254 return utf_buffer;
257 #ifdef CONFIG_UTF8
258 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
259 * equal ones and handled different. */
260 static const char utf8char_len_tab[256] = {
261 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
262 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
263 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
264 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
265 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
266 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
267 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
268 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
271 inline int utf8charlen(const unsigned char *p)
273 return p ? utf8char_len_tab[*p] : 0;
276 inline int
277 strlen_utf8(unsigned char **str)
279 unsigned char *s = *str;
280 unsigned char *end = strchr(s, '\0');
281 int x;
282 int len;
284 for (x = 0;; x++, s += len) {
285 len = utf8charlen(s);
286 if (s + len > end) break;
288 *str = s;
289 return x;
292 #define utf8_issingle(p) (((p) & 0x80) == 0)
293 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
295 /* Start from @current and move back to @pos char. This pointer return. The
296 * most left pointer is @start. */
297 inline unsigned char *
298 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
300 if (current == NULL || start == NULL || pos < 0)
301 return NULL;
302 while (pos > 0 && current != start) {
303 current--;
304 if (utf8_islead(*current))
305 pos--;
307 return current;
310 /* Count number of standard terminal cells needed for displaying UTF-8
311 * character. */
313 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
315 unicode_val_T u;
317 if (end == NULL)
318 end = strchr(utf8_char, '\0');
320 if(!utf8_char || !end)
321 return -1;
323 u = utf8_to_unicode(&utf8_char, end);
325 return unicode_to_cell(u);
328 /* Count number of standard terminal cells needed for displaying string
329 * with UTF-8 characters. */
331 utf8_ptr2cells(unsigned char *string, unsigned char *end)
333 int charlen, cell, cells = 0;
335 if (end == NULL)
336 end = strchr(string, '\0');
338 if(!string || !end)
339 return -1;
341 do {
342 charlen = utf8charlen(string);
343 if (string + charlen > end)
344 break;
346 cell = utf8_char2cells(string, end);
347 if (cell < 0)
348 return -1;
350 cells += cell;
351 string += charlen;
352 } while (1);
354 return cells;
357 /* Count number of characters in string. */
359 utf8_ptr2chars(unsigned char *string, unsigned char *end)
361 int charlen, chars = 0;
363 if (end == NULL)
364 end = strchr(string, '\0');
366 if(!string || !end)
367 return -1;
369 do {
370 charlen = utf8charlen(string);
371 if (string + charlen > end)
372 break;
374 chars++;
375 string += charlen;
376 } while (1);
378 return chars;
382 * Count number of bytes from begining of the string needed for displaying
383 * specified number of cells.
386 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
388 unsigned int bytes = 0, cells = 0;
390 assert(max_cells>=0);
392 if (end == NULL)
393 end = strchr(string, '\0');
395 if(!string || !end)
396 return -1;
398 do {
399 int cell = utf8_char2cells(&string[bytes], end);
400 if (cell < 0)
401 return -1;
403 cells += cell;
404 if (cells > max_cells)
405 break;
407 bytes += utf8charlen(&string[bytes]);
409 if (string + bytes > end) {
410 bytes = end - string;
411 break;
413 } while(1);
415 return bytes;
418 /* Take @max steps forward from @string in the specified @way, but
419 * not going past @end. Return the resulting address. Store the
420 * number of steps taken to *@count, unless @count is NULL.
422 * This assumes the text is valid UTF-8, and @string and @end point to
423 * character boundaries. If not, it doesn't crash but the results may
424 * be inconsistent.
426 * This function can do some of the same jobs as utf8charlen(),
427 * utf8_cells2bytes(), and strlen_utf8(). */
428 unsigned char *
429 utf8_step_forward(unsigned char *string, unsigned char *end,
430 int max, enum utf8_step way, int *count)
432 int steps = 0;
433 unsigned char *current = string;
435 assert(string);
436 assert(max >= 0);
437 if_assert_failed goto invalid_arg;
438 if (end == NULL)
439 end = strchr(string, '\0');
441 switch (way) {
442 case UTF8_STEP_CHARACTERS:
443 while (steps < max && current < end) {
444 ++current;
445 if (utf8_islead(*current))
446 ++steps;
448 break;
450 case UTF8_STEP_CELLS_FEWER:
451 case UTF8_STEP_CELLS_MORE:
452 while (steps < max) {
453 unicode_val_T u;
454 unsigned char *prev = current;
455 int width;
457 u = utf8_to_unicode(&current, end);
458 if (u == UCS_NO_CHAR) {
459 /* Assume the incomplete sequence
460 * costs one cell. */
461 current = end;
462 ++steps;
463 break;
466 width = unicode_to_cell(u);
467 if (way == UTF8_STEP_CELLS_FEWER
468 && steps + width > max) {
469 /* Back off. */
470 current = prev;
471 break;
473 steps += width;
475 break;
477 default:
478 INTERNAL("impossible enum utf8_step");
481 invalid_arg:
482 if (count)
483 *count = steps;
484 return current;
487 /* Take @max steps backward from @string in the specified @way, but
488 * not going past @start. Return the resulting address. Store the
489 * number of steps taken to *@count, unless @count is NULL.
491 * This assumes the text is valid UTF-8, and @string and @start point
492 * to character boundaries. If not, it doesn't crash but the results
493 * may be inconsistent.
495 * This function can do some of the same jobs as utf8_prevchar(). */
496 unsigned char *
497 utf8_step_backward(unsigned char *string, unsigned char *start,
498 int max, enum utf8_step way, int *count)
500 int steps = 0;
501 unsigned char *current = string;
503 assert(string);
504 assert(start);
505 assert(max >= 0);
506 if_assert_failed goto invalid_arg;
508 switch (way) {
509 case UTF8_STEP_CHARACTERS:
510 while (steps < max && current > start) {
511 --current;
512 if (utf8_islead(*current))
513 ++steps;
515 break;
517 case UTF8_STEP_CELLS_FEWER:
518 case UTF8_STEP_CELLS_MORE:
519 while (steps < max) {
520 unsigned char *prev = current;
521 unsigned char *look;
522 unicode_val_T u;
523 int width;
525 if (current <= start)
526 break;
527 do {
528 --current;
529 } while (current > start && !utf8_islead(*current));
531 look = current;
532 u = utf8_to_unicode(&look, prev);
533 if (u == UCS_NO_CHAR) {
534 /* Assume the incomplete sequence
535 * costs one cell. */
536 width = 1;
537 } else
538 width = unicode_to_cell(u);
540 if (way == UTF8_STEP_CELLS_FEWER
541 && steps + width > max) {
542 /* Back off. */
543 current = prev;
544 break;
546 steps += width;
548 break;
550 default:
551 INTERNAL("impossible enum utf8_step");
554 invalid_arg:
555 if (count)
556 *count = steps;
557 return current;
561 * Find out number of standard terminal collumns needed for displaying symbol
562 * (glyph) which represents Unicode character c.
564 * TODO: Use wcwidth when it is available. This seems to require:
565 * - Make the configure script check whether <wchar.h> and wcwidth exist.
566 * - Define _XOPEN_SOURCE and include <wchar.h>.
567 * - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
568 * matches ISO 10646 in all locales.)
569 * However, these do not suffice, because wcwidth depends on LC_CTYPE
570 * in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
571 * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
572 * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
573 * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
574 * character is apparently not supported in all locales. Why is that?
575 * - Perhaps there is standardese that requires supported characters
576 * to be convertable to multibyte form. Then ELinks could just pick
577 * some UTF-8 locale for its wcwidth purposes.
578 * - Perhaps wcwidth can even return different nonnegative values for
579 * the same ISO 10646 character in different locales. Then ELinks
580 * would have to set LC_CTYPE to match at least the terminal's
581 * charset (which may differ from the LC_CTYPE environment variable,
582 * especially when the master process is serving a slave terminal).
583 * But there is no guarantee that the libc supports all the same
584 * charsets as ELinks does.
585 * For now, it seems safest to avoid the potentially locale-dependent
586 * libc version of wcwidth, and instead use a hardcoded mapping.
588 * @return 2 for double-width glyph, 1 for others.
589 * TODO: May be extended to return 0 for zero-width glyphs
590 * (like composing, maybe unprintable too).
592 inline int
593 unicode_to_cell(unicode_val_T c)
595 if (c >= 0x1100
596 && (c <= 0x115f /* Hangul Jamo */
597 || c == 0x2329
598 || c == 0x232a
599 || (c >= 0x2e80 && c <= 0xa4cf
600 && c != 0x303f) /* CJK ... Yi */
601 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
602 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
603 Ideographs */
604 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
605 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
606 || (c >= 0xffe0 && c <= 0xffe6)
607 || (c >= 0x20000 && c <= 0x2fffd)
608 || (c >= 0x30000 && c <= 0x3fffd)))
609 return 2;
611 return 1;
614 /* Fold the case of a Unicode character, so that hotkeys in labels can
615 * be compared case-insensitively. It is unspecified whether the
616 * result will be in upper or lower case. */
617 unicode_val_T
618 unicode_fold_label_case(unicode_val_T c)
620 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
621 return towlower(c);
622 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
623 /* For now, this supports only ASCII. It would be possible to
624 * use code generated from CaseFolding.txt of Unicode if the
625 * acknowledgements required by http://www.unicode.org/copyright.html
626 * were added to associated documentation of ELinks. */
627 if (c >= 0x41 && c <= 0x5A)
628 return c + 0x20;
629 else
630 return c;
631 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
634 inline unicode_val_T
635 utf8_to_unicode(unsigned char **string, const unsigned char *end)
637 unsigned char *str = *string;
638 unicode_val_T u;
639 int length;
641 length = utf8char_len_tab[str[0]];
643 if (str + length > end) {
644 return UCS_NO_CHAR;
647 switch (length) {
648 case 1: /* U+0000 to U+007F */
649 if (str[0] >= 0x80) {
650 invalid_utf8:
651 ++*string;
652 return UCS_REPLACEMENT_CHARACTER;
654 u = str[0];
655 break;
656 case 2: /* U+0080 to U+07FF */
657 if ((str[1] & 0xc0) != 0x80)
658 goto invalid_utf8;
659 u = (str[0] & 0x1f) << 6;
660 u += (str[1] & 0x3f);
661 if (u < 0x80)
662 goto invalid_utf8;
663 break;
664 case 3: /* U+0800 to U+FFFF, except surrogates */
665 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
666 goto invalid_utf8;
667 u = (str[0] & 0x0f) << 12;
668 u += ((str[1] & 0x3f) << 6);
669 u += (str[2] & 0x3f);
670 if (u < 0x800 || is_utf16_surrogate(u))
671 goto invalid_utf8;
672 break;
673 case 4: /* U+10000 to U+1FFFFF */
674 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
675 || (str[3] & 0xc0) != 0x80)
676 goto invalid_utf8;
677 u = (str[0] & 0x0f) << 18;
678 u += ((str[1] & 0x3f) << 12);
679 u += ((str[2] & 0x3f) << 6);
680 u += (str[3] & 0x3f);
681 if (u < 0x10000)
682 goto invalid_utf8;
683 break;
684 case 5: /* U+200000 to U+3FFFFFF */
685 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
686 || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
687 goto invalid_utf8;
688 u = (str[0] & 0x0f) << 24;
689 u += ((str[1] & 0x3f) << 18);
690 u += ((str[2] & 0x3f) << 12);
691 u += ((str[3] & 0x3f) << 6);
692 u += (str[4] & 0x3f);
693 if (u < 0x200000)
694 goto invalid_utf8;
695 break;
696 case 6: /* U+4000000 to U+7FFFFFFF */
697 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
698 || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
699 || (str[5] & 0xc0) != 0x80)
700 goto invalid_utf8;
701 u = (str[0] & 0x01) << 30;
702 u += ((str[1] & 0x3f) << 24);
703 u += ((str[2] & 0x3f) << 18);
704 u += ((str[3] & 0x3f) << 12);
705 u += ((str[4] & 0x3f) << 6);
706 u += (str[5] & 0x3f);
707 if (u < 0x4000000)
708 goto invalid_utf8;
709 break;
710 default:
711 INTERNAL("utf8char_len_tab out of range");
712 goto invalid_utf8;
714 *string = str + length;
715 return u;
717 #endif /* CONFIG_UTF8 */
719 /* The common part of cp2u and cp2utf_8. */
720 static unicode_val_T
721 cp2u_shared(const struct codepage_desc *from, unsigned char c)
723 unicode_val_T u = from->highhalf[c - 0x80];
725 if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
726 return u;
729 /* Used for converting input from the terminal. */
730 unicode_val_T
731 cp2u(int from, unsigned char c)
733 from &= ~SYSTEM_CHARSET_FLAG;
735 /* UTF-8 is a multibyte codepage and cannot be handled with
736 * this function. */
737 assert(!is_cp_ptr_utf8(&codepages[from]));
738 if_assert_failed return UCS_REPLACEMENT_CHARACTER;
740 if (c < 0x80) return c;
741 else return cp2u_shared(&codepages[from], c);
744 /* This slow and ugly code is used by the terminal utf_8_io */
745 const unsigned char *
746 cp2utf8(int from, int c)
748 from &= ~SYSTEM_CHARSET_FLAG;
750 if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
751 return strings[c];
753 return encode_utf8(cp2u_shared(&codepages[from], c));
756 #ifdef CONFIG_UTF8
757 unicode_val_T
758 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
760 unicode_val_T ret;
762 if (is_cp_utf8(codepage))
763 return utf8_to_unicode(string, end);
765 if (*string >= end)
766 return UCS_NO_CHAR;
768 ret = cp2u(codepage, **string);
769 ++*string;
770 return ret;
772 #endif /* CONFIG_UTF8 */
775 static void
776 add_utf8(struct conv_table *ct, unicode_val_T u, const unsigned char *str)
778 unsigned char *p = encode_utf8(u);
780 while (p[1]) {
781 if (ct[*p].t) ct = ct[*p].u.tbl;
782 else {
783 struct conv_table *nct;
785 assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
786 if_assert_failed return;
788 nct = mem_calloc(256, sizeof(*nct));
789 if (!nct) return;
790 new_translation_table(nct);
791 ct[*p].t = 1;
792 ct[*p].u.tbl = nct;
793 ct = nct;
795 p++;
798 assertm(!ct[*p].t, "bad utf encoding #2");
799 if_assert_failed return;
801 if (ct[*p].u.str == no_str)
802 ct[*p].u.str = str;
805 /* A conversion table from some charset to UTF-8.
806 * If it is from UTF-8 to UTF-8, it converts each byte separately.
807 * Unlike in other translation tables, the strings in elements 0x80 to
808 * 0xFF are allocated dynamically. */
809 struct conv_table utf_table[256];
810 int utf_table_init = 1;
812 static void
813 free_utf_table(void)
815 int i;
817 /* Cast away const. */
818 for (i = 128; i < 256; i++)
819 mem_free((unsigned char *) utf_table[i].u.str);
822 static struct conv_table *
823 get_translation_table_to_utf8(int from)
825 int i;
826 static int lfr = -1;
828 if (from == -1) return NULL;
829 from &= ~SYSTEM_CHARSET_FLAG;
830 if (from == lfr) return utf_table;
831 lfr = from;
832 if (utf_table_init) {
833 memset(utf_table, 0, sizeof(utf_table));
834 utf_table_init = 0;
835 } else
836 free_utf_table();
838 for (i = 0; i < 128; i++)
839 utf_table[i].u.str = strings[i];
841 if (is_cp_ptr_utf8(&codepages[from])) {
842 for (i = 128; i < 256; i++)
843 utf_table[i].u.str = stracpy(strings[i]);
844 return utf_table;
847 for (i = 128; i < 256; i++) {
848 unicode_val_T u = codepages[from].highhalf[i - 0x80];
850 if (u == 0xFFFF)
851 utf_table[i].u.str = NULL;
852 else
853 utf_table[i].u.str = stracpy(encode_utf8(u));
856 for (i = 0; codepages[from].table[i].c; i++) {
857 unicode_val_T u = codepages[from].table[i].u;
859 if (!utf_table[codepages[from].table[i].c].u.str)
860 utf_table[codepages[from].table[i].c].u.str =
861 stracpy(encode_utf8(u));
864 for (i = 128; i < 256; i++)
865 if (!utf_table[i].u.str)
866 utf_table[i].u.str = stracpy(no_str);
868 return utf_table;
871 /* A conversion table between two charsets, where the target is not UTF-8. */
872 static struct conv_table table[256];
873 static int first = 1;
875 void
876 free_conv_table(void)
878 if (!utf_table_init) free_utf_table();
879 if (first) {
880 memset(table, 0, sizeof(table));
881 first = 0;
883 new_translation_table(table);
887 struct conv_table *
888 get_translation_table(int from, int to)
890 static int lfr = -1;
891 static int lto = -1;
893 from &= ~SYSTEM_CHARSET_FLAG;
894 to &= ~SYSTEM_CHARSET_FLAG;
895 if (first) {
896 memset(table, 0, sizeof(table));
897 first = 0;
899 if (/*from == to ||*/ from == -1 || to == -1)
900 return NULL;
901 if (is_cp_ptr_utf8(&codepages[to]))
902 return get_translation_table_to_utf8(from);
903 if (from == lfr && to == lto)
904 return table;
905 lfr = from;
906 lto = to;
907 new_translation_table(table);
909 if (is_cp_ptr_utf8(&codepages[from])) {
910 int i;
912 /* Map U+00A0 and U+00AD the same way as u2cp() would. */
913 add_utf8(table, UCS_NO_BREAK_SPACE, strings[NBSP_CHAR]);
914 add_utf8(table, UCS_SOFT_HYPHEN, "");
916 for (i = 0x80; i <= 0xFF; i++)
917 if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
918 add_utf8(table,
919 codepages[to].highhalf[i - 0x80],
920 strings[i]);
922 for (i = 0; codepages[to].table[i].c; i++)
923 add_utf8(table, codepages[to].table[i].u,
924 strings[codepages[to].table[i].c]);
926 for (i = 0; unicode_7b[i].x != -1; i++)
927 if (unicode_7b[i].x >= 0x80)
928 add_utf8(table, unicode_7b[i].x,
929 unicode_7b[i].s);
931 } else {
932 int i;
934 for (i = 128; i < 256; i++) {
935 if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
936 const unsigned char *u;
938 u = u2cp(codepages[from].highhalf[i - 0x80], to);
939 if (u) table[i].u.str = u;
944 return table;
947 static inline int
948 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
950 while (l2) {
951 if (*s1 > *s2) return 1;
952 if (*s1 < *s2) return -1;
953 s1++;
954 s2++;
955 l2--;
958 return *s2 ? -1 : 0;
961 /* Entity cache debugging purpose. */
962 #if 0
963 #define DEBUG_ENTITY_CACHE
964 #else
965 #undef DEBUG_ENTITY_CACHE
966 #endif
968 struct entity_cache {
969 unsigned int hits;
970 int strlen;
971 int encoding;
972 const unsigned char *result;
973 unsigned char str[20]; /* Suffice in any case. */
976 /* comparison function for qsort() */
977 static int
978 hits_cmp(const void *v1, const void *v2)
980 const struct entity_cache *a = v1, *b = v2;
982 if (a->hits == b->hits) return 0;
983 if (a->hits > b->hits) return -1;
984 else return 1;
987 static int
988 compare_entities(const void *key_, const void *element_)
990 struct string *key = (struct string *) key_;
991 struct entity *element = (struct entity *) element_;
992 int length = key->length;
993 unsigned char *first = key->source;
994 unsigned char *second = element->s;
996 return xxstrcmp(first, second, length);
999 const unsigned char *
1000 get_entity_string(const unsigned char *str, const int strlen, int encoding)
1002 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
1003 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
1004 will go in [0] table */
1005 static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
1006 static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
1007 static int first_time = 1;
1008 unsigned int slen = 0;
1009 const unsigned char *result = NULL;
1011 if (strlen <= 0) return NULL;
1013 #ifdef CONFIG_UTF8
1014 /* TODO: caching UTF-8 */
1015 encoding &= ~SYSTEM_CHARSET_FLAG;
1016 if (is_cp_ptr_utf8(&codepages[encoding]))
1017 goto skip;
1018 #endif /* CONFIG_UTF8 */
1020 if (first_time) {
1021 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
1022 first_time = 0;
1025 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1026 * + google + slashdot + websites that result from a search for test on google,
1027 * + various ones) show quite impressive improvment:
1028 * Top ten is:
1029 * 0: hits=2459 l=4 st='nbsp'
1030 * 1: hits=2152 l=6 st='eacute'
1031 * 2: hits=235 l=6 st='egrave'
1032 * 3: hits=136 l=6 st='agrave'
1033 * 4: hits=100 l=3 st='amp'
1034 * 5: hits=40 l=5 st='laquo'
1035 * 6: hits=8 l=4 st='copy'
1036 * 7: hits=5 l=2 st='gt'
1037 * 8: hits=2 l=2 st='lt'
1038 * 9: hits=1 l=6 st='middot'
1040 * Most of the time cache hit ratio is near 95%.
1042 * A long test shows: 15186 hits vs. 24 misses and mean iteration
1043 * count is kept < 2 (worst case 1.58). Not so bad ;)
1045 * --Zas */
1047 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1048 slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
1050 if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
1051 int i;
1053 for (i = 0; i < nb_entity_cache[slen]; i++) {
1054 if (entity_cache[slen][i].encoding == encoding
1055 && !memcmp(str, entity_cache[slen][i].str, strlen)) {
1056 #ifdef DEBUG_ENTITY_CACHE
1057 static double total_iter = 0;
1058 static unsigned long hit_count = 0;
1060 total_iter += i + 1;
1061 hit_count++;
1062 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
1063 #endif
1064 if (entity_cache[slen][i].hits < (unsigned int) ~0)
1065 entity_cache[slen][i].hits++;
1066 return entity_cache[slen][i].result;
1069 #ifdef DEBUG_ENTITY_CACHE
1070 fprintf(stderr, "miss\n");
1071 #endif
1073 #ifdef CONFIG_UTF8
1074 skip:
1075 #endif /* CONFIG_UTF8 */
1076 if (*str == '#') { /* Numeric entity. */
1077 int l = (int) strlen;
1078 unsigned char *st = (unsigned char *) str;
1079 unicode_val_T n = 0;
1081 if (l == 1) goto end; /* &#; ? */
1082 st++, l--;
1083 if ((*st | 32) == 'x') { /* Hexadecimal */
1085 if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
1086 st++, l--;
1087 do {
1088 unsigned char c = (*(st++) | 32);
1090 if (isdigit(c))
1091 n = (n << 4) | (c - '0');
1092 else if (isxdigit(c))
1093 n = (n << 4) | (c - 'a' + 10);
1094 else
1095 goto end; /* Bad char. */
1096 } while (--l);
1097 } else { /* Decimal */
1098 if (l > 10) goto end; /* 4294967295 max. */
1099 do {
1100 unsigned char c = *(st++);
1102 if (isdigit(c))
1103 n = n * 10 + c - '0';
1104 else
1105 goto end; /* Bad char. */
1106 /* Limit to 0xFFFFFFFF. */
1107 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1108 goto end;
1109 } while (--l);
1112 result = u2cp(n, encoding);
1114 #ifdef DEBUG_ENTITY_CACHE
1115 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1116 #endif
1117 } else { /* Text entity. */
1118 struct string key = INIT_STRING((unsigned char *) str, strlen);
1119 struct entity *element = bsearch((void *) &key, entities,
1120 N_ENTITIES,
1121 sizeof(*element),
1122 compare_entities);
1124 if (element) result = u2cp(element->c, encoding);
1127 #ifdef CONFIG_UTF8
1128 if (is_cp_ptr_utf8(&codepages[encoding])) {
1129 return result;
1131 #endif /* CONFIG_UTF8 */
1132 end:
1133 /* Take care of potential buffer overflow. */
1134 if (strlen < sizeof(entity_cache[slen][0].str)) {
1135 struct entity_cache *ece;
1137 /* Sort entries by hit order. */
1138 if (nb_entity_cache[slen] > 1)
1139 qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1140 sizeof(entity_cache[slen][0]), hits_cmp);
1142 /* Increment number of cache entries if possible.
1143 * Else, just replace the least used entry. */
1144 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1145 ece = &entity_cache[slen][nb_entity_cache[slen] - 1];
1147 /* Copy new entry to cache. */
1148 ece->hits = 1;
1149 ece->strlen = strlen;
1150 ece->encoding = encoding;
1151 ece->result = result;
1152 memcpy(ece->str, str, strlen);
1153 ece->str[strlen] = '\0';
1156 #ifdef DEBUG_ENTITY_CACHE
1157 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1158 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1161 unsigned int i;
1163 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1164 for (i = 0; i < nb_entity_cache[slen] ; i++)
1165 fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1166 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1167 entity_cache[slen][i].str);
1168 fprintf(stderr, "-----------------\n");
1170 #endif /* DEBUG_ENTITY_CACHE */
1172 return result;
1175 unsigned char *
1176 convert_string(struct conv_table *convert_table,
1177 unsigned char *chars, int charslen, int cp,
1178 enum convert_string_mode mode, int *length,
1179 void (*callback)(void *data, unsigned char *buf, int buflen),
1180 void *callback_data)
1182 unsigned char *buffer;
1183 int bufferpos = 0;
1184 int charspos = 0;
1186 if (!convert_table && !memchr(chars, '&', charslen)) {
1187 if (callback) {
1188 if (charslen) callback(callback_data, chars, charslen);
1189 return NULL;
1190 } else {
1191 return memacpy(chars, charslen);
1195 /* Buffer allocation */
1197 buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1198 if (!buffer) return NULL;
1200 /* Iterate ;-) */
1202 while (charspos < charslen) {
1203 const unsigned char *translit;
1205 #define PUTC do { \
1206 buffer[bufferpos++] = chars[charspos++]; \
1207 translit = ""; \
1208 goto flush; \
1209 } while (0)
1211 if (chars[charspos] != '&') {
1212 struct conv_table *t;
1213 int i;
1215 if (chars[charspos] < 128 || !convert_table) PUTC;
1217 t = convert_table;
1218 i = charspos;
1220 while (t[chars[i]].t) {
1221 t = t[chars[i++]].u.tbl;
1222 if (i >= charslen) PUTC;
1225 translit = t[chars[i]].u.str;
1226 charspos = i + 1;
1228 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1229 PUTC;
1231 } else {
1232 int start = charspos + 1;
1233 int i = start;
1235 while (i < charslen
1236 && (isasciialpha(chars[i])
1237 || isdigit(chars[i])
1238 || (chars[i] == '#')))
1239 i++;
1241 /* This prevents bug 213: we were expanding "entities"
1242 * in URL query strings. */
1243 /* XXX: But this disables &nbsp&nbsp usage, which
1244 * appears to be relatively common! --pasky */
1245 if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1246 && i > start
1247 && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1248 translit = get_entity_string(&chars[start], i - start,
1249 cp);
1250 if (chars[i] != ';') {
1251 /* Eat &nbsp &nbsp<foo> happily, but
1252 * pull back from the character after
1253 * entity string if it is not the valid
1254 * terminator. */
1255 i--;
1258 if (!translit) PUTC;
1259 charspos = i + (i < charslen);
1260 } else PUTC;
1263 if (!translit[0]) continue;
1265 if (!translit[1]) {
1266 buffer[bufferpos++] = translit[0];
1267 translit = "";
1268 goto flush;
1271 while (*translit) {
1272 unsigned char *new;
1274 buffer[bufferpos++] = *(translit++);
1275 flush:
1276 if (bufferpos & (ALLOC_GR - 1)) continue;
1278 if (callback) {
1279 buffer[bufferpos] = 0;
1280 callback(callback_data, buffer, bufferpos);
1281 bufferpos = 0;
1282 } else {
1283 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1284 if (!new) {
1285 mem_free(buffer);
1286 return NULL;
1288 buffer = new;
1291 #undef PUTC
1294 /* Say bye */
1296 buffer[bufferpos] = 0;
1297 if (length) *length = bufferpos;
1299 if (callback) {
1300 if (bufferpos) callback(callback_data, buffer, bufferpos);
1301 mem_free(buffer);
1302 return NULL;
1303 } else {
1304 return buffer;
1309 #ifndef USE_FASTFIND
1311 get_cp_index(const unsigned char *name)
1313 int i, a;
1314 int syscp = 0;
1316 if (!strcasecmp(name, "System")) {
1317 #if HAVE_LANGINFO_CODESET
1318 name = nl_langinfo(CODESET);
1319 syscp = SYSTEM_CHARSET_FLAG;
1320 #else
1321 name = "us-ascii";
1322 #endif
1325 for (i = 0; codepages[i].name; i++) {
1326 for (a = 0; codepages[i].aliases[a]; a++) {
1327 /* In the past, we looked for the longest substring
1328 * in all the names; it is way too expensive, though:
1330 * % cumulative self self total
1331 * time seconds seconds calls us/call us/call name
1332 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1334 * Anything called from redraw_screen() is in fact
1335 * relatively expensive, even if it's called just
1336 * once. So we will do a simple strcasecmp() here.
1339 if (!strcasecmp(name, codepages[i].aliases[a]))
1340 return i | syscp;
1344 if (syscp) {
1345 return get_cp_index("us-ascii") | syscp;
1346 } else {
1347 return -1;
1351 #else
1353 static unsigned int i_name = 0;
1354 static unsigned int i_alias = 0;
1356 /* Reset internal list pointer */
1357 void
1358 charsets_list_reset(void)
1360 i_name = 0;
1361 i_alias = 0;
1364 /* Returns a pointer to a struct that contains current key and data pointers
1365 * and increment internal pointer. It returns NULL when key is NULL. */
1366 struct fastfind_key_value *
1367 charsets_list_next(void)
1369 static struct fastfind_key_value kv;
1371 if (!codepages[i_name].name) return NULL;
1373 kv.key = codepages[i_name].aliases[i_alias];
1374 kv.data = (void *) &codepages[i_name]; /* cast away const */
1376 if (codepages[i_name].aliases[i_alias + 1])
1377 i_alias++;
1378 else {
1379 i_name++;
1380 i_alias = 0;
1383 return &kv;
1386 static struct fastfind_index ff_charsets_index
1387 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1389 /* It searchs for a charset named @name or one of its aliases and
1390 * returns index for it or -1 if not found. */
1392 get_cp_index(const unsigned char *name)
1394 const struct codepage_desc *codepage;
1395 int syscp = 0;
1397 if (!strcasecmp(name, "System")) {
1398 #if HAVE_LANGINFO_CODESET
1399 name = nl_langinfo(CODESET);
1400 syscp = SYSTEM_CHARSET_FLAG;
1401 #else
1402 name = "us-ascii";
1403 #endif
1406 codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1407 if (codepage) {
1408 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1409 return (codepage - codepages) | syscp;
1411 } else if (syscp) {
1412 return get_cp_index("us-ascii") | syscp;
1414 } else {
1415 return -1;
1419 #endif /* USE_FASTFIND */
1421 void
1422 init_charsets_lookup(void)
1424 #ifdef USE_FASTFIND
1425 fastfind_index(&ff_charsets_index, FF_COMPRESS);
1426 #endif
1429 void
1430 free_charsets_lookup(void)
1432 #ifdef USE_FASTFIND
1433 fastfind_done(&ff_charsets_index);
1434 #endif
1437 /* Get the codepage's name for displaying to the user, or NULL if
1438 * @cp_index is one past the end. In the future, we might want to
1439 * localize these with gettext. So it may be best not to use this
1440 * function if the name will have to be converted back to an
1441 * index. */
1442 unsigned char *
1443 get_cp_name(int cp_index)
1445 if (cp_index < 0) return "none";
1446 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1448 return codepages[cp_index].name;
1451 /* Get the codepage's name for saving to a configuration file. These
1452 * names can be converted back to indexes, even in future versions of
1453 * ELinks. */
1454 unsigned char *
1455 get_cp_config_name(int cp_index)
1457 if (cp_index < 0) return "none";
1458 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1459 if (!codepages[cp_index].aliases) return NULL;
1461 return codepages[cp_index].aliases[0];
1464 /* Get the codepage's name for sending to a library or server that
1465 * understands MIME charset names. This function irreversibly maps
1466 * the "System" codepage to the underlying charset. */
1467 unsigned char *
1468 get_cp_mime_name(int cp_index)
1470 if (cp_index < 0) return "none";
1471 cp_index &= ~SYSTEM_CHARSET_FLAG;
1472 if (!codepages[cp_index].aliases) return NULL;
1474 return codepages[cp_index].aliases[0];
1478 is_cp_utf8(int cp_index)
1480 cp_index &= ~SYSTEM_CHARSET_FLAG;
1481 return is_cp_ptr_utf8(&codepages[cp_index]);