big dialogs: set_curosr2 -> set_dlg_cursor.
[elinks.git] / src / intl / charsets.c
blobde853b982bbf69cd787495795936c7842a4ae3c6
1 /* Charsets convertor */
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE /* strcasecmp() */
5 #endif
7 #ifdef HAVE_CONFIG_H
8 #include "config.h"
9 #endif
11 #if HAVE_LANGINFO_CODESET
12 #include <langinfo.h>
13 #endif
15 #include <ctype.h>
16 #include <stdlib.h>
17 #if HAVE_WCTYPE_H
18 #include <wctype.h>
19 #endif
21 #include "elinks.h"
23 #include "document/options.h"
24 #include "intl/charsets.h"
25 #include "util/conv.h"
26 #include "util/error.h"
27 #include "util/fastfind.h"
28 #include "util/hash.h"
29 #include "util/memory.h"
30 #include "util/string.h"
33 /* Fix namespace clash on MacOS. */
34 #define table table_elinks
36 struct table_entry {
37 unsigned char c;
38 /* This should in principle be unicode_val_T, but because all
39 * the values currently in codepage.inc fit in 16 bits, we can
40 * as well use uint16_t and halve sizeof(struct table_entry)
41 * from 8 bytes to 4. Should other characters ever be needed,
42 * unicode_val_T u : 24 might be a possibility, although it
43 * seems a little unportable as bitfields are in principle
44 * restricted to int, which may be 16-bit. */
45 uint16_t u;
48 struct codepage_desc {
49 unsigned char *name;
50 unsigned char *const *aliases;
52 /* The Unicode mappings of codepage bytes 0x80...0xFF.
53 * (0x00...0x7F are assumed to be ASCII in all codepages.)
54 * Because all current values fit in 16 bits, we store them as
55 * uint16_t rather than unicode_val_T. If the codepage does
56 * not use some byte, then @highhalf maps that byte to 0xFFFF,
57 * which C code converts to UCS_REPLACEMENT_CHARACTER where
58 * appropriate. (U+FFFF is reserved and will never be
59 * assigned as a character.) */
60 const uint16_t *highhalf;
62 /* If some byte in the codepage corresponds to multiple Unicode
63 * characters, then the preferred character is in @highhalf
64 * above, and the rest are listed here in @table. This table
65 * is not used for translating from the codepage to Unicode. */
66 const struct table_entry *table;
69 #include "intl/codepage.inc"
70 #include "intl/uni_7b.inc"
71 #include "intl/entity.inc"
74 static const char strings[256][2] = {
75 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
76 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
77 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
78 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
79 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
80 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
81 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
82 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
83 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
84 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
85 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
86 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
87 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
88 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
89 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
90 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
91 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
92 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
93 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
94 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
95 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
96 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
97 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
98 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
99 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
100 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
101 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
102 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
103 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
104 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
105 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
106 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
109 static void
110 free_translation_table(struct conv_table *p)
112 int i;
114 for (i = 0; i < 256; i++)
115 if (p[i].t)
116 free_translation_table(p[i].u.tbl);
118 mem_free(p);
121 /* A string used in conversion tables when there is no correct
122 * conversion. This is compared by address and therefore should be a
123 * named array rather than a pointer so that it won't share storage
124 * with any other string literal that happens to have the same
125 * characters. */
126 static const unsigned char no_str[] = "*";
128 static void
129 new_translation_table(struct conv_table *p)
131 int i;
133 for (i = 0; i < 256; i++)
134 if (p[i].t)
135 free_translation_table(p[i].u.tbl);
136 for (i = 0; i < 128; i++) {
137 p[i].t = 0;
138 p[i].u.str = strings[i];
140 for (; i < 256; i++) {
141 p[i].t = 0;
142 p[i].u.str = no_str;
146 #define BIN_SEARCH(table, entry, entries, key, result) \
148 long _s = 0, _e = (entries) - 1; \
150 while (_s <= _e || !((result) = -1)) { \
151 long _m = (_s + _e) / 2; \
153 if ((table)[_m].entry == (key)) { \
154 (result) = _m; \
155 break; \
157 if ((table)[_m].entry > (key)) _e = _m - 1; \
158 if ((table)[_m].entry < (key)) _s = _m + 1; \
162 static const unicode_val_T strange_chars[32] = {
163 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
164 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
165 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
166 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
169 #define SYSTEM_CHARSET_FLAG 128
170 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
172 const unsigned char *
173 u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
175 int j;
176 int s;
178 if (u < 128) return strings[u];
180 to &= ~SYSTEM_CHARSET_FLAG;
182 #ifdef CONFIG_UTF8
183 if (is_cp_ptr_utf8(&codepages[to]))
184 return encode_utf8(u);
185 #endif /* CONFIG_UTF8 */
187 /* To mark non breaking spaces in non-UTF-8 strings, we use a
188 * special char NBSP_CHAR. */
189 if (u == UCS_NO_BREAK_SPACE) {
190 if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
191 else /* NBSP_MODE_ASCII */ return " ";
193 if (u == UCS_SOFT_HYPHEN) return "";
195 if (u < 0xa0) {
196 unicode_val_T strange = strange_chars[u - 0x80];
198 if (!strange) return NULL;
199 return u2cp_(strange, to, nbsp_mode);
202 if (u < 0xFFFF)
203 for (j = 0; j < 0x80; j++)
204 if (codepages[to].highhalf[j] == u)
205 return strings[0x80 + j];
206 for (j = 0; codepages[to].table[j].c; j++)
207 if (codepages[to].table[j].u == u)
208 return strings[codepages[to].table[j].c];
210 BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
211 if (s != -1) return unicode_7b[s].s;
213 return no_str;
216 static unsigned char utf_buffer[7];
218 #ifdef CONFIG_UTF8
219 inline unsigned char *
220 encode_utf8(unicode_val_T u)
221 #else
222 static unsigned char *
223 encode_utf8(unicode_val_T u)
224 #endif /* CONFIG_UTF8 */
226 memset(utf_buffer, 0, 7);
228 if (u < 0x80)
229 utf_buffer[0] = u;
230 else if (u < 0x800)
231 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
232 utf_buffer[1] = 0x80 | (u & 0x3f);
233 else if (u < 0x10000)
234 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
235 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
236 utf_buffer[2] = 0x80 | (u & 0x3f);
237 else if (u < 0x200000)
238 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
239 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
240 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
241 utf_buffer[3] = 0x80 | (u & 0x3f);
242 else if (u < 0x4000000)
243 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
244 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
245 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
246 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
247 utf_buffer[4] = 0x80 | (u & 0x3f);
248 else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
249 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
250 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
251 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
252 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
253 utf_buffer[5] = 0x80 | (u & 0x3f);
255 return utf_buffer;
258 #ifdef CONFIG_UTF8
259 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
260 * equal ones and handled different. */
261 static const char utf8char_len_tab[256] = {
262 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
263 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
264 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
265 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
266 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
267 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
268 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
269 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
272 inline int utf8charlen(const unsigned char *p)
274 return p ? utf8char_len_tab[*p] : 0;
277 inline int
278 strlen_utf8(unsigned char **str)
280 unsigned char *s = *str;
281 unsigned char *end = strchr(s, '\0');
282 int x;
283 int len;
285 for (x = 0;; x++, s += len) {
286 len = utf8charlen(s);
287 if (s + len > end) break;
289 *str = s;
290 return x;
293 #define utf8_issingle(p) (((p) & 0x80) == 0)
294 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
296 /* Start from @current and move back to @pos char. This pointer return. The
297 * most left pointer is @start. */
298 inline unsigned char *
299 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
301 if (current == NULL || start == NULL || pos < 0)
302 return NULL;
303 while (pos > 0 && current != start) {
304 current--;
305 if (utf8_islead(*current))
306 pos--;
308 return current;
311 /* Count number of standard terminal cells needed for displaying UTF-8
312 * character. */
314 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
316 unicode_val_T u;
318 if (end == NULL)
319 end = strchr(utf8_char, '\0');
321 if(!utf8_char || !end)
322 return -1;
324 u = utf8_to_unicode(&utf8_char, end);
326 return unicode_to_cell(u);
329 /* Count number of standard terminal cells needed for displaying string
330 * with UTF-8 characters. */
332 utf8_ptr2cells(unsigned char *string, unsigned char *end)
334 int charlen, cell, cells = 0;
336 if (end == NULL)
337 end = strchr(string, '\0');
339 if(!string || !end)
340 return -1;
342 do {
343 charlen = utf8charlen(string);
344 if (string + charlen > end)
345 break;
347 cell = utf8_char2cells(string, end);
348 if (cell < 0)
349 return -1;
351 cells += cell;
352 string += charlen;
353 } while (1);
355 return cells;
358 /* Count number of characters in string. */
360 utf8_ptr2chars(unsigned char *string, unsigned char *end)
362 int charlen, chars = 0;
364 if (end == NULL)
365 end = strchr(string, '\0');
367 if(!string || !end)
368 return -1;
370 do {
371 charlen = utf8charlen(string);
372 if (string + charlen > end)
373 break;
375 chars++;
376 string += charlen;
377 } while (1);
379 return chars;
383 * Count number of bytes from begining of the string needed for displaying
384 * specified number of cells.
387 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
389 unsigned int bytes = 0, cells = 0;
391 assert(max_cells>=0);
393 if (end == NULL)
394 end = strchr(string, '\0');
396 if(!string || !end)
397 return -1;
399 do {
400 int cell = utf8_char2cells(&string[bytes], end);
401 if (cell < 0)
402 return -1;
404 cells += cell;
405 if (cells > max_cells)
406 break;
408 bytes += utf8charlen(&string[bytes]);
410 if (string + bytes > end) {
411 bytes = end - string;
412 break;
414 } while(1);
416 return bytes;
419 /* Take @max steps forward from @string in the specified @way, but
420 * not going past @end. Return the resulting address. Store the
421 * number of steps taken to *@count, unless @count is NULL.
423 * This assumes the text is valid UTF-8, and @string and @end point to
424 * character boundaries. If not, it doesn't crash but the results may
425 * be inconsistent.
427 * This function can do some of the same jobs as utf8charlen(),
428 * utf8_cells2bytes(), and strlen_utf8(). */
429 unsigned char *
430 utf8_step_forward(unsigned char *string, unsigned char *end,
431 int max, enum utf8_step way, int *count)
433 int steps = 0;
434 unsigned char *current = string;
436 assert(string);
437 assert(max >= 0);
438 if_assert_failed goto invalid_arg;
439 if (end == NULL)
440 end = strchr(string, '\0');
442 switch (way) {
443 case UTF8_STEP_CHARACTERS:
444 while (steps < max && current < end) {
445 ++current;
446 if (utf8_islead(*current))
447 ++steps;
449 break;
451 case UTF8_STEP_CELLS_FEWER:
452 case UTF8_STEP_CELLS_MORE:
453 while (steps < max) {
454 unicode_val_T u;
455 unsigned char *prev = current;
456 int width;
458 u = utf8_to_unicode(&current, end);
459 if (u == UCS_NO_CHAR) {
460 /* Assume the incomplete sequence
461 * costs one cell. */
462 current = end;
463 ++steps;
464 break;
467 width = unicode_to_cell(u);
468 if (way == UTF8_STEP_CELLS_FEWER
469 && steps + width > max) {
470 /* Back off. */
471 current = prev;
472 break;
474 steps += width;
476 break;
478 default:
479 INTERNAL("impossible enum utf8_step");
482 invalid_arg:
483 if (count)
484 *count = steps;
485 return current;
488 /* Take @max steps backward from @string in the specified @way, but
489 * not going past @start. Return the resulting address. Store the
490 * number of steps taken to *@count, unless @count is NULL.
492 * This assumes the text is valid UTF-8, and @string and @start point
493 * to character boundaries. If not, it doesn't crash but the results
494 * may be inconsistent.
496 * This function can do some of the same jobs as utf8_prevchar(). */
497 unsigned char *
498 utf8_step_backward(unsigned char *string, unsigned char *start,
499 int max, enum utf8_step way, int *count)
501 int steps = 0;
502 unsigned char *current = string;
504 assert(string);
505 assert(start);
506 assert(max >= 0);
507 if_assert_failed goto invalid_arg;
509 switch (way) {
510 case UTF8_STEP_CHARACTERS:
511 while (steps < max && current > start) {
512 --current;
513 if (utf8_islead(*current))
514 ++steps;
516 break;
518 case UTF8_STEP_CELLS_FEWER:
519 case UTF8_STEP_CELLS_MORE:
520 while (steps < max) {
521 unsigned char *prev = current;
522 unsigned char *look;
523 unicode_val_T u;
524 int width;
526 if (current <= start)
527 break;
528 do {
529 --current;
530 } while (current > start && !utf8_islead(*current));
532 look = current;
533 u = utf8_to_unicode(&look, prev);
534 if (u == UCS_NO_CHAR) {
535 /* Assume the incomplete sequence
536 * costs one cell. */
537 width = 1;
538 } else
539 width = unicode_to_cell(u);
541 if (way == UTF8_STEP_CELLS_FEWER
542 && steps + width > max) {
543 /* Back off. */
544 current = prev;
545 break;
547 steps += width;
549 break;
551 default:
552 INTERNAL("impossible enum utf8_step");
555 invalid_arg:
556 if (count)
557 *count = steps;
558 return current;
562 * Find out number of standard terminal collumns needed for displaying symbol
563 * (glyph) which represents Unicode character c.
565 * TODO: Use wcwidth when it is available. This seems to require:
566 * - Make the configure script check whether <wchar.h> and wcwidth exist.
567 * - Define _XOPEN_SOURCE and include <wchar.h>.
568 * - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
569 * matches ISO 10646 in all locales.)
570 * However, these do not suffice, because wcwidth depends on LC_CTYPE
571 * in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
572 * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
573 * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
574 * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
575 * character is apparently not supported in all locales. Why is that?
576 * - Perhaps there is standardese that requires supported characters
577 * to be convertable to multibyte form. Then ELinks could just pick
578 * some UTF-8 locale for its wcwidth purposes.
579 * - Perhaps wcwidth can even return different nonnegative values for
580 * the same ISO 10646 character in different locales. Then ELinks
581 * would have to set LC_CTYPE to match at least the terminal's
582 * charset (which may differ from the LC_CTYPE environment variable,
583 * especially when the master process is serving a slave terminal).
584 * But there is no guarantee that the libc supports all the same
585 * charsets as ELinks does.
586 * For now, it seems safest to avoid the potentially locale-dependent
587 * libc version of wcwidth, and instead use a hardcoded mapping.
589 * @return 2 for double-width glyph, 1 for others.
590 * TODO: May be extended to return 0 for zero-width glyphs
591 * (like composing, maybe unprintable too).
593 inline int
594 unicode_to_cell(unicode_val_T c)
596 if (c >= 0x1100
597 && (c <= 0x115f /* Hangul Jamo */
598 || c == 0x2329
599 || c == 0x232a
600 || (c >= 0x2e80 && c <= 0xa4cf
601 && c != 0x303f) /* CJK ... Yi */
602 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
603 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
604 Ideographs */
605 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
606 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
607 || (c >= 0xffe0 && c <= 0xffe6)
608 || (c >= 0x20000 && c <= 0x2fffd)
609 || (c >= 0x30000 && c <= 0x3fffd)))
610 return 2;
612 return 1;
615 /* Fold the case of a Unicode character, so that hotkeys in labels can
616 * be compared case-insensitively. It is unspecified whether the
617 * result will be in upper or lower case. */
618 unicode_val_T
619 unicode_fold_label_case(unicode_val_T c)
621 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
622 return towlower(c);
623 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
624 /* For now, this supports only ASCII. It would be possible to
625 * use code generated from CaseFolding.txt of Unicode if the
626 * acknowledgements required by http://www.unicode.org/copyright.html
627 * were added to associated documentation of ELinks. */
628 if (c >= 0x41 && c <= 0x5A)
629 return c + 0x20;
630 else
631 return c;
632 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
635 inline unicode_val_T
636 utf8_to_unicode(unsigned char **string, const unsigned char *end)
638 unsigned char *str = *string;
639 unicode_val_T u;
640 int length;
642 length = utf8char_len_tab[str[0]];
644 if (str + length > end) {
645 return UCS_NO_CHAR;
648 switch (length) {
649 case 1: /* U+0000 to U+007F */
650 if (str[0] >= 0x80) {
651 invalid_utf8:
652 ++*string;
653 return UCS_REPLACEMENT_CHARACTER;
655 u = str[0];
656 break;
657 case 2: /* U+0080 to U+07FF */
658 if ((str[1] & 0xc0) != 0x80)
659 goto invalid_utf8;
660 u = (str[0] & 0x1f) << 6;
661 u += (str[1] & 0x3f);
662 if (u < 0x80)
663 goto invalid_utf8;
664 break;
665 case 3: /* U+0800 to U+FFFF, except surrogates */
666 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
667 goto invalid_utf8;
668 u = (str[0] & 0x0f) << 12;
669 u += ((str[1] & 0x3f) << 6);
670 u += (str[2] & 0x3f);
671 if (u < 0x800 || is_utf16_surrogate(u))
672 goto invalid_utf8;
673 break;
674 case 4: /* U+10000 to U+1FFFFF */
675 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
676 || (str[3] & 0xc0) != 0x80)
677 goto invalid_utf8;
678 u = (str[0] & 0x0f) << 18;
679 u += ((str[1] & 0x3f) << 12);
680 u += ((str[2] & 0x3f) << 6);
681 u += (str[3] & 0x3f);
682 if (u < 0x10000)
683 goto invalid_utf8;
684 break;
685 case 5: /* U+200000 to U+3FFFFFF */
686 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
687 || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
688 goto invalid_utf8;
689 u = (str[0] & 0x0f) << 24;
690 u += ((str[1] & 0x3f) << 18);
691 u += ((str[2] & 0x3f) << 12);
692 u += ((str[3] & 0x3f) << 6);
693 u += (str[4] & 0x3f);
694 if (u < 0x200000)
695 goto invalid_utf8;
696 break;
697 case 6: /* U+4000000 to U+7FFFFFFF */
698 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
699 || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
700 || (str[5] & 0xc0) != 0x80)
701 goto invalid_utf8;
702 u = (str[0] & 0x01) << 30;
703 u += ((str[1] & 0x3f) << 24);
704 u += ((str[2] & 0x3f) << 18);
705 u += ((str[3] & 0x3f) << 12);
706 u += ((str[4] & 0x3f) << 6);
707 u += (str[5] & 0x3f);
708 if (u < 0x4000000)
709 goto invalid_utf8;
710 break;
711 default:
712 INTERNAL("utf8char_len_tab out of range");
713 goto invalid_utf8;
715 *string = str + length;
716 return u;
718 #endif /* CONFIG_UTF8 */
720 /* The common part of cp2u and cp2utf_8. */
721 static unicode_val_T
722 cp2u_shared(const struct codepage_desc *from, unsigned char c)
724 unicode_val_T u = from->highhalf[c - 0x80];
726 if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
727 return u;
730 /* Used for converting input from the terminal. */
731 unicode_val_T
732 cp2u(int from, unsigned char c)
734 from &= ~SYSTEM_CHARSET_FLAG;
736 /* UTF-8 is a multibyte codepage and cannot be handled with
737 * this function. */
738 assert(!is_cp_ptr_utf8(&codepages[from]));
739 if_assert_failed return UCS_REPLACEMENT_CHARACTER;
741 if (c < 0x80) return c;
742 else return cp2u_shared(&codepages[from], c);
745 /* This slow and ugly code is used by the terminal utf_8_io */
746 const unsigned char *
747 cp2utf8(int from, int c)
749 from &= ~SYSTEM_CHARSET_FLAG;
751 if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
752 return strings[c];
754 return encode_utf8(cp2u_shared(&codepages[from], c));
757 #ifdef CONFIG_UTF8
758 unicode_val_T
759 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
761 unicode_val_T ret;
763 if (is_cp_utf8(codepage))
764 return utf8_to_unicode(string, end);
766 if (*string >= end)
767 return UCS_NO_CHAR;
769 ret = cp2u(codepage, **string);
770 ++*string;
771 return ret;
773 #endif /* CONFIG_UTF8 */
776 #ifdef CONFIG_COMBINE
777 unicode_val_T last_combined = UCS_BEGIN_COMBINED - 1;
778 unicode_val_T **combined;
779 struct hash *combined_hash;
781 unicode_val_T
782 get_combined(unicode_val_T *data, int length)
784 struct hash_item *item;
785 unicode_val_T *key;
786 int i, indeks;
788 assert(length >= 1 && length <= UCS_MAX_LENGTH_COMBINED);
789 if_assert_failed return UCS_NO_CHAR;
791 if (!combined_hash) combined_hash = init_hash8();
792 if (!combined_hash) return UCS_NO_CHAR;
793 item = get_hash_item(combined_hash, (unsigned char *)data, length * sizeof(*data));
795 if (item) return (unicode_val_T)(long)item->value;
796 if (last_combined >= UCS_END_COMBINED) return UCS_NO_CHAR;
798 key = mem_alloc((length + 1) * sizeof(*key));
799 if (!key) return UCS_NO_CHAR;
800 for (i = 0; i < length; i++)
801 key[i] = data[i];
802 key[i] = UCS_END_COMBINED;
804 last_combined++;
805 indeks = last_combined - UCS_BEGIN_COMBINED;
807 combined = mem_realloc(combined, sizeof(*combined) * (indeks + 1));
808 if (!combined) {
809 mem_free(key);
810 last_combined--;
811 return UCS_NO_CHAR;
813 combined[indeks] = key;
814 item = add_hash_item(combined_hash, (unsigned char *)key,
815 length * sizeof(*data), (void *)(long)(last_combined));
816 if (!item) {
817 last_combined--;
818 mem_free(key);
819 return UCS_NO_CHAR;
821 return last_combined;
824 void
825 free_combined()
827 int i, end = last_combined - UCS_BEGIN_COMBINED + 1;
829 if (combined_hash)
830 free_hash(&combined_hash);
831 for (i = 0; i < end; i++)
832 mem_free(combined[i]);
833 mem_free_if(combined);
835 #endif /* CONFIG_COMBINE */
838 static void
839 add_utf8(struct conv_table *ct, unicode_val_T u, const unsigned char *str)
841 unsigned char *p = encode_utf8(u);
843 while (p[1]) {
844 if (ct[*p].t) ct = ct[*p].u.tbl;
845 else {
846 struct conv_table *nct;
848 assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
849 if_assert_failed return;
851 nct = mem_calloc(256, sizeof(*nct));
852 if (!nct) return;
853 new_translation_table(nct);
854 ct[*p].t = 1;
855 ct[*p].u.tbl = nct;
856 ct = nct;
858 p++;
861 assertm(!ct[*p].t, "bad utf encoding #2");
862 if_assert_failed return;
864 if (ct[*p].u.str == no_str)
865 ct[*p].u.str = str;
868 /* A conversion table from some charset to UTF-8.
869 * If it is from UTF-8 to UTF-8, it converts each byte separately.
870 * Unlike in other translation tables, the strings in elements 0x80 to
871 * 0xFF are allocated dynamically. */
872 struct conv_table utf_table[256];
873 int utf_table_init = 1;
875 static void
876 free_utf_table(void)
878 int i;
880 /* Cast away const. */
881 for (i = 128; i < 256; i++)
882 mem_free((unsigned char *) utf_table[i].u.str);
885 static struct conv_table *
886 get_translation_table_to_utf8(int from)
888 int i;
889 static int lfr = -1;
891 if (from == -1) return NULL;
892 from &= ~SYSTEM_CHARSET_FLAG;
893 if (from == lfr) return utf_table;
894 lfr = from;
895 if (utf_table_init) {
896 memset(utf_table, 0, sizeof(utf_table));
897 utf_table_init = 0;
898 } else
899 free_utf_table();
901 for (i = 0; i < 128; i++)
902 utf_table[i].u.str = strings[i];
904 if (is_cp_ptr_utf8(&codepages[from])) {
905 for (i = 128; i < 256; i++)
906 utf_table[i].u.str = stracpy(strings[i]);
907 return utf_table;
910 for (i = 128; i < 256; i++) {
911 unicode_val_T u = codepages[from].highhalf[i - 0x80];
913 if (u == 0xFFFF)
914 utf_table[i].u.str = NULL;
915 else
916 utf_table[i].u.str = stracpy(encode_utf8(u));
919 for (i = 0; codepages[from].table[i].c; i++) {
920 unicode_val_T u = codepages[from].table[i].u;
922 if (!utf_table[codepages[from].table[i].c].u.str)
923 utf_table[codepages[from].table[i].c].u.str =
924 stracpy(encode_utf8(u));
927 for (i = 128; i < 256; i++)
928 if (!utf_table[i].u.str)
929 utf_table[i].u.str = stracpy(no_str);
931 return utf_table;
934 /* A conversion table between two charsets, where the target is not UTF-8. */
935 static struct conv_table table[256];
936 static int first = 1;
938 void
939 free_conv_table(void)
941 if (!utf_table_init) free_utf_table();
942 if (first) {
943 memset(table, 0, sizeof(table));
944 first = 0;
946 new_translation_table(table);
950 struct conv_table *
951 get_translation_table(int from, int to)
953 static int lfr = -1;
954 static int lto = -1;
956 from &= ~SYSTEM_CHARSET_FLAG;
957 to &= ~SYSTEM_CHARSET_FLAG;
958 if (first) {
959 memset(table, 0, sizeof(table));
960 first = 0;
962 if (/*from == to ||*/ from == -1 || to == -1)
963 return NULL;
964 if (is_cp_ptr_utf8(&codepages[to]))
965 return get_translation_table_to_utf8(from);
966 if (from == lfr && to == lto)
967 return table;
968 lfr = from;
969 lto = to;
970 new_translation_table(table);
972 if (is_cp_ptr_utf8(&codepages[from])) {
973 int i;
975 /* Map U+00A0 and U+00AD the same way as u2cp() would. */
976 add_utf8(table, UCS_NO_BREAK_SPACE, strings[NBSP_CHAR]);
977 add_utf8(table, UCS_SOFT_HYPHEN, "");
979 for (i = 0x80; i <= 0xFF; i++)
980 if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
981 add_utf8(table,
982 codepages[to].highhalf[i - 0x80],
983 strings[i]);
985 for (i = 0; codepages[to].table[i].c; i++)
986 add_utf8(table, codepages[to].table[i].u,
987 strings[codepages[to].table[i].c]);
989 for (i = 0; unicode_7b[i].x != -1; i++)
990 if (unicode_7b[i].x >= 0x80)
991 add_utf8(table, unicode_7b[i].x,
992 unicode_7b[i].s);
994 } else {
995 int i;
997 for (i = 128; i < 256; i++) {
998 if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
999 const unsigned char *u;
1001 u = u2cp(codepages[from].highhalf[i - 0x80], to);
1002 if (u) table[i].u.str = u;
1007 return table;
1010 static inline int
1011 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
1013 while (l2) {
1014 if (*s1 > *s2) return 1;
1015 if (*s1 < *s2) return -1;
1016 s1++;
1017 s2++;
1018 l2--;
1021 return *s2 ? -1 : 0;
1024 /* Entity cache debugging purpose. */
1025 #if 0
1026 #define DEBUG_ENTITY_CACHE
1027 #else
1028 #undef DEBUG_ENTITY_CACHE
1029 #endif
1031 struct entity_cache {
1032 unsigned int hits;
1033 int strlen;
1034 int encoding;
1035 const unsigned char *result;
1036 unsigned char str[20]; /* Suffice in any case. */
1039 /* comparison function for qsort() */
1040 static int
1041 hits_cmp(const void *v1, const void *v2)
1043 const struct entity_cache *a = v1, *b = v2;
1045 if (a->hits == b->hits) return 0;
1046 if (a->hits > b->hits) return -1;
1047 else return 1;
1050 static int
1051 compare_entities(const void *key_, const void *element_)
1053 struct string *key = (struct string *) key_;
1054 struct entity *element = (struct entity *) element_;
1055 int length = key->length;
1056 unsigned char *first = key->source;
1057 unsigned char *second = element->s;
1059 return xxstrcmp(first, second, length);
1062 const unsigned char *
1063 get_entity_string(const unsigned char *str, const int strlen, int encoding)
1065 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
1066 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
1067 will go in [0] table */
1068 static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
1069 static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
1070 static int first_time = 1;
1071 unsigned int slen = 0;
1072 const unsigned char *result = NULL;
1074 if (strlen <= 0) return NULL;
1076 #ifdef CONFIG_UTF8
1077 /* TODO: caching UTF-8 */
1078 encoding &= ~SYSTEM_CHARSET_FLAG;
1079 if (is_cp_ptr_utf8(&codepages[encoding]))
1080 goto skip;
1081 #endif /* CONFIG_UTF8 */
1083 if (first_time) {
1084 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
1085 first_time = 0;
1088 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1089 * + google + slashdot + websites that result from a search for test on google,
1090 * + various ones) show quite impressive improvment:
1091 * Top ten is:
1092 * 0: hits=2459 l=4 st='nbsp'
1093 * 1: hits=2152 l=6 st='eacute'
1094 * 2: hits=235 l=6 st='egrave'
1095 * 3: hits=136 l=6 st='agrave'
1096 * 4: hits=100 l=3 st='amp'
1097 * 5: hits=40 l=5 st='laquo'
1098 * 6: hits=8 l=4 st='copy'
1099 * 7: hits=5 l=2 st='gt'
1100 * 8: hits=2 l=2 st='lt'
1101 * 9: hits=1 l=6 st='middot'
1103 * Most of the time cache hit ratio is near 95%.
1105 * A long test shows: 15186 hits vs. 24 misses and mean iteration
1106 * count is kept < 2 (worst case 1.58). Not so bad ;)
1108 * --Zas */
1110 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1111 slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
1113 if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
1114 int i;
1116 for (i = 0; i < nb_entity_cache[slen]; i++) {
1117 if (entity_cache[slen][i].encoding == encoding
1118 && !memcmp(str, entity_cache[slen][i].str, strlen)) {
1119 #ifdef DEBUG_ENTITY_CACHE
1120 static double total_iter = 0;
1121 static unsigned long hit_count = 0;
1123 total_iter += i + 1;
1124 hit_count++;
1125 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
1126 #endif
1127 if (entity_cache[slen][i].hits < (unsigned int) ~0)
1128 entity_cache[slen][i].hits++;
1129 return entity_cache[slen][i].result;
1132 #ifdef DEBUG_ENTITY_CACHE
1133 fprintf(stderr, "miss\n");
1134 #endif
1136 #ifdef CONFIG_UTF8
1137 skip:
1138 #endif /* CONFIG_UTF8 */
1139 if (*str == '#') { /* Numeric entity. */
1140 int l = (int) strlen;
1141 unsigned char *st = (unsigned char *) str;
1142 unicode_val_T n = 0;
1144 if (l == 1) goto end; /* &#; ? */
1145 st++, l--;
1146 if ((*st | 32) == 'x') { /* Hexadecimal */
1148 if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
1149 st++, l--;
1150 do {
1151 unsigned char c = (*(st++) | 32);
1153 if (isdigit(c))
1154 n = (n << 4) | (c - '0');
1155 else if (isxdigit(c))
1156 n = (n << 4) | (c - 'a' + 10);
1157 else
1158 goto end; /* Bad char. */
1159 } while (--l);
1160 } else { /* Decimal */
1161 if (l > 10) goto end; /* 4294967295 max. */
1162 do {
1163 unsigned char c = *(st++);
1165 if (isdigit(c))
1166 n = n * 10 + c - '0';
1167 else
1168 goto end; /* Bad char. */
1169 /* Limit to 0xFFFFFFFF. */
1170 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1171 goto end;
1172 } while (--l);
1175 result = u2cp(n, encoding);
1177 #ifdef DEBUG_ENTITY_CACHE
1178 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1179 #endif
1180 } else { /* Text entity. */
1181 struct string key = INIT_STRING((unsigned char *) str, strlen);
1182 struct entity *element = bsearch((void *) &key, entities,
1183 N_ENTITIES,
1184 sizeof(*element),
1185 compare_entities);
1187 if (element) result = u2cp(element->c, encoding);
1190 #ifdef CONFIG_UTF8
1191 if (is_cp_ptr_utf8(&codepages[encoding])) {
1192 return result;
1194 #endif /* CONFIG_UTF8 */
1195 end:
1196 /* Take care of potential buffer overflow. */
1197 if (strlen < sizeof(entity_cache[slen][0].str)) {
1198 struct entity_cache *ece;
1200 /* Sort entries by hit order. */
1201 if (nb_entity_cache[slen] > 1)
1202 qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1203 sizeof(entity_cache[slen][0]), hits_cmp);
1205 /* Increment number of cache entries if possible.
1206 * Else, just replace the least used entry. */
1207 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1208 ece = &entity_cache[slen][nb_entity_cache[slen] - 1];
1210 /* Copy new entry to cache. */
1211 ece->hits = 1;
1212 ece->strlen = strlen;
1213 ece->encoding = encoding;
1214 ece->result = result;
1215 memcpy(ece->str, str, strlen);
1216 ece->str[strlen] = '\0';
1219 #ifdef DEBUG_ENTITY_CACHE
1220 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1221 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1224 unsigned int i;
1226 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1227 for (i = 0; i < nb_entity_cache[slen] ; i++)
1228 fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1229 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1230 entity_cache[slen][i].str);
1231 fprintf(stderr, "-----------------\n");
1233 #endif /* DEBUG_ENTITY_CACHE */
1235 return result;
1238 unsigned char *
1239 convert_string(struct conv_table *convert_table,
1240 unsigned char *chars, int charslen, int cp,
1241 enum convert_string_mode mode, int *length,
1242 void (*callback)(void *data, unsigned char *buf, int buflen),
1243 void *callback_data)
1245 unsigned char *buffer;
1246 int bufferpos = 0;
1247 int charspos = 0;
1249 if (!convert_table && !memchr(chars, '&', charslen)) {
1250 if (callback) {
1251 if (charslen) callback(callback_data, chars, charslen);
1252 return NULL;
1253 } else {
1254 return memacpy(chars, charslen);
1258 /* Buffer allocation */
1260 buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1261 if (!buffer) return NULL;
1263 /* Iterate ;-) */
1265 while (charspos < charslen) {
1266 const unsigned char *translit;
1268 #define PUTC do { \
1269 buffer[bufferpos++] = chars[charspos++]; \
1270 translit = ""; \
1271 goto flush; \
1272 } while (0)
1274 if (chars[charspos] != '&') {
1275 struct conv_table *t;
1276 int i;
1278 if (chars[charspos] < 128 || !convert_table) PUTC;
1280 t = convert_table;
1281 i = charspos;
1283 while (t[chars[i]].t) {
1284 t = t[chars[i++]].u.tbl;
1285 if (i >= charslen) PUTC;
1288 translit = t[chars[i]].u.str;
1289 charspos = i + 1;
1291 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1292 PUTC;
1294 } else {
1295 int start = charspos + 1;
1296 int i = start;
1298 while (i < charslen
1299 && (isasciialpha(chars[i])
1300 || isdigit(chars[i])
1301 || (chars[i] == '#')))
1302 i++;
1304 /* This prevents bug 213: we were expanding "entities"
1305 * in URL query strings. */
1306 /* XXX: But this disables &nbsp&nbsp usage, which
1307 * appears to be relatively common! --pasky */
1308 if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1309 && i > start
1310 && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1311 translit = get_entity_string(&chars[start], i - start,
1312 cp);
1313 if (chars[i] != ';') {
1314 /* Eat &nbsp &nbsp<foo> happily, but
1315 * pull back from the character after
1316 * entity string if it is not the valid
1317 * terminator. */
1318 i--;
1321 if (!translit) PUTC;
1322 charspos = i + (i < charslen);
1323 } else PUTC;
1326 if (!translit[0]) continue;
1328 if (!translit[1]) {
1329 buffer[bufferpos++] = translit[0];
1330 translit = "";
1331 goto flush;
1334 while (*translit) {
1335 unsigned char *new;
1337 buffer[bufferpos++] = *(translit++);
1338 flush:
1339 if (bufferpos & (ALLOC_GR - 1)) continue;
1341 if (callback) {
1342 buffer[bufferpos] = 0;
1343 callback(callback_data, buffer, bufferpos);
1344 bufferpos = 0;
1345 } else {
1346 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1347 if (!new) {
1348 mem_free(buffer);
1349 return NULL;
1351 buffer = new;
1354 #undef PUTC
1357 /* Say bye */
1359 buffer[bufferpos] = 0;
1360 if (length) *length = bufferpos;
1362 if (callback) {
1363 if (bufferpos) callback(callback_data, buffer, bufferpos);
1364 mem_free(buffer);
1365 return NULL;
1366 } else {
1367 return buffer;
1372 #ifndef USE_FASTFIND
1374 get_cp_index(const unsigned char *name)
1376 int i, a;
1377 int syscp = 0;
1379 if (!strcasecmp(name, "System")) {
1380 #if HAVE_LANGINFO_CODESET
1381 name = nl_langinfo(CODESET);
1382 syscp = SYSTEM_CHARSET_FLAG;
1383 #else
1384 name = "us-ascii";
1385 #endif
1388 for (i = 0; codepages[i].name; i++) {
1389 for (a = 0; codepages[i].aliases[a]; a++) {
1390 /* In the past, we looked for the longest substring
1391 * in all the names; it is way too expensive, though:
1393 * % cumulative self self total
1394 * time seconds seconds calls us/call us/call name
1395 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1397 * Anything called from redraw_screen() is in fact
1398 * relatively expensive, even if it's called just
1399 * once. So we will do a simple strcasecmp() here.
1402 if (!strcasecmp(name, codepages[i].aliases[a]))
1403 return i | syscp;
1407 if (syscp) {
1408 return get_cp_index("us-ascii") | syscp;
1409 } else {
1410 return -1;
1414 #else
1416 static unsigned int i_name = 0;
1417 static unsigned int i_alias = 0;
1419 /* Reset internal list pointer */
1420 void
1421 charsets_list_reset(void)
1423 i_name = 0;
1424 i_alias = 0;
1427 /* Returns a pointer to a struct that contains current key and data pointers
1428 * and increment internal pointer. It returns NULL when key is NULL. */
1429 struct fastfind_key_value *
1430 charsets_list_next(void)
1432 static struct fastfind_key_value kv;
1434 if (!codepages[i_name].name) return NULL;
1436 kv.key = codepages[i_name].aliases[i_alias];
1437 kv.data = (void *) &codepages[i_name]; /* cast away const */
1439 if (codepages[i_name].aliases[i_alias + 1])
1440 i_alias++;
1441 else {
1442 i_name++;
1443 i_alias = 0;
1446 return &kv;
1449 static struct fastfind_index ff_charsets_index
1450 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1452 /* It searchs for a charset named @name or one of its aliases and
1453 * returns index for it or -1 if not found. */
1455 get_cp_index(const unsigned char *name)
1457 const struct codepage_desc *codepage;
1458 int syscp = 0;
1460 if (!strcasecmp(name, "System")) {
1461 #if HAVE_LANGINFO_CODESET
1462 name = nl_langinfo(CODESET);
1463 syscp = SYSTEM_CHARSET_FLAG;
1464 #else
1465 name = "us-ascii";
1466 #endif
1469 codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1470 if (codepage) {
1471 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1472 return (codepage - codepages) | syscp;
1474 } else if (syscp) {
1475 return get_cp_index("us-ascii") | syscp;
1477 } else {
1478 return -1;
1482 #endif /* USE_FASTFIND */
1484 void
1485 init_charsets_lookup(void)
1487 #ifdef USE_FASTFIND
1488 fastfind_index(&ff_charsets_index, FF_COMPRESS);
1489 #endif
1492 void
1493 free_charsets_lookup(void)
1495 #ifdef USE_FASTFIND
1496 fastfind_done(&ff_charsets_index);
1497 #endif
1500 /* Get the codepage's name for displaying to the user, or NULL if
1501 * @cp_index is one past the end. In the future, we might want to
1502 * localize these with gettext. So it may be best not to use this
1503 * function if the name will have to be converted back to an
1504 * index. */
1505 unsigned char *
1506 get_cp_name(int cp_index)
1508 if (cp_index < 0) return "none";
1509 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1511 return codepages[cp_index].name;
1514 /* Get the codepage's name for saving to a configuration file. These
1515 * names can be converted back to indexes, even in future versions of
1516 * ELinks. */
1517 unsigned char *
1518 get_cp_config_name(int cp_index)
1520 if (cp_index < 0) return "none";
1521 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1522 if (!codepages[cp_index].aliases) return NULL;
1524 return codepages[cp_index].aliases[0];
1527 /* Get the codepage's name for sending to a library or server that
1528 * understands MIME charset names. This function irreversibly maps
1529 * the "System" codepage to the underlying charset. */
1530 unsigned char *
1531 get_cp_mime_name(int cp_index)
1533 if (cp_index < 0) return "none";
1534 cp_index &= ~SYSTEM_CHARSET_FLAG;
1535 if (!codepages[cp_index].aliases) return NULL;
1537 return codepages[cp_index].aliases[0];
1541 is_cp_utf8(int cp_index)
1543 cp_index &= ~SYSTEM_CHARSET_FLAG;
1544 return is_cp_ptr_utf8(&codepages[cp_index]);