u2cp_: Make the no_nbsp_hack parameter an enum.
[elinks.git] / src / intl / charsets.c
blob047beb1f5380c009f30cee9fabe7dbe6b85febf5
1 /* Charsets convertor */
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE /* strcasecmp() */
5 #endif
7 #ifdef HAVE_CONFIG_H
8 #include "config.h"
9 #endif
11 #if HAVE_LANGINFO_CODESET
12 #include <langinfo.h>
13 #endif
15 #include <ctype.h>
16 #include <stdlib.h>
17 #if HAVE_WCTYPE_H
18 #include <wctype.h>
19 #endif
21 #include "elinks.h"
23 #include "document/options.h"
24 #include "intl/charsets.h"
25 #include "util/conv.h"
26 #include "util/error.h"
27 #include "util/fastfind.h"
28 #include "util/memory.h"
29 #include "util/string.h"
32 /* Fix namespace clash on MacOS. */
33 #define table table_elinks
35 struct table_entry {
36 unsigned char c;
37 /* This should in principle be unicode_val_T, but because all
38 * the values currently in codepage.inc fit in 16 bits, we can
39 * as well use uint16_t and halve sizeof(struct table_entry)
40 * from 8 bytes to 4. Should other characters ever be needed,
41 * unicode_val_T u : 24 might be a possibility, although it
42 * seems a little unportable as bitfields are in principle
43 * restricted to int, which may be 16-bit. */
44 uint16_t u;
47 struct codepage_desc {
48 unsigned char *name;
49 unsigned char *const *aliases;
51 /* The Unicode mappings of codepage bytes 0x80...0xFF.
52 * (0x00...0x7F are assumed to be ASCII in all codepages.)
53 * Because all current values fit in 16 bits, we store them as
54 * uint16_t rather than unicode_val_T. If the codepage does
55 * not use some byte, then @highhalf maps that byte to 0xFFFF,
56 * which C code converts to UCS_REPLACEMENT_CHARACTER where
57 * appropriate. (U+FFFF is reserved and will never be
58 * assigned as a character.) */
59 const uint16_t *highhalf;
61 /* If some byte in the codepage corresponds to multiple Unicode
62 * characters, then the preferred character is in @highhalf
63 * above, and the rest are listed here in @extra. This table
64 * is not used for translating from the codepage to Unicode. */
65 const struct table_entry *table;
68 #include "intl/codepage.inc"
69 #include "intl/uni_7b.inc"
70 #include "intl/entity.inc"
73 static char strings[256][2] = {
74 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
75 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
76 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
77 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
78 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
79 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
80 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
81 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
82 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
83 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
84 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
85 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
86 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
87 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
88 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
89 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
90 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
91 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
92 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
93 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
94 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
95 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
96 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
97 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
98 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
99 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
100 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
101 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
102 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
103 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
104 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
105 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
108 static void
109 free_translation_table(struct conv_table *p)
111 int i;
113 for (i = 0; i < 256; i++)
114 if (p[i].t)
115 free_translation_table(p[i].u.tbl);
117 mem_free(p);
120 static unsigned char *no_str = "*";
122 static void
123 new_translation_table(struct conv_table *p)
125 int i;
127 for (i = 0; i < 256; i++)
128 if (p[i].t)
129 free_translation_table(p[i].u.tbl);
130 for (i = 0; i < 128; i++) {
131 p[i].t = 0;
132 p[i].u.str = strings[i];
134 for (; i < 256; i++) {
135 p[i].t = 0;
136 p[i].u.str = no_str;
140 #define BIN_SEARCH(table, entry, entries, key, result) \
142 long _s = 0, _e = (entries) - 1; \
144 while (_s <= _e || !((result) = -1)) { \
145 long _m = (_s + _e) / 2; \
147 if ((table)[_m].entry == (key)) { \
148 (result) = _m; \
149 break; \
151 if ((table)[_m].entry > (key)) _e = _m - 1; \
152 if ((table)[_m].entry < (key)) _s = _m + 1; \
156 static const unicode_val_T strange_chars[32] = {
157 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
158 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
159 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
160 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
163 #define SYSTEM_CHARSET_FLAG 128
164 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
166 unsigned char *
167 u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
169 int j;
170 int s;
172 if (u < 128) return strings[u];
174 to &= ~SYSTEM_CHARSET_FLAG;
176 #ifdef CONFIG_UTF8
177 if (is_cp_ptr_utf8(&codepages[to]))
178 return encode_utf8(u);
179 #endif /* CONFIG_UTF8 */
181 /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
182 if (u == 0xa0) {
183 if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
184 else /* NBSP_MODE_ASCII */ return " ";
186 if (u == 0xad) return "";
188 if (u < 0xa0) {
189 unicode_val_T strange = strange_chars[u - 0x80];
191 if (!strange) return NULL;
192 return u2cp_(strange, to, nbsp_mode);
195 if (u < 0xFFFF)
196 for (j = 0; j < 0x80; j++)
197 if (codepages[to].highhalf[j] == u)
198 return strings[0x80 + j];
199 for (j = 0; codepages[to].table[j].c; j++)
200 if (codepages[to].table[j].u == u)
201 return strings[codepages[to].table[j].c];
203 BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
204 if (s != -1) return unicode_7b[s].s;
206 return no_str;
209 static unsigned char utf_buffer[7];
211 #ifdef CONFIG_UTF8
212 inline unsigned char *
213 encode_utf8(unicode_val_T u)
214 #else
215 static unsigned char *
216 encode_utf8(unicode_val_T u)
217 #endif /* CONFIG_UTF8 */
219 memset(utf_buffer, 0, 7);
221 if (u < 0x80)
222 utf_buffer[0] = u;
223 else if (u < 0x800)
224 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
225 utf_buffer[1] = 0x80 | (u & 0x3f);
226 else if (u < 0x10000)
227 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
228 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
229 utf_buffer[2] = 0x80 | (u & 0x3f);
230 else if (u < 0x200000)
231 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
232 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
233 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
234 utf_buffer[3] = 0x80 | (u & 0x3f);
235 else if (u < 0x4000000)
236 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
237 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
238 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
239 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
240 utf_buffer[4] = 0x80 | (u & 0x3f);
241 else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
242 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
243 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
244 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
245 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
246 utf_buffer[5] = 0x80 | (u & 0x3f);
248 return utf_buffer;
251 #ifdef CONFIG_UTF8
252 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
253 * equal ones and handled different. */
254 static char utf8char_len_tab[256] = {
255 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
256 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
257 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
258 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
259 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
260 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
261 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
262 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
265 inline int utf8charlen(const unsigned char *p)
267 return p ? utf8char_len_tab[*p] : 0;
270 inline int
271 strlen_utf8(unsigned char **str)
273 unsigned char *s = *str;
274 unsigned char *end = strchr(s, '\0');
275 int x;
276 int len;
278 for (x = 0;; x++, s += len) {
279 len = utf8charlen(s);
280 if (s + len > end) break;
282 *str = s;
283 return x;
286 #define utf8_issingle(p) (((p) & 0x80) == 0)
287 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
289 /* Start from @current and move back to @pos char. This pointer return. The
290 * most left pointer is @start. */
291 inline unsigned char *
292 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
294 if (current == NULL || start == NULL || pos < 0)
295 return NULL;
296 while (pos > 0 && current != start) {
297 current--;
298 if (utf8_islead(*current))
299 pos--;
301 return current;
304 /* Count number of standard terminal cells needed for displaying UTF-8
305 * character. */
307 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
309 unicode_val_T u;
311 if (end == NULL)
312 end = strchr(utf8_char, '\0');
314 if(!utf8_char || !end)
315 return -1;
317 u = utf8_to_unicode(&utf8_char, end);
319 return unicode_to_cell(u);
322 /* Count number of standard terminal cells needed for displaying string
323 * with UTF-8 characters. */
325 utf8_ptr2cells(unsigned char *string, unsigned char *end)
327 int charlen, cell, cells = 0;
329 if (end == NULL)
330 end = strchr(string, '\0');
332 if(!string || !end)
333 return -1;
335 do {
336 charlen = utf8charlen(string);
337 if (string + charlen > end)
338 break;
340 cell = utf8_char2cells(string, end);
341 if (cell < 0)
342 return -1;
344 cells += cell;
345 string += charlen;
346 } while (1);
348 return cells;
351 /* Count number of characters in string. */
353 utf8_ptr2chars(unsigned char *string, unsigned char *end)
355 int charlen, chars = 0;
357 if (end == NULL)
358 end = strchr(string, '\0');
360 if(!string || !end)
361 return -1;
363 do {
364 charlen = utf8charlen(string);
365 if (string + charlen > end)
366 break;
368 chars++;
369 string += charlen;
370 } while (1);
372 return chars;
376 * Count number of bytes from begining of the string needed for displaying
377 * specified number of cells.
380 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
382 unsigned int bytes = 0, cells = 0;
384 assert(max_cells>=0);
386 if (end == NULL)
387 end = strchr(string, '\0');
389 if(!string || !end)
390 return -1;
392 do {
393 int cell = utf8_char2cells(&string[bytes], end);
394 if (cell < 0)
395 return -1;
397 cells += cell;
398 if (cells > max_cells)
399 break;
401 bytes += utf8charlen(&string[bytes]);
403 if (string + bytes > end) {
404 bytes = end - string;
405 break;
407 } while(1);
409 return bytes;
412 /* Take @max steps forward from @string in the specified @way, but
413 * not going past @end. Return the resulting address. Store the
414 * number of steps taken to *@count, unless @count is NULL.
416 * This assumes the text is valid UTF-8, and @string and @end point to
417 * character boundaries. If not, it doesn't crash but the results may
418 * be inconsistent.
420 * This function can do some of the same jobs as utf8charlen(),
421 * utf8_cells2bytes(), and strlen_utf8(). */
422 unsigned char *
423 utf8_step_forward(unsigned char *string, unsigned char *end,
424 int max, enum utf8_step way, int *count)
426 int steps = 0;
427 unsigned char *current = string;
429 assert(string);
430 assert(max >= 0);
431 if_assert_failed goto invalid_arg;
432 if (end == NULL)
433 end = strchr(string, '\0');
435 switch (way) {
436 case utf8_step_characters:
437 while (steps < max && current < end) {
438 ++current;
439 if (utf8_islead(*current))
440 ++steps;
442 break;
444 case utf8_step_cells_fewer:
445 case utf8_step_cells_more:
446 while (steps < max) {
447 unicode_val_T u;
448 unsigned char *prev = current;
449 int width;
451 u = utf8_to_unicode(&current, end);
452 if (u == UCS_NO_CHAR) {
453 /* Assume the incomplete sequence
454 * costs one cell. */
455 current = end;
456 ++steps;
457 break;
460 width = unicode_to_cell(u);
461 if (way == utf8_step_cells_fewer
462 && steps + width > max) {
463 /* Back off. */
464 current = prev;
465 break;
467 steps += width;
469 break;
471 default:
472 INTERNAL("impossible enum utf8_step");
475 invalid_arg:
476 if (count)
477 *count = steps;
478 return current;
481 /* Take @max steps backward from @string in the specified @way, but
482 * not going past @start. Return the resulting address. Store the
483 * number of steps taken to *@count, unless @count is NULL.
485 * This assumes the text is valid UTF-8, and @string and @start point
486 * to character boundaries. If not, it doesn't crash but the results
487 * may be inconsistent.
489 * This function can do some of the same jobs as utf8_prevchar(). */
490 unsigned char *
491 utf8_step_backward(unsigned char *string, unsigned char *start,
492 int max, enum utf8_step way, int *count)
494 int steps = 0;
495 unsigned char *current = string;
497 assert(string);
498 assert(start);
499 assert(max >= 0);
500 if_assert_failed goto invalid_arg;
502 switch (way) {
503 case utf8_step_characters:
504 while (steps < max && current > start) {
505 --current;
506 if (utf8_islead(*current))
507 ++steps;
509 break;
511 case utf8_step_cells_fewer:
512 case utf8_step_cells_more:
513 while (steps < max) {
514 unsigned char *prev = current;
515 unsigned char *look;
516 unicode_val_T u;
517 int width;
519 if (current <= start)
520 break;
521 do {
522 --current;
523 } while (current > start && !utf8_islead(*current));
525 look = current;
526 u = utf8_to_unicode(&look, prev);
527 if (u == UCS_NO_CHAR) {
528 /* Assume the incomplete sequence
529 * costs one cell. */
530 width = 1;
531 } else
532 width = unicode_to_cell(u);
534 if (way == utf8_step_cells_fewer
535 && steps + width > max) {
536 /* Back off. */
537 current = prev;
538 break;
540 steps += width;
542 break;
544 default:
545 INTERNAL("impossible enum utf8_step");
548 invalid_arg:
549 if (count)
550 *count = steps;
551 return current;
555 * Find out number of standard terminal collumns needed for displaying symbol
556 * (glyph) which represents Unicode character c.
558 * TODO: Use wcwidth when it is available. This seems to require:
559 * - Make the configure script check whether <wchar.h> and wcwidth exist.
560 * - Define _XOPEN_SOURCE and include <wchar.h>.
561 * - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
562 * matches ISO 10646 in all locales.)
563 * However, these do not suffice, because wcwidth depends on LC_CTYPE
564 * in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
565 * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
566 * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
567 * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
568 * character is apparently not supported in all locales. Why is that?
569 * - Perhaps there is standardese that requires supported characters
570 * to be convertable to multibyte form. Then ELinks could just pick
571 * some UTF-8 locale for its wcwidth purposes.
572 * - Perhaps wcwidth can even return different nonnegative values for
573 * the same ISO 10646 character in different locales. Then ELinks
574 * would have to set LC_CTYPE to match at least the terminal's
575 * charset (which may differ from the LC_CTYPE environment variable,
576 * especially when the master process is serving a slave terminal).
577 * But there is no guarantee that the libc supports all the same
578 * charsets as ELinks does.
579 * For now, it seems safest to avoid the potentially locale-dependent
580 * libc version of wcwidth, and instead use a hardcoded mapping.
582 * @return 2 for double-width glyph, 1 for others.
583 * TODO: May be extended to return 0 for zero-width glyphs
584 * (like composing, maybe unprintable too).
586 inline int
587 unicode_to_cell(unicode_val_T c)
589 if (c >= 0x1100
590 && (c <= 0x115f /* Hangul Jamo */
591 || c == 0x2329
592 || c == 0x232a
593 || (c >= 0x2e80 && c <= 0xa4cf
594 && c != 0x303f) /* CJK ... Yi */
595 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
596 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
597 Ideographs */
598 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
599 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
600 || (c >= 0xffe0 && c <= 0xffe6)
601 || (c >= 0x20000 && c <= 0x2fffd)
602 || (c >= 0x30000 && c <= 0x3fffd)))
603 return 2;
605 return 1;
608 /* Fold the case of a Unicode character, so that hotkeys in labels can
609 * be compared case-insensitively. It is unspecified whether the
610 * result will be in upper or lower case. */
611 unicode_val_T
612 unicode_fold_label_case(unicode_val_T c)
614 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
615 return towlower(c);
616 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
617 /* For now, this supports only ASCII. It would be possible to
618 * use code generated from CaseFolding.txt of Unicode if the
619 * acknowledgements required by http://www.unicode.org/copyright.html
620 * were added to associated documentation of ELinks. */
621 if (c >= 0x41 && c <= 0x5A)
622 return c + 0x20;
623 else
624 return c;
625 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
628 inline unicode_val_T
629 utf8_to_unicode(unsigned char **string, unsigned char *end)
631 unsigned char *str = *string;
632 unicode_val_T u;
633 int length;
635 length = utf8char_len_tab[str[0]];
637 if (str + length > end) {
638 return UCS_NO_CHAR;
641 switch (length) {
642 case 1:
643 u = str[0];
644 break;
645 case 2:
646 u = (str[0] & 0x1f) << 6;
647 u += (str[1] & 0x3f);
648 break;
649 case 3:
650 u = (str[0] & 0x0f) << 12;
651 u += ((str[1] & 0x3f) << 6);
652 u += (str[2] & 0x3f);
653 break;
654 case 4:
655 u = (str[0] & 0x0f) << 18;
656 u += ((str[1] & 0x3f) << 12);
657 u += ((str[2] & 0x3f) << 6);
658 u += (str[3] & 0x3f);
659 break;
660 case 5:
661 u = (str[0] & 0x0f) << 24;
662 u += ((str[1] & 0x3f) << 18);
663 u += ((str[2] & 0x3f) << 12);
664 u += ((str[3] & 0x3f) << 6);
665 u += (str[4] & 0x3f);
666 break;
667 case 6:
668 default:
669 u = (str[0] & 0x01) << 30;
670 u += ((str[1] & 0x3f) << 24);
671 u += ((str[2] & 0x3f) << 18);
672 u += ((str[3] & 0x3f) << 12);
673 u += ((str[4] & 0x3f) << 6);
674 u += (str[5] & 0x3f);
675 break;
677 *string = str + length;
678 return u;
680 #endif /* CONFIG_UTF8 */
682 /* The common part of cp2u and cp2utf_8. */
683 static unicode_val_T
684 cp2u_shared(const struct codepage_desc *from, unsigned char c)
686 unicode_val_T u = from->highhalf[c - 0x80];
688 if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
689 return u;
692 /* Used for converting input from the terminal. */
693 unicode_val_T
694 cp2u(int from, unsigned char c)
696 from &= ~SYSTEM_CHARSET_FLAG;
698 /* UTF-8 is a multibyte codepage and cannot be handled with
699 * this function. */
700 assert(!is_cp_ptr_utf8(&codepages[from]));
701 if_assert_failed return UCS_REPLACEMENT_CHARACTER;
703 if (c < 0x80) return c;
704 else return cp2u_shared(&codepages[from], c);
707 /* This slow and ugly code is used by the terminal utf_8_io */
708 unsigned char *
709 cp2utf8(int from, int c)
711 from &= ~SYSTEM_CHARSET_FLAG;
713 if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
714 return strings[c];
716 return encode_utf8(cp2u_shared(&codepages[from], c));
719 #ifdef CONFIG_UTF8
720 unicode_val_T
721 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
723 unicode_val_T ret;
725 if (is_cp_utf8(codepage))
726 return utf8_to_unicode(string, end);
728 if (*string >= end)
729 return UCS_NO_CHAR;
731 ret = cp2u(codepage, **string);
732 ++*string;
733 return ret;
735 #endif /* CONFIG_UTF8 */
738 static void
739 add_utf8(struct conv_table *ct, unicode_val_T u, unsigned char *str)
741 unsigned char *p = encode_utf8(u);
743 while (p[1]) {
744 if (ct[*p].t) ct = ct[*p].u.tbl;
745 else {
746 struct conv_table *nct;
748 assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
749 if_assert_failed return;
751 nct = mem_calloc(256, sizeof(*nct));
752 if (!nct) return;
753 new_translation_table(nct);
754 ct[*p].t = 1;
755 ct[*p].u.tbl = nct;
756 ct = nct;
758 p++;
761 assertm(!ct[*p].t, "bad utf encoding #2");
762 if_assert_failed return;
764 if (ct[*p].u.str == no_str)
765 ct[*p].u.str = str;
768 struct conv_table utf_table[256];
769 int utf_table_init = 1;
771 static void
772 free_utf_table(void)
774 int i;
776 for (i = 128; i < 256; i++)
777 mem_free(utf_table[i].u.str);
780 static struct conv_table *
781 get_translation_table_to_utf8(int from)
783 int i;
784 static int lfr = -1;
786 if (from == -1) return NULL;
787 from &= ~SYSTEM_CHARSET_FLAG;
788 if (from == lfr) return utf_table;
789 lfr = from;
790 if (utf_table_init)
791 memset(utf_table, 0, sizeof(utf_table)),
792 utf_table_init = 0;
793 else
794 free_utf_table();
796 for (i = 0; i < 128; i++)
797 utf_table[i].u.str = strings[i];
799 if (is_cp_ptr_utf8(&codepages[from])) {
800 for (i = 128; i < 256; i++)
801 utf_table[i].u.str = stracpy(strings[i]);
802 return utf_table;
805 for (i = 128; i < 256; i++) {
806 unicode_val_T u = codepages[from].highhalf[i - 0x80];
808 if (u == 0xFFFF)
809 utf_table[i].u.str = NULL;
810 else
811 utf_table[i].u.str = stracpy(encode_utf8(u));
814 for (i = 0; codepages[from].table[i].c; i++) {
815 unicode_val_T u = codepages[from].table[i].u;
817 if (!utf_table[codepages[from].table[i].c].u.str)
818 utf_table[codepages[from].table[i].c].u.str =
819 stracpy(encode_utf8(u));
822 for (i = 128; i < 256; i++)
823 if (!utf_table[i].u.str)
824 utf_table[i].u.str = stracpy(no_str);
826 return utf_table;
829 struct conv_table table[256];
830 static int first = 1;
832 void
833 free_conv_table(void)
835 if (!utf_table_init) free_utf_table();
836 if (first) {
837 memset(table, 0, sizeof(table));
838 first = 0;
840 new_translation_table(table);
844 struct conv_table *
845 get_translation_table(int from, int to)
847 static int lfr = -1;
848 static int lto = -1;
850 from &= ~SYSTEM_CHARSET_FLAG;
851 to &= ~SYSTEM_CHARSET_FLAG;
852 if (first) {
853 memset(table, 0, sizeof(table));
854 first = 0;
856 if (/*from == to ||*/ from == -1 || to == -1)
857 return NULL;
858 if (is_cp_ptr_utf8(&codepages[to]))
859 return get_translation_table_to_utf8(from);
860 if (from == lfr && to == lto)
861 return table;
862 lfr = from;
863 lto = to;
864 new_translation_table(table);
866 if (is_cp_ptr_utf8(&codepages[from])) {
867 int i;
869 for (i = 0x80; i <= 0xFF; i++)
870 if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
871 add_utf8(table,
872 codepages[to].highhalf[i - 0x80],
873 strings[i]);
875 for (i = 0; codepages[to].table[i].c; i++)
876 add_utf8(table, codepages[to].table[i].u,
877 strings[codepages[to].table[i].c]);
879 for (i = 0; unicode_7b[i].x != -1; i++)
880 if (unicode_7b[i].x >= 0x80)
881 add_utf8(table, unicode_7b[i].x,
882 unicode_7b[i].s);
884 } else {
885 int i;
887 for (i = 128; i < 256; i++) {
888 if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
889 unsigned char *u;
891 u = u2cp(codepages[from].highhalf[i - 0x80], to);
892 if (u) table[i].u.str = u;
897 return table;
900 static inline int
901 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
903 while (l2) {
904 if (*s1 > *s2) return 1;
905 if (*s1 < *s2) return -1;
906 s1++;
907 s2++;
908 l2--;
911 return *s2 ? -1 : 0;
914 /* Entity cache debugging purpose. */
915 #if 0
916 #define DEBUG_ENTITY_CACHE
917 #else
918 #undef DEBUG_ENTITY_CACHE
919 #endif
921 struct entity_cache {
922 unsigned int hits;
923 int strlen;
924 int encoding;
925 unsigned char *result;
926 unsigned char str[20]; /* Suffice in any case. */
929 static int
930 hits_cmp(struct entity_cache *a, struct entity_cache *b)
932 if (a->hits == b->hits) return 0;
933 if (a->hits > b->hits) return -1;
934 else return 1;
937 static int
938 compare_entities(const void *key_, const void *element_)
940 struct string *key = (struct string *) key_;
941 struct entity *element = (struct entity *) element_;
942 int length = key->length;
943 unsigned char *first = key->source;
944 unsigned char *second = element->s;
946 return xxstrcmp(first, second, length);
949 unsigned char *
950 get_entity_string(const unsigned char *str, const int strlen, int encoding)
952 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
953 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
954 will go in [0] table */
955 static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
956 static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
957 static int first_time = 1;
958 unsigned int slen = 0;
959 unsigned char *result = NULL;
961 if (strlen <= 0) return NULL;
963 #ifdef CONFIG_UTF8
964 /* TODO: caching UTF-8 */
965 encoding &= ~SYSTEM_CHARSET_FLAG;
966 if (is_cp_ptr_utf8(&codepages[encoding]))
967 goto skip;
968 #endif /* CONFIG_UTF8 */
970 if (first_time) {
971 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
972 first_time = 0;
975 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
976 * + google + slashdot + websites that result from a search for test on google,
977 * + various ones) show a quite impressive improvment:
978 * Top ten is:
979 * 0: hits=2459 l=4 st='nbsp'
980 * 1: hits=2152 l=6 st='eacute'
981 * 2: hits=235 l=6 st='egrave'
982 * 3: hits=136 l=6 st='agrave'
983 * 4: hits=100 l=3 st='amp'
984 * 5: hits=40 l=5 st='laquo'
985 * 6: hits=8 l=4 st='copy'
986 * 7: hits=5 l=2 st='gt'
987 * 8: hits=2 l=2 st='lt'
988 * 9: hits=1 l=6 st='middot'
990 * Most of the time cache hit ratio is near 95%.
992 * A long test shows: 15186 hits vs. 24 misses and mean iteration
993 * count is kept < 2 (worst case 1.58). Not so bad ;)
995 * --Zas */
997 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
998 slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
1000 if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
1001 int i;
1003 for (i = 0; i < nb_entity_cache[slen]; i++) {
1004 if (entity_cache[slen][i].encoding == encoding
1005 && !memcmp(str, entity_cache[slen][i].str, strlen)) {
1006 #ifdef DEBUG_ENTITY_CACHE
1007 static double total_iter = 0;
1008 static unsigned long hit_count = 0;
1010 total_iter += i + 1;
1011 hit_count++;
1012 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
1013 #endif
1014 if (entity_cache[slen][i].hits < (unsigned int) ~0)
1015 entity_cache[slen][i].hits++;
1016 return entity_cache[slen][i].result;
1019 #ifdef DEBUG_ENTITY_CACHE
1020 fprintf(stderr, "miss\n");
1021 #endif
1023 #ifdef CONFIG_UTF8
1024 skip:
1025 #endif /* CONFIG_UTF8 */
1026 if (*str == '#') { /* Numeric entity. */
1027 int l = (int) strlen;
1028 unsigned char *st = (unsigned char *) str;
1029 unicode_val_T n = 0;
1031 if (l == 1) goto end; /* &#; ? */
1032 st++, l--;
1033 if ((*st | 32) == 'x') { /* Hexadecimal */
1035 if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
1036 st++, l--;
1037 do {
1038 unsigned char c = (*(st++) | 32);
1040 if (isdigit(c))
1041 n = (n << 4) | (c - '0');
1042 else if (isxdigit(c))
1043 n = (n << 4) | (c - 'a' + 10);
1044 else
1045 goto end; /* Bad char. */
1046 } while (--l);
1047 } else { /* Decimal */
1048 if (l > 10) goto end; /* 4294967295 max. */
1049 do {
1050 unsigned char c = *(st++);
1052 if (isdigit(c))
1053 n = n * 10 + c - '0';
1054 else
1055 goto end; /* Bad char. */
1056 /* Limit to 0xFFFFFFFF. */
1057 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1058 goto end;
1059 } while (--l);
1062 result = u2cp(n, encoding);
1064 #ifdef DEBUG_ENTITY_CACHE
1065 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1066 #endif
1067 } else { /* Text entity. */
1068 struct string key = INIT_STRING((unsigned char *) str, strlen);
1069 struct entity *element = bsearch((void *) &key, entities,
1070 N_ENTITIES,
1071 sizeof(*element),
1072 compare_entities);
1074 if (element) result = u2cp(element->c, encoding);
1077 #ifdef CONFIG_UTF8
1078 if (is_cp_ptr_utf8(&codepages[encoding])) {
1079 return result;
1081 #endif /* CONFIG_UTF8 */
1082 end:
1083 /* Take care of potential buffer overflow. */
1084 if (strlen < sizeof(entity_cache[slen][0].str)) {
1085 struct entity_cache *ece = &entity_cache[slen][nb_entity_cache[slen]];
1087 /* Copy new entry to cache. */
1088 ece->hits = 1;
1089 ece->strlen = strlen;
1090 ece->encoding = encoding;
1091 ece->result = result;
1092 memcpy(ece->str, str, strlen);
1093 ece->str[strlen] = '\0';
1095 /* Increment number of cache entries if possible. */
1096 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1098 #ifdef DEBUG_ENTITY_CACHE
1099 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1100 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1102 #endif
1104 /* Sort entries by hit order. */
1105 if (nb_entity_cache[slen] > 1)
1106 qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1107 sizeof(entity_cache[slen][0]), (void *) hits_cmp);
1109 #ifdef DEBUG_ENTITY_CACHE
1111 unsigned int i;
1113 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1114 for (i = 0; i < nb_entity_cache[slen] ; i++)
1115 fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1116 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1117 entity_cache[slen][i].str);
1118 fprintf(stderr, "-----------------\n");
1120 #endif
1122 return result;
1125 unsigned char *
1126 convert_string(struct conv_table *convert_table,
1127 unsigned char *chars, int charslen, int cp,
1128 enum convert_string_mode mode, int *length,
1129 void (*callback)(void *data, unsigned char *buf, int buflen),
1130 void *callback_data)
1132 unsigned char *buffer;
1133 int bufferpos = 0;
1134 int charspos = 0;
1136 if (!convert_table && !memchr(chars, '&', charslen)) {
1137 if (callback) {
1138 if (charslen) callback(callback_data, chars, charslen);
1139 return NULL;
1140 } else {
1141 return memacpy(chars, charslen);
1145 /* Buffer allocation */
1147 buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1148 if (!buffer) return NULL;
1150 /* Iterate ;-) */
1152 while (charspos < charslen) {
1153 unsigned char *translit;
1155 #define PUTC do { \
1156 buffer[bufferpos++] = chars[charspos++]; \
1157 translit = ""; \
1158 goto flush; \
1159 } while (0)
1161 if (chars[charspos] != '&') {
1162 struct conv_table *t;
1163 int i;
1165 if (chars[charspos] < 128 || !convert_table) PUTC;
1167 t = convert_table;
1168 i = charspos;
1170 while (t[chars[i]].t) {
1171 t = t[chars[i++]].u.tbl;
1172 if (i >= charslen) PUTC;
1175 translit = t[chars[i]].u.str;
1176 charspos = i + 1;
1178 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1179 PUTC;
1181 } else {
1182 int start = charspos + 1;
1183 int i = start;
1185 while (i < charslen
1186 && (isasciialpha(chars[i])
1187 || isdigit(chars[i])
1188 || (chars[i] == '#')))
1189 i++;
1191 /* This prevents bug 213: we were expanding "entities"
1192 * in URL query strings. */
1193 /* XXX: But this disables &nbsp&nbsp usage, which
1194 * appears to be relatively common! --pasky */
1195 if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1196 && i > start
1197 && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1198 translit = get_entity_string(&chars[start], i - start,
1199 cp);
1200 if (chars[i] != ';') {
1201 /* Eat &nbsp &nbsp<foo> happily, but
1202 * pull back from the character after
1203 * entity string if it is not the valid
1204 * terminator. */
1205 i--;
1208 if (!translit) PUTC;
1209 charspos = i + (i < charslen);
1210 } else PUTC;
1213 if (!translit[0]) continue;
1215 if (!translit[1]) {
1216 buffer[bufferpos++] = translit[0];
1217 translit = "";
1218 goto flush;
1221 while (*translit) {
1222 unsigned char *new;
1224 buffer[bufferpos++] = *(translit++);
1225 flush:
1226 if (bufferpos & (ALLOC_GR - 1)) continue;
1228 if (callback) {
1229 buffer[bufferpos] = 0;
1230 callback(callback_data, buffer, bufferpos);
1231 bufferpos = 0;
1232 } else {
1233 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1234 if (!new) {
1235 mem_free(buffer);
1236 return NULL;
1238 buffer = new;
1241 #undef PUTC
1244 /* Say bye */
1246 buffer[bufferpos] = 0;
1247 if (length) *length = bufferpos;
1249 if (callback) {
1250 if (bufferpos) callback(callback_data, buffer, bufferpos);
1251 mem_free(buffer);
1252 return NULL;
1253 } else {
1254 return buffer;
1259 #ifndef USE_FASTFIND
1261 get_cp_index(unsigned char *name)
1263 int i, a;
1264 int syscp = 0;
1266 if (!strcasecmp(name, "System")) {
1267 #if HAVE_LANGINFO_CODESET
1268 name = nl_langinfo(CODESET);
1269 syscp = SYSTEM_CHARSET_FLAG;
1270 #else
1271 name = "us-ascii";
1272 #endif
1275 for (i = 0; codepages[i].name; i++) {
1276 for (a = 0; codepages[i].aliases[a]; a++) {
1277 /* In the past, we looked for the longest substring
1278 * in all the names; it is way too expensive, though:
1280 * % cumulative self self total
1281 * time seconds seconds calls us/call us/call name
1282 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1284 * Anything called from redraw_screen() is in fact
1285 * relatively expensive, even if it's called just
1286 * once. So we will do a simple strcasecmp() here.
1289 if (!strcasecmp(name, codepages[i].aliases[a]))
1290 return i | syscp;
1294 if (syscp) {
1295 return get_cp_index("us-ascii") | syscp;
1296 } else {
1297 return -1;
1301 #else
1303 static unsigned int i_name = 0;
1304 static unsigned int i_alias = 0;
1306 /* Reset internal list pointer */
1307 void
1308 charsets_list_reset(void)
1310 i_name = 0;
1311 i_alias = 0;
1314 /* Returns a pointer to a struct that contains current key and data pointers
1315 * and increment internal pointer. It returns NULL when key is NULL. */
1316 struct fastfind_key_value *
1317 charsets_list_next(void)
1319 static struct fastfind_key_value kv;
1321 if (!codepages[i_name].name) return NULL;
1323 kv.key = codepages[i_name].aliases[i_alias];
1324 kv.data = (void *) &codepages[i_name]; /* cast away const */
1326 if (codepages[i_name].aliases[i_alias + 1])
1327 i_alias++;
1328 else {
1329 i_name++;
1330 i_alias = 0;
1333 return &kv;
1336 static struct fastfind_index ff_charsets_index
1337 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1339 /* It searchs for a charset named @name or one of its aliases and
1340 * returns index for it or -1 if not found. */
1342 get_cp_index(unsigned char *name)
1344 const struct codepage_desc *codepage;
1345 int syscp = 0;
1347 if (!strcasecmp(name, "System")) {
1348 #if HAVE_LANGINFO_CODESET
1349 name = nl_langinfo(CODESET);
1350 syscp = SYSTEM_CHARSET_FLAG;
1351 #else
1352 name = "us-ascii";
1353 #endif
1356 codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1357 if (codepage) {
1358 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1359 return (codepage - codepages) | syscp;
1361 } else if (syscp) {
1362 return get_cp_index("us-ascii") | syscp;
1364 } else {
1365 return -1;
1369 #endif /* USE_FASTFIND */
1371 void
1372 init_charsets_lookup(void)
1374 #ifdef USE_FASTFIND
1375 fastfind_index(&ff_charsets_index, FF_COMPRESS);
1376 #endif
1379 void
1380 free_charsets_lookup(void)
1382 #ifdef USE_FASTFIND
1383 fastfind_done(&ff_charsets_index);
1384 #endif
1387 unsigned char *
1388 get_cp_name(int cp_index)
1390 if (cp_index < 0) return "none";
1391 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1393 return codepages[cp_index].name;
1396 unsigned char *
1397 get_cp_mime_name(int cp_index)
1399 if (cp_index < 0) return "none";
1400 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1401 if (!codepages[cp_index].aliases) return NULL;
1403 return codepages[cp_index].aliases[0];
1407 is_cp_utf8(int cp_index)
1409 cp_index &= ~SYSTEM_CHARSET_FLAG;
1410 return is_cp_ptr_utf8(&codepages[cp_index]);