Bug 381: Halve sizeof(struct table_entry).
[elinks.git] / src / intl / charsets.c
blobaee850fd1f27f0e3359140b0b01cd71f4ad64b6c
1 /* Charsets convertor */
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
7 #if HAVE_LANGINFO_CODESET
8 #include <langinfo.h>
9 #endif
11 #include <ctype.h>
12 #include <stdlib.h>
13 #if HAVE_WCTYPE_H
14 #include <wctype.h>
15 #endif
17 #include "elinks.h"
19 #include "document/options.h"
20 #include "intl/charsets.h"
21 #include "util/conv.h"
22 #include "util/error.h"
23 #include "util/fastfind.h"
24 #include "util/memory.h"
25 #include "util/string.h"
28 /* Fix namespace clash on MacOS. */
29 #define table table_elinks
31 struct table_entry {
32 unsigned char c;
33 /* This should in principle be unicode_val_T, but because all
34 * the values currently in codepage.inc fit in 16 bits, we can
35 * as well use uint16_t and halve sizeof(struct table_entry)
36 * from 8 bytes to 4. Should other characters ever be needed,
37 * unicode_val_T u : 24 might be a possibility, although it
38 * seems a little unportable as bitfields are in principle
39 * restricted to int, which may be 16-bit. */
40 uint16_t u;
43 struct codepage_desc {
44 unsigned char *name;
45 unsigned char *const *aliases;
46 const struct table_entry *table;
49 #include "intl/codepage.inc"
50 #include "intl/uni_7b.inc"
51 #include "intl/entity.inc"
54 static char strings[256][2] = {
55 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
56 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
57 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
58 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
59 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
60 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
61 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
62 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
63 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
64 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
65 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
66 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
67 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
68 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
69 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
70 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
71 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
72 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
73 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
74 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
75 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
76 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
77 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
78 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
79 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
80 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
81 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
82 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
83 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
84 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
85 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
86 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
89 static void
90 free_translation_table(struct conv_table *p)
92 int i;
94 for (i = 0; i < 256; i++)
95 if (p[i].t)
96 free_translation_table(p[i].u.tbl);
98 mem_free(p);
101 static unsigned char *no_str = "*";
103 static void
104 new_translation_table(struct conv_table *p)
106 int i;
108 for (i = 0; i < 256; i++)
109 if (p[i].t)
110 free_translation_table(p[i].u.tbl);
111 for (i = 0; i < 128; i++) {
112 p[i].t = 0;
113 p[i].u.str = strings[i];
115 for (; i < 256; i++) {
116 p[i].t = 0;
117 p[i].u.str = no_str;
121 #define BIN_SEARCH(table, entry, entries, key, result) \
123 long _s = 0, _e = (entries) - 1; \
125 while (_s <= _e || !((result) = -1)) { \
126 long _m = (_s + _e) / 2; \
128 if ((table)[_m].entry == (key)) { \
129 (result) = _m; \
130 break; \
132 if ((table)[_m].entry > (key)) _e = _m - 1; \
133 if ((table)[_m].entry < (key)) _s = _m + 1; \
137 static const unicode_val_T strange_chars[32] = {
138 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
139 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
140 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
141 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
144 #define SYSTEM_CHARSET_FLAG 128
146 unsigned char *
147 u2cp_(unicode_val_T u, int to, int no_nbsp_hack)
149 int j;
150 int s;
152 if (u < 128) return strings[u];
154 to &= ~SYSTEM_CHARSET_FLAG;
156 #ifdef CONFIG_UTF8
157 if (codepages[to].table == table_utf8)
158 return encode_utf8(u);
159 #endif /* CONFIG_UTF8 */
161 /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
162 if (u == 0xa0) return no_nbsp_hack ? " " : NBSP_CHAR_STRING;
163 if (u == 0xad) return "";
165 if (u < 0xa0) {
166 unicode_val_T strange = strange_chars[u - 0x80];
168 if (!strange) return NULL;
169 return u2cp_(strange, to, no_nbsp_hack);
173 for (j = 0; codepages[to].table[j].c; j++)
174 if (codepages[to].table[j].u == u)
175 return strings[codepages[to].table[j].c];
177 BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
178 if (s != -1) return unicode_7b[s].s;
180 return no_str;
183 static unsigned char utf_buffer[7];
185 #ifdef CONFIG_UTF8
186 inline unsigned char *
187 encode_utf8(unicode_val_T u)
188 #else
189 static unsigned char *
190 encode_utf8(unicode_val_T u)
191 #endif /* CONFIG_UTF8 */
193 memset(utf_buffer, 0, 7);
195 if (u < 0x80)
196 utf_buffer[0] = u;
197 else if (u < 0x800)
198 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
199 utf_buffer[1] = 0x80 | (u & 0x3f);
200 else if (u < 0x10000)
201 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
202 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
203 utf_buffer[2] = 0x80 | (u & 0x3f);
204 else if (u < 0x200000)
205 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
206 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
207 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
208 utf_buffer[3] = 0x80 | (u & 0x3f);
209 else if (u < 0x4000000)
210 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
211 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
212 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
213 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
214 utf_buffer[4] = 0x80 | (u & 0x3f);
215 else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
216 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
217 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
218 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
219 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
220 utf_buffer[5] = 0x80 | (u & 0x3f);
222 return utf_buffer;
225 #ifdef CONFIG_UTF8
226 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
227 * equal ones and handled different. */
228 static char utf8char_len_tab[256] = {
229 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
230 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
231 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
232 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
233 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
234 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
235 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
236 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
239 inline int utf8charlen(const unsigned char *p)
241 return p ? utf8char_len_tab[*p] : 0;
244 inline int
245 strlen_utf8(unsigned char **str)
247 unsigned char *s = *str;
248 unsigned char *end = strchr(s, '\0');
249 int x;
250 int len;
252 for (x = 0;; x++, s += len) {
253 len = utf8charlen(s);
254 if (s + len > end) break;
256 *str = s;
257 return x;
260 #define utf8_issingle(p) (((p) & 0x80) == 0)
261 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
263 /* Start from @current and move back to @pos char. This pointer return. The
264 * most left pointer is @start. */
265 inline unsigned char *
266 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
268 if (current == NULL || start == NULL || pos < 0)
269 return NULL;
270 while (pos > 0 && current != start) {
271 current--;
272 if (utf8_islead(*current))
273 pos--;
275 return current;
278 /* Count number of standard terminal cells needed for displaying UTF-8
279 * character. */
281 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
283 unicode_val_T u;
285 if (end == NULL)
286 end = strchr(utf8_char, '\0');
288 if(!utf8_char || !end)
289 return -1;
291 u = utf8_to_unicode(&utf8_char, end);
293 return unicode_to_cell(u);
296 /* Count number of standard terminal cells needed for displaying string
297 * with UTF-8 characters. */
299 utf8_ptr2cells(unsigned char *string, unsigned char *end)
301 int charlen, cell, cells = 0;
303 if (end == NULL)
304 end = strchr(string, '\0');
306 if(!string || !end)
307 return -1;
309 do {
310 charlen = utf8charlen(string);
311 if (string + charlen > end)
312 break;
314 cell = utf8_char2cells(string, end);
315 if (cell < 0)
316 return -1;
318 cells += cell;
319 string += charlen;
320 } while (1);
322 return cells;
325 /* Count number of characters in string. */
327 utf8_ptr2chars(unsigned char *string, unsigned char *end)
329 int charlen, chars = 0;
331 if (end == NULL)
332 end = strchr(string, '\0');
334 if(!string || !end)
335 return -1;
337 do {
338 charlen = utf8charlen(string);
339 if (string + charlen > end)
340 break;
342 chars++;
343 string += charlen;
344 } while (1);
346 return chars;
350 * Count number of bytes from begining of the string needed for displaying
351 * specified number of cells.
354 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
356 unsigned int bytes = 0, cells = 0;
358 assert(max_cells>=0);
360 if (end == NULL)
361 end = strchr(string, '\0');
363 if(!string || !end)
364 return -1;
366 do {
367 int cell = utf8_char2cells(&string[bytes], end);
368 if (cell < 0)
369 return -1;
371 cells += cell;
372 if (cells > max_cells)
373 break;
375 bytes += utf8charlen(&string[bytes]);
377 if (string + bytes > end) {
378 bytes = end - string;
379 break;
381 } while(1);
383 return bytes;
386 /* Take @max steps forward from @string in the specified @way, but
387 * not going past @end. Return the resulting address. Store the
388 * number of steps taken to *@count, unless @count is NULL.
390 * This assumes the text is valid UTF-8, and @string and @end point to
391 * character boundaries. If not, it doesn't crash but the results may
392 * be inconsistent.
394 * This function can do some of the same jobs as utf8charlen(),
395 * utf8_cells2bytes(), and strlen_utf8(). */
396 unsigned char *
397 utf8_step_forward(unsigned char *string, unsigned char *end,
398 int max, enum utf8_step way, int *count)
400 int steps = 0;
401 unsigned char *current = string;
403 assert(string);
404 assert(max >= 0);
405 if_assert_failed goto invalid_arg;
406 if (end == NULL)
407 end = strchr(string, '\0');
409 switch (way) {
410 case utf8_step_characters:
411 while (steps < max && current < end) {
412 ++current;
413 if (utf8_islead(*current))
414 ++steps;
416 break;
418 case utf8_step_cells_fewer:
419 case utf8_step_cells_more:
420 while (steps < max) {
421 unicode_val_T u;
422 unsigned char *prev = current;
423 int width;
425 u = utf8_to_unicode(&current, end);
426 if (u == UCS_NO_CHAR) {
427 /* Assume the incomplete sequence
428 * costs one cell. */
429 current = end;
430 ++steps;
431 break;
434 width = unicode_to_cell(u);
435 if (way == utf8_step_cells_fewer
436 && steps + width > max) {
437 /* Back off. */
438 current = prev;
439 break;
441 steps += width;
443 break;
445 default:
446 INTERNAL("impossible enum utf8_step");
449 invalid_arg:
450 if (count)
451 *count = steps;
452 return current;
455 /* Take @max steps backward from @string in the specified @way, but
456 * not going past @start. Return the resulting address. Store the
457 * number of steps taken to *@count, unless @count is NULL.
459 * This assumes the text is valid UTF-8, and @string and @start point
460 * to character boundaries. If not, it doesn't crash but the results
461 * may be inconsistent.
463 * This function can do some of the same jobs as utf8_prevchar(). */
464 unsigned char *
465 utf8_step_backward(unsigned char *string, unsigned char *start,
466 int max, enum utf8_step way, int *count)
468 int steps = 0;
469 unsigned char *current = string;
471 assert(string);
472 assert(start);
473 assert(max >= 0);
474 if_assert_failed goto invalid_arg;
476 switch (way) {
477 case utf8_step_characters:
478 while (steps < max && current > start) {
479 --current;
480 if (utf8_islead(*current))
481 ++steps;
483 break;
485 case utf8_step_cells_fewer:
486 case utf8_step_cells_more:
487 while (steps < max) {
488 unsigned char *prev = current;
489 unsigned char *look;
490 unicode_val_T u;
491 int width;
493 if (current <= start)
494 break;
495 do {
496 --current;
497 } while (current > start && !utf8_islead(*current));
499 look = current;
500 u = utf8_to_unicode(&look, prev);
501 if (u == UCS_NO_CHAR) {
502 /* Assume the incomplete sequence
503 * costs one cell. */
504 width = 1;
505 } else
506 width = unicode_to_cell(u);
508 if (way == utf8_step_cells_fewer
509 && steps + width > max) {
510 /* Back off. */
511 current = prev;
512 break;
514 steps += width;
516 break;
518 default:
519 INTERNAL("impossible enum utf8_step");
522 invalid_arg:
523 if (count)
524 *count = steps;
525 return current;
529 * Find out number of standard terminal collumns needed for displaying symbol
530 * (glyph) which represents Unicode character c.
531 * TODO: Use wcwidth when it is available.
533 * @return 2 for double-width glyph, 1 for others.
534 * TODO: May be extended to return 0 for zero-width glyphs
535 * (like composing, maybe unprintable too).
537 inline int
538 unicode_to_cell(unicode_val_T c)
540 if (c >= 0x1100
541 && (c <= 0x115f /* Hangul Jamo */
542 || c == 0x2329
543 || c == 0x232a
544 || (c >= 0x2e80 && c <= 0xa4cf
545 && c != 0x303f) /* CJK ... Yi */
546 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
547 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
548 Ideographs */
549 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
550 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
551 || (c >= 0xffe0 && c <= 0xffe6)
552 || (c >= 0x20000 && c <= 0x2fffd)
553 || (c >= 0x30000 && c <= 0x3fffd)))
554 return 2;
556 return 1;
559 /* Fold the case of a Unicode character, so that hotkeys in labels can
560 * be compared case-insensitively. It is unspecified whether the
561 * result will be in upper or lower case. */
562 unicode_val_T
563 unicode_fold_label_case(unicode_val_T c)
565 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
566 return towlower(c);
567 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
568 /* For now, this supports only ASCII. It would be possible to
569 * use code generated from CaseFolding.txt of Unicode if the
570 * acknowledgements required by http://www.unicode.org/copyright.html
571 * were added to associated documentation of ELinks. */
572 if (c >= 0x41 && c <= 0x5A)
573 return c + 0x20;
574 else
575 return c;
576 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
579 inline unicode_val_T
580 utf8_to_unicode(unsigned char **string, unsigned char *end)
582 unsigned char *str = *string;
583 unicode_val_T u;
584 int length;
586 length = utf8char_len_tab[str[0]];
588 if (str + length > end) {
589 return UCS_NO_CHAR;
592 switch (length) {
593 case 1:
594 u = str[0];
595 break;
596 case 2:
597 u = (str[0] & 0x1f) << 6;
598 u += (str[1] & 0x3f);
599 break;
600 case 3:
601 u = (str[0] & 0x0f) << 12;
602 u += ((str[1] & 0x3f) << 6);
603 u += (str[2] & 0x3f);
604 break;
605 case 4:
606 u = (str[0] & 0x0f) << 18;
607 u += ((str[1] & 0x3f) << 12);
608 u += ((str[2] & 0x3f) << 6);
609 u += (str[3] & 0x3f);
610 break;
611 case 5:
612 u = (str[0] & 0x0f) << 24;
613 u += ((str[1] & 0x3f) << 18);
614 u += ((str[2] & 0x3f) << 12);
615 u += ((str[3] & 0x3f) << 6);
616 u += (str[4] & 0x3f);
617 break;
618 case 6:
619 default:
620 u = (str[0] & 0x01) << 30;
621 u += ((str[1] & 0x3f) << 24);
622 u += ((str[2] & 0x3f) << 18);
623 u += ((str[3] & 0x3f) << 12);
624 u += ((str[4] & 0x3f) << 6);
625 u += (str[5] & 0x3f);
626 break;
628 *string = str + length;
629 return u;
631 #endif /* CONFIG_UTF8 */
633 /* Slow algorithm, the common part of cp2u and cp2utf8. */
634 static unicode_val_T
635 cp2u_shared(const struct codepage_desc *from, unsigned char c)
637 int j;
639 for (j = 0; from->table[j].c; j++)
640 if (from->table[j].c == c)
641 return from->table[j].u;
643 return UCS_REPLACEMENT_CHARACTER;
646 /* Slow algorithm, used for converting input from the terminal. */
647 unicode_val_T
648 cp2u(int from, unsigned char c)
650 from &= ~SYSTEM_CHARSET_FLAG;
652 /* UTF-8 is a multibyte codepage and cannot be handled with
653 * this function. */
654 assert(codepages[from].table != table_utf8);
655 if_assert_failed return UCS_REPLACEMENT_CHARACTER;
657 if (c < 0x80) return c;
658 else return cp2u_shared(&codepages[from], c);
661 /* This slow and ugly code is used by the terminal utf_8_io */
662 unsigned char *
663 cp2utf8(int from, int c)
665 from &= ~SYSTEM_CHARSET_FLAG;
667 if (codepages[from].table == table_utf8 || c < 128)
668 return strings[c];
670 return encode_utf8(cp2u_shared(&codepages[from], c));
673 #ifdef CONFIG_UTF8
674 unicode_val_T
675 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
677 unicode_val_T ret;
679 if (is_cp_utf8(codepage))
680 return utf8_to_unicode(string, end);
682 if (*string >= end)
683 return UCS_NO_CHAR;
685 ret = cp2u(codepage, **string);
686 ++*string;
687 return ret;
689 #endif /* CONFIG_UTF8 */
692 static void
693 add_utf8(struct conv_table *ct, unicode_val_T u, unsigned char *str)
695 unsigned char *p = encode_utf8(u);
697 while (p[1]) {
698 if (ct[*p].t) ct = ct[*p].u.tbl;
699 else {
700 struct conv_table *nct;
702 assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
703 if_assert_failed return;
705 nct = mem_calloc(256, sizeof(*nct));
706 if (!nct) return;
707 new_translation_table(nct);
708 ct[*p].t = 1;
709 ct[*p].u.tbl = nct;
710 ct = nct;
712 p++;
715 assertm(!ct[*p].t, "bad utf encoding #2");
716 if_assert_failed return;
718 if (ct[*p].u.str == no_str)
719 ct[*p].u.str = str;
722 struct conv_table utf_table[256];
723 int utf_table_init = 1;
725 static void
726 free_utf_table(void)
728 int i;
730 for (i = 128; i < 256; i++)
731 mem_free(utf_table[i].u.str);
734 static struct conv_table *
735 get_translation_table_to_utf8(int from)
737 int i;
738 static int lfr = -1;
740 if (from == -1) return NULL;
741 from &= ~SYSTEM_CHARSET_FLAG;
742 if (from == lfr) return utf_table;
743 lfr = from;
744 if (utf_table_init)
745 memset(utf_table, 0, sizeof(utf_table)),
746 utf_table_init = 0;
747 else
748 free_utf_table();
750 for (i = 0; i < 128; i++)
751 utf_table[i].u.str = strings[i];
753 if (codepages[from].table == table_utf8) {
754 for (i = 128; i < 256; i++)
755 utf_table[i].u.str = stracpy(strings[i]);
756 return utf_table;
759 for (i = 128; i < 256; i++)
760 utf_table[i].u.str = NULL;
762 for (i = 0; codepages[from].table[i].c; i++) {
763 unicode_val_T u = codepages[from].table[i].u;
765 if (!utf_table[codepages[from].table[i].c].u.str)
766 utf_table[codepages[from].table[i].c].u.str =
767 stracpy(encode_utf8(u));
770 for (i = 128; i < 256; i++)
771 if (!utf_table[i].u.str)
772 utf_table[i].u.str = stracpy(no_str);
774 return utf_table;
777 struct conv_table table[256];
778 static int first = 1;
780 void
781 free_conv_table(void)
783 if (!utf_table_init) free_utf_table();
784 if (first) {
785 memset(table, 0, sizeof(table));
786 first = 0;
788 new_translation_table(table);
792 struct conv_table *
793 get_translation_table(int from, int to)
795 static int lfr = -1;
796 static int lto = -1;
798 from &= ~SYSTEM_CHARSET_FLAG;
799 to &= ~SYSTEM_CHARSET_FLAG;
800 if (first) {
801 memset(table, 0, sizeof(table));
802 first = 0;
804 if (/*from == to ||*/ from == -1 || to == -1)
805 return NULL;
806 if (codepages[to].table == table_utf8)
807 return get_translation_table_to_utf8(from);
808 if (from == lfr && to == lto)
809 return table;
810 lfr = from;
811 lto = to;
812 new_translation_table(table);
814 if (codepages[from].table == table_utf8) {
815 int i;
817 for (i = 0; codepages[to].table[i].c; i++)
818 add_utf8(table, codepages[to].table[i].u,
819 strings[codepages[to].table[i].c]);
821 for (i = 0; unicode_7b[i].x != -1; i++)
822 if (unicode_7b[i].x >= 0x80)
823 add_utf8(table, unicode_7b[i].x,
824 unicode_7b[i].s);
826 } else {
827 int i;
829 for (i = 128; i < 256; i++) {
830 int j;
832 for (j = 0; codepages[from].table[j].c; j++) {
833 if (codepages[from].table[j].c == i) {
834 unsigned char *u;
836 u = u2cp(codepages[from].table[j].u, to);
837 if (u) table[i].u.str = u;
838 break;
844 return table;
847 static inline int
848 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
850 while (l2) {
851 if (*s1 > *s2) return 1;
852 if (*s1 < *s2) return -1;
853 s1++;
854 s2++;
855 l2--;
858 return *s2 ? -1 : 0;
861 /* Entity cache debugging purpose. */
862 #if 0
863 #define DEBUG_ENTITY_CACHE
864 #else
865 #undef DEBUG_ENTITY_CACHE
866 #endif
868 struct entity_cache {
869 unsigned int hits;
870 int strlen;
871 int encoding;
872 unsigned char *result;
873 unsigned char str[20]; /* Suffice in any case. */
876 static int
877 hits_cmp(struct entity_cache *a, struct entity_cache *b)
879 if (a->hits == b->hits) return 0;
880 if (a->hits > b->hits) return -1;
881 else return 1;
884 static int
885 compare_entities(const void *key_, const void *element_)
887 struct string *key = (struct string *) key_;
888 struct entity *element = (struct entity *) element_;
889 int length = key->length;
890 unsigned char *first = key->source;
891 unsigned char *second = element->s;
893 return xxstrcmp(first, second, length);
896 unsigned char *
897 get_entity_string(const unsigned char *str, const int strlen, int encoding)
899 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
900 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
901 will go in [0] table */
902 static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
903 static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
904 static int first_time = 1;
905 unsigned int slen = 0;
906 unsigned char *result = NULL;
908 if (strlen <= 0) return NULL;
910 #ifdef CONFIG_UTF8
911 /* TODO: caching UTF-8 */
912 encoding &= ~SYSTEM_CHARSET_FLAG;
913 if (codepages[encoding].table == table_utf8)
914 goto skip;
915 #endif /* CONFIG_UTF8 */
917 if (first_time) {
918 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
919 first_time = 0;
922 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
923 * + google + slashdot + websites that result from a search for test on google,
924 * + various ones) show a quite impressive improvment:
925 * Top ten is:
926 * 0: hits=2459 l=4 st='nbsp'
927 * 1: hits=2152 l=6 st='eacute'
928 * 2: hits=235 l=6 st='egrave'
929 * 3: hits=136 l=6 st='agrave'
930 * 4: hits=100 l=3 st='amp'
931 * 5: hits=40 l=5 st='laquo'
932 * 6: hits=8 l=4 st='copy'
933 * 7: hits=5 l=2 st='gt'
934 * 8: hits=2 l=2 st='lt'
935 * 9: hits=1 l=6 st='middot'
937 * Most of the time cache hit ratio is near 95%.
939 * A long test shows: 15186 hits vs. 24 misses and mean iteration
940 * count is kept < 2 (worst case 1.58). Not so bad ;)
942 * --Zas */
944 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
945 slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
947 if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
948 int i;
950 for (i = 0; i < nb_entity_cache[slen]; i++) {
951 if (entity_cache[slen][i].encoding == encoding
952 && !memcmp(str, entity_cache[slen][i].str, strlen)) {
953 #ifdef DEBUG_ENTITY_CACHE
954 static double total_iter = 0;
955 static unsigned long hit_count = 0;
957 total_iter += i + 1;
958 hit_count++;
959 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
960 #endif
961 if (entity_cache[slen][i].hits < (unsigned int) ~0)
962 entity_cache[slen][i].hits++;
963 return entity_cache[slen][i].result;
966 #ifdef DEBUG_ENTITY_CACHE
967 fprintf(stderr, "miss\n");
968 #endif
970 #ifdef CONFIG_UTF8
971 skip:
972 #endif /* CONFIG_UTF8 */
973 if (*str == '#') { /* Numeric entity. */
974 int l = (int) strlen;
975 unsigned char *st = (unsigned char *) str;
976 unicode_val_T n = 0;
978 if (l == 1) goto end; /* &#; ? */
979 st++, l--;
980 if ((*st | 32) == 'x') { /* Hexadecimal */
982 if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
983 st++, l--;
984 do {
985 unsigned char c = (*(st++) | 32);
987 if (isdigit(c))
988 n = (n << 4) | (c - '0');
989 else if (isxdigit(c))
990 n = (n << 4) | (c - 'a' + 10);
991 else
992 goto end; /* Bad char. */
993 } while (--l);
994 } else { /* Decimal */
995 if (l > 10) goto end; /* 4294967295 max. */
996 do {
997 unsigned char c = *(st++);
999 if (isdigit(c))
1000 n = n * 10 + c - '0';
1001 else
1002 goto end; /* Bad char. */
1003 /* Limit to 0xFFFFFFFF. */
1004 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1005 goto end;
1006 } while (--l);
1009 result = u2cp(n, encoding);
1011 #ifdef DEBUG_ENTITY_CACHE
1012 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1013 #endif
1014 } else { /* Text entity. */
1015 struct string key = INIT_STRING((unsigned char *) str, strlen);
1016 struct entity *element = bsearch((void *) &key, entities,
1017 N_ENTITIES,
1018 sizeof(*element),
1019 compare_entities);
1021 if (element) result = u2cp(element->c, encoding);
1024 #ifdef CONFIG_UTF8
1025 if (codepages[encoding].table == table_utf8) {
1026 return result;
1028 #endif /* CONFIG_UTF8 */
1029 end:
1030 /* Take care of potential buffer overflow. */
1031 if (strlen < sizeof(entity_cache[slen][0].str)) {
1032 struct entity_cache *ece = &entity_cache[slen][nb_entity_cache[slen]];
1034 /* Copy new entry to cache. */
1035 ece->hits = 1;
1036 ece->strlen = strlen;
1037 ece->encoding = encoding;
1038 ece->result = result;
1039 memcpy(ece->str, str, strlen);
1040 ece->str[strlen] = '\0';
1042 /* Increment number of cache entries if possible. */
1043 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1045 #ifdef DEBUG_ENTITY_CACHE
1046 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1047 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1049 #endif
1051 /* Sort entries by hit order. */
1052 if (nb_entity_cache[slen] > 1)
1053 qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1054 sizeof(entity_cache[slen][0]), (void *) hits_cmp);
1056 #ifdef DEBUG_ENTITY_CACHE
1058 unsigned int i;
1060 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1061 for (i = 0; i < nb_entity_cache[slen] ; i++)
1062 fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1063 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1064 entity_cache[slen][i].str);
1065 fprintf(stderr, "-----------------\n");
1067 #endif
1069 return result;
1072 unsigned char *
1073 convert_string(struct conv_table *convert_table,
1074 unsigned char *chars, int charslen, int cp,
1075 enum convert_string_mode mode, int *length,
1076 void (*callback)(void *data, unsigned char *buf, int buflen),
1077 void *callback_data)
1079 unsigned char *buffer;
1080 int bufferpos = 0;
1081 int charspos = 0;
1083 if (!convert_table && !memchr(chars, '&', charslen)) {
1084 if (callback) {
1085 if (charslen) callback(callback_data, chars, charslen);
1086 return NULL;
1087 } else {
1088 return memacpy(chars, charslen);
1092 /* Buffer allocation */
1094 buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1095 if (!buffer) return NULL;
1097 /* Iterate ;-) */
1099 while (charspos < charslen) {
1100 unsigned char *translit;
1102 #define PUTC do { \
1103 buffer[bufferpos++] = chars[charspos++]; \
1104 translit = ""; \
1105 goto flush; \
1106 } while (0)
1108 if (chars[charspos] != '&') {
1109 struct conv_table *t;
1110 int i;
1112 if (chars[charspos] < 128 || !convert_table) PUTC;
1114 t = convert_table;
1115 i = charspos;
1117 while (t[chars[i]].t) {
1118 t = t[chars[i++]].u.tbl;
1119 if (i >= charslen) PUTC;
1122 translit = t[chars[i]].u.str;
1123 charspos = i + 1;
1125 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1126 PUTC;
1128 } else {
1129 int start = charspos + 1;
1130 int i = start;
1132 while (i < charslen
1133 && (isasciialpha(chars[i])
1134 || isdigit(chars[i])
1135 || (chars[i] == '#')))
1136 i++;
1138 /* This prevents bug 213: we were expanding "entities"
1139 * in URL query strings. */
1140 /* XXX: But this disables &nbsp&nbsp usage, which
1141 * appears to be relatively common! --pasky */
1142 if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1143 && i > start
1144 && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1145 translit = get_entity_string(&chars[start], i - start,
1146 cp);
1147 if (chars[i] != ';') {
1148 /* Eat &nbsp &nbsp<foo> happily, but
1149 * pull back from the character after
1150 * entity string if it is not the valid
1151 * terminator. */
1152 i--;
1155 if (!translit) PUTC;
1156 charspos = i + (i < charslen);
1157 } else PUTC;
1160 if (!translit[0]) continue;
1162 if (!translit[1]) {
1163 buffer[bufferpos++] = translit[0];
1164 translit = "";
1165 goto flush;
1168 while (*translit) {
1169 unsigned char *new;
1171 buffer[bufferpos++] = *(translit++);
1172 flush:
1173 if (bufferpos & (ALLOC_GR - 1)) continue;
1175 if (callback) {
1176 buffer[bufferpos] = 0;
1177 callback(callback_data, buffer, bufferpos);
1178 bufferpos = 0;
1179 } else {
1180 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1181 if (!new) {
1182 mem_free(buffer);
1183 return NULL;
1185 buffer = new;
1188 #undef PUTC
1191 /* Say bye */
1193 buffer[bufferpos] = 0;
1194 if (length) *length = bufferpos;
1196 if (callback) {
1197 if (bufferpos) callback(callback_data, buffer, bufferpos);
1198 mem_free(buffer);
1199 return NULL;
1200 } else {
1201 return buffer;
1206 #ifndef USE_FASTFIND
1208 get_cp_index(unsigned char *name)
1210 int i, a;
1211 int syscp = 0;
1213 if (!strcasecmp(name, "System")) {
1214 #if HAVE_LANGINFO_CODESET
1215 name = nl_langinfo(CODESET);
1216 syscp = SYSTEM_CHARSET_FLAG;
1217 #else
1218 name = "us-ascii";
1219 #endif
1222 for (i = 0; codepages[i].name; i++) {
1223 for (a = 0; codepages[i].aliases[a]; a++) {
1224 /* In the past, we looked for the longest substring
1225 * in all the names; it is way too expensive, though:
1227 * % cumulative self self total
1228 * time seconds seconds calls us/call us/call name
1229 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1231 * Anything called from redraw_screen() is in fact
1232 * relatively expensive, even if it's called just
1233 * once. So we will do a simple strcasecmp() here.
1236 if (!strcasecmp(name, codepages[i].aliases[a]))
1237 return i | syscp;
1241 if (syscp) {
1242 return get_cp_index("us-ascii") | syscp;
1243 } else {
1244 return -1;
1248 #else
1250 static unsigned int i_name = 0;
1251 static unsigned int i_alias = 0;
1253 /* Reset internal list pointer */
1254 void
1255 charsets_list_reset(void)
1257 i_name = 0;
1258 i_alias = 0;
1261 /* Returns a pointer to a struct that contains current key and data pointers
1262 * and increment internal pointer. It returns NULL when key is NULL. */
1263 struct fastfind_key_value *
1264 charsets_list_next(void)
1266 static struct fastfind_key_value kv;
1268 if (!codepages[i_name].name) return NULL;
1270 kv.key = codepages[i_name].aliases[i_alias];
1271 kv.data = (void *) &codepages[i_name]; /* cast away const */
1273 if (codepages[i_name].aliases[i_alias + 1])
1274 i_alias++;
1275 else {
1276 i_name++;
1277 i_alias = 0;
1280 return &kv;
1283 static struct fastfind_index ff_charsets_index
1284 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1286 /* It searchs for a charset named @name or one of its aliases and
1287 * returns index for it or -1 if not found. */
1289 get_cp_index(unsigned char *name)
1291 const struct codepage_desc *codepage;
1292 int syscp = 0;
1294 if (!strcasecmp(name, "System")) {
1295 #if HAVE_LANGINFO_CODESET
1296 name = nl_langinfo(CODESET);
1297 syscp = SYSTEM_CHARSET_FLAG;
1298 #else
1299 name = "us-ascii";
1300 #endif
1303 codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1304 if (codepage) {
1305 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1306 return (codepage - codepages) | syscp;
1308 } else if (syscp) {
1309 return get_cp_index("us-ascii") | syscp;
1311 } else {
1312 return -1;
1316 #endif /* USE_FASTFIND */
1318 void
1319 init_charsets_lookup(void)
1321 #ifdef USE_FASTFIND
1322 fastfind_index(&ff_charsets_index, FF_COMPRESS);
1323 #endif
1326 void
1327 free_charsets_lookup(void)
1329 #ifdef USE_FASTFIND
1330 fastfind_done(&ff_charsets_index);
1331 #endif
1334 unsigned char *
1335 get_cp_name(int cp_index)
1337 if (cp_index < 0) return "none";
1338 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1340 return codepages[cp_index].name;
1343 unsigned char *
1344 get_cp_mime_name(int cp_index)
1346 if (cp_index < 0) return "none";
1347 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1348 if (!codepages[cp_index].aliases) return NULL;
1350 return codepages[cp_index].aliases[0];
1354 is_cp_utf8(int cp_index)
1356 cp_index &= ~SYSTEM_CHARSET_FLAG;
1357 return codepages[cp_index].table == table_utf8;