UTF-8: Split UCS_REPLACEMENT_CHARACTER off UCS_NO_CHAR.
[elinks.git] / src / intl / charsets.c
blob11aedf286ad3bce4b345a8fb15012af199c72333
1 /* Charsets convertor */
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
7 #if HAVE_LANGINFO_CODESET
8 #include <langinfo.h>
9 #endif
11 #include <ctype.h>
12 #include <stdlib.h>
13 #if HAVE_WCTYPE_H
14 #include <wctype.h>
15 #endif
17 #include "elinks.h"
19 #include "document/options.h"
20 #include "intl/charsets.h"
21 #include "util/conv.h"
22 #include "util/error.h"
23 #include "util/fastfind.h"
24 #include "util/memory.h"
25 #include "util/string.h"
28 /* Fix namespace clash on MacOS. */
29 #define table table_elinks
31 struct table_entry {
32 unsigned char c;
33 unicode_val_T u;
36 struct codepage_desc {
37 unsigned char *name;
38 unsigned char **aliases;
39 struct table_entry *table;
42 #include "intl/codepage.inc"
43 #include "intl/uni_7b.inc"
44 #include "intl/entity.inc"
47 static char strings[256][2] = {
48 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
49 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
50 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
51 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
52 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
53 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
54 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
55 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
56 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
57 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
58 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
59 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
60 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
61 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
62 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
63 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
64 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
65 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
66 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
67 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
68 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
69 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
70 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
71 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
72 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
73 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
74 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
75 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
76 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
77 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
78 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
79 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
82 static void
83 free_translation_table(struct conv_table *p)
85 int i;
87 for (i = 0; i < 256; i++)
88 if (p[i].t)
89 free_translation_table(p[i].u.tbl);
91 mem_free(p);
94 static unsigned char *no_str = "*";
96 static void
97 new_translation_table(struct conv_table *p)
99 int i;
101 for (i = 0; i < 256; i++)
102 if (p[i].t)
103 free_translation_table(p[i].u.tbl);
104 for (i = 0; i < 128; i++) {
105 p[i].t = 0;
106 p[i].u.str = strings[i];
108 for (; i < 256; i++) {
109 p[i].t = 0;
110 p[i].u.str = no_str;
114 #define BIN_SEARCH(table, entry, entries, key, result) \
116 long _s = 0, _e = (entries) - 1; \
118 while (_s <= _e || !((result) = -1)) { \
119 long _m = (_s + _e) / 2; \
121 if ((table)[_m].entry == (key)) { \
122 (result) = _m; \
123 break; \
125 if ((table)[_m].entry > (key)) _e = _m - 1; \
126 if ((table)[_m].entry < (key)) _s = _m + 1; \
130 static const unicode_val_T strange_chars[32] = {
131 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
132 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
133 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
134 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
137 #define SYSTEM_CHARSET_FLAG 128
139 unsigned char *
140 u2cp_(unicode_val_T u, int to, int no_nbsp_hack)
142 int j;
143 int s;
145 if (u < 128) return strings[u];
147 to &= ~SYSTEM_CHARSET_FLAG;
149 #ifdef CONFIG_UTF_8
150 if (codepages[to].table == table_utf_8)
151 return encode_utf_8(u);
152 #endif /* CONFIG_UTF_8 */
154 /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
155 if (u == 0xa0) return no_nbsp_hack ? " " : NBSP_CHAR_STRING;
156 if (u == 0xad) return "";
158 if (u < 0xa0) {
159 unicode_val_T strange = strange_chars[u - 0x80];
161 if (!strange) return NULL;
162 return u2cp_(strange, to, no_nbsp_hack);
166 for (j = 0; codepages[to].table[j].c; j++)
167 if (codepages[to].table[j].u == u)
168 return strings[codepages[to].table[j].c];
170 BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
171 if (s != -1) return unicode_7b[s].s;
173 return no_str;
176 static unsigned char utf_buffer[7];
178 #ifdef CONFIG_UTF_8
179 inline unsigned char *
180 encode_utf_8(unicode_val_T u)
181 #else
182 static unsigned char *
183 encode_utf_8(unicode_val_T u)
184 #endif /* CONFIG_UTF_8 */
186 memset(utf_buffer, 0, 7);
188 if (u < 0x80)
189 utf_buffer[0] = u;
190 else if (u < 0x800)
191 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
192 utf_buffer[1] = 0x80 | (u & 0x3f);
193 else if (u < 0x10000)
194 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
195 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
196 utf_buffer[2] = 0x80 | (u & 0x3f);
197 else if (u < 0x200000)
198 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
199 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
200 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
201 utf_buffer[3] = 0x80 | (u & 0x3f);
202 else if (u < 0x4000000)
203 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
204 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
205 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
206 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
207 utf_buffer[4] = 0x80 | (u & 0x3f);
208 else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
209 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
210 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
211 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
212 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
213 utf_buffer[5] = 0x80 | (u & 0x3f);
215 return utf_buffer;
218 #ifdef CONFIG_UTF_8
219 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
220 * equal ones and handled different. */
221 static char utf8char_len_tab[256] = {
222 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
223 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
224 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
225 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
226 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
227 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
228 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
229 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
232 inline int utf8charlen(const unsigned char *p)
234 return p ? utf8char_len_tab[*p] : 0;
237 inline int
238 strlen_utf8(unsigned char **str)
240 unsigned char *s = *str;
241 unsigned char *end = strchr(s, '\0');
242 int x;
243 int len;
245 for (x = 0;; x++, s += len) {
246 len = utf8charlen(s);
247 if (s + len > end) break;
249 *str = s;
250 return x;
253 #define utf8_issingle(p) (((p) & 0x80) == 0)
254 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
256 /* Start from @current and move back to @pos char. This pointer return. The
257 * most left pointer is @start. */
258 inline unsigned char *
259 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
261 if (current == NULL || start == NULL || pos < 0)
262 return NULL;
263 while (pos > 0 && current != start) {
264 current--;
265 if (utf8_islead(*current))
266 pos--;
268 return current;
271 /* Count number of standard terminal cells needed for displaying UTF-8
272 * character. */
274 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
276 unicode_val_T u;
278 if (end == NULL)
279 end = strchr(utf8_char, '\0');
281 if(!utf8_char || !end)
282 return -1;
284 u = utf_8_to_unicode(&utf8_char, end);
286 return unicode_to_cell(u);
289 /* Count number of standard terminal cells needed for displaying string
290 * with UTF-8 characters. */
292 utf8_ptr2cells(unsigned char *string, unsigned char *end)
294 int charlen, cell, cells = 0;
296 if (end == NULL)
297 end = strchr(string, '\0');
299 if(!string || !end)
300 return -1;
302 do {
303 charlen = utf8charlen(string);
304 if (string + charlen > end)
305 break;
307 cell = utf8_char2cells(string, end);
308 if (cell < 0)
309 return -1;
311 cells += cell;
312 string += charlen;
313 } while (1);
315 return cells;
318 /* Count number of characters in string. */
320 utf8_ptr2chars(unsigned char *string, unsigned char *end)
322 int charlen, chars = 0;
324 if (end == NULL)
325 end = strchr(string, '\0');
327 if(!string || !end)
328 return -1;
330 do {
331 charlen = utf8charlen(string);
332 if (string + charlen > end)
333 break;
335 chars++;
336 string += charlen;
337 } while (1);
339 return chars;
343 * Count number of bytes from begining of the string needed for displaying
344 * specified number of cells.
347 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
349 unsigned int bytes = 0, cells = 0;
351 assert(max_cells>=0);
353 if (end == NULL)
354 end = strchr(string, '\0');
356 if(!string || !end)
357 return -1;
359 do {
360 int cell = utf8_char2cells(&string[bytes], end);
361 if (cell < 0)
362 return -1;
364 cells += cell;
365 if (cells > max_cells)
366 break;
368 bytes += utf8charlen(&string[bytes]);
370 if (string + bytes > end) {
371 bytes = end - string;
372 break;
374 } while(1);
376 return bytes;
380 * Find out number of standard terminal collumns needed for displaying symbol
381 * (glyph) which represents Unicode character c.
382 * TODO: Use wcwidth when it is available.
384 * @return 2 for double-width glyph, 1 for others.
385 * TODO: May be extended to return 0 for zero-width glyphs
386 * (like composing, maybe unprintable too).
388 inline int
389 unicode_to_cell(unicode_val_T c)
391 if (c >= 0x1100
392 && (c <= 0x115f /* Hangul Jamo */
393 || c == 0x2329
394 || c == 0x232a
395 || (c >= 0x2e80 && c <= 0xa4cf
396 && c != 0x303f) /* CJK ... Yi */
397 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
398 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
399 Ideographs */
400 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
401 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
402 || (c >= 0xffe0 && c <= 0xffe6)
403 || (c >= 0x20000 && c <= 0x2fffd)
404 || (c >= 0x30000 && c <= 0x3fffd)))
405 return 2;
407 return 1;
410 /* Fold the case of a Unicode character, so that hotkeys in labels can
411 * be compared case-insensitively. It is unspecified whether the
412 * result will be in upper or lower case. */
413 unicode_val_T
414 unicode_fold_label_case(unicode_val_T c)
416 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
417 return towlower(c);
418 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
419 /* For now, this supports only ASCII. It would be possible to
420 * use code generated from CaseFolding.txt of Unicode if the
421 * acknowledgements required by http://www.unicode.org/copyright.html
422 * were added to associated documentation of ELinks. */
423 if (c >= 0x41 && c <= 0x5A)
424 return c + 0x20;
425 else
426 return c;
427 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
430 inline unicode_val_T
431 utf_8_to_unicode(unsigned char **string, unsigned char *end)
433 unsigned char *str = *string;
434 unicode_val_T u;
435 int length;
437 length = utf8char_len_tab[str[0]];
439 if (str + length > end) {
440 return UCS_NO_CHAR;
443 switch (length) {
444 case 1:
445 u = str[0];
446 break;
447 case 2:
448 u = (str[0] & 0x1f) << 6;
449 u += (str[1] & 0x3f);
450 break;
451 case 3:
452 u = (str[0] & 0x0f) << 12;
453 u += ((str[1] & 0x3f) << 6);
454 u += (str[2] & 0x3f);
455 break;
456 case 4:
457 u = (str[0] & 0x0f) << 18;
458 u += ((str[1] & 0x3f) << 12);
459 u += ((str[2] & 0x3f) << 6);
460 u += (str[3] & 0x3f);
461 break;
462 case 5:
463 u = (str[0] & 0x0f) << 24;
464 u += ((str[1] & 0x3f) << 18);
465 u += ((str[2] & 0x3f) << 12);
466 u += ((str[3] & 0x3f) << 6);
467 u += (str[4] & 0x3f);
468 break;
469 case 6:
470 default:
471 u = (str[0] & 0x01) << 30;
472 u += ((str[1] & 0x3f) << 24);
473 u += ((str[2] & 0x3f) << 18);
474 u += ((str[3] & 0x3f) << 12);
475 u += ((str[4] & 0x3f) << 6);
476 u += (str[5] & 0x3f);
477 break;
479 *string = str + length;
480 return u;
482 #endif /* CONFIG_UTF_8 */
484 /* Slow algorithm, the common part of cp2u and cp2utf_8. */
485 static unicode_val_T
486 cp2u_shared(const struct codepage_desc *from, unsigned char c)
488 int j;
490 for (j = 0; from->table[j].c; j++)
491 if (from->table[j].c == c)
492 return from->table[j].u;
494 return UCS_REPLACEMENT_CHARACTER;
497 /* Slow algorithm, used for converting input from the terminal. */
498 unicode_val_T
499 cp2u(int from, unsigned char c)
501 from &= ~SYSTEM_CHARSET_FLAG;
503 /* UTF-8 is a multibyte codepage and cannot be handled with
504 * this function. */
505 assert(codepages[from].table != table_utf_8);
506 if_assert_failed return UCS_REPLACEMENT_CHARACTER;
508 if (c < 0x80) return c;
509 else return cp2u_shared(&codepages[from], c);
512 /* This slow and ugly code is used by the terminal utf_8_io */
513 unsigned char *
514 cp2utf_8(int from, int c)
516 from &= ~SYSTEM_CHARSET_FLAG;
518 if (codepages[from].table == table_utf_8 || c < 128)
519 return strings[c];
521 return encode_utf_8(cp2u_shared(&codepages[from], c));
524 #ifdef CONFIG_UTF_8
525 unicode_val_T
526 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
528 unicode_val_T ret;
530 if (is_cp_utf8(codepage))
531 return utf_8_to_unicode(string, end);
533 if (*string >= end)
534 return UCS_NO_CHAR;
536 ret = cp2u(codepage, **string);
537 ++*string;
538 return ret;
540 #endif /* CONFIG_UTF_8 */
543 static void
544 add_utf_8(struct conv_table *ct, unicode_val_T u, unsigned char *str)
546 unsigned char *p = encode_utf_8(u);
548 while (p[1]) {
549 if (ct[*p].t) ct = ct[*p].u.tbl;
550 else {
551 struct conv_table *nct;
553 assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
554 if_assert_failed return;
556 nct = mem_calloc(256, sizeof(*nct));
557 if (!nct) return;
558 new_translation_table(nct);
559 ct[*p].t = 1;
560 ct[*p].u.tbl = nct;
561 ct = nct;
563 p++;
566 assertm(!ct[*p].t, "bad utf encoding #2");
567 if_assert_failed return;
569 if (ct[*p].u.str == no_str)
570 ct[*p].u.str = str;
573 struct conv_table utf_table[256];
574 int utf_table_init = 1;
576 static void
577 free_utf_table(void)
579 int i;
581 for (i = 128; i < 256; i++)
582 mem_free(utf_table[i].u.str);
585 static struct conv_table *
586 get_translation_table_to_utf_8(int from)
588 int i;
589 static int lfr = -1;
591 if (from == -1) return NULL;
592 from &= ~SYSTEM_CHARSET_FLAG;
593 if (from == lfr) return utf_table;
594 lfr = from;
595 if (utf_table_init)
596 memset(utf_table, 0, sizeof(utf_table)),
597 utf_table_init = 0;
598 else
599 free_utf_table();
601 for (i = 0; i < 128; i++)
602 utf_table[i].u.str = strings[i];
604 if (codepages[from].table == table_utf_8) {
605 for (i = 128; i < 256; i++)
606 utf_table[i].u.str = stracpy(strings[i]);
607 return utf_table;
610 for (i = 128; i < 256; i++)
611 utf_table[i].u.str = NULL;
613 for (i = 0; codepages[from].table[i].c; i++) {
614 unicode_val_T u = codepages[from].table[i].u;
616 if (!utf_table[codepages[from].table[i].c].u.str)
617 utf_table[codepages[from].table[i].c].u.str =
618 stracpy(encode_utf_8(u));
621 for (i = 128; i < 256; i++)
622 if (!utf_table[i].u.str)
623 utf_table[i].u.str = stracpy(no_str);
625 return utf_table;
628 struct conv_table table[256];
629 static int first = 1;
631 void
632 free_conv_table(void)
634 if (!utf_table_init) free_utf_table();
635 if (first) {
636 memset(table, 0, sizeof(table));
637 first = 0;
639 new_translation_table(table);
643 struct conv_table *
644 get_translation_table(int from, int to)
646 static int lfr = -1;
647 static int lto = -1;
649 from &= ~SYSTEM_CHARSET_FLAG;
650 to &= ~SYSTEM_CHARSET_FLAG;
651 if (first) {
652 memset(table, 0, sizeof(table));
653 first = 0;
655 if (/*from == to ||*/ from == -1 || to == -1)
656 return NULL;
657 if (codepages[to].table == table_utf_8)
658 return get_translation_table_to_utf_8(from);
659 if (from == lfr && to == lto)
660 return table;
661 lfr = from;
662 lto = to;
663 new_translation_table(table);
665 if (codepages[from].table == table_utf_8) {
666 int i;
668 for (i = 0; codepages[to].table[i].c; i++)
669 add_utf_8(table, codepages[to].table[i].u,
670 strings[codepages[to].table[i].c]);
672 for (i = 0; unicode_7b[i].x != -1; i++)
673 if (unicode_7b[i].x >= 0x80)
674 add_utf_8(table, unicode_7b[i].x,
675 unicode_7b[i].s);
677 } else {
678 int i;
680 for (i = 128; i < 256; i++) {
681 int j;
683 for (j = 0; codepages[from].table[j].c; j++) {
684 if (codepages[from].table[j].c == i) {
685 unsigned char *u;
687 u = u2cp(codepages[from].table[j].u, to);
688 if (u) table[i].u.str = u;
689 break;
695 return table;
698 static inline int
699 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
701 while (l2) {
702 if (*s1 > *s2) return 1;
703 if (*s1 < *s2) return -1;
704 s1++;
705 s2++;
706 l2--;
709 return *s2 ? -1 : 0;
712 /* Entity cache debugging purpose. */
713 #if 0
714 #define DEBUG_ENTITY_CACHE
715 #else
716 #undef DEBUG_ENTITY_CACHE
717 #endif
719 struct entity_cache {
720 unsigned int hits;
721 int strlen;
722 int encoding;
723 unsigned char *result;
724 unsigned char str[20]; /* Suffice in any case. */
727 static int
728 hits_cmp(struct entity_cache *a, struct entity_cache *b)
730 if (a->hits == b->hits) return 0;
731 if (a->hits > b->hits) return -1;
732 else return 1;
735 static int
736 compare_entities(const void *key_, const void *element_)
738 struct string *key = (struct string *) key_;
739 struct entity *element = (struct entity *) element_;
740 int length = key->length;
741 unsigned char *first = key->source;
742 unsigned char *second = element->s;
744 return xxstrcmp(first, second, length);
747 unsigned char *
748 get_entity_string(const unsigned char *str, const int strlen, int encoding)
750 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
751 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
752 will go in [0] table */
753 static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
754 static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
755 static int first_time = 1;
756 unsigned int slen = 0;
757 unsigned char *result = NULL;
759 if (strlen <= 0) return NULL;
761 #ifdef CONFIG_UTF_8
762 /* TODO: caching UTF-8 */
763 encoding &= ~SYSTEM_CHARSET_FLAG;
764 if (codepages[encoding].table == table_utf_8)
765 goto skip;
766 #endif /* CONFIG_UTF_8 */
768 if (first_time) {
769 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
770 first_time = 0;
773 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
774 * + google + slashdot + websites that result from a search for test on google,
775 * + various ones) show a quite impressive improvment:
776 * Top ten is:
777 * 0: hits=2459 l=4 st='nbsp'
778 * 1: hits=2152 l=6 st='eacute'
779 * 2: hits=235 l=6 st='egrave'
780 * 3: hits=136 l=6 st='agrave'
781 * 4: hits=100 l=3 st='amp'
782 * 5: hits=40 l=5 st='laquo'
783 * 6: hits=8 l=4 st='copy'
784 * 7: hits=5 l=2 st='gt'
785 * 8: hits=2 l=2 st='lt'
786 * 9: hits=1 l=6 st='middot'
788 * Most of the time cache hit ratio is near 95%.
790 * A long test shows: 15186 hits vs. 24 misses and mean iteration
791 * count is kept < 2 (worst case 1.58). Not so bad ;)
793 * --Zas */
795 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
796 slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
798 if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
799 int i;
801 for (i = 0; i < nb_entity_cache[slen]; i++) {
802 if (entity_cache[slen][i].encoding == encoding
803 && !memcmp(str, entity_cache[slen][i].str, strlen)) {
804 #ifdef DEBUG_ENTITY_CACHE
805 static double total_iter = 0;
806 static unsigned long hit_count = 0;
808 total_iter += i + 1;
809 hit_count++;
810 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
811 #endif
812 if (entity_cache[slen][i].hits < (unsigned int) ~0)
813 entity_cache[slen][i].hits++;
814 return entity_cache[slen][i].result;
817 #ifdef DEBUG_ENTITY_CACHE
818 fprintf(stderr, "miss\n");
819 #endif
821 #ifdef CONFIG_UTF_8
822 skip:
823 #endif /* CONFIG_UTF_8 */
824 if (*str == '#') { /* Numeric entity. */
825 int l = (int) strlen;
826 unsigned char *st = (unsigned char *) str;
827 unicode_val_T n = 0;
829 if (l == 1) goto end; /* &#; ? */
830 st++, l--;
831 if ((*st | 32) == 'x') { /* Hexadecimal */
833 if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
834 st++, l--;
835 do {
836 unsigned char c = (*(st++) | 32);
838 if (isdigit(c))
839 n = (n << 4) | (c - '0');
840 else if (isxdigit(c))
841 n = (n << 4) | (c - 'a' + 10);
842 else
843 goto end; /* Bad char. */
844 } while (--l);
845 } else { /* Decimal */
846 if (l > 10) goto end; /* 4294967295 max. */
847 do {
848 unsigned char c = *(st++);
850 if (isdigit(c))
851 n = n * 10 + c - '0';
852 else
853 goto end; /* Bad char. */
854 /* Limit to 0xFFFFFFFF. */
855 if (n >= (unicode_val_T) 0xFFFFFFFFu)
856 goto end;
857 } while (--l);
860 result = u2cp(n, encoding);
862 #ifdef DEBUG_ENTITY_CACHE
863 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
864 #endif
865 } else { /* Text entity. */
866 struct string key = INIT_STRING((unsigned char *) str, strlen);
867 struct entity *element = bsearch((void *) &key, entities,
868 N_ENTITIES,
869 sizeof(*element),
870 compare_entities);
872 if (element) result = u2cp(element->c, encoding);
875 #ifdef CONFIG_UTF_8
876 if (codepages[encoding].table == table_utf_8) {
877 return result;
879 #endif /* CONFIG_UTF_8 */
880 end:
881 /* Take care of potential buffer overflow. */
882 if (strlen < sizeof(entity_cache[slen][0].str)) {
883 struct entity_cache *ece = &entity_cache[slen][nb_entity_cache[slen]];
885 /* Copy new entry to cache. */
886 ece->hits = 1;
887 ece->strlen = strlen;
888 ece->encoding = encoding;
889 ece->result = result;
890 memcpy(ece->str, str, strlen);
891 ece->str[strlen] = '\0';
893 /* Increment number of cache entries if possible. */
894 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
896 #ifdef DEBUG_ENTITY_CACHE
897 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
898 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
900 #endif
902 /* Sort entries by hit order. */
903 if (nb_entity_cache[slen] > 1)
904 qsort(&entity_cache[slen][0], nb_entity_cache[slen],
905 sizeof(entity_cache[slen][0]), (void *) hits_cmp);
907 #ifdef DEBUG_ENTITY_CACHE
909 unsigned int i;
911 fprintf(stderr, "- Cache entries [%u] -\n", slen);
912 for (i = 0; i < nb_entity_cache[slen] ; i++)
913 fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
914 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
915 entity_cache[slen][i].str);
916 fprintf(stderr, "-----------------\n");
918 #endif
920 return result;
923 unsigned char *
924 convert_string(struct conv_table *convert_table,
925 unsigned char *chars, int charslen, int cp,
926 enum convert_string_mode mode, int *length,
927 void (*callback)(void *data, unsigned char *buf, int buflen),
928 void *callback_data)
930 unsigned char *buffer;
931 int bufferpos = 0;
932 int charspos = 0;
934 if (!convert_table && !memchr(chars, '&', charslen)) {
935 if (callback) {
936 if (charslen) callback(callback_data, chars, charslen);
937 return NULL;
938 } else {
939 return memacpy(chars, charslen);
943 /* Buffer allocation */
945 buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
946 if (!buffer) return NULL;
948 /* Iterate ;-) */
950 while (charspos < charslen) {
951 unsigned char *translit;
953 #define PUTC do { \
954 buffer[bufferpos++] = chars[charspos++]; \
955 translit = ""; \
956 goto flush; \
957 } while (0)
959 if (chars[charspos] != '&') {
960 struct conv_table *t;
961 int i;
963 if (chars[charspos] < 128 || !convert_table) PUTC;
965 t = convert_table;
966 i = charspos;
968 while (t[chars[i]].t) {
969 t = t[chars[i++]].u.tbl;
970 if (i >= charslen) PUTC;
973 translit = t[chars[i]].u.str;
974 charspos = i + 1;
976 } else if (mode == CSM_FORM || mode == CSM_NONE) {
977 PUTC;
979 } else {
980 int start = charspos + 1;
981 int i = start;
983 while (i < charslen
984 && (isasciialpha(chars[i])
985 || isdigit(chars[i])
986 || (chars[i] == '#')))
987 i++;
989 /* This prevents bug 213: we were expanding "entities"
990 * in URL query strings. */
991 /* XXX: But this disables &nbsp&nbsp usage, which
992 * appears to be relatively common! --pasky */
993 if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
994 && i > start
995 && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
996 translit = get_entity_string(&chars[start], i - start,
997 cp);
998 if (chars[i] != ';') {
999 /* Eat &nbsp &nbsp<foo> happily, but
1000 * pull back from the character after
1001 * entity string if it is not the valid
1002 * terminator. */
1003 i--;
1006 if (!translit) PUTC;
1007 charspos = i + (i < charslen);
1008 } else PUTC;
1011 if (!translit[0]) continue;
1013 if (!translit[1]) {
1014 buffer[bufferpos++] = translit[0];
1015 translit = "";
1016 goto flush;
1019 while (*translit) {
1020 unsigned char *new;
1022 buffer[bufferpos++] = *(translit++);
1023 flush:
1024 if (bufferpos & (ALLOC_GR - 1)) continue;
1026 if (callback) {
1027 buffer[bufferpos] = 0;
1028 callback(callback_data, buffer, bufferpos);
1029 bufferpos = 0;
1030 } else {
1031 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1032 if (!new) {
1033 mem_free(buffer);
1034 return NULL;
1036 buffer = new;
1039 #undef PUTC
1042 /* Say bye */
1044 buffer[bufferpos] = 0;
1045 if (length) *length = bufferpos;
1047 if (callback) {
1048 if (bufferpos) callback(callback_data, buffer, bufferpos);
1049 mem_free(buffer);
1050 return NULL;
1051 } else {
1052 return buffer;
1057 #ifndef USE_FASTFIND
1059 get_cp_index(unsigned char *name)
1061 int i, a;
1062 int syscp = 0;
1064 if (!strcasecmp(name, "System")) {
1065 #if HAVE_LANGINFO_CODESET
1066 name = nl_langinfo(CODESET);
1067 syscp = SYSTEM_CHARSET_FLAG;
1068 #else
1069 name = "us-ascii";
1070 #endif
1073 for (i = 0; codepages[i].name; i++) {
1074 for (a = 0; codepages[i].aliases[a]; a++) {
1075 /* In the past, we looked for the longest substring
1076 * in all the names; it is way too expensive, though:
1078 * % cumulative self self total
1079 * time seconds seconds calls us/call us/call name
1080 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1082 * Anything called from redraw_screen() is in fact
1083 * relatively expensive, even if it's called just
1084 * once. So we will do a simple strcasecmp() here.
1087 if (!strcasecmp(name, codepages[i].aliases[a]))
1088 return i | syscp;
1092 if (syscp) {
1093 return get_cp_index("us-ascii") | syscp;
1094 } else {
1095 return -1;
1099 #else
1101 static unsigned int i_name = 0;
1102 static unsigned int i_alias = 0;
1104 /* Reset internal list pointer */
1105 void
1106 charsets_list_reset(void)
1108 i_name = 0;
1109 i_alias = 0;
1112 /* Returns a pointer to a struct that contains current key and data pointers
1113 * and increment internal pointer. It returns NULL when key is NULL. */
1114 struct fastfind_key_value *
1115 charsets_list_next(void)
1117 static struct fastfind_key_value kv;
1119 if (!codepages[i_name].name) return NULL;
1121 kv.key = codepages[i_name].aliases[i_alias];
1122 kv.data = &codepages[i_name];
1124 if (codepages[i_name].aliases[i_alias + 1])
1125 i_alias++;
1126 else {
1127 i_name++;
1128 i_alias = 0;
1131 return &kv;
1134 static struct fastfind_index ff_charsets_index
1135 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1137 /* It searchs for a charset named @name or one of its aliases and
1138 * returns index for it or -1 if not found. */
1140 get_cp_index(unsigned char *name)
1142 struct codepage_desc *codepage;
1143 int syscp = 0;
1145 if (!strcasecmp(name, "System")) {
1146 #if HAVE_LANGINFO_CODESET
1147 name = nl_langinfo(CODESET);
1148 syscp = SYSTEM_CHARSET_FLAG;
1149 #else
1150 name = "us-ascii";
1151 #endif
1154 codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1155 if (codepage) {
1156 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1157 return (codepage - codepages) | syscp;
1159 } else if (syscp) {
1160 return get_cp_index("us-ascii") | syscp;
1162 } else {
1163 return -1;
1167 #endif /* USE_FASTFIND */
1169 void
1170 init_charsets_lookup(void)
1172 #ifdef USE_FASTFIND
1173 fastfind_index(&ff_charsets_index, FF_COMPRESS);
1174 #endif
1177 void
1178 free_charsets_lookup(void)
1180 #ifdef USE_FASTFIND
1181 fastfind_done(&ff_charsets_index);
1182 #endif
1185 unsigned char *
1186 get_cp_name(int cp_index)
1188 if (cp_index < 0) return "none";
1189 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1191 return codepages[cp_index].name;
1194 unsigned char *
1195 get_cp_mime_name(int cp_index)
1197 if (cp_index < 0) return "none";
1198 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1199 if (!codepages[cp_index].aliases) return NULL;
1201 return codepages[cp_index].aliases[0];
1205 is_cp_utf8(int cp_index)
1207 cp_index &= ~SYSTEM_CHARSET_FLAG;
1208 return codepages[cp_index].table == table_utf_8;