UTF-8: New function cp_to_unicode().
[elinks.git] / src / intl / charsets.c
blob5809403e8668045e54bfc504e3823c9bfcd5bcda
1 /* Charsets convertor */
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
7 #if HAVE_LANGINFO_CODESET
8 #include <langinfo.h>
9 #endif
11 #include <ctype.h>
12 #include <stdlib.h>
13 #if HAVE_WCTYPE_H
14 #include <wctype.h>
15 #endif
17 #include "elinks.h"
19 #include "document/options.h"
20 #include "intl/charsets.h"
21 #include "util/conv.h"
22 #include "util/error.h"
23 #include "util/fastfind.h"
24 #include "util/memory.h"
25 #include "util/string.h"
28 /* Fix namespace clash on MacOS. */
29 #define table table_elinks
31 struct table_entry {
32 unsigned char c;
33 unicode_val_T u;
36 struct codepage_desc {
37 unsigned char *name;
38 unsigned char **aliases;
39 struct table_entry *table;
42 #include "intl/codepage.inc"
43 #include "intl/uni_7b.inc"
44 #include "intl/entity.inc"
47 static char strings[256][2] = {
48 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
49 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
50 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
51 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
52 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
53 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
54 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
55 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
56 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
57 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
58 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
59 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
60 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
61 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
62 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
63 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
64 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
65 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
66 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
67 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
68 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
69 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
70 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
71 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
72 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
73 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
74 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
75 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
76 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
77 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
78 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
79 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
82 static void
83 free_translation_table(struct conv_table *p)
85 int i;
87 for (i = 0; i < 256; i++)
88 if (p[i].t)
89 free_translation_table(p[i].u.tbl);
91 mem_free(p);
94 static unsigned char *no_str = "*";
96 static void
97 new_translation_table(struct conv_table *p)
99 int i;
101 for (i = 0; i < 256; i++)
102 if (p[i].t)
103 free_translation_table(p[i].u.tbl);
104 for (i = 0; i < 128; i++) {
105 p[i].t = 0;
106 p[i].u.str = strings[i];
108 for (; i < 256; i++) {
109 p[i].t = 0;
110 p[i].u.str = no_str;
114 #define BIN_SEARCH(table, entry, entries, key, result) \
116 long _s = 0, _e = (entries) - 1; \
118 while (_s <= _e || !((result) = -1)) { \
119 long _m = (_s + _e) / 2; \
121 if ((table)[_m].entry == (key)) { \
122 (result) = _m; \
123 break; \
125 if ((table)[_m].entry > (key)) _e = _m - 1; \
126 if ((table)[_m].entry < (key)) _s = _m + 1; \
130 static const unicode_val_T strange_chars[32] = {
131 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
132 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
133 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
134 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
137 #define SYSTEM_CHARSET_FLAG 128
139 unsigned char *
140 u2cp_(unicode_val_T u, int to, int no_nbsp_hack)
142 int j;
143 int s;
145 if (u < 128) return strings[u];
147 to &= ~SYSTEM_CHARSET_FLAG;
149 #ifdef CONFIG_UTF_8
150 if (codepages[to].table == table_utf_8)
151 return encode_utf_8(u);
152 #endif /* CONFIG_UTF_8 */
154 /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
155 if (u == 0xa0) return no_nbsp_hack ? " " : NBSP_CHAR_STRING;
156 if (u == 0xad) return "";
158 if (u < 0xa0) {
159 unicode_val_T strange = strange_chars[u - 0x80];
161 if (!strange) return NULL;
162 return u2cp_(strange, to, no_nbsp_hack);
166 for (j = 0; codepages[to].table[j].c; j++)
167 if (codepages[to].table[j].u == u)
168 return strings[codepages[to].table[j].c];
170 BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
171 if (s != -1) return unicode_7b[s].s;
173 return no_str;
176 static unsigned char utf_buffer[7];
178 #ifdef CONFIG_UTF_8
179 inline unsigned char *
180 encode_utf_8(unicode_val_T u)
181 #else
182 static unsigned char *
183 encode_utf_8(unicode_val_T u)
184 #endif /* CONFIG_UTF_8 */
186 memset(utf_buffer, 0, 7);
188 if (u < 0x80)
189 utf_buffer[0] = u;
190 else if (u < 0x800)
191 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
192 utf_buffer[1] = 0x80 | (u & 0x3f);
193 else if (u < 0x10000)
194 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
195 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
196 utf_buffer[2] = 0x80 | (u & 0x3f);
197 else if (u < 0x200000)
198 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
199 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
200 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
201 utf_buffer[3] = 0x80 | (u & 0x3f);
202 else if (u < 0x4000000)
203 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
204 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
205 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
206 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
207 utf_buffer[4] = 0x80 | (u & 0x3f);
208 else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
209 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
210 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
211 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
212 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
213 utf_buffer[5] = 0x80 | (u & 0x3f);
215 return utf_buffer;
218 #ifdef CONFIG_UTF_8
219 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
220 * equal ones and handled different. */
221 static char utf8char_len_tab[256] = {
222 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
223 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
224 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
225 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
226 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
227 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
228 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
229 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
232 inline int utf8charlen(const unsigned char *p)
234 return p ? utf8char_len_tab[*p] : 0;
237 inline int
238 strlen_utf8(unsigned char **str)
240 unsigned char *s = *str;
241 unsigned char *end = strchr(s, '\0');
242 int x;
243 int len;
245 for (x = 0;; x++, s += len) {
246 len = utf8charlen(s);
247 if (s + len > end) break;
249 *str = s;
250 return x;
253 #define utf8_issingle(p) (((p) & 0x80) == 0)
254 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
256 /* Start from @current and move back to @pos char. This pointer return. The
257 * most left pointer is @start. */
258 inline unsigned char *
259 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
261 if (current == NULL || start == NULL || pos < 0)
262 return NULL;
263 while (pos > 0 && current != start) {
264 current--;
265 if (utf8_islead(*current))
266 pos--;
268 return current;
271 /* Count number of standard terminal cells needed for displaying UTF-8
272 * character. */
274 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
276 unicode_val_T u;
278 if (end == NULL)
279 end = strchr(utf8_char, '\0');
281 if(!utf8_char || !end)
282 return -1;
284 u = utf_8_to_unicode(&utf8_char, end);
286 return unicode_to_cell(u);
289 /* Count number of standard terminal cells needed for displaying string
290 * with UTF-8 characters. */
292 utf8_ptr2cells(unsigned char *string, unsigned char *end)
294 int charlen, cell, cells = 0;
296 if (end == NULL)
297 end = strchr(string, '\0');
299 if(!string || !end)
300 return -1;
302 do {
303 charlen = utf8charlen(string);
304 if (string + charlen > end)
305 break;
307 cell = utf8_char2cells(string, end);
308 if (cell < 0)
309 return -1;
311 cells += cell;
312 string += charlen;
313 } while (1);
315 return cells;
318 /* Count number of characters in string. */
320 utf8_ptr2chars(unsigned char *string, unsigned char *end)
322 int charlen, chars = 0;
324 if (end == NULL)
325 end = strchr(string, '\0');
327 if(!string || !end)
328 return -1;
330 do {
331 charlen = utf8charlen(string);
332 if (string + charlen > end)
333 break;
335 chars++;
336 string += charlen;
337 } while (1);
339 return chars;
343 * Count number of bytes from begining of the string needed for displaying
344 * specified number of cells.
347 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
349 unsigned int bytes = 0, cells = 0;
351 assert(max_cells>=0);
353 if (end == NULL)
354 end = strchr(string, '\0');
356 if(!string || !end)
357 return -1;
359 do {
360 int cell = utf8_char2cells(&string[bytes], end);
361 if (cell < 0)
362 return -1;
364 cells += cell;
365 if (cells > max_cells)
366 break;
368 bytes += utf8charlen(&string[bytes]);
370 if (string + bytes > end) {
371 bytes = end - string;
372 break;
374 } while(1);
376 return bytes;
380 * Find out number of standard terminal collumns needed for displaying symbol
381 * (glyph) which represents Unicode character c.
382 * TODO: Use wcwidth when it is available.
384 * @return 2 for double-width glyph, 1 for others.
385 * TODO: May be extended to return 0 for zero-width glyphs
386 * (like composing, maybe unprintable too).
388 inline int
389 unicode_to_cell(unicode_val_T c)
391 if (c >= 0x1100
392 && (c <= 0x115f /* Hangul Jamo */
393 || c == 0x2329
394 || c == 0x232a
395 || (c >= 0x2e80 && c <= 0xa4cf
396 && c != 0x303f) /* CJK ... Yi */
397 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
398 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
399 Ideographs */
400 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
401 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
402 || (c >= 0xffe0 && c <= 0xffe6)
403 || (c >= 0x20000 && c <= 0x2fffd)
404 || (c >= 0x30000 && c <= 0x3fffd)))
405 return 2;
407 return 1;
410 /* Fold the case of a Unicode character, so that hotkeys in labels can
411 * be compared case-insensitively. This should be called only if
412 * check_kbd_label_key(c) is true. It is unspecified whether the
413 * result will be in upper or lower case. */
414 unicode_val_T
415 unicode_fold_label_case(unicode_val_T c)
417 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
418 return towlower(c);
419 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
420 /* For now, this supports only ASCII. It would be possible to
421 * use code generated from CaseFolding.txt of Unicode if the
422 * acknowledgements required by http://www.unicode.org/copyright.html
423 * were added to associated documentation of ELinks. */
424 if (c >= 0x41 && c <= 0x5A)
425 return c + 0x20;
426 else
427 return c;
428 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
431 inline unicode_val_T
432 utf_8_to_unicode(unsigned char **string, unsigned char *end)
434 unsigned char *str = *string;
435 unicode_val_T u;
436 int length;
438 length = utf8char_len_tab[str[0]];
440 if (str + length > end) {
441 return UCS_NO_CHAR;
444 switch (length) {
445 case 1:
446 u = str[0];
447 break;
448 case 2:
449 u = (str[0] & 0x1f) << 6;
450 u += (str[1] & 0x3f);
451 break;
452 case 3:
453 u = (str[0] & 0x0f) << 12;
454 u += ((str[1] & 0x3f) << 6);
455 u += (str[2] & 0x3f);
456 break;
457 case 4:
458 u = (str[0] & 0x0f) << 18;
459 u += ((str[1] & 0x3f) << 12);
460 u += ((str[2] & 0x3f) << 6);
461 u += (str[3] & 0x3f);
462 break;
463 case 5:
464 u = (str[0] & 0x0f) << 24;
465 u += ((str[1] & 0x3f) << 18);
466 u += ((str[2] & 0x3f) << 12);
467 u += ((str[3] & 0x3f) << 6);
468 u += (str[4] & 0x3f);
469 break;
470 case 6:
471 default:
472 u = (str[0] & 0x01) << 30;
473 u += ((str[1] & 0x3f) << 24);
474 u += ((str[2] & 0x3f) << 18);
475 u += ((str[3] & 0x3f) << 12);
476 u += ((str[4] & 0x3f) << 6);
477 u += (str[5] & 0x3f);
478 break;
480 *string = str + length;
481 return u;
483 #endif /* CONFIG_UTF_8 */
485 /* Slow algorithm, the common part of cp2u and cp2utf_8. */
486 static unicode_val_T
487 cp2u_shared(const struct codepage_desc *from, unsigned char c)
489 int j;
491 for (j = 0; from->table[j].c; j++)
492 if (from->table[j].c == c)
493 return from->table[j].u;
495 return UCS_NO_CHAR;
498 /* Slow algorithm, used for converting input from the terminal. */
499 unicode_val_T
500 cp2u(int from, unsigned char c)
502 from &= ~SYSTEM_CHARSET_FLAG;
504 /* UTF-8 is a multibyte codepage and cannot be handled with
505 * this function. */
506 assert(codepages[from].table != table_utf_8);
507 if_assert_failed return UCS_NO_CHAR;
509 if (c < 0x80) return c;
510 else return cp2u_shared(&codepages[from], c);
513 /* This slow and ugly code is used by the terminal utf_8_io */
514 unsigned char *
515 cp2utf_8(int from, int c)
517 from &= ~SYSTEM_CHARSET_FLAG;
519 if (codepages[from].table == table_utf_8 || c < 128)
520 return strings[c];
522 return encode_utf_8(cp2u_shared(&codepages[from], c));
525 #ifdef CONFIG_UTF_8
526 unicode_val_T
527 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
529 if (is_cp_utf8(codepage))
530 return utf_8_to_unicode(string, end);
531 else {
532 if (*string >= end)
533 return UCS_NO_CHAR;
534 else {
535 unicode_val_T ret = cp2u(codepage, **string);
536 ++*string;
537 return ret;
541 #endif /* CONFIG_UTF_8 */
544 static void
545 add_utf_8(struct conv_table *ct, unicode_val_T u, unsigned char *str)
547 unsigned char *p = encode_utf_8(u);
549 while (p[1]) {
550 if (ct[*p].t) ct = ct[*p].u.tbl;
551 else {
552 struct conv_table *nct;
554 assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
555 if_assert_failed return;
557 nct = mem_calloc(256, sizeof(*nct));
558 if (!nct) return;
559 new_translation_table(nct);
560 ct[*p].t = 1;
561 ct[*p].u.tbl = nct;
562 ct = nct;
564 p++;
567 assertm(!ct[*p].t, "bad utf encoding #2");
568 if_assert_failed return;
570 if (ct[*p].u.str == no_str)
571 ct[*p].u.str = str;
574 struct conv_table utf_table[256];
575 int utf_table_init = 1;
577 static void
578 free_utf_table(void)
580 int i;
582 for (i = 128; i < 256; i++)
583 mem_free(utf_table[i].u.str);
586 static struct conv_table *
587 get_translation_table_to_utf_8(int from)
589 int i;
590 static int lfr = -1;
592 if (from == -1) return NULL;
593 from &= ~SYSTEM_CHARSET_FLAG;
594 if (from == lfr) return utf_table;
595 lfr = from;
596 if (utf_table_init)
597 memset(utf_table, 0, sizeof(utf_table)),
598 utf_table_init = 0;
599 else
600 free_utf_table();
602 for (i = 0; i < 128; i++)
603 utf_table[i].u.str = strings[i];
605 if (codepages[from].table == table_utf_8) {
606 for (i = 128; i < 256; i++)
607 utf_table[i].u.str = stracpy(strings[i]);
608 return utf_table;
611 for (i = 128; i < 256; i++)
612 utf_table[i].u.str = NULL;
614 for (i = 0; codepages[from].table[i].c; i++) {
615 unicode_val_T u = codepages[from].table[i].u;
617 if (!utf_table[codepages[from].table[i].c].u.str)
618 utf_table[codepages[from].table[i].c].u.str =
619 stracpy(encode_utf_8(u));
622 for (i = 128; i < 256; i++)
623 if (!utf_table[i].u.str)
624 utf_table[i].u.str = stracpy(no_str);
626 return utf_table;
629 struct conv_table table[256];
630 static int first = 1;
632 void
633 free_conv_table(void)
635 if (!utf_table_init) free_utf_table();
636 if (first) {
637 memset(table, 0, sizeof(table));
638 first = 0;
640 new_translation_table(table);
644 struct conv_table *
645 get_translation_table(int from, int to)
647 static int lfr = -1;
648 static int lto = -1;
650 from &= ~SYSTEM_CHARSET_FLAG;
651 to &= ~SYSTEM_CHARSET_FLAG;
652 if (first) {
653 memset(table, 0, sizeof(table));
654 first = 0;
656 if (/*from == to ||*/ from == -1 || to == -1)
657 return NULL;
658 if (codepages[to].table == table_utf_8)
659 return get_translation_table_to_utf_8(from);
660 if (from == lfr && to == lto)
661 return table;
662 lfr = from;
663 lto = to;
664 new_translation_table(table);
666 if (codepages[from].table == table_utf_8) {
667 int i;
669 for (i = 0; codepages[to].table[i].c; i++)
670 add_utf_8(table, codepages[to].table[i].u,
671 strings[codepages[to].table[i].c]);
673 for (i = 0; unicode_7b[i].x != -1; i++)
674 if (unicode_7b[i].x >= 0x80)
675 add_utf_8(table, unicode_7b[i].x,
676 unicode_7b[i].s);
678 } else {
679 int i;
681 for (i = 128; i < 256; i++) {
682 int j;
684 for (j = 0; codepages[from].table[j].c; j++) {
685 if (codepages[from].table[j].c == i) {
686 unsigned char *u;
688 u = u2cp(codepages[from].table[j].u, to);
689 if (u) table[i].u.str = u;
690 break;
696 return table;
699 static inline int
700 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
702 while (l2) {
703 if (*s1 > *s2) return 1;
704 if (*s1 < *s2) return -1;
705 s1++;
706 s2++;
707 l2--;
710 return *s2 ? -1 : 0;
713 /* Entity cache debugging purpose. */
714 #if 0
715 #define DEBUG_ENTITY_CACHE
716 #else
717 #undef DEBUG_ENTITY_CACHE
718 #endif
720 struct entity_cache {
721 unsigned int hits;
722 int strlen;
723 int encoding;
724 unsigned char *result;
725 unsigned char str[20]; /* Suffice in any case. */
728 static int
729 hits_cmp(struct entity_cache *a, struct entity_cache *b)
731 if (a->hits == b->hits) return 0;
732 if (a->hits > b->hits) return -1;
733 else return 1;
736 static int
737 compare_entities(const void *key_, const void *element_)
739 struct string *key = (struct string *) key_;
740 struct entity *element = (struct entity *) element_;
741 int length = key->length;
742 unsigned char *first = key->source;
743 unsigned char *second = element->s;
745 return xxstrcmp(first, second, length);
748 unsigned char *
749 get_entity_string(const unsigned char *str, const int strlen, int encoding)
751 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
752 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
753 will go in [0] table */
754 static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
755 static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
756 static int first_time = 1;
757 unsigned int slen = 0;
758 unsigned char *result = NULL;
760 if (strlen <= 0) return NULL;
762 #ifdef CONFIG_UTF_8
763 /* TODO: caching UTF-8 */
764 encoding &= ~SYSTEM_CHARSET_FLAG;
765 if (codepages[encoding].table == table_utf_8)
766 goto skip;
767 #endif /* CONFIG_UTF_8 */
769 if (first_time) {
770 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
771 first_time = 0;
774 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
775 * + google + slashdot + websites that result from a search for test on google,
776 * + various ones) show a quite impressive improvment:
777 * Top ten is:
778 * 0: hits=2459 l=4 st='nbsp'
779 * 1: hits=2152 l=6 st='eacute'
780 * 2: hits=235 l=6 st='egrave'
781 * 3: hits=136 l=6 st='agrave'
782 * 4: hits=100 l=3 st='amp'
783 * 5: hits=40 l=5 st='laquo'
784 * 6: hits=8 l=4 st='copy'
785 * 7: hits=5 l=2 st='gt'
786 * 8: hits=2 l=2 st='lt'
787 * 9: hits=1 l=6 st='middot'
789 * Most of the time cache hit ratio is near 95%.
791 * A long test shows: 15186 hits vs. 24 misses and mean iteration
792 * count is kept < 2 (worst case 1.58). Not so bad ;)
794 * --Zas */
796 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
797 slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
799 if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
800 int i;
802 for (i = 0; i < nb_entity_cache[slen]; i++) {
803 if (entity_cache[slen][i].encoding == encoding
804 && !memcmp(str, entity_cache[slen][i].str, strlen)) {
805 #ifdef DEBUG_ENTITY_CACHE
806 static double total_iter = 0;
807 static unsigned long hit_count = 0;
809 total_iter += i + 1;
810 hit_count++;
811 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
812 #endif
813 if (entity_cache[slen][i].hits < (unsigned int) ~0)
814 entity_cache[slen][i].hits++;
815 return entity_cache[slen][i].result;
818 #ifdef DEBUG_ENTITY_CACHE
819 fprintf(stderr, "miss\n");
820 #endif
822 #ifdef CONFIG_UTF_8
823 skip:
824 #endif /* CONFIG_UTF_8 */
825 if (*str == '#') { /* Numeric entity. */
826 int l = (int) strlen;
827 unsigned char *st = (unsigned char *) str;
828 unicode_val_T n = 0;
830 if (l == 1) goto end; /* &#; ? */
831 st++, l--;
832 if ((*st | 32) == 'x') { /* Hexadecimal */
834 if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
835 st++, l--;
836 do {
837 unsigned char c = (*(st++) | 32);
839 if (isdigit(c))
840 n = (n << 4) | (c - '0');
841 else if (isxdigit(c))
842 n = (n << 4) | (c - 'a' + 10);
843 else
844 goto end; /* Bad char. */
845 } while (--l);
846 } else { /* Decimal */
847 if (l > 10) goto end; /* 4294967295 max. */
848 do {
849 unsigned char c = *(st++);
851 if (isdigit(c))
852 n = n * 10 + c - '0';
853 else
854 goto end; /* Bad char. */
855 /* Limit to 0xFFFFFFFF. */
856 if (n >= (unicode_val_T) 0xFFFFFFFFu)
857 goto end;
858 } while (--l);
861 result = u2cp(n, encoding);
863 #ifdef DEBUG_ENTITY_CACHE
864 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
865 #endif
866 } else { /* Text entity. */
867 struct string key = INIT_STRING((unsigned char *) str, strlen);
868 struct entity *element = bsearch((void *) &key, entities,
869 N_ENTITIES,
870 sizeof(*element),
871 compare_entities);
873 if (element) result = u2cp(element->c, encoding);
876 #ifdef CONFIG_UTF_8
877 if (codepages[encoding].table == table_utf_8) {
878 return result;
880 #endif /* CONFIG_UTF_8 */
881 end:
882 /* Take care of potential buffer overflow. */
883 if (strlen < sizeof(entity_cache[slen][0].str)) {
884 struct entity_cache *ece = &entity_cache[slen][nb_entity_cache[slen]];
886 /* Copy new entry to cache. */
887 ece->hits = 1;
888 ece->strlen = strlen;
889 ece->encoding = encoding;
890 ece->result = result;
891 memcpy(ece->str, str, strlen);
892 ece->str[strlen] = '\0';
894 /* Increment number of cache entries if possible. */
895 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
897 #ifdef DEBUG_ENTITY_CACHE
898 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
899 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
901 #endif
903 /* Sort entries by hit order. */
904 if (nb_entity_cache[slen] > 1)
905 qsort(&entity_cache[slen][0], nb_entity_cache[slen],
906 sizeof(entity_cache[slen][0]), (void *) hits_cmp);
908 #ifdef DEBUG_ENTITY_CACHE
910 unsigned int i;
912 fprintf(stderr, "- Cache entries [%u] -\n", slen);
913 for (i = 0; i < nb_entity_cache[slen] ; i++)
914 fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
915 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
916 entity_cache[slen][i].str);
917 fprintf(stderr, "-----------------\n");
919 #endif
921 return result;
924 unsigned char *
925 convert_string(struct conv_table *convert_table,
926 unsigned char *chars, int charslen, int cp,
927 enum convert_string_mode mode, int *length,
928 void (*callback)(void *data, unsigned char *buf, int buflen),
929 void *callback_data)
931 unsigned char *buffer;
932 int bufferpos = 0;
933 int charspos = 0;
935 if (!convert_table && !memchr(chars, '&', charslen)) {
936 if (callback) {
937 if (charslen) callback(callback_data, chars, charslen);
938 return NULL;
939 } else {
940 return memacpy(chars, charslen);
944 /* Buffer allocation */
946 buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
947 if (!buffer) return NULL;
949 /* Iterate ;-) */
951 while (charspos < charslen) {
952 unsigned char *translit;
954 #define PUTC do { \
955 buffer[bufferpos++] = chars[charspos++]; \
956 translit = ""; \
957 goto flush; \
958 } while (0)
960 if (chars[charspos] != '&') {
961 struct conv_table *t;
962 int i;
964 if (chars[charspos] < 128 || !convert_table) PUTC;
966 t = convert_table;
967 i = charspos;
969 while (t[chars[i]].t) {
970 t = t[chars[i++]].u.tbl;
971 if (i >= charslen) PUTC;
974 translit = t[chars[i]].u.str;
975 charspos = i + 1;
977 } else if (mode == CSM_FORM || mode == CSM_NONE) {
978 PUTC;
980 } else {
981 int start = charspos + 1;
982 int i = start;
984 while (i < charslen
985 && (isasciialpha(chars[i])
986 || isdigit(chars[i])
987 || (chars[i] == '#')))
988 i++;
990 /* This prevents bug 213: we were expanding "entities"
991 * in URL query strings. */
992 /* XXX: But this disables &nbsp&nbsp usage, which
993 * appears to be relatively common! --pasky */
994 if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
995 && i > start
996 && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
997 translit = get_entity_string(&chars[start], i - start,
998 cp);
999 if (chars[i] != ';') {
1000 /* Eat &nbsp &nbsp<foo> happily, but
1001 * pull back from the character after
1002 * entity string if it is not the valid
1003 * terminator. */
1004 i--;
1007 if (!translit) PUTC;
1008 charspos = i + (i < charslen);
1009 } else PUTC;
1012 if (!translit[0]) continue;
1014 if (!translit[1]) {
1015 buffer[bufferpos++] = translit[0];
1016 translit = "";
1017 goto flush;
1020 while (*translit) {
1021 unsigned char *new;
1023 buffer[bufferpos++] = *(translit++);
1024 flush:
1025 if (bufferpos & (ALLOC_GR - 1)) continue;
1027 if (callback) {
1028 buffer[bufferpos] = 0;
1029 callback(callback_data, buffer, bufferpos);
1030 bufferpos = 0;
1031 } else {
1032 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1033 if (!new) {
1034 mem_free(buffer);
1035 return NULL;
1037 buffer = new;
1040 #undef PUTC
1043 /* Say bye */
1045 buffer[bufferpos] = 0;
1046 if (length) *length = bufferpos;
1048 if (callback) {
1049 if (bufferpos) callback(callback_data, buffer, bufferpos);
1050 mem_free(buffer);
1051 return NULL;
1052 } else {
1053 return buffer;
1058 #ifndef USE_FASTFIND
1060 get_cp_index(unsigned char *name)
1062 int i, a;
1063 int syscp = 0;
1065 if (!strcasecmp(name, "System")) {
1066 #if HAVE_LANGINFO_CODESET
1067 name = nl_langinfo(CODESET);
1068 syscp = SYSTEM_CHARSET_FLAG;
1069 #else
1070 name = "us-ascii";
1071 #endif
1074 for (i = 0; codepages[i].name; i++) {
1075 for (a = 0; codepages[i].aliases[a]; a++) {
1076 /* In the past, we looked for the longest substring
1077 * in all the names; it is way too expensive, though:
1079 * % cumulative self self total
1080 * time seconds seconds calls us/call us/call name
1081 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1083 * Anything called from redraw_screen() is in fact
1084 * relatively expensive, even if it's called just
1085 * once. So we will do a simple strcasecmp() here.
1088 if (!strcasecmp(name, codepages[i].aliases[a]))
1089 return i | syscp;
1093 if (syscp) {
1094 return get_cp_index("us-ascii") | syscp;
1095 } else {
1096 return -1;
1100 #else
1102 static unsigned int i_name = 0;
1103 static unsigned int i_alias = 0;
1105 /* Reset internal list pointer */
1106 void
1107 charsets_list_reset(void)
1109 i_name = 0;
1110 i_alias = 0;
1113 /* Returns a pointer to a struct that contains current key and data pointers
1114 * and increment internal pointer. It returns NULL when key is NULL. */
1115 struct fastfind_key_value *
1116 charsets_list_next(void)
1118 static struct fastfind_key_value kv;
1120 if (!codepages[i_name].name) return NULL;
1122 kv.key = codepages[i_name].aliases[i_alias];
1123 kv.data = &codepages[i_name];
1125 if (codepages[i_name].aliases[i_alias + 1])
1126 i_alias++;
1127 else {
1128 i_name++;
1129 i_alias = 0;
1132 return &kv;
1135 static struct fastfind_index ff_charsets_index
1136 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1138 /* It searchs for a charset named @name or one of its aliases and
1139 * returns index for it or -1 if not found. */
1141 get_cp_index(unsigned char *name)
1143 struct codepage_desc *codepage;
1144 int syscp = 0;
1146 if (!strcasecmp(name, "System")) {
1147 #if HAVE_LANGINFO_CODESET
1148 name = nl_langinfo(CODESET);
1149 syscp = SYSTEM_CHARSET_FLAG;
1150 #else
1151 name = "us-ascii";
1152 #endif
1155 codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1156 if (codepage) {
1157 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1158 return (codepage - codepages) | syscp;
1160 } else if (syscp) {
1161 return get_cp_index("us-ascii") | syscp;
1163 } else {
1164 return -1;
1168 #endif /* USE_FASTFIND */
1170 void
1171 init_charsets_lookup(void)
1173 #ifdef USE_FASTFIND
1174 fastfind_index(&ff_charsets_index, FF_COMPRESS);
1175 #endif
1178 void
1179 free_charsets_lookup(void)
1181 #ifdef USE_FASTFIND
1182 fastfind_done(&ff_charsets_index);
1183 #endif
1186 unsigned char *
1187 get_cp_name(int cp_index)
1189 if (cp_index < 0) return "none";
1190 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1192 return codepages[cp_index].name;
1195 unsigned char *
1196 get_cp_mime_name(int cp_index)
1198 if (cp_index < 0) return "none";
1199 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1200 if (!codepages[cp_index].aliases) return NULL;
1202 return codepages[cp_index].aliases[0];
1206 is_cp_utf8(int cp_index)
1208 cp_index &= ~SYSTEM_CHARSET_FLAG;
1209 return codepages[cp_index].table == table_utf_8;