UTF-8: Stepping functions set *count even if an assertion fails.
[elinks.git] / src / intl / charsets.c
blobd8ecaa2dabd1eddcc61b64fe3e6bb45bc2ee5e92
1 /* Charsets convertor */
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
7 #if HAVE_LANGINFO_CODESET
8 #include <langinfo.h>
9 #endif
11 #include <ctype.h>
12 #include <stdlib.h>
13 #if HAVE_WCTYPE_H
14 #include <wctype.h>
15 #endif
17 #include "elinks.h"
19 #include "document/options.h"
20 #include "intl/charsets.h"
21 #include "util/conv.h"
22 #include "util/error.h"
23 #include "util/fastfind.h"
24 #include "util/memory.h"
25 #include "util/string.h"
28 /* Fix namespace clash on MacOS. */
29 #define table table_elinks
31 struct table_entry {
32 unsigned char c;
33 unicode_val_T u;
36 struct codepage_desc {
37 unsigned char *name;
38 unsigned char **aliases;
39 struct table_entry *table;
42 #include "intl/codepage.inc"
43 #include "intl/uni_7b.inc"
44 #include "intl/entity.inc"
47 static char strings[256][2] = {
48 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
49 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
50 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
51 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
52 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
53 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
54 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
55 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
56 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
57 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
58 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
59 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
60 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
61 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
62 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
63 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
64 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
65 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
66 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
67 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
68 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
69 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
70 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
71 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
72 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
73 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
74 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
75 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
76 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
77 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
78 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
79 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
82 static void
83 free_translation_table(struct conv_table *p)
85 int i;
87 for (i = 0; i < 256; i++)
88 if (p[i].t)
89 free_translation_table(p[i].u.tbl);
91 mem_free(p);
94 static unsigned char *no_str = "*";
96 static void
97 new_translation_table(struct conv_table *p)
99 int i;
101 for (i = 0; i < 256; i++)
102 if (p[i].t)
103 free_translation_table(p[i].u.tbl);
104 for (i = 0; i < 128; i++) {
105 p[i].t = 0;
106 p[i].u.str = strings[i];
108 for (; i < 256; i++) {
109 p[i].t = 0;
110 p[i].u.str = no_str;
114 #define BIN_SEARCH(table, entry, entries, key, result) \
116 long _s = 0, _e = (entries) - 1; \
118 while (_s <= _e || !((result) = -1)) { \
119 long _m = (_s + _e) / 2; \
121 if ((table)[_m].entry == (key)) { \
122 (result) = _m; \
123 break; \
125 if ((table)[_m].entry > (key)) _e = _m - 1; \
126 if ((table)[_m].entry < (key)) _s = _m + 1; \
130 static const unicode_val_T strange_chars[32] = {
131 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
132 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
133 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
134 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
137 #define SYSTEM_CHARSET_FLAG 128
139 unsigned char *
140 u2cp_(unicode_val_T u, int to, int no_nbsp_hack)
142 int j;
143 int s;
145 if (u < 128) return strings[u];
147 to &= ~SYSTEM_CHARSET_FLAG;
149 #ifdef CONFIG_UTF_8
150 if (codepages[to].table == table_utf_8)
151 return encode_utf_8(u);
152 #endif /* CONFIG_UTF_8 */
154 /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
155 if (u == 0xa0) return no_nbsp_hack ? " " : NBSP_CHAR_STRING;
156 if (u == 0xad) return "";
158 if (u < 0xa0) {
159 unicode_val_T strange = strange_chars[u - 0x80];
161 if (!strange) return NULL;
162 return u2cp_(strange, to, no_nbsp_hack);
166 for (j = 0; codepages[to].table[j].c; j++)
167 if (codepages[to].table[j].u == u)
168 return strings[codepages[to].table[j].c];
170 BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
171 if (s != -1) return unicode_7b[s].s;
173 return no_str;
176 static unsigned char utf_buffer[7];
178 #ifdef CONFIG_UTF_8
179 inline unsigned char *
180 encode_utf_8(unicode_val_T u)
181 #else
182 static unsigned char *
183 encode_utf_8(unicode_val_T u)
184 #endif /* CONFIG_UTF_8 */
186 memset(utf_buffer, 0, 7);
188 if (u < 0x80)
189 utf_buffer[0] = u;
190 else if (u < 0x800)
191 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
192 utf_buffer[1] = 0x80 | (u & 0x3f);
193 else if (u < 0x10000)
194 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
195 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
196 utf_buffer[2] = 0x80 | (u & 0x3f);
197 else if (u < 0x200000)
198 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
199 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
200 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
201 utf_buffer[3] = 0x80 | (u & 0x3f);
202 else if (u < 0x4000000)
203 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
204 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
205 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
206 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
207 utf_buffer[4] = 0x80 | (u & 0x3f);
208 else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
209 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
210 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
211 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
212 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
213 utf_buffer[5] = 0x80 | (u & 0x3f);
215 return utf_buffer;
218 #ifdef CONFIG_UTF_8
219 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
220 * equal ones and handled different. */
221 static char utf8char_len_tab[256] = {
222 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
223 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
224 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
225 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
226 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
227 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
228 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
229 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
232 inline int utf8charlen(const unsigned char *p)
234 return p ? utf8char_len_tab[*p] : 0;
237 inline int
238 strlen_utf8(unsigned char **str)
240 unsigned char *s = *str;
241 unsigned char *end = strchr(s, '\0');
242 int x;
243 int len;
245 for (x = 0;; x++, s += len) {
246 len = utf8charlen(s);
247 if (s + len > end) break;
249 *str = s;
250 return x;
253 #define utf8_issingle(p) (((p) & 0x80) == 0)
254 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
256 /* Start from @current and move back to @pos char. This pointer return. The
257 * most left pointer is @start. */
258 inline unsigned char *
259 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
261 if (current == NULL || start == NULL || pos < 0)
262 return NULL;
263 while (pos > 0 && current != start) {
264 current--;
265 if (utf8_islead(*current))
266 pos--;
268 return current;
271 /* Count number of standard terminal cells needed for displaying UTF-8
272 * character. */
274 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
276 unicode_val_T u;
278 if (end == NULL)
279 end = strchr(utf8_char, '\0');
281 if(!utf8_char || !end)
282 return -1;
284 u = utf_8_to_unicode(&utf8_char, end);
286 return unicode_to_cell(u);
289 /* Count number of standard terminal cells needed for displaying string
290 * with UTF-8 characters. */
292 utf8_ptr2cells(unsigned char *string, unsigned char *end)
294 int charlen, cell, cells = 0;
296 if (end == NULL)
297 end = strchr(string, '\0');
299 if(!string || !end)
300 return -1;
302 do {
303 charlen = utf8charlen(string);
304 if (string + charlen > end)
305 break;
307 cell = utf8_char2cells(string, end);
308 if (cell < 0)
309 return -1;
311 cells += cell;
312 string += charlen;
313 } while (1);
315 return cells;
318 /* Count number of characters in string. */
320 utf8_ptr2chars(unsigned char *string, unsigned char *end)
322 int charlen, chars = 0;
324 if (end == NULL)
325 end = strchr(string, '\0');
327 if(!string || !end)
328 return -1;
330 do {
331 charlen = utf8charlen(string);
332 if (string + charlen > end)
333 break;
335 chars++;
336 string += charlen;
337 } while (1);
339 return chars;
343 * Count number of bytes from begining of the string needed for displaying
344 * specified number of cells.
347 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
349 unsigned int bytes = 0, cells = 0;
351 assert(max_cells>=0);
353 if (end == NULL)
354 end = strchr(string, '\0');
356 if(!string || !end)
357 return -1;
359 do {
360 int cell = utf8_char2cells(&string[bytes], end);
361 if (cell < 0)
362 return -1;
364 cells += cell;
365 if (cells > max_cells)
366 break;
368 bytes += utf8charlen(&string[bytes]);
370 if (string + bytes > end) {
371 bytes = end - string;
372 break;
374 } while(1);
376 return bytes;
379 /* Take @max steps forward from @string in the specified @way, but
380 * not going past @end. Return the resulting address. Store the
381 * number of steps taken to *@count, unless @count is NULL.
383 * This assumes the text is valid UTF-8, and @string and @end point to
384 * character boundaries. If not, it doesn't crash but the results may
385 * be inconsistent.
387 * This function can do some of the same jobs as utf8charlen(),
388 * utf8_cells2bytes(), and strlen_utf8(). */
389 unsigned char *
390 utf8_step_forward(unsigned char *string, unsigned char *end,
391 int max, enum utf8_step way, int *count)
393 int steps = 0;
394 unsigned char *current = string;
396 assert(string);
397 assert(max >= 0);
398 if_assert_failed goto invalid_arg;
399 if (end == NULL)
400 end = strchr(string, '\0');
402 switch (way) {
403 case utf8_step_characters:
404 while (steps < max && current < end) {
405 ++current;
406 if (utf8_islead(*current))
407 ++steps;
409 break;
411 case utf8_step_cells_fewer:
412 case utf8_step_cells_more:
413 while (steps < max) {
414 unicode_val_T u;
415 unsigned char *prev = current;
416 int width;
418 u = utf_8_to_unicode(&current, end);
419 if (u == UCS_NO_CHAR) {
420 /* Assume the incomplete sequence
421 * costs one cell. */
422 current = end;
423 ++steps;
424 break;
427 width = unicode_to_cell(u);
428 if (way == utf8_step_cells_fewer
429 && steps + width > max) {
430 /* Back off. */
431 current = prev;
432 break;
434 steps += width;
436 break;
438 default:
439 INTERNAL("impossible enum utf8_step");
442 invalid_arg:
443 if (count)
444 *count = steps;
445 return current;
448 /* Take @max steps backward from @string in the specified @way, but
449 * not going past @start. Return the resulting address. Store the
450 * number of steps taken to *@count, unless @count is NULL.
452 * This assumes the text is valid UTF-8, and @string and @start point
453 * to character boundaries. If not, it doesn't crash but the results
454 * may be inconsistent.
456 * This function can do some of the same jobs as utf8_prevchar(). */
457 unsigned char *
458 utf8_step_backward(unsigned char *string, unsigned char *start,
459 int max, enum utf8_step way, int *count)
461 int steps = 0;
462 unsigned char *current = string;
464 assert(string);
465 assert(start);
466 assert(max >= 0);
467 if_assert_failed goto invalid_arg;
469 switch (way) {
470 case utf8_step_characters:
471 while (steps < max && current > start) {
472 --current;
473 if (utf8_islead(*current))
474 ++steps;
476 break;
478 case utf8_step_cells_fewer:
479 case utf8_step_cells_more:
480 while (steps < max) {
481 unsigned char *prev = current;
482 unsigned char *look;
483 unicode_val_T u;
484 int width;
486 if (current <= start)
487 break;
488 do {
489 --current;
490 } while (current > start && !utf8_islead(*current));
492 look = current;
493 u = utf_8_to_unicode(&look, prev);
494 if (u == UCS_NO_CHAR) {
495 /* Assume the incomplete sequence
496 * costs one cell. */
497 width = 1;
498 } else
499 width = unicode_to_cell(u);
501 if (way == utf8_step_cells_fewer
502 && steps + width > max) {
503 /* Back off. */
504 current = prev;
505 break;
507 steps += width;
509 break;
511 default:
512 INTERNAL("impossible enum utf8_step");
515 invalid_arg:
516 if (count)
517 *count = steps;
518 return current;
522 * Find out number of standard terminal collumns needed for displaying symbol
523 * (glyph) which represents Unicode character c.
524 * TODO: Use wcwidth when it is available.
526 * @return 2 for double-width glyph, 1 for others.
527 * TODO: May be extended to return 0 for zero-width glyphs
528 * (like composing, maybe unprintable too).
530 inline int
531 unicode_to_cell(unicode_val_T c)
533 if (c >= 0x1100
534 && (c <= 0x115f /* Hangul Jamo */
535 || c == 0x2329
536 || c == 0x232a
537 || (c >= 0x2e80 && c <= 0xa4cf
538 && c != 0x303f) /* CJK ... Yi */
539 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
540 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
541 Ideographs */
542 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
543 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
544 || (c >= 0xffe0 && c <= 0xffe6)
545 || (c >= 0x20000 && c <= 0x2fffd)
546 || (c >= 0x30000 && c <= 0x3fffd)))
547 return 2;
549 return 1;
552 /* Fold the case of a Unicode character, so that hotkeys in labels can
553 * be compared case-insensitively. It is unspecified whether the
554 * result will be in upper or lower case. */
555 unicode_val_T
556 unicode_fold_label_case(unicode_val_T c)
558 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
559 return towlower(c);
560 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
561 /* For now, this supports only ASCII. It would be possible to
562 * use code generated from CaseFolding.txt of Unicode if the
563 * acknowledgements required by http://www.unicode.org/copyright.html
564 * were added to associated documentation of ELinks. */
565 if (c >= 0x41 && c <= 0x5A)
566 return c + 0x20;
567 else
568 return c;
569 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
572 inline unicode_val_T
573 utf_8_to_unicode(unsigned char **string, unsigned char *end)
575 unsigned char *str = *string;
576 unicode_val_T u;
577 int length;
579 length = utf8char_len_tab[str[0]];
581 if (str + length > end) {
582 return UCS_NO_CHAR;
585 switch (length) {
586 case 1:
587 u = str[0];
588 break;
589 case 2:
590 u = (str[0] & 0x1f) << 6;
591 u += (str[1] & 0x3f);
592 break;
593 case 3:
594 u = (str[0] & 0x0f) << 12;
595 u += ((str[1] & 0x3f) << 6);
596 u += (str[2] & 0x3f);
597 break;
598 case 4:
599 u = (str[0] & 0x0f) << 18;
600 u += ((str[1] & 0x3f) << 12);
601 u += ((str[2] & 0x3f) << 6);
602 u += (str[3] & 0x3f);
603 break;
604 case 5:
605 u = (str[0] & 0x0f) << 24;
606 u += ((str[1] & 0x3f) << 18);
607 u += ((str[2] & 0x3f) << 12);
608 u += ((str[3] & 0x3f) << 6);
609 u += (str[4] & 0x3f);
610 break;
611 case 6:
612 default:
613 u = (str[0] & 0x01) << 30;
614 u += ((str[1] & 0x3f) << 24);
615 u += ((str[2] & 0x3f) << 18);
616 u += ((str[3] & 0x3f) << 12);
617 u += ((str[4] & 0x3f) << 6);
618 u += (str[5] & 0x3f);
619 break;
621 *string = str + length;
622 return u;
624 #endif /* CONFIG_UTF_8 */
626 /* Slow algorithm, the common part of cp2u and cp2utf_8. */
627 static unicode_val_T
628 cp2u_shared(const struct codepage_desc *from, unsigned char c)
630 int j;
632 for (j = 0; from->table[j].c; j++)
633 if (from->table[j].c == c)
634 return from->table[j].u;
636 return UCS_REPLACEMENT_CHARACTER;
639 /* Slow algorithm, used for converting input from the terminal. */
640 unicode_val_T
641 cp2u(int from, unsigned char c)
643 from &= ~SYSTEM_CHARSET_FLAG;
645 /* UTF-8 is a multibyte codepage and cannot be handled with
646 * this function. */
647 assert(codepages[from].table != table_utf_8);
648 if_assert_failed return UCS_REPLACEMENT_CHARACTER;
650 if (c < 0x80) return c;
651 else return cp2u_shared(&codepages[from], c);
654 /* This slow and ugly code is used by the terminal utf_8_io */
655 unsigned char *
656 cp2utf_8(int from, int c)
658 from &= ~SYSTEM_CHARSET_FLAG;
660 if (codepages[from].table == table_utf_8 || c < 128)
661 return strings[c];
663 return encode_utf_8(cp2u_shared(&codepages[from], c));
666 #ifdef CONFIG_UTF_8
667 unicode_val_T
668 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
670 unicode_val_T ret;
672 if (is_cp_utf8(codepage))
673 return utf_8_to_unicode(string, end);
675 if (*string >= end)
676 return UCS_NO_CHAR;
678 ret = cp2u(codepage, **string);
679 ++*string;
680 return ret;
682 #endif /* CONFIG_UTF_8 */
685 static void
686 add_utf_8(struct conv_table *ct, unicode_val_T u, unsigned char *str)
688 unsigned char *p = encode_utf_8(u);
690 while (p[1]) {
691 if (ct[*p].t) ct = ct[*p].u.tbl;
692 else {
693 struct conv_table *nct;
695 assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
696 if_assert_failed return;
698 nct = mem_calloc(256, sizeof(*nct));
699 if (!nct) return;
700 new_translation_table(nct);
701 ct[*p].t = 1;
702 ct[*p].u.tbl = nct;
703 ct = nct;
705 p++;
708 assertm(!ct[*p].t, "bad utf encoding #2");
709 if_assert_failed return;
711 if (ct[*p].u.str == no_str)
712 ct[*p].u.str = str;
715 struct conv_table utf_table[256];
716 int utf_table_init = 1;
718 static void
719 free_utf_table(void)
721 int i;
723 for (i = 128; i < 256; i++)
724 mem_free(utf_table[i].u.str);
727 static struct conv_table *
728 get_translation_table_to_utf_8(int from)
730 int i;
731 static int lfr = -1;
733 if (from == -1) return NULL;
734 from &= ~SYSTEM_CHARSET_FLAG;
735 if (from == lfr) return utf_table;
736 lfr = from;
737 if (utf_table_init)
738 memset(utf_table, 0, sizeof(utf_table)),
739 utf_table_init = 0;
740 else
741 free_utf_table();
743 for (i = 0; i < 128; i++)
744 utf_table[i].u.str = strings[i];
746 if (codepages[from].table == table_utf_8) {
747 for (i = 128; i < 256; i++)
748 utf_table[i].u.str = stracpy(strings[i]);
749 return utf_table;
752 for (i = 128; i < 256; i++)
753 utf_table[i].u.str = NULL;
755 for (i = 0; codepages[from].table[i].c; i++) {
756 unicode_val_T u = codepages[from].table[i].u;
758 if (!utf_table[codepages[from].table[i].c].u.str)
759 utf_table[codepages[from].table[i].c].u.str =
760 stracpy(encode_utf_8(u));
763 for (i = 128; i < 256; i++)
764 if (!utf_table[i].u.str)
765 utf_table[i].u.str = stracpy(no_str);
767 return utf_table;
770 struct conv_table table[256];
771 static int first = 1;
773 void
774 free_conv_table(void)
776 if (!utf_table_init) free_utf_table();
777 if (first) {
778 memset(table, 0, sizeof(table));
779 first = 0;
781 new_translation_table(table);
785 struct conv_table *
786 get_translation_table(int from, int to)
788 static int lfr = -1;
789 static int lto = -1;
791 from &= ~SYSTEM_CHARSET_FLAG;
792 to &= ~SYSTEM_CHARSET_FLAG;
793 if (first) {
794 memset(table, 0, sizeof(table));
795 first = 0;
797 if (/*from == to ||*/ from == -1 || to == -1)
798 return NULL;
799 if (codepages[to].table == table_utf_8)
800 return get_translation_table_to_utf_8(from);
801 if (from == lfr && to == lto)
802 return table;
803 lfr = from;
804 lto = to;
805 new_translation_table(table);
807 if (codepages[from].table == table_utf_8) {
808 int i;
810 for (i = 0; codepages[to].table[i].c; i++)
811 add_utf_8(table, codepages[to].table[i].u,
812 strings[codepages[to].table[i].c]);
814 for (i = 0; unicode_7b[i].x != -1; i++)
815 if (unicode_7b[i].x >= 0x80)
816 add_utf_8(table, unicode_7b[i].x,
817 unicode_7b[i].s);
819 } else {
820 int i;
822 for (i = 128; i < 256; i++) {
823 int j;
825 for (j = 0; codepages[from].table[j].c; j++) {
826 if (codepages[from].table[j].c == i) {
827 unsigned char *u;
829 u = u2cp(codepages[from].table[j].u, to);
830 if (u) table[i].u.str = u;
831 break;
837 return table;
840 static inline int
841 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
843 while (l2) {
844 if (*s1 > *s2) return 1;
845 if (*s1 < *s2) return -1;
846 s1++;
847 s2++;
848 l2--;
851 return *s2 ? -1 : 0;
854 /* Entity cache debugging purpose. */
855 #if 0
856 #define DEBUG_ENTITY_CACHE
857 #else
858 #undef DEBUG_ENTITY_CACHE
859 #endif
861 struct entity_cache {
862 unsigned int hits;
863 int strlen;
864 int encoding;
865 unsigned char *result;
866 unsigned char str[20]; /* Suffice in any case. */
869 static int
870 hits_cmp(struct entity_cache *a, struct entity_cache *b)
872 if (a->hits == b->hits) return 0;
873 if (a->hits > b->hits) return -1;
874 else return 1;
877 static int
878 compare_entities(const void *key_, const void *element_)
880 struct string *key = (struct string *) key_;
881 struct entity *element = (struct entity *) element_;
882 int length = key->length;
883 unsigned char *first = key->source;
884 unsigned char *second = element->s;
886 return xxstrcmp(first, second, length);
889 unsigned char *
890 get_entity_string(const unsigned char *str, const int strlen, int encoding)
892 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
893 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
894 will go in [0] table */
895 static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
896 static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
897 static int first_time = 1;
898 unsigned int slen = 0;
899 unsigned char *result = NULL;
901 if (strlen <= 0) return NULL;
903 #ifdef CONFIG_UTF_8
904 /* TODO: caching UTF-8 */
905 encoding &= ~SYSTEM_CHARSET_FLAG;
906 if (codepages[encoding].table == table_utf_8)
907 goto skip;
908 #endif /* CONFIG_UTF_8 */
910 if (first_time) {
911 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
912 first_time = 0;
915 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
916 * + google + slashdot + websites that result from a search for test on google,
917 * + various ones) show a quite impressive improvment:
918 * Top ten is:
919 * 0: hits=2459 l=4 st='nbsp'
920 * 1: hits=2152 l=6 st='eacute'
921 * 2: hits=235 l=6 st='egrave'
922 * 3: hits=136 l=6 st='agrave'
923 * 4: hits=100 l=3 st='amp'
924 * 5: hits=40 l=5 st='laquo'
925 * 6: hits=8 l=4 st='copy'
926 * 7: hits=5 l=2 st='gt'
927 * 8: hits=2 l=2 st='lt'
928 * 9: hits=1 l=6 st='middot'
930 * Most of the time cache hit ratio is near 95%.
932 * A long test shows: 15186 hits vs. 24 misses and mean iteration
933 * count is kept < 2 (worst case 1.58). Not so bad ;)
935 * --Zas */
937 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
938 slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
940 if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
941 int i;
943 for (i = 0; i < nb_entity_cache[slen]; i++) {
944 if (entity_cache[slen][i].encoding == encoding
945 && !memcmp(str, entity_cache[slen][i].str, strlen)) {
946 #ifdef DEBUG_ENTITY_CACHE
947 static double total_iter = 0;
948 static unsigned long hit_count = 0;
950 total_iter += i + 1;
951 hit_count++;
952 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
953 #endif
954 if (entity_cache[slen][i].hits < (unsigned int) ~0)
955 entity_cache[slen][i].hits++;
956 return entity_cache[slen][i].result;
959 #ifdef DEBUG_ENTITY_CACHE
960 fprintf(stderr, "miss\n");
961 #endif
963 #ifdef CONFIG_UTF_8
964 skip:
965 #endif /* CONFIG_UTF_8 */
966 if (*str == '#') { /* Numeric entity. */
967 int l = (int) strlen;
968 unsigned char *st = (unsigned char *) str;
969 unicode_val_T n = 0;
971 if (l == 1) goto end; /* &#; ? */
972 st++, l--;
973 if ((*st | 32) == 'x') { /* Hexadecimal */
975 if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
976 st++, l--;
977 do {
978 unsigned char c = (*(st++) | 32);
980 if (isdigit(c))
981 n = (n << 4) | (c - '0');
982 else if (isxdigit(c))
983 n = (n << 4) | (c - 'a' + 10);
984 else
985 goto end; /* Bad char. */
986 } while (--l);
987 } else { /* Decimal */
988 if (l > 10) goto end; /* 4294967295 max. */
989 do {
990 unsigned char c = *(st++);
992 if (isdigit(c))
993 n = n * 10 + c - '0';
994 else
995 goto end; /* Bad char. */
996 /* Limit to 0xFFFFFFFF. */
997 if (n >= (unicode_val_T) 0xFFFFFFFFu)
998 goto end;
999 } while (--l);
1002 result = u2cp(n, encoding);
1004 #ifdef DEBUG_ENTITY_CACHE
1005 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1006 #endif
1007 } else { /* Text entity. */
1008 struct string key = INIT_STRING((unsigned char *) str, strlen);
1009 struct entity *element = bsearch((void *) &key, entities,
1010 N_ENTITIES,
1011 sizeof(*element),
1012 compare_entities);
1014 if (element) result = u2cp(element->c, encoding);
1017 #ifdef CONFIG_UTF_8
1018 if (codepages[encoding].table == table_utf_8) {
1019 return result;
1021 #endif /* CONFIG_UTF_8 */
1022 end:
1023 /* Take care of potential buffer overflow. */
1024 if (strlen < sizeof(entity_cache[slen][0].str)) {
1025 struct entity_cache *ece = &entity_cache[slen][nb_entity_cache[slen]];
1027 /* Copy new entry to cache. */
1028 ece->hits = 1;
1029 ece->strlen = strlen;
1030 ece->encoding = encoding;
1031 ece->result = result;
1032 memcpy(ece->str, str, strlen);
1033 ece->str[strlen] = '\0';
1035 /* Increment number of cache entries if possible. */
1036 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1038 #ifdef DEBUG_ENTITY_CACHE
1039 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1040 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1042 #endif
1044 /* Sort entries by hit order. */
1045 if (nb_entity_cache[slen] > 1)
1046 qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1047 sizeof(entity_cache[slen][0]), (void *) hits_cmp);
1049 #ifdef DEBUG_ENTITY_CACHE
1051 unsigned int i;
1053 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1054 for (i = 0; i < nb_entity_cache[slen] ; i++)
1055 fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1056 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1057 entity_cache[slen][i].str);
1058 fprintf(stderr, "-----------------\n");
1060 #endif
1062 return result;
1065 unsigned char *
1066 convert_string(struct conv_table *convert_table,
1067 unsigned char *chars, int charslen, int cp,
1068 enum convert_string_mode mode, int *length,
1069 void (*callback)(void *data, unsigned char *buf, int buflen),
1070 void *callback_data)
1072 unsigned char *buffer;
1073 int bufferpos = 0;
1074 int charspos = 0;
1076 if (!convert_table && !memchr(chars, '&', charslen)) {
1077 if (callback) {
1078 if (charslen) callback(callback_data, chars, charslen);
1079 return NULL;
1080 } else {
1081 return memacpy(chars, charslen);
1085 /* Buffer allocation */
1087 buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1088 if (!buffer) return NULL;
1090 /* Iterate ;-) */
1092 while (charspos < charslen) {
1093 unsigned char *translit;
1095 #define PUTC do { \
1096 buffer[bufferpos++] = chars[charspos++]; \
1097 translit = ""; \
1098 goto flush; \
1099 } while (0)
1101 if (chars[charspos] != '&') {
1102 struct conv_table *t;
1103 int i;
1105 if (chars[charspos] < 128 || !convert_table) PUTC;
1107 t = convert_table;
1108 i = charspos;
1110 while (t[chars[i]].t) {
1111 t = t[chars[i++]].u.tbl;
1112 if (i >= charslen) PUTC;
1115 translit = t[chars[i]].u.str;
1116 charspos = i + 1;
1118 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1119 PUTC;
1121 } else {
1122 int start = charspos + 1;
1123 int i = start;
1125 while (i < charslen
1126 && (isasciialpha(chars[i])
1127 || isdigit(chars[i])
1128 || (chars[i] == '#')))
1129 i++;
1131 /* This prevents bug 213: we were expanding "entities"
1132 * in URL query strings. */
1133 /* XXX: But this disables &nbsp&nbsp usage, which
1134 * appears to be relatively common! --pasky */
1135 if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1136 && i > start
1137 && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1138 translit = get_entity_string(&chars[start], i - start,
1139 cp);
1140 if (chars[i] != ';') {
1141 /* Eat &nbsp &nbsp<foo> happily, but
1142 * pull back from the character after
1143 * entity string if it is not the valid
1144 * terminator. */
1145 i--;
1148 if (!translit) PUTC;
1149 charspos = i + (i < charslen);
1150 } else PUTC;
1153 if (!translit[0]) continue;
1155 if (!translit[1]) {
1156 buffer[bufferpos++] = translit[0];
1157 translit = "";
1158 goto flush;
1161 while (*translit) {
1162 unsigned char *new;
1164 buffer[bufferpos++] = *(translit++);
1165 flush:
1166 if (bufferpos & (ALLOC_GR - 1)) continue;
1168 if (callback) {
1169 buffer[bufferpos] = 0;
1170 callback(callback_data, buffer, bufferpos);
1171 bufferpos = 0;
1172 } else {
1173 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1174 if (!new) {
1175 mem_free(buffer);
1176 return NULL;
1178 buffer = new;
1181 #undef PUTC
1184 /* Say bye */
1186 buffer[bufferpos] = 0;
1187 if (length) *length = bufferpos;
1189 if (callback) {
1190 if (bufferpos) callback(callback_data, buffer, bufferpos);
1191 mem_free(buffer);
1192 return NULL;
1193 } else {
1194 return buffer;
1199 #ifndef USE_FASTFIND
1201 get_cp_index(unsigned char *name)
1203 int i, a;
1204 int syscp = 0;
1206 if (!strcasecmp(name, "System")) {
1207 #if HAVE_LANGINFO_CODESET
1208 name = nl_langinfo(CODESET);
1209 syscp = SYSTEM_CHARSET_FLAG;
1210 #else
1211 name = "us-ascii";
1212 #endif
1215 for (i = 0; codepages[i].name; i++) {
1216 for (a = 0; codepages[i].aliases[a]; a++) {
1217 /* In the past, we looked for the longest substring
1218 * in all the names; it is way too expensive, though:
1220 * % cumulative self self total
1221 * time seconds seconds calls us/call us/call name
1222 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1224 * Anything called from redraw_screen() is in fact
1225 * relatively expensive, even if it's called just
1226 * once. So we will do a simple strcasecmp() here.
1229 if (!strcasecmp(name, codepages[i].aliases[a]))
1230 return i | syscp;
1234 if (syscp) {
1235 return get_cp_index("us-ascii") | syscp;
1236 } else {
1237 return -1;
1241 #else
1243 static unsigned int i_name = 0;
1244 static unsigned int i_alias = 0;
1246 /* Reset internal list pointer */
1247 void
1248 charsets_list_reset(void)
1250 i_name = 0;
1251 i_alias = 0;
1254 /* Returns a pointer to a struct that contains current key and data pointers
1255 * and increment internal pointer. It returns NULL when key is NULL. */
1256 struct fastfind_key_value *
1257 charsets_list_next(void)
1259 static struct fastfind_key_value kv;
1261 if (!codepages[i_name].name) return NULL;
1263 kv.key = codepages[i_name].aliases[i_alias];
1264 kv.data = &codepages[i_name];
1266 if (codepages[i_name].aliases[i_alias + 1])
1267 i_alias++;
1268 else {
1269 i_name++;
1270 i_alias = 0;
1273 return &kv;
1276 static struct fastfind_index ff_charsets_index
1277 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1279 /* It searchs for a charset named @name or one of its aliases and
1280 * returns index for it or -1 if not found. */
1282 get_cp_index(unsigned char *name)
1284 struct codepage_desc *codepage;
1285 int syscp = 0;
1287 if (!strcasecmp(name, "System")) {
1288 #if HAVE_LANGINFO_CODESET
1289 name = nl_langinfo(CODESET);
1290 syscp = SYSTEM_CHARSET_FLAG;
1291 #else
1292 name = "us-ascii";
1293 #endif
1296 codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1297 if (codepage) {
1298 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1299 return (codepage - codepages) | syscp;
1301 } else if (syscp) {
1302 return get_cp_index("us-ascii") | syscp;
1304 } else {
1305 return -1;
1309 #endif /* USE_FASTFIND */
1311 void
1312 init_charsets_lookup(void)
1314 #ifdef USE_FASTFIND
1315 fastfind_index(&ff_charsets_index, FF_COMPRESS);
1316 #endif
1319 void
1320 free_charsets_lookup(void)
1322 #ifdef USE_FASTFIND
1323 fastfind_done(&ff_charsets_index);
1324 #endif
1327 unsigned char *
1328 get_cp_name(int cp_index)
1330 if (cp_index < 0) return "none";
1331 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1333 return codepages[cp_index].name;
1336 unsigned char *
1337 get_cp_mime_name(int cp_index)
1339 if (cp_index < 0) return "none";
1340 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1341 if (!codepages[cp_index].aliases) return NULL;
1343 return codepages[cp_index].aliases[0];
1347 is_cp_utf8(int cp_index)
1349 cp_index &= ~SYSTEM_CHARSET_FLAG;
1350 return codepages[cp_index].table == table_utf_8;