test: The CGI script showing POST DATA sent to it.
[elinks.git] / src / intl / charsets.c
blobc8702927faa6886a3bb82f5da403eb77a9a48484
1 /* Charsets convertor */
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE /* strcasecmp() */
5 #endif
7 #ifdef HAVE_CONFIG_H
8 #include "config.h"
9 #endif
11 #if HAVE_LANGINFO_CODESET
12 #include <langinfo.h>
13 #endif
15 #include <ctype.h>
16 #include <stdlib.h>
17 #if HAVE_WCTYPE_H
18 #include <wctype.h>
19 #endif
21 #include "elinks.h"
23 #include "document/options.h"
24 #include "intl/charsets.h"
25 #include "util/conv.h"
26 #include "util/error.h"
27 #include "util/fastfind.h"
28 #include "util/memory.h"
29 #include "util/string.h"
32 /* Fix namespace clash on MacOS. */
33 #define table table_elinks
35 struct table_entry {
36 unsigned char c;
37 /* This should in principle be unicode_val_T, but because all
38 * the values currently in codepage.inc fit in 16 bits, we can
39 * as well use uint16_t and halve sizeof(struct table_entry)
40 * from 8 bytes to 4. Should other characters ever be needed,
41 * unicode_val_T u : 24 might be a possibility, although it
42 * seems a little unportable as bitfields are in principle
43 * restricted to int, which may be 16-bit. */
44 uint16_t u;
47 struct codepage_desc {
48 unsigned char *name;
49 unsigned char *const *aliases;
51 /* The Unicode mappings of codepage bytes 0x80...0xFF.
52 * (0x00...0x7F are assumed to be ASCII in all codepages.)
53 * Because all current values fit in 16 bits, we store them as
54 * uint16_t rather than unicode_val_T. If the codepage does
55 * not use some byte, then @highhalf maps that byte to 0xFFFF,
56 * which C code converts to UCS_REPLACEMENT_CHARACTER where
57 * appropriate. (U+FFFF is reserved and will never be
58 * assigned as a character.) */
59 const uint16_t *highhalf;
61 /* If some byte in the codepage corresponds to multiple Unicode
62 * characters, then the preferred character is in @highhalf
63 * above, and the rest are listed here in @table. This table
64 * is not used for translating from the codepage to Unicode. */
65 const struct table_entry *table;
68 #include "intl/codepage.inc"
69 #include "intl/uni_7b.inc"
70 #include "intl/entity.inc"
73 static const char strings[256][2] = {
74 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
75 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
76 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
77 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
78 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
79 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
80 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
81 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
82 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
83 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
84 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
85 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
86 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
87 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
88 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
89 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
90 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
91 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
92 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
93 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
94 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
95 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
96 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
97 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
98 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
99 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
100 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
101 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
102 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
103 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
104 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
105 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
108 static void
109 free_translation_table(struct conv_table *p)
111 int i;
113 for (i = 0; i < 256; i++)
114 if (p[i].t)
115 free_translation_table(p[i].u.tbl);
117 mem_free(p);
120 /* A string used in conversion tables when there is no correct
121 * conversion. This is compared by address and therefore should be a
122 * named array rather than a pointer so that it won't share storage
123 * with any other string literal that happens to have the same
124 * characters. */
125 static const unsigned char no_str[] = "*";
127 static void
128 new_translation_table(struct conv_table *p)
130 int i;
132 for (i = 0; i < 256; i++)
133 if (p[i].t)
134 free_translation_table(p[i].u.tbl);
135 for (i = 0; i < 128; i++) {
136 p[i].t = 0;
137 p[i].u.str = strings[i];
139 for (; i < 256; i++) {
140 p[i].t = 0;
141 p[i].u.str = no_str;
145 #define BIN_SEARCH(table, entry, entries, key, result) \
147 long _s = 0, _e = (entries) - 1; \
149 while (_s <= _e || !((result) = -1)) { \
150 long _m = (_s + _e) / 2; \
152 if ((table)[_m].entry == (key)) { \
153 (result) = _m; \
154 break; \
156 if ((table)[_m].entry > (key)) _e = _m - 1; \
157 if ((table)[_m].entry < (key)) _s = _m + 1; \
161 static const unicode_val_T strange_chars[32] = {
162 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
163 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
164 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
165 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
168 #define SYSTEM_CHARSET_FLAG 128
169 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
171 const unsigned char *
172 u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
174 int j;
175 int s;
177 if (u < 128) return strings[u];
179 to &= ~SYSTEM_CHARSET_FLAG;
181 #ifdef CONFIG_UTF8
182 if (is_cp_ptr_utf8(&codepages[to]))
183 return encode_utf8(u);
184 #endif /* CONFIG_UTF8 */
186 /* To mark non breaking spaces in non-UTF-8 strings, we use a
187 * special char NBSP_CHAR. */
188 if (u == 0xa0) {
189 if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
190 else /* NBSP_MODE_ASCII */ return " ";
192 if (u == 0xad) return "";
194 if (u < 0xa0) {
195 unicode_val_T strange = strange_chars[u - 0x80];
197 if (!strange) return NULL;
198 return u2cp_(strange, to, nbsp_mode);
201 if (u < 0xFFFF)
202 for (j = 0; j < 0x80; j++)
203 if (codepages[to].highhalf[j] == u)
204 return strings[0x80 + j];
205 for (j = 0; codepages[to].table[j].c; j++)
206 if (codepages[to].table[j].u == u)
207 return strings[codepages[to].table[j].c];
209 BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
210 if (s != -1) return unicode_7b[s].s;
212 return no_str;
215 static unsigned char utf_buffer[7];
217 #ifdef CONFIG_UTF8
218 inline unsigned char *
219 encode_utf8(unicode_val_T u)
220 #else
221 static unsigned char *
222 encode_utf8(unicode_val_T u)
223 #endif /* CONFIG_UTF8 */
225 memset(utf_buffer, 0, 7);
227 if (u < 0x80)
228 utf_buffer[0] = u;
229 else if (u < 0x800)
230 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
231 utf_buffer[1] = 0x80 | (u & 0x3f);
232 else if (u < 0x10000)
233 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
234 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
235 utf_buffer[2] = 0x80 | (u & 0x3f);
236 else if (u < 0x200000)
237 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
238 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
239 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
240 utf_buffer[3] = 0x80 | (u & 0x3f);
241 else if (u < 0x4000000)
242 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
243 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
244 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
245 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
246 utf_buffer[4] = 0x80 | (u & 0x3f);
247 else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
248 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
249 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
250 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
251 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
252 utf_buffer[5] = 0x80 | (u & 0x3f);
254 return utf_buffer;
257 #ifdef CONFIG_UTF8
258 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
259 * equal ones and handled different. */
260 static const char utf8char_len_tab[256] = {
261 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
262 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
263 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
264 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
265 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
266 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
267 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
268 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
271 inline int utf8charlen(const unsigned char *p)
273 return p ? utf8char_len_tab[*p] : 0;
276 inline int
277 strlen_utf8(unsigned char **str)
279 unsigned char *s = *str;
280 unsigned char *end = strchr(s, '\0');
281 int x;
282 int len;
284 for (x = 0;; x++, s += len) {
285 len = utf8charlen(s);
286 if (s + len > end) break;
288 *str = s;
289 return x;
292 #define utf8_issingle(p) (((p) & 0x80) == 0)
293 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
295 /* Start from @current and move back to @pos char. This pointer return. The
296 * most left pointer is @start. */
297 inline unsigned char *
298 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
300 if (current == NULL || start == NULL || pos < 0)
301 return NULL;
302 while (pos > 0 && current != start) {
303 current--;
304 if (utf8_islead(*current))
305 pos--;
307 return current;
310 /* Count number of standard terminal cells needed for displaying UTF-8
311 * character. */
313 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
315 unicode_val_T u;
317 if (end == NULL)
318 end = strchr(utf8_char, '\0');
320 if(!utf8_char || !end)
321 return -1;
323 u = utf8_to_unicode(&utf8_char, end);
325 return unicode_to_cell(u);
328 /* Count number of standard terminal cells needed for displaying string
329 * with UTF-8 characters. */
331 utf8_ptr2cells(unsigned char *string, unsigned char *end)
333 int charlen, cell, cells = 0;
335 if (end == NULL)
336 end = strchr(string, '\0');
338 if(!string || !end)
339 return -1;
341 do {
342 charlen = utf8charlen(string);
343 if (string + charlen > end)
344 break;
346 cell = utf8_char2cells(string, end);
347 if (cell < 0)
348 return -1;
350 cells += cell;
351 string += charlen;
352 } while (1);
354 return cells;
357 /* Count number of characters in string. */
359 utf8_ptr2chars(unsigned char *string, unsigned char *end)
361 int charlen, chars = 0;
363 if (end == NULL)
364 end = strchr(string, '\0');
366 if(!string || !end)
367 return -1;
369 do {
370 charlen = utf8charlen(string);
371 if (string + charlen > end)
372 break;
374 chars++;
375 string += charlen;
376 } while (1);
378 return chars;
382 * Count number of bytes from begining of the string needed for displaying
383 * specified number of cells.
386 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
388 unsigned int bytes = 0, cells = 0;
390 assert(max_cells>=0);
392 if (end == NULL)
393 end = strchr(string, '\0');
395 if(!string || !end)
396 return -1;
398 do {
399 int cell = utf8_char2cells(&string[bytes], end);
400 if (cell < 0)
401 return -1;
403 cells += cell;
404 if (cells > max_cells)
405 break;
407 bytes += utf8charlen(&string[bytes]);
409 if (string + bytes > end) {
410 bytes = end - string;
411 break;
413 } while(1);
415 return bytes;
418 /* Take @max steps forward from @string in the specified @way, but
419 * not going past @end. Return the resulting address. Store the
420 * number of steps taken to *@count, unless @count is NULL.
422 * This assumes the text is valid UTF-8, and @string and @end point to
423 * character boundaries. If not, it doesn't crash but the results may
424 * be inconsistent.
426 * This function can do some of the same jobs as utf8charlen(),
427 * utf8_cells2bytes(), and strlen_utf8(). */
428 unsigned char *
429 utf8_step_forward(unsigned char *string, unsigned char *end,
430 int max, enum utf8_step way, int *count)
432 int steps = 0;
433 unsigned char *current = string;
435 assert(string);
436 assert(max >= 0);
437 if_assert_failed goto invalid_arg;
438 if (end == NULL)
439 end = strchr(string, '\0');
441 switch (way) {
442 case UTF8_STEP_CHARACTERS:
443 while (steps < max && current < end) {
444 ++current;
445 if (utf8_islead(*current))
446 ++steps;
448 break;
450 case UTF8_STEP_CELLS_FEWER:
451 case UTF8_STEP_CELLS_MORE:
452 while (steps < max) {
453 unicode_val_T u;
454 unsigned char *prev = current;
455 int width;
457 u = utf8_to_unicode(&current, end);
458 if (u == UCS_NO_CHAR) {
459 /* Assume the incomplete sequence
460 * costs one cell. */
461 current = end;
462 ++steps;
463 break;
466 width = unicode_to_cell(u);
467 if (way == UTF8_STEP_CELLS_FEWER
468 && steps + width > max) {
469 /* Back off. */
470 current = prev;
471 break;
473 steps += width;
475 break;
477 default:
478 INTERNAL("impossible enum utf8_step");
481 invalid_arg:
482 if (count)
483 *count = steps;
484 return current;
487 /* Take @max steps backward from @string in the specified @way, but
488 * not going past @start. Return the resulting address. Store the
489 * number of steps taken to *@count, unless @count is NULL.
491 * This assumes the text is valid UTF-8, and @string and @start point
492 * to character boundaries. If not, it doesn't crash but the results
493 * may be inconsistent.
495 * This function can do some of the same jobs as utf8_prevchar(). */
496 unsigned char *
497 utf8_step_backward(unsigned char *string, unsigned char *start,
498 int max, enum utf8_step way, int *count)
500 int steps = 0;
501 unsigned char *current = string;
503 assert(string);
504 assert(start);
505 assert(max >= 0);
506 if_assert_failed goto invalid_arg;
508 switch (way) {
509 case UTF8_STEP_CHARACTERS:
510 while (steps < max && current > start) {
511 --current;
512 if (utf8_islead(*current))
513 ++steps;
515 break;
517 case UTF8_STEP_CELLS_FEWER:
518 case UTF8_STEP_CELLS_MORE:
519 while (steps < max) {
520 unsigned char *prev = current;
521 unsigned char *look;
522 unicode_val_T u;
523 int width;
525 if (current <= start)
526 break;
527 do {
528 --current;
529 } while (current > start && !utf8_islead(*current));
531 look = current;
532 u = utf8_to_unicode(&look, prev);
533 if (u == UCS_NO_CHAR) {
534 /* Assume the incomplete sequence
535 * costs one cell. */
536 width = 1;
537 } else
538 width = unicode_to_cell(u);
540 if (way == UTF8_STEP_CELLS_FEWER
541 && steps + width > max) {
542 /* Back off. */
543 current = prev;
544 break;
546 steps += width;
548 break;
550 default:
551 INTERNAL("impossible enum utf8_step");
554 invalid_arg:
555 if (count)
556 *count = steps;
557 return current;
561 * Find out number of standard terminal collumns needed for displaying symbol
562 * (glyph) which represents Unicode character c.
564 * TODO: Use wcwidth when it is available. This seems to require:
565 * - Make the configure script check whether <wchar.h> and wcwidth exist.
566 * - Define _XOPEN_SOURCE and include <wchar.h>.
567 * - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
568 * matches ISO 10646 in all locales.)
569 * However, these do not suffice, because wcwidth depends on LC_CTYPE
570 * in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
571 * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
572 * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
573 * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
574 * character is apparently not supported in all locales. Why is that?
575 * - Perhaps there is standardese that requires supported characters
576 * to be convertable to multibyte form. Then ELinks could just pick
577 * some UTF-8 locale for its wcwidth purposes.
578 * - Perhaps wcwidth can even return different nonnegative values for
579 * the same ISO 10646 character in different locales. Then ELinks
580 * would have to set LC_CTYPE to match at least the terminal's
581 * charset (which may differ from the LC_CTYPE environment variable,
582 * especially when the master process is serving a slave terminal).
583 * But there is no guarantee that the libc supports all the same
584 * charsets as ELinks does.
585 * For now, it seems safest to avoid the potentially locale-dependent
586 * libc version of wcwidth, and instead use a hardcoded mapping.
588 * @return 2 for double-width glyph, 1 for others.
589 * TODO: May be extended to return 0 for zero-width glyphs
590 * (like composing, maybe unprintable too).
592 inline int
593 unicode_to_cell(unicode_val_T c)
595 if (c >= 0x1100
596 && (c <= 0x115f /* Hangul Jamo */
597 || c == 0x2329
598 || c == 0x232a
599 || (c >= 0x2e80 && c <= 0xa4cf
600 && c != 0x303f) /* CJK ... Yi */
601 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
602 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
603 Ideographs */
604 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
605 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
606 || (c >= 0xffe0 && c <= 0xffe6)
607 || (c >= 0x20000 && c <= 0x2fffd)
608 || (c >= 0x30000 && c <= 0x3fffd)))
609 return 2;
611 return 1;
614 /* Fold the case of a Unicode character, so that hotkeys in labels can
615 * be compared case-insensitively. It is unspecified whether the
616 * result will be in upper or lower case. */
617 unicode_val_T
618 unicode_fold_label_case(unicode_val_T c)
620 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
621 return towlower(c);
622 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
623 /* For now, this supports only ASCII. It would be possible to
624 * use code generated from CaseFolding.txt of Unicode if the
625 * acknowledgements required by http://www.unicode.org/copyright.html
626 * were added to associated documentation of ELinks. */
627 if (c >= 0x41 && c <= 0x5A)
628 return c + 0x20;
629 else
630 return c;
631 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
634 inline unicode_val_T
635 utf8_to_unicode(unsigned char **string, unsigned char *end)
637 unsigned char *str = *string;
638 unicode_val_T u;
639 int length;
641 length = utf8char_len_tab[str[0]];
643 if (str + length > end) {
644 return UCS_NO_CHAR;
647 switch (length) {
648 case 1: /* U+0000 to U+007F */
649 if (str[0] >= 0x80) {
650 invalid_utf8:
651 ++*string;
652 return UCS_REPLACEMENT_CHARACTER;
654 u = str[0];
655 break;
656 case 2: /* U+0080 to U+07FF */
657 if ((str[1] & 0xc0) != 0x80)
658 goto invalid_utf8;
659 u = (str[0] & 0x1f) << 6;
660 u += (str[1] & 0x3f);
661 if (u < 0x80)
662 goto invalid_utf8;
663 break;
664 case 3: /* U+0800 to U+FFFF, except surrogates */
665 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
666 goto invalid_utf8;
667 u = (str[0] & 0x0f) << 12;
668 u += ((str[1] & 0x3f) << 6);
669 u += (str[2] & 0x3f);
670 if (u < 0x800 || is_utf16_surrogate(u))
671 goto invalid_utf8;
672 break;
673 case 4: /* U+10000 to U+1FFFFF */
674 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
675 || (str[3] & 0xc0) != 0x80)
676 goto invalid_utf8;
677 u = (str[0] & 0x0f) << 18;
678 u += ((str[1] & 0x3f) << 12);
679 u += ((str[2] & 0x3f) << 6);
680 u += (str[3] & 0x3f);
681 if (u < 0x10000)
682 goto invalid_utf8;
683 break;
684 case 5: /* U+200000 to U+3FFFFFF */
685 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
686 || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
687 goto invalid_utf8;
688 u = (str[0] & 0x0f) << 24;
689 u += ((str[1] & 0x3f) << 18);
690 u += ((str[2] & 0x3f) << 12);
691 u += ((str[3] & 0x3f) << 6);
692 u += (str[4] & 0x3f);
693 if (u < 0x200000)
694 goto invalid_utf8;
695 break;
696 case 6: /* U+4000000 to U+7FFFFFFF */
697 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
698 || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
699 || (str[5] & 0xc0) != 0x80)
700 goto invalid_utf8;
701 u = (str[0] & 0x01) << 30;
702 u += ((str[1] & 0x3f) << 24);
703 u += ((str[2] & 0x3f) << 18);
704 u += ((str[3] & 0x3f) << 12);
705 u += ((str[4] & 0x3f) << 6);
706 u += (str[5] & 0x3f);
707 if (u < 0x4000000)
708 goto invalid_utf8;
709 break;
710 default:
711 INTERNAL("utf8char_len_tab out of range");
712 goto invalid_utf8;
714 *string = str + length;
715 return u;
717 #endif /* CONFIG_UTF8 */
719 /* The common part of cp2u and cp2utf_8. */
720 static unicode_val_T
721 cp2u_shared(const struct codepage_desc *from, unsigned char c)
723 unicode_val_T u = from->highhalf[c - 0x80];
725 if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
726 return u;
729 /* Used for converting input from the terminal. */
730 unicode_val_T
731 cp2u(int from, unsigned char c)
733 from &= ~SYSTEM_CHARSET_FLAG;
735 /* UTF-8 is a multibyte codepage and cannot be handled with
736 * this function. */
737 assert(!is_cp_ptr_utf8(&codepages[from]));
738 if_assert_failed return UCS_REPLACEMENT_CHARACTER;
740 if (c < 0x80) return c;
741 else return cp2u_shared(&codepages[from], c);
744 /* This slow and ugly code is used by the terminal utf_8_io */
745 const unsigned char *
746 cp2utf8(int from, int c)
748 from &= ~SYSTEM_CHARSET_FLAG;
750 if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
751 return strings[c];
753 return encode_utf8(cp2u_shared(&codepages[from], c));
756 #ifdef CONFIG_UTF8
757 unicode_val_T
758 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
760 unicode_val_T ret;
762 if (is_cp_utf8(codepage))
763 return utf8_to_unicode(string, end);
765 if (*string >= end)
766 return UCS_NO_CHAR;
768 ret = cp2u(codepage, **string);
769 ++*string;
770 return ret;
772 #endif /* CONFIG_UTF8 */
775 static void
776 add_utf8(struct conv_table *ct, unicode_val_T u, const unsigned char *str)
778 unsigned char *p = encode_utf8(u);
780 while (p[1]) {
781 if (ct[*p].t) ct = ct[*p].u.tbl;
782 else {
783 struct conv_table *nct;
785 assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
786 if_assert_failed return;
788 nct = mem_calloc(256, sizeof(*nct));
789 if (!nct) return;
790 new_translation_table(nct);
791 ct[*p].t = 1;
792 ct[*p].u.tbl = nct;
793 ct = nct;
795 p++;
798 assertm(!ct[*p].t, "bad utf encoding #2");
799 if_assert_failed return;
801 if (ct[*p].u.str == no_str)
802 ct[*p].u.str = str;
805 /* A conversion table from some charset to UTF-8.
806 * If it is from UTF-8 to UTF-8, it converts each byte separately.
807 * Unlike in other translation tables, the strings in elements 0x80 to
808 * 0xFF are allocated dynamically. */
809 struct conv_table utf_table[256];
810 int utf_table_init = 1;
812 static void
813 free_utf_table(void)
815 int i;
817 /* Cast away const. */
818 for (i = 128; i < 256; i++)
819 mem_free((unsigned char *) utf_table[i].u.str);
822 static struct conv_table *
823 get_translation_table_to_utf8(int from)
825 int i;
826 static int lfr = -1;
828 if (from == -1) return NULL;
829 from &= ~SYSTEM_CHARSET_FLAG;
830 if (from == lfr) return utf_table;
831 lfr = from;
832 if (utf_table_init)
833 memset(utf_table, 0, sizeof(utf_table)),
834 utf_table_init = 0;
835 else
836 free_utf_table();
838 for (i = 0; i < 128; i++)
839 utf_table[i].u.str = strings[i];
841 if (is_cp_ptr_utf8(&codepages[from])) {
842 for (i = 128; i < 256; i++)
843 utf_table[i].u.str = stracpy(strings[i]);
844 return utf_table;
847 for (i = 128; i < 256; i++) {
848 unicode_val_T u = codepages[from].highhalf[i - 0x80];
850 if (u == 0xFFFF)
851 utf_table[i].u.str = NULL;
852 else
853 utf_table[i].u.str = stracpy(encode_utf8(u));
856 for (i = 0; codepages[from].table[i].c; i++) {
857 unicode_val_T u = codepages[from].table[i].u;
859 if (!utf_table[codepages[from].table[i].c].u.str)
860 utf_table[codepages[from].table[i].c].u.str =
861 stracpy(encode_utf8(u));
864 for (i = 128; i < 256; i++)
865 if (!utf_table[i].u.str)
866 utf_table[i].u.str = stracpy(no_str);
868 return utf_table;
871 /* A conversion table between two charsets, where the target is not UTF-8. */
872 static struct conv_table table[256];
873 static int first = 1;
875 void
876 free_conv_table(void)
878 if (!utf_table_init) free_utf_table();
879 if (first) {
880 memset(table, 0, sizeof(table));
881 first = 0;
883 new_translation_table(table);
887 struct conv_table *
888 get_translation_table(int from, int to)
890 static int lfr = -1;
891 static int lto = -1;
893 from &= ~SYSTEM_CHARSET_FLAG;
894 to &= ~SYSTEM_CHARSET_FLAG;
895 if (first) {
896 memset(table, 0, sizeof(table));
897 first = 0;
899 if (/*from == to ||*/ from == -1 || to == -1)
900 return NULL;
901 if (is_cp_ptr_utf8(&codepages[to]))
902 return get_translation_table_to_utf8(from);
903 if (from == lfr && to == lto)
904 return table;
905 lfr = from;
906 lto = to;
907 new_translation_table(table);
909 if (is_cp_ptr_utf8(&codepages[from])) {
910 int i;
912 for (i = 0x80; i <= 0xFF; i++)
913 if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
914 add_utf8(table,
915 codepages[to].highhalf[i - 0x80],
916 strings[i]);
918 for (i = 0; codepages[to].table[i].c; i++)
919 add_utf8(table, codepages[to].table[i].u,
920 strings[codepages[to].table[i].c]);
922 for (i = 0; unicode_7b[i].x != -1; i++)
923 if (unicode_7b[i].x >= 0x80)
924 add_utf8(table, unicode_7b[i].x,
925 unicode_7b[i].s);
927 } else {
928 int i;
930 for (i = 128; i < 256; i++) {
931 if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
932 const unsigned char *u;
934 u = u2cp(codepages[from].highhalf[i - 0x80], to);
935 if (u) table[i].u.str = u;
940 return table;
943 static inline int
944 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
946 while (l2) {
947 if (*s1 > *s2) return 1;
948 if (*s1 < *s2) return -1;
949 s1++;
950 s2++;
951 l2--;
954 return *s2 ? -1 : 0;
957 /* Entity cache debugging purpose. */
958 #if 0
959 #define DEBUG_ENTITY_CACHE
960 #else
961 #undef DEBUG_ENTITY_CACHE
962 #endif
964 struct entity_cache {
965 unsigned int hits;
966 int strlen;
967 int encoding;
968 const unsigned char *result;
969 unsigned char str[20]; /* Suffice in any case. */
972 static int
973 hits_cmp(struct entity_cache *a, struct entity_cache *b)
975 if (a->hits == b->hits) return 0;
976 if (a->hits > b->hits) return -1;
977 else return 1;
980 static int
981 compare_entities(const void *key_, const void *element_)
983 struct string *key = (struct string *) key_;
984 struct entity *element = (struct entity *) element_;
985 int length = key->length;
986 unsigned char *first = key->source;
987 unsigned char *second = element->s;
989 return xxstrcmp(first, second, length);
992 const unsigned char *
993 get_entity_string(const unsigned char *str, const int strlen, int encoding)
995 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
996 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
997 will go in [0] table */
998 static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
999 static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
1000 static int first_time = 1;
1001 unsigned int slen = 0;
1002 const unsigned char *result = NULL;
1004 if (strlen <= 0) return NULL;
1006 #ifdef CONFIG_UTF8
1007 /* TODO: caching UTF-8 */
1008 encoding &= ~SYSTEM_CHARSET_FLAG;
1009 if (is_cp_ptr_utf8(&codepages[encoding]))
1010 goto skip;
1011 #endif /* CONFIG_UTF8 */
1013 if (first_time) {
1014 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
1015 first_time = 0;
1018 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1019 * + google + slashdot + websites that result from a search for test on google,
1020 * + various ones) show a quite impressive improvment:
1021 * Top ten is:
1022 * 0: hits=2459 l=4 st='nbsp'
1023 * 1: hits=2152 l=6 st='eacute'
1024 * 2: hits=235 l=6 st='egrave'
1025 * 3: hits=136 l=6 st='agrave'
1026 * 4: hits=100 l=3 st='amp'
1027 * 5: hits=40 l=5 st='laquo'
1028 * 6: hits=8 l=4 st='copy'
1029 * 7: hits=5 l=2 st='gt'
1030 * 8: hits=2 l=2 st='lt'
1031 * 9: hits=1 l=6 st='middot'
1033 * Most of the time cache hit ratio is near 95%.
1035 * A long test shows: 15186 hits vs. 24 misses and mean iteration
1036 * count is kept < 2 (worst case 1.58). Not so bad ;)
1038 * --Zas */
1040 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1041 slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
1043 if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
1044 int i;
1046 for (i = 0; i < nb_entity_cache[slen]; i++) {
1047 if (entity_cache[slen][i].encoding == encoding
1048 && !memcmp(str, entity_cache[slen][i].str, strlen)) {
1049 #ifdef DEBUG_ENTITY_CACHE
1050 static double total_iter = 0;
1051 static unsigned long hit_count = 0;
1053 total_iter += i + 1;
1054 hit_count++;
1055 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
1056 #endif
1057 if (entity_cache[slen][i].hits < (unsigned int) ~0)
1058 entity_cache[slen][i].hits++;
1059 return entity_cache[slen][i].result;
1062 #ifdef DEBUG_ENTITY_CACHE
1063 fprintf(stderr, "miss\n");
1064 #endif
1066 #ifdef CONFIG_UTF8
1067 skip:
1068 #endif /* CONFIG_UTF8 */
1069 if (*str == '#') { /* Numeric entity. */
1070 int l = (int) strlen;
1071 unsigned char *st = (unsigned char *) str;
1072 unicode_val_T n = 0;
1074 if (l == 1) goto end; /* &#; ? */
1075 st++, l--;
1076 if ((*st | 32) == 'x') { /* Hexadecimal */
1078 if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
1079 st++, l--;
1080 do {
1081 unsigned char c = (*(st++) | 32);
1083 if (isdigit(c))
1084 n = (n << 4) | (c - '0');
1085 else if (isxdigit(c))
1086 n = (n << 4) | (c - 'a' + 10);
1087 else
1088 goto end; /* Bad char. */
1089 } while (--l);
1090 } else { /* Decimal */
1091 if (l > 10) goto end; /* 4294967295 max. */
1092 do {
1093 unsigned char c = *(st++);
1095 if (isdigit(c))
1096 n = n * 10 + c - '0';
1097 else
1098 goto end; /* Bad char. */
1099 /* Limit to 0xFFFFFFFF. */
1100 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1101 goto end;
1102 } while (--l);
1105 result = u2cp(n, encoding);
1107 #ifdef DEBUG_ENTITY_CACHE
1108 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1109 #endif
1110 } else { /* Text entity. */
1111 struct string key = INIT_STRING((unsigned char *) str, strlen);
1112 struct entity *element = bsearch((void *) &key, entities,
1113 N_ENTITIES,
1114 sizeof(*element),
1115 compare_entities);
1117 if (element) result = u2cp(element->c, encoding);
1120 #ifdef CONFIG_UTF8
1121 if (is_cp_ptr_utf8(&codepages[encoding])) {
1122 return result;
1124 #endif /* CONFIG_UTF8 */
1125 end:
1126 /* Take care of potential buffer overflow. */
1127 if (strlen < sizeof(entity_cache[slen][0].str)) {
1128 struct entity_cache *ece = &entity_cache[slen][nb_entity_cache[slen]];
1130 /* Copy new entry to cache. */
1131 ece->hits = 1;
1132 ece->strlen = strlen;
1133 ece->encoding = encoding;
1134 ece->result = result;
1135 memcpy(ece->str, str, strlen);
1136 ece->str[strlen] = '\0';
1138 /* Increment number of cache entries if possible. */
1139 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1141 #ifdef DEBUG_ENTITY_CACHE
1142 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1143 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1145 #endif
1147 /* Sort entries by hit order. */
1148 if (nb_entity_cache[slen] > 1)
1149 qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1150 sizeof(entity_cache[slen][0]), (void *) hits_cmp);
1152 #ifdef DEBUG_ENTITY_CACHE
1154 unsigned int i;
1156 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1157 for (i = 0; i < nb_entity_cache[slen] ; i++)
1158 fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1159 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1160 entity_cache[slen][i].str);
1161 fprintf(stderr, "-----------------\n");
1163 #endif
1165 return result;
1168 unsigned char *
1169 convert_string(struct conv_table *convert_table,
1170 unsigned char *chars, int charslen, int cp,
1171 enum convert_string_mode mode, int *length,
1172 void (*callback)(void *data, unsigned char *buf, int buflen),
1173 void *callback_data)
1175 unsigned char *buffer;
1176 int bufferpos = 0;
1177 int charspos = 0;
1179 if (!convert_table && !memchr(chars, '&', charslen)) {
1180 if (callback) {
1181 if (charslen) callback(callback_data, chars, charslen);
1182 return NULL;
1183 } else {
1184 return memacpy(chars, charslen);
1188 /* Buffer allocation */
1190 buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1191 if (!buffer) return NULL;
1193 /* Iterate ;-) */
1195 while (charspos < charslen) {
1196 const unsigned char *translit;
1198 #define PUTC do { \
1199 buffer[bufferpos++] = chars[charspos++]; \
1200 translit = ""; \
1201 goto flush; \
1202 } while (0)
1204 if (chars[charspos] != '&') {
1205 struct conv_table *t;
1206 int i;
1208 if (chars[charspos] < 128 || !convert_table) PUTC;
1210 t = convert_table;
1211 i = charspos;
1213 while (t[chars[i]].t) {
1214 t = t[chars[i++]].u.tbl;
1215 if (i >= charslen) PUTC;
1218 translit = t[chars[i]].u.str;
1219 charspos = i + 1;
1221 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1222 PUTC;
1224 } else {
1225 int start = charspos + 1;
1226 int i = start;
1228 while (i < charslen
1229 && (isasciialpha(chars[i])
1230 || isdigit(chars[i])
1231 || (chars[i] == '#')))
1232 i++;
1234 /* This prevents bug 213: we were expanding "entities"
1235 * in URL query strings. */
1236 /* XXX: But this disables &nbsp&nbsp usage, which
1237 * appears to be relatively common! --pasky */
1238 if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1239 && i > start
1240 && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1241 translit = get_entity_string(&chars[start], i - start,
1242 cp);
1243 if (chars[i] != ';') {
1244 /* Eat &nbsp &nbsp<foo> happily, but
1245 * pull back from the character after
1246 * entity string if it is not the valid
1247 * terminator. */
1248 i--;
1251 if (!translit) PUTC;
1252 charspos = i + (i < charslen);
1253 } else PUTC;
1256 if (!translit[0]) continue;
1258 if (!translit[1]) {
1259 buffer[bufferpos++] = translit[0];
1260 translit = "";
1261 goto flush;
1264 while (*translit) {
1265 unsigned char *new;
1267 buffer[bufferpos++] = *(translit++);
1268 flush:
1269 if (bufferpos & (ALLOC_GR - 1)) continue;
1271 if (callback) {
1272 buffer[bufferpos] = 0;
1273 callback(callback_data, buffer, bufferpos);
1274 bufferpos = 0;
1275 } else {
1276 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1277 if (!new) {
1278 mem_free(buffer);
1279 return NULL;
1281 buffer = new;
1284 #undef PUTC
1287 /* Say bye */
1289 buffer[bufferpos] = 0;
1290 if (length) *length = bufferpos;
1292 if (callback) {
1293 if (bufferpos) callback(callback_data, buffer, bufferpos);
1294 mem_free(buffer);
1295 return NULL;
1296 } else {
1297 return buffer;
1302 #ifndef USE_FASTFIND
1304 get_cp_index(unsigned char *name)
1306 int i, a;
1307 int syscp = 0;
1309 if (!strcasecmp(name, "System")) {
1310 #if HAVE_LANGINFO_CODESET
1311 name = nl_langinfo(CODESET);
1312 syscp = SYSTEM_CHARSET_FLAG;
1313 #else
1314 name = "us-ascii";
1315 #endif
1318 for (i = 0; codepages[i].name; i++) {
1319 for (a = 0; codepages[i].aliases[a]; a++) {
1320 /* In the past, we looked for the longest substring
1321 * in all the names; it is way too expensive, though:
1323 * % cumulative self self total
1324 * time seconds seconds calls us/call us/call name
1325 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1327 * Anything called from redraw_screen() is in fact
1328 * relatively expensive, even if it's called just
1329 * once. So we will do a simple strcasecmp() here.
1332 if (!strcasecmp(name, codepages[i].aliases[a]))
1333 return i | syscp;
1337 if (syscp) {
1338 return get_cp_index("us-ascii") | syscp;
1339 } else {
1340 return -1;
1344 #else
1346 static unsigned int i_name = 0;
1347 static unsigned int i_alias = 0;
1349 /* Reset internal list pointer */
1350 void
1351 charsets_list_reset(void)
1353 i_name = 0;
1354 i_alias = 0;
1357 /* Returns a pointer to a struct that contains current key and data pointers
1358 * and increment internal pointer. It returns NULL when key is NULL. */
1359 struct fastfind_key_value *
1360 charsets_list_next(void)
1362 static struct fastfind_key_value kv;
1364 if (!codepages[i_name].name) return NULL;
1366 kv.key = codepages[i_name].aliases[i_alias];
1367 kv.data = (void *) &codepages[i_name]; /* cast away const */
1369 if (codepages[i_name].aliases[i_alias + 1])
1370 i_alias++;
1371 else {
1372 i_name++;
1373 i_alias = 0;
1376 return &kv;
1379 static struct fastfind_index ff_charsets_index
1380 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1382 /* It searchs for a charset named @name or one of its aliases and
1383 * returns index for it or -1 if not found. */
1385 get_cp_index(unsigned char *name)
1387 const struct codepage_desc *codepage;
1388 int syscp = 0;
1390 if (!strcasecmp(name, "System")) {
1391 #if HAVE_LANGINFO_CODESET
1392 name = nl_langinfo(CODESET);
1393 syscp = SYSTEM_CHARSET_FLAG;
1394 #else
1395 name = "us-ascii";
1396 #endif
1399 codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1400 if (codepage) {
1401 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1402 return (codepage - codepages) | syscp;
1404 } else if (syscp) {
1405 return get_cp_index("us-ascii") | syscp;
1407 } else {
1408 return -1;
1412 #endif /* USE_FASTFIND */
1414 void
1415 init_charsets_lookup(void)
1417 #ifdef USE_FASTFIND
1418 fastfind_index(&ff_charsets_index, FF_COMPRESS);
1419 #endif
1422 void
1423 free_charsets_lookup(void)
1425 #ifdef USE_FASTFIND
1426 fastfind_done(&ff_charsets_index);
1427 #endif
1430 unsigned char *
1431 get_cp_name(int cp_index)
1433 if (cp_index < 0) return "none";
1434 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1436 return codepages[cp_index].name;
1439 unsigned char *
1440 get_cp_mime_name(int cp_index)
1442 if (cp_index < 0) return "none";
1443 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1444 if (!codepages[cp_index].aliases) return NULL;
1446 return codepages[cp_index].aliases[0];
1450 is_cp_utf8(int cp_index)
1452 cp_index &= ~SYSTEM_CHARSET_FLAG;
1453 return is_cp_ptr_utf8(&codepages[cp_index]);