1 /* Charsets convertor */
4 #define _GNU_SOURCE /* strcasecmp() */
11 #if HAVE_LANGINFO_CODESET
23 #include "document/options.h"
24 #include "intl/charsets.h"
25 #include "util/conv.h"
26 #include "util/error.h"
27 #include "util/fastfind.h"
28 #include "util/memory.h"
29 #include "util/string.h"
32 /* Fix namespace clash on MacOS. */
33 #define table table_elinks
37 /* This should in principle be unicode_val_T, but because all
38 * the values currently in codepage.inc fit in 16 bits, we can
39 * as well use uint16_t and halve sizeof(struct table_entry)
40 * from 8 bytes to 4. Should other characters ever be needed,
41 * unicode_val_T u : 24 might be a possibility, although it
42 * seems a little unportable as bitfields are in principle
43 * restricted to int, which may be 16-bit. */
47 struct codepage_desc
{
49 unsigned char *const *aliases
;
51 /* The Unicode mappings of codepage bytes 0x80...0xFF.
52 * (0x00...0x7F are assumed to be ASCII in all codepages.)
53 * Because all current values fit in 16 bits, we store them as
54 * uint16_t rather than unicode_val_T. If the codepage does
55 * not use some byte, then @highhalf maps that byte to 0xFFFF,
56 * which C code converts to UCS_REPLACEMENT_CHARACTER where
57 * appropriate. (U+FFFF is reserved and will never be
58 * assigned as a character.) */
59 const uint16_t *highhalf
;
61 /* If some byte in the codepage corresponds to multiple Unicode
62 * characters, then the preferred character is in @highhalf
63 * above, and the rest are listed here in @table. This table
64 * is not used for translating from the codepage to Unicode. */
65 const struct table_entry
*table
;
68 #include "intl/codepage.inc"
69 #include "intl/uni_7b.inc"
70 #include "intl/entity.inc"
73 static const char strings
[256][2] = {
74 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
75 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
76 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
77 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
78 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
79 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
80 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
81 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
82 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
83 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
84 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
85 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
86 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
87 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
88 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
89 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
90 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
91 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
92 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
93 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
94 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
95 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
96 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
97 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
98 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
99 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
100 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
101 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
102 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
103 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
104 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
105 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
109 free_translation_table(struct conv_table
*p
)
113 for (i
= 0; i
< 256; i
++)
115 free_translation_table(p
[i
].u
.tbl
);
120 /* A string used in conversion tables when there is no correct
121 * conversion. This is compared by address and therefore should be a
122 * named array rather than a pointer so that it won't share storage
123 * with any other string literal that happens to have the same
125 static const unsigned char no_str
[] = "*";
128 new_translation_table(struct conv_table
*p
)
132 for (i
= 0; i
< 256; i
++)
134 free_translation_table(p
[i
].u
.tbl
);
135 for (i
= 0; i
< 128; i
++) {
137 p
[i
].u
.str
= strings
[i
];
139 for (; i
< 256; i
++) {
145 #define BIN_SEARCH(table, entry, entries, key, result) \
147 long _s = 0, _e = (entries) - 1; \
149 while (_s <= _e || !((result) = -1)) { \
150 long _m = (_s + _e) / 2; \
152 if ((table)[_m].entry == (key)) { \
156 if ((table)[_m].entry > (key)) _e = _m - 1; \
157 if ((table)[_m].entry < (key)) _s = _m + 1; \
161 static const unicode_val_T strange_chars[32] = {
162 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
163 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
164 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
165 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
168 #define SYSTEM_CHARSET_FLAG 128
169 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
171 const unsigned char *
172 u2cp_(unicode_val_T u
, int to
, enum nbsp_mode nbsp_mode
)
177 if (u
< 128) return strings
[u
];
179 to
&= ~SYSTEM_CHARSET_FLAG
;
182 if (is_cp_ptr_utf8(&codepages
[to
]))
183 return encode_utf8(u
);
184 #endif /* CONFIG_UTF8 */
186 /* To mark non breaking spaces in non-UTF-8 strings, we use a
187 * special char NBSP_CHAR. */
189 if (nbsp_mode
== NBSP_MODE_HACK
) return NBSP_CHAR_STRING
;
190 else /* NBSP_MODE_ASCII */ return " ";
192 if (u
== 0xad) return "";
195 unicode_val_T strange
= strange_chars
[u
- 0x80];
197 if (!strange
) return NULL
;
198 return u2cp_(strange
, to
, nbsp_mode
);
202 for (j
= 0; j
< 0x80; j
++)
203 if (codepages
[to
].highhalf
[j
] == u
)
204 return strings
[0x80 + j
];
205 for (j
= 0; codepages
[to
].table
[j
].c
; j
++)
206 if (codepages
[to
].table
[j
].u
== u
)
207 return strings
[codepages
[to
].table
[j
].c
];
209 BIN_SEARCH(unicode_7b
, x
, N_UNICODE_7B
, u
, s
);
210 if (s
!= -1) return unicode_7b
[s
].s
;
215 static unsigned char utf_buffer
[7];
218 inline unsigned char *
219 encode_utf8(unicode_val_T u
)
221 static unsigned char *
222 encode_utf8(unicode_val_T u
)
223 #endif /* CONFIG_UTF8 */
225 memset(utf_buffer
, 0, 7);
230 utf_buffer
[0] = 0xc0 | ((u
>> 6) & 0x1f),
231 utf_buffer
[1] = 0x80 | (u
& 0x3f);
232 else if (u
< 0x10000)
233 utf_buffer
[0] = 0xe0 | ((u
>> 12) & 0x0f),
234 utf_buffer
[1] = 0x80 | ((u
>> 6) & 0x3f),
235 utf_buffer
[2] = 0x80 | (u
& 0x3f);
236 else if (u
< 0x200000)
237 utf_buffer
[0] = 0xf0 | ((u
>> 18) & 0x0f),
238 utf_buffer
[1] = 0x80 | ((u
>> 12) & 0x3f),
239 utf_buffer
[2] = 0x80 | ((u
>> 6) & 0x3f),
240 utf_buffer
[3] = 0x80 | (u
& 0x3f);
241 else if (u
< 0x4000000)
242 utf_buffer
[0] = 0xf8 | ((u
>> 24) & 0x0f),
243 utf_buffer
[1] = 0x80 | ((u
>> 18) & 0x3f),
244 utf_buffer
[2] = 0x80 | ((u
>> 12) & 0x3f),
245 utf_buffer
[3] = 0x80 | ((u
>> 6) & 0x3f),
246 utf_buffer
[4] = 0x80 | (u
& 0x3f);
247 else utf_buffer
[0] = 0xfc | ((u
>> 30) & 0x01),
248 utf_buffer
[1] = 0x80 | ((u
>> 24) & 0x3f),
249 utf_buffer
[2] = 0x80 | ((u
>> 18) & 0x3f),
250 utf_buffer
[3] = 0x80 | ((u
>> 12) & 0x3f),
251 utf_buffer
[4] = 0x80 | ((u
>> 6) & 0x3f),
252 utf_buffer
[5] = 0x80 | (u
& 0x3f);
258 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
259 * equal ones and handled different. */
260 static const char utf8char_len_tab
[256] = {
261 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
262 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
263 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
264 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
265 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
266 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
267 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
268 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
271 inline int utf8charlen(const unsigned char *p
)
273 return p
? utf8char_len_tab
[*p
] : 0;
277 strlen_utf8(unsigned char **str
)
279 unsigned char *s
= *str
;
280 unsigned char *end
= strchr(s
, '\0');
284 for (x
= 0;; x
++, s
+= len
) {
285 len
= utf8charlen(s
);
286 if (s
+ len
> end
) break;
292 #define utf8_issingle(p) (((p) & 0x80) == 0)
293 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
295 /* Start from @current and move back to @pos char. This pointer return. The
296 * most left pointer is @start. */
297 inline unsigned char *
298 utf8_prevchar(unsigned char *current
, int pos
, unsigned char *start
)
300 if (current
== NULL
|| start
== NULL
|| pos
< 0)
302 while (pos
> 0 && current
!= start
) {
304 if (utf8_islead(*current
))
310 /* Count number of standard terminal cells needed for displaying UTF-8
313 utf8_char2cells(unsigned char *utf8_char
, unsigned char *end
)
318 end
= strchr(utf8_char
, '\0');
320 if(!utf8_char
|| !end
)
323 u
= utf8_to_unicode(&utf8_char
, end
);
325 return unicode_to_cell(u
);
328 /* Count number of standard terminal cells needed for displaying string
329 * with UTF-8 characters. */
331 utf8_ptr2cells(unsigned char *string
, unsigned char *end
)
333 int charlen
, cell
, cells
= 0;
336 end
= strchr(string
, '\0');
342 charlen
= utf8charlen(string
);
343 if (string
+ charlen
> end
)
346 cell
= utf8_char2cells(string
, end
);
357 /* Count number of characters in string. */
359 utf8_ptr2chars(unsigned char *string
, unsigned char *end
)
361 int charlen
, chars
= 0;
364 end
= strchr(string
, '\0');
370 charlen
= utf8charlen(string
);
371 if (string
+ charlen
> end
)
382 * Count number of bytes from begining of the string needed for displaying
383 * specified number of cells.
386 utf8_cells2bytes(unsigned char *string
, int max_cells
, unsigned char *end
)
388 unsigned int bytes
= 0, cells
= 0;
390 assert(max_cells
>=0);
393 end
= strchr(string
, '\0');
399 int cell
= utf8_char2cells(&string
[bytes
], end
);
404 if (cells
> max_cells
)
407 bytes
+= utf8charlen(&string
[bytes
]);
409 if (string
+ bytes
> end
) {
410 bytes
= end
- string
;
418 /* Take @max steps forward from @string in the specified @way, but
419 * not going past @end. Return the resulting address. Store the
420 * number of steps taken to *@count, unless @count is NULL.
422 * This assumes the text is valid UTF-8, and @string and @end point to
423 * character boundaries. If not, it doesn't crash but the results may
426 * This function can do some of the same jobs as utf8charlen(),
427 * utf8_cells2bytes(), and strlen_utf8(). */
429 utf8_step_forward(unsigned char *string
, unsigned char *end
,
430 int max
, enum utf8_step way
, int *count
)
433 unsigned char *current
= string
;
437 if_assert_failed
goto invalid_arg
;
439 end
= strchr(string
, '\0');
442 case UTF8_STEP_CHARACTERS
:
443 while (steps
< max
&& current
< end
) {
445 if (utf8_islead(*current
))
450 case UTF8_STEP_CELLS_FEWER
:
451 case UTF8_STEP_CELLS_MORE
:
452 while (steps
< max
) {
454 unsigned char *prev
= current
;
457 u
= utf8_to_unicode(¤t
, end
);
458 if (u
== UCS_NO_CHAR
) {
459 /* Assume the incomplete sequence
466 width
= unicode_to_cell(u
);
467 if (way
== UTF8_STEP_CELLS_FEWER
468 && steps
+ width
> max
) {
478 INTERNAL("impossible enum utf8_step");
487 /* Take @max steps backward from @string in the specified @way, but
488 * not going past @start. Return the resulting address. Store the
489 * number of steps taken to *@count, unless @count is NULL.
491 * This assumes the text is valid UTF-8, and @string and @start point
492 * to character boundaries. If not, it doesn't crash but the results
493 * may be inconsistent.
495 * This function can do some of the same jobs as utf8_prevchar(). */
497 utf8_step_backward(unsigned char *string
, unsigned char *start
,
498 int max
, enum utf8_step way
, int *count
)
501 unsigned char *current
= string
;
506 if_assert_failed
goto invalid_arg
;
509 case UTF8_STEP_CHARACTERS
:
510 while (steps
< max
&& current
> start
) {
512 if (utf8_islead(*current
))
517 case UTF8_STEP_CELLS_FEWER
:
518 case UTF8_STEP_CELLS_MORE
:
519 while (steps
< max
) {
520 unsigned char *prev
= current
;
525 if (current
<= start
)
529 } while (current
> start
&& !utf8_islead(*current
));
532 u
= utf8_to_unicode(&look
, prev
);
533 if (u
== UCS_NO_CHAR
) {
534 /* Assume the incomplete sequence
538 width
= unicode_to_cell(u
);
540 if (way
== UTF8_STEP_CELLS_FEWER
541 && steps
+ width
> max
) {
551 INTERNAL("impossible enum utf8_step");
561 * Find out number of standard terminal collumns needed for displaying symbol
562 * (glyph) which represents Unicode character c.
564 * TODO: Use wcwidth when it is available. This seems to require:
565 * - Make the configure script check whether <wchar.h> and wcwidth exist.
566 * - Define _XOPEN_SOURCE and include <wchar.h>.
567 * - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
568 * matches ISO 10646 in all locales.)
569 * However, these do not suffice, because wcwidth depends on LC_CTYPE
570 * in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
571 * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
572 * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
573 * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
574 * character is apparently not supported in all locales. Why is that?
575 * - Perhaps there is standardese that requires supported characters
576 * to be convertable to multibyte form. Then ELinks could just pick
577 * some UTF-8 locale for its wcwidth purposes.
578 * - Perhaps wcwidth can even return different nonnegative values for
579 * the same ISO 10646 character in different locales. Then ELinks
580 * would have to set LC_CTYPE to match at least the terminal's
581 * charset (which may differ from the LC_CTYPE environment variable,
582 * especially when the master process is serving a slave terminal).
583 * But there is no guarantee that the libc supports all the same
584 * charsets as ELinks does.
585 * For now, it seems safest to avoid the potentially locale-dependent
586 * libc version of wcwidth, and instead use a hardcoded mapping.
588 * @return 2 for double-width glyph, 1 for others.
589 * TODO: May be extended to return 0 for zero-width glyphs
590 * (like composing, maybe unprintable too).
593 unicode_to_cell(unicode_val_T c
)
596 && (c
<= 0x115f /* Hangul Jamo */
599 || (c
>= 0x2e80 && c
<= 0xa4cf
600 && c
!= 0x303f) /* CJK ... Yi */
601 || (c
>= 0xac00 && c
<= 0xd7a3) /* Hangul Syllables */
602 || (c
>= 0xf900 && c
<= 0xfaff) /* CJK Compatibility
604 || (c
>= 0xfe30 && c
<= 0xfe6f) /* CJK Compatibility Forms */
605 || (c
>= 0xff00 && c
<= 0xff60) /* Fullwidth Forms */
606 || (c
>= 0xffe0 && c
<= 0xffe6)
607 || (c
>= 0x20000 && c
<= 0x2fffd)
608 || (c
>= 0x30000 && c
<= 0x3fffd)))
614 /* Fold the case of a Unicode character, so that hotkeys in labels can
615 * be compared case-insensitively. It is unspecified whether the
616 * result will be in upper or lower case. */
618 unicode_fold_label_case(unicode_val_T c
)
620 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
622 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
623 /* For now, this supports only ASCII. It would be possible to
624 * use code generated from CaseFolding.txt of Unicode if the
625 * acknowledgements required by http://www.unicode.org/copyright.html
626 * were added to associated documentation of ELinks. */
627 if (c
>= 0x41 && c
<= 0x5A)
631 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
635 utf8_to_unicode(unsigned char **string
, unsigned char *end
)
637 unsigned char *str
= *string
;
641 length
= utf8char_len_tab
[str
[0]];
643 if (str
+ length
> end
) {
648 case 1: /* U+0000 to U+007F */
649 if (str
[0] >= 0x80) {
652 return UCS_REPLACEMENT_CHARACTER
;
656 case 2: /* U+0080 to U+07FF */
657 if ((str
[1] & 0xc0) != 0x80)
659 u
= (str
[0] & 0x1f) << 6;
660 u
+= (str
[1] & 0x3f);
664 case 3: /* U+0800 to U+FFFF, except surrogates */
665 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80)
667 u
= (str
[0] & 0x0f) << 12;
668 u
+= ((str
[1] & 0x3f) << 6);
669 u
+= (str
[2] & 0x3f);
670 if (u
< 0x800 || is_utf16_surrogate(u
))
673 case 4: /* U+10000 to U+1FFFFF */
674 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80
675 || (str
[3] & 0xc0) != 0x80)
677 u
= (str
[0] & 0x0f) << 18;
678 u
+= ((str
[1] & 0x3f) << 12);
679 u
+= ((str
[2] & 0x3f) << 6);
680 u
+= (str
[3] & 0x3f);
684 case 5: /* U+200000 to U+3FFFFFF */
685 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80
686 || (str
[3] & 0xc0) != 0x80 || (str
[4] & 0xc0) != 0x80)
688 u
= (str
[0] & 0x0f) << 24;
689 u
+= ((str
[1] & 0x3f) << 18);
690 u
+= ((str
[2] & 0x3f) << 12);
691 u
+= ((str
[3] & 0x3f) << 6);
692 u
+= (str
[4] & 0x3f);
696 case 6: /* U+4000000 to U+7FFFFFFF */
697 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80
698 || (str
[3] & 0xc0) != 0x80 || (str
[4] & 0xc0) != 0x80
699 || (str
[5] & 0xc0) != 0x80)
701 u
= (str
[0] & 0x01) << 30;
702 u
+= ((str
[1] & 0x3f) << 24);
703 u
+= ((str
[2] & 0x3f) << 18);
704 u
+= ((str
[3] & 0x3f) << 12);
705 u
+= ((str
[4] & 0x3f) << 6);
706 u
+= (str
[5] & 0x3f);
711 INTERNAL("utf8char_len_tab out of range");
714 *string
= str
+ length
;
717 #endif /* CONFIG_UTF8 */
719 /* The common part of cp2u and cp2utf_8. */
721 cp2u_shared(const struct codepage_desc
*from
, unsigned char c
)
723 unicode_val_T u
= from
->highhalf
[c
- 0x80];
725 if (u
== 0xFFFF) u
= UCS_REPLACEMENT_CHARACTER
;
729 /* Used for converting input from the terminal. */
731 cp2u(int from
, unsigned char c
)
733 from
&= ~SYSTEM_CHARSET_FLAG
;
735 /* UTF-8 is a multibyte codepage and cannot be handled with
737 assert(!is_cp_ptr_utf8(&codepages
[from
]));
738 if_assert_failed
return UCS_REPLACEMENT_CHARACTER
;
740 if (c
< 0x80) return c
;
741 else return cp2u_shared(&codepages
[from
], c
);
744 /* This slow and ugly code is used by the terminal utf_8_io */
745 const unsigned char *
746 cp2utf8(int from
, int c
)
748 from
&= ~SYSTEM_CHARSET_FLAG
;
750 if (is_cp_ptr_utf8(&codepages
[from
]) || c
< 128)
753 return encode_utf8(cp2u_shared(&codepages
[from
], c
));
758 cp_to_unicode(int codepage
, unsigned char **string
, unsigned char *end
)
762 if (is_cp_utf8(codepage
))
763 return utf8_to_unicode(string
, end
);
768 ret
= cp2u(codepage
, **string
);
772 #endif /* CONFIG_UTF8 */
776 add_utf8(struct conv_table
*ct
, unicode_val_T u
, const unsigned char *str
)
778 unsigned char *p
= encode_utf8(u
);
781 if (ct
[*p
].t
) ct
= ct
[*p
].u
.tbl
;
783 struct conv_table
*nct
;
785 assertm(ct
[*p
].u
.str
== no_str
, "bad utf encoding #1");
786 if_assert_failed
return;
788 nct
= mem_calloc(256, sizeof(*nct
));
790 new_translation_table(nct
);
798 assertm(!ct
[*p
].t
, "bad utf encoding #2");
799 if_assert_failed
return;
801 if (ct
[*p
].u
.str
== no_str
)
805 /* A conversion table from some charset to UTF-8.
806 * If it is from UTF-8 to UTF-8, it converts each byte separately.
807 * Unlike in other translation tables, the strings in elements 0x80 to
808 * 0xFF are allocated dynamically. */
809 struct conv_table utf_table
[256];
810 int utf_table_init
= 1;
817 /* Cast away const. */
818 for (i
= 128; i
< 256; i
++)
819 mem_free((unsigned char *) utf_table
[i
].u
.str
);
822 static struct conv_table
*
823 get_translation_table_to_utf8(int from
)
828 if (from
== -1) return NULL
;
829 from
&= ~SYSTEM_CHARSET_FLAG
;
830 if (from
== lfr
) return utf_table
;
833 memset(utf_table
, 0, sizeof(utf_table
)),
838 for (i
= 0; i
< 128; i
++)
839 utf_table
[i
].u
.str
= strings
[i
];
841 if (is_cp_ptr_utf8(&codepages
[from
])) {
842 for (i
= 128; i
< 256; i
++)
843 utf_table
[i
].u
.str
= stracpy(strings
[i
]);
847 for (i
= 128; i
< 256; i
++) {
848 unicode_val_T u
= codepages
[from
].highhalf
[i
- 0x80];
851 utf_table
[i
].u
.str
= NULL
;
853 utf_table
[i
].u
.str
= stracpy(encode_utf8(u
));
856 for (i
= 0; codepages
[from
].table
[i
].c
; i
++) {
857 unicode_val_T u
= codepages
[from
].table
[i
].u
;
859 if (!utf_table
[codepages
[from
].table
[i
].c
].u
.str
)
860 utf_table
[codepages
[from
].table
[i
].c
].u
.str
=
861 stracpy(encode_utf8(u
));
864 for (i
= 128; i
< 256; i
++)
865 if (!utf_table
[i
].u
.str
)
866 utf_table
[i
].u
.str
= stracpy(no_str
);
871 /* A conversion table between two charsets, where the target is not UTF-8. */
872 static struct conv_table table
[256];
873 static int first
= 1;
876 free_conv_table(void)
878 if (!utf_table_init
) free_utf_table();
880 memset(table
, 0, sizeof(table
));
883 new_translation_table(table
);
888 get_translation_table(int from
, int to
)
893 from
&= ~SYSTEM_CHARSET_FLAG
;
894 to
&= ~SYSTEM_CHARSET_FLAG
;
896 memset(table
, 0, sizeof(table
));
899 if (/*from == to ||*/ from
== -1 || to
== -1)
901 if (is_cp_ptr_utf8(&codepages
[to
]))
902 return get_translation_table_to_utf8(from
);
903 if (from
== lfr
&& to
== lto
)
907 new_translation_table(table
);
909 if (is_cp_ptr_utf8(&codepages
[from
])) {
912 for (i
= 0x80; i
<= 0xFF; i
++)
913 if (codepages
[to
].highhalf
[i
- 0x80] != 0xFFFF)
915 codepages
[to
].highhalf
[i
- 0x80],
918 for (i
= 0; codepages
[to
].table
[i
].c
; i
++)
919 add_utf8(table
, codepages
[to
].table
[i
].u
,
920 strings
[codepages
[to
].table
[i
].c
]);
922 for (i
= 0; unicode_7b
[i
].x
!= -1; i
++)
923 if (unicode_7b
[i
].x
>= 0x80)
924 add_utf8(table
, unicode_7b
[i
].x
,
930 for (i
= 128; i
< 256; i
++) {
931 if (codepages
[from
].highhalf
[i
- 0x80] != 0xFFFF) {
932 const unsigned char *u
;
934 u
= u2cp(codepages
[from
].highhalf
[i
- 0x80], to
);
935 if (u
) table
[i
].u
.str
= u
;
944 xxstrcmp(unsigned char *s1
, unsigned char *s2
, int l2
)
947 if (*s1
> *s2
) return 1;
948 if (*s1
< *s2
) return -1;
957 /* Entity cache debugging purpose. */
959 #define DEBUG_ENTITY_CACHE
961 #undef DEBUG_ENTITY_CACHE
964 struct entity_cache
{
968 const unsigned char *result
;
969 unsigned char str
[20]; /* Suffice in any case. */
973 hits_cmp(struct entity_cache
*a
, struct entity_cache
*b
)
975 if (a
->hits
== b
->hits
) return 0;
976 if (a
->hits
> b
->hits
) return -1;
981 compare_entities(const void *key_
, const void *element_
)
983 struct string
*key
= (struct string
*) key_
;
984 struct entity
*element
= (struct entity
*) element_
;
985 int length
= key
->length
;
986 unsigned char *first
= key
->source
;
987 unsigned char *second
= element
->s
;
989 return xxstrcmp(first
, second
, length
);
992 const unsigned char *
993 get_entity_string(const unsigned char *str
, const int strlen
, int encoding
)
995 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
996 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
997 will go in [0] table */
998 static struct entity_cache entity_cache
[ENTITY_CACHE_MAXLEN
][ENTITY_CACHE_SIZE
];
999 static unsigned int nb_entity_cache
[ENTITY_CACHE_MAXLEN
];
1000 static int first_time
= 1;
1001 unsigned int slen
= 0;
1002 const unsigned char *result
= NULL
;
1004 if (strlen
<= 0) return NULL
;
1007 /* TODO: caching UTF-8 */
1008 encoding
&= ~SYSTEM_CHARSET_FLAG
;
1009 if (is_cp_ptr_utf8(&codepages
[encoding
]))
1011 #endif /* CONFIG_UTF8 */
1014 memset(&nb_entity_cache
, 0, ENTITY_CACHE_MAXLEN
* sizeof(unsigned int));
1018 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1019 * + google + slashdot + websites that result from a search for test on google,
1020 * + various ones) show a quite impressive improvment:
1022 * 0: hits=2459 l=4 st='nbsp'
1023 * 1: hits=2152 l=6 st='eacute'
1024 * 2: hits=235 l=6 st='egrave'
1025 * 3: hits=136 l=6 st='agrave'
1026 * 4: hits=100 l=3 st='amp'
1027 * 5: hits=40 l=5 st='laquo'
1028 * 6: hits=8 l=4 st='copy'
1029 * 7: hits=5 l=2 st='gt'
1030 * 8: hits=2 l=2 st='lt'
1031 * 9: hits=1 l=6 st='middot'
1033 * Most of the time cache hit ratio is near 95%.
1035 * A long test shows: 15186 hits vs. 24 misses and mean iteration
1036 * count is kept < 2 (worst case 1.58). Not so bad ;)
1040 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1041 slen
= (strlen
> 1 && strlen
< ENTITY_CACHE_MAXLEN
) ? strlen
: 0;
1043 if (strlen
< ENTITY_CACHE_MAXLEN
&& nb_entity_cache
[slen
] > 0) {
1046 for (i
= 0; i
< nb_entity_cache
[slen
]; i
++) {
1047 if (entity_cache
[slen
][i
].encoding
== encoding
1048 && !memcmp(str
, entity_cache
[slen
][i
].str
, strlen
)) {
1049 #ifdef DEBUG_ENTITY_CACHE
1050 static double total_iter
= 0;
1051 static unsigned long hit_count
= 0;
1053 total_iter
+= i
+ 1;
1055 fprintf(stderr
, "hit after %d iter. (mean = %0.2f)\n", i
+ 1, total_iter
/ (double) hit_count
);
1057 if (entity_cache
[slen
][i
].hits
< (unsigned int) ~0)
1058 entity_cache
[slen
][i
].hits
++;
1059 return entity_cache
[slen
][i
].result
;
1062 #ifdef DEBUG_ENTITY_CACHE
1063 fprintf(stderr
, "miss\n");
1068 #endif /* CONFIG_UTF8 */
1069 if (*str
== '#') { /* Numeric entity. */
1070 int l
= (int) strlen
;
1071 unsigned char *st
= (unsigned char *) str
;
1072 unicode_val_T n
= 0;
1074 if (l
== 1) goto end
; /* &#; ? */
1076 if ((*st
| 32) == 'x') { /* Hexadecimal */
1078 if (l
== 1 || l
> 9) goto end
; /* xFFFFFFFF max. */
1081 unsigned char c
= (*(st
++) | 32);
1084 n
= (n
<< 4) | (c
- '0');
1085 else if (isxdigit(c
))
1086 n
= (n
<< 4) | (c
- 'a' + 10);
1088 goto end
; /* Bad char. */
1090 } else { /* Decimal */
1091 if (l
> 10) goto end
; /* 4294967295 max. */
1093 unsigned char c
= *(st
++);
1096 n
= n
* 10 + c
- '0';
1098 goto end
; /* Bad char. */
1099 /* Limit to 0xFFFFFFFF. */
1100 if (n
>= (unicode_val_T
) 0xFFFFFFFFu
)
1105 result
= u2cp(n
, encoding
);
1107 #ifdef DEBUG_ENTITY_CACHE
1108 fprintf(stderr
, "%lu %016x %s\n", (unsigned long) n
, n
, result
);
1110 } else { /* Text entity. */
1111 struct string key
= INIT_STRING((unsigned char *) str
, strlen
);
1112 struct entity
*element
= bsearch((void *) &key
, entities
,
1117 if (element
) result
= u2cp(element
->c
, encoding
);
1121 if (is_cp_ptr_utf8(&codepages
[encoding
])) {
1124 #endif /* CONFIG_UTF8 */
1126 /* Take care of potential buffer overflow. */
1127 if (strlen
< sizeof(entity_cache
[slen
][0].str
)) {
1128 struct entity_cache
*ece
= &entity_cache
[slen
][nb_entity_cache
[slen
]];
1130 /* Copy new entry to cache. */
1132 ece
->strlen
= strlen
;
1133 ece
->encoding
= encoding
;
1134 ece
->result
= result
;
1135 memcpy(ece
->str
, str
, strlen
);
1136 ece
->str
[strlen
] = '\0';
1138 /* Increment number of cache entries if possible. */
1139 if (nb_entity_cache
[slen
] < ENTITY_CACHE_SIZE
) nb_entity_cache
[slen
]++;
1141 #ifdef DEBUG_ENTITY_CACHE
1142 fprintf(stderr
, "Added in [%u]: l=%d st='%s'\n", slen
,
1143 entity_cache
[slen
][0].strlen
, entity_cache
[slen
][0].str
);
1147 /* Sort entries by hit order. */
1148 if (nb_entity_cache
[slen
] > 1)
1149 qsort(&entity_cache
[slen
][0], nb_entity_cache
[slen
],
1150 sizeof(entity_cache
[slen
][0]), (void *) hits_cmp
);
1152 #ifdef DEBUG_ENTITY_CACHE
1156 fprintf(stderr
, "- Cache entries [%u] -\n", slen
);
1157 for (i
= 0; i
< nb_entity_cache
[slen
] ; i
++)
1158 fprintf(stderr
, "%d: hits=%u l=%d st='%s'\n", i
,
1159 entity_cache
[slen
][i
].hits
, entity_cache
[slen
][i
].strlen
,
1160 entity_cache
[slen
][i
].str
);
1161 fprintf(stderr
, "-----------------\n");
1169 convert_string(struct conv_table
*convert_table
,
1170 unsigned char *chars
, int charslen
, int cp
,
1171 enum convert_string_mode mode
, int *length
,
1172 void (*callback
)(void *data
, unsigned char *buf
, int buflen
),
1173 void *callback_data
)
1175 unsigned char *buffer
;
1179 if (!convert_table
&& !memchr(chars
, '&', charslen
)) {
1181 if (charslen
) callback(callback_data
, chars
, charslen
);
1184 return memacpy(chars
, charslen
);
1188 /* Buffer allocation */
1190 buffer
= mem_alloc(ALLOC_GR
+ 1 /* trailing \0 */);
1191 if (!buffer
) return NULL
;
1195 while (charspos
< charslen
) {
1196 const unsigned char *translit
;
1199 buffer[bufferpos++] = chars[charspos++]; \
1204 if (chars
[charspos
] != '&') {
1205 struct conv_table
*t
;
1208 if (chars
[charspos
] < 128 || !convert_table
) PUTC
;
1213 while (t
[chars
[i
]].t
) {
1214 t
= t
[chars
[i
++]].u
.tbl
;
1215 if (i
>= charslen
) PUTC
;
1218 translit
= t
[chars
[i
]].u
.str
;
1221 } else if (mode
== CSM_FORM
|| mode
== CSM_NONE
) {
1225 int start
= charspos
+ 1;
1229 && (isasciialpha(chars
[i
])
1230 || isdigit(chars
[i
])
1231 || (chars
[i
] == '#')))
1234 /* This prevents bug 213: we were expanding "entities"
1235 * in URL query strings. */
1236 /* XXX: But this disables    usage, which
1237 * appears to be relatively common! --pasky */
1238 if ((mode
== CSM_DEFAULT
|| (chars
[i
] != '&' && chars
[i
] != '='))
1240 && !isasciialpha(chars
[i
]) && !isdigit(chars
[i
])) {
1241 translit
= get_entity_string(&chars
[start
], i
- start
,
1243 if (chars
[i
] != ';') {
1244 /* Eat    <foo> happily, but
1245 * pull back from the character after
1246 * entity string if it is not the valid
1251 if (!translit
) PUTC
;
1252 charspos
= i
+ (i
< charslen
);
1256 if (!translit
[0]) continue;
1259 buffer
[bufferpos
++] = translit
[0];
1267 buffer
[bufferpos
++] = *(translit
++);
1269 if (bufferpos
& (ALLOC_GR
- 1)) continue;
1272 buffer
[bufferpos
] = 0;
1273 callback(callback_data
, buffer
, bufferpos
);
1276 new = mem_realloc(buffer
, bufferpos
+ ALLOC_GR
);
1289 buffer
[bufferpos
] = 0;
1290 if (length
) *length
= bufferpos
;
1293 if (bufferpos
) callback(callback_data
, buffer
, bufferpos
);
1302 #ifndef USE_FASTFIND
1304 get_cp_index(unsigned char *name
)
1309 if (!strcasecmp(name
, "System")) {
1310 #if HAVE_LANGINFO_CODESET
1311 name
= nl_langinfo(CODESET
);
1312 syscp
= SYSTEM_CHARSET_FLAG
;
1318 for (i
= 0; codepages
[i
].name
; i
++) {
1319 for (a
= 0; codepages
[i
].aliases
[a
]; a
++) {
1320 /* In the past, we looked for the longest substring
1321 * in all the names; it is way too expensive, though:
1323 * % cumulative self self total
1324 * time seconds seconds calls us/call us/call name
1325 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1327 * Anything called from redraw_screen() is in fact
1328 * relatively expensive, even if it's called just
1329 * once. So we will do a simple strcasecmp() here.
1332 if (!strcasecmp(name
, codepages
[i
].aliases
[a
]))
1338 return get_cp_index("us-ascii") | syscp
;
1346 static unsigned int i_name
= 0;
1347 static unsigned int i_alias
= 0;
1349 /* Reset internal list pointer */
1351 charsets_list_reset(void)
1357 /* Returns a pointer to a struct that contains current key and data pointers
1358 * and increment internal pointer. It returns NULL when key is NULL. */
1359 struct fastfind_key_value
*
1360 charsets_list_next(void)
1362 static struct fastfind_key_value kv
;
1364 if (!codepages
[i_name
].name
) return NULL
;
1366 kv
.key
= codepages
[i_name
].aliases
[i_alias
];
1367 kv
.data
= (void *) &codepages
[i_name
]; /* cast away const */
1369 if (codepages
[i_name
].aliases
[i_alias
+ 1])
1379 static struct fastfind_index ff_charsets_index
1380 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset
, charsets_list_next
);
1382 /* It searchs for a charset named @name or one of its aliases and
1383 * returns index for it or -1 if not found. */
1385 get_cp_index(unsigned char *name
)
1387 const struct codepage_desc
*codepage
;
1390 if (!strcasecmp(name
, "System")) {
1391 #if HAVE_LANGINFO_CODESET
1392 name
= nl_langinfo(CODESET
);
1393 syscp
= SYSTEM_CHARSET_FLAG
;
1399 codepage
= fastfind_search(&ff_charsets_index
, name
, strlen(name
));
1401 assert(codepages
<= codepage
&& codepage
< codepages
+ N_CODEPAGES
);
1402 return (codepage
- codepages
) | syscp
;
1405 return get_cp_index("us-ascii") | syscp
;
1412 #endif /* USE_FASTFIND */
1415 init_charsets_lookup(void)
1418 fastfind_index(&ff_charsets_index
, FF_COMPRESS
);
1423 free_charsets_lookup(void)
1426 fastfind_done(&ff_charsets_index
);
1431 get_cp_name(int cp_index
)
1433 if (cp_index
< 0) return "none";
1434 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1436 return codepages
[cp_index
].name
;
1440 get_cp_mime_name(int cp_index
)
1442 if (cp_index
< 0) return "none";
1443 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1444 if (!codepages
[cp_index
].aliases
) return NULL
;
1446 return codepages
[cp_index
].aliases
[0];
1450 is_cp_utf8(int cp_index
)
1452 cp_index
&= ~SYSTEM_CHARSET_FLAG
;
1453 return is_cp_ptr_utf8(&codepages
[cp_index
]);