1 /* Charsets convertor */
4 #define _GNU_SOURCE /* strcasecmp() */
11 #if HAVE_LANGINFO_CODESET
28 #include "document/options.h"
29 #include "intl/charsets.h"
30 #include "util/conv.h"
31 #include "util/error.h"
32 #include "util/fastfind.h"
33 #include "util/hash.h"
34 #include "util/memory.h"
35 #include "util/string.h"
38 /* Fix namespace clash on MacOS. */
39 #define table table_elinks
43 /* This should in principle be unicode_val_T, but because all
44 * the values currently in codepage.inc fit in 16 bits, we can
45 * as well use uint16_t and halve sizeof(struct table_entry)
46 * from 8 bytes to 4. Should other characters ever be needed,
47 * unicode_val_T u : 24 might be a possibility, although it
48 * seems a little unportable as bitfields are in principle
49 * restricted to int, which may be 16-bit. */
53 struct codepage_desc
{
55 unsigned char *const *aliases
;
57 /* The Unicode mappings of codepage bytes 0x80...0xFF.
58 * (0x00...0x7F are assumed to be ASCII in all codepages.)
59 * Because all current values fit in 16 bits, we store them as
60 * uint16_t rather than unicode_val_T. If the codepage does
61 * not use some byte, then @highhalf maps that byte to 0xFFFF,
62 * which C code converts to UCS_REPLACEMENT_CHARACTER where
63 * appropriate. (U+FFFF is reserved and will never be
64 * assigned as a character.) */
65 const uint16_t *highhalf
;
67 /* If some byte in the codepage corresponds to multiple Unicode
68 * characters, then the preferred character is in @highhalf
69 * above, and the rest are listed here in @table. This table
70 * is not used for translating from the codepage to Unicode. */
71 const struct table_entry
*table
;
73 /* Whether use iconv for translation */
77 #include "intl/codepage.inc"
78 #include "intl/uni_7b.inc"
79 #include "intl/entity.inc"
81 /* Declare the external-linkage inline functions defined in this file.
82 * Avoid the GCC 4.3.1 warning: `foo' declared inline after being
83 * called. The functions are not declared inline in charsets.h
84 * because C99 6.7.4p6 says that every external-linkage function
85 * declared inline shall be defined in the same translation unit.
86 * The non-inline declarations in charsets.h also make sure that the
87 * compiler emits global definitions for the symbols so that the
88 * functions can be called from other translation units. */
89 NONSTATIC_INLINE
unsigned char *encode_utf8(unicode_val_T u
);
90 NONSTATIC_INLINE
int utf8charlen(const unsigned char *p
);
91 NONSTATIC_INLINE
int unicode_to_cell(unicode_val_T c
);
92 NONSTATIC_INLINE unicode_val_T
utf8_to_unicode(unsigned char **string
,
93 const unsigned char *end
);
95 static const char strings
[256][2] = {
96 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
97 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
98 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
99 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
100 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
101 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
102 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
103 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
104 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
105 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
106 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
107 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
108 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
109 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
110 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
111 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
112 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
113 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
114 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
115 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
116 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
117 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
118 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
119 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
120 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
121 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
122 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
123 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
124 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
125 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
126 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
127 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
131 static iconv_t iconv_cd
= (iconv_t
)-1;
135 free_translation_table(struct conv_table
*p
)
139 for (i
= 0; i
< 256; i
++)
141 free_translation_table(p
[i
].u
.tbl
);
146 /* A string used in conversion tables when there is no correct
147 * conversion. This is compared by address and therefore should be a
148 * named array rather than a pointer so that it won't share storage
149 * with any other string literal that happens to have the same
151 static const unsigned char no_str
[] = "*";
154 new_translation_table(struct conv_table
*p
)
158 for (i
= 0; i
< 256; i
++)
160 free_translation_table(p
[i
].u
.tbl
);
161 for (i
= 0; i
< 128; i
++) {
163 p
[i
].u
.str
= strings
[i
];
165 for (; i
< 256; i
++) {
172 #define BIN_SEARCH(table, entry, entries, key, result) \
174 long _s = 0, _e = (entries) - 1; \
176 while (_s <= _e || !((result) = -1)) { \
177 long _m = (_s + _e) / 2; \
179 if ((table)[_m].entry == (key)) { \
183 if ((table)[_m].entry > (key)) _e = _m - 1; \
184 if ((table)[_m].entry < (key)) _s = _m + 1; \
188 static const unicode_val_T strange_chars[32] = {
189 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
190 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
191 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
192 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
195 #define SYSTEM_CHARSET_FLAG 128
196 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
198 const unsigned char *
199 u2cp_(unicode_val_T u
, int to
, enum nbsp_mode nbsp_mode
)
204 if (u
< 128) return strings
[u
];
207 u
= strange_chars
[u
- 0x80];
211 to
&= ~SYSTEM_CHARSET_FLAG
;
213 if (is_cp_ptr_utf8(&codepages
[to
]))
214 return encode_utf8(u
);
216 /* To mark non breaking spaces in non-UTF-8 strings, we use a
217 * special char NBSP_CHAR. */
218 if (u
== UCS_NO_BREAK_SPACE
) {
219 if (nbsp_mode
== NBSP_MODE_HACK
) return NBSP_CHAR_STRING
;
220 else /* NBSP_MODE_ASCII */ return " ";
222 if (u
== UCS_SOFT_HYPHEN
) return "";
225 for (j
= 0; j
< 0x80; j
++)
226 if (codepages
[to
].highhalf
[j
] == u
)
227 return strings
[0x80 + j
];
228 for (j
= 0; codepages
[to
].table
[j
].c
; j
++)
229 if (codepages
[to
].table
[j
].u
== u
)
230 return strings
[codepages
[to
].table
[j
].c
];
232 BIN_SEARCH(unicode_7b
, x
, N_UNICODE_7B
, u
, s
);
233 if (s
!= -1) return unicode_7b
[s
].s
;
238 static unsigned char utf_buffer
[7];
240 NONSTATIC_INLINE
unsigned char *
241 encode_utf8(unicode_val_T u
)
243 memset(utf_buffer
, 0, 7);
248 utf_buffer
[0] = 0xc0 | ((u
>> 6) & 0x1f),
249 utf_buffer
[1] = 0x80 | (u
& 0x3f);
250 else if (u
< 0x10000)
251 utf_buffer
[0] = 0xe0 | ((u
>> 12) & 0x0f),
252 utf_buffer
[1] = 0x80 | ((u
>> 6) & 0x3f),
253 utf_buffer
[2] = 0x80 | (u
& 0x3f);
254 else if (u
< 0x200000)
255 utf_buffer
[0] = 0xf0 | ((u
>> 18) & 0x0f),
256 utf_buffer
[1] = 0x80 | ((u
>> 12) & 0x3f),
257 utf_buffer
[2] = 0x80 | ((u
>> 6) & 0x3f),
258 utf_buffer
[3] = 0x80 | (u
& 0x3f);
259 else if (u
< 0x4000000)
260 utf_buffer
[0] = 0xf8 | ((u
>> 24) & 0x0f),
261 utf_buffer
[1] = 0x80 | ((u
>> 18) & 0x3f),
262 utf_buffer
[2] = 0x80 | ((u
>> 12) & 0x3f),
263 utf_buffer
[3] = 0x80 | ((u
>> 6) & 0x3f),
264 utf_buffer
[4] = 0x80 | (u
& 0x3f);
265 else utf_buffer
[0] = 0xfc | ((u
>> 30) & 0x01),
266 utf_buffer
[1] = 0x80 | ((u
>> 24) & 0x3f),
267 utf_buffer
[2] = 0x80 | ((u
>> 18) & 0x3f),
268 utf_buffer
[3] = 0x80 | ((u
>> 12) & 0x3f),
269 utf_buffer
[4] = 0x80 | ((u
>> 6) & 0x3f),
270 utf_buffer
[5] = 0x80 | (u
& 0x3f);
275 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
276 * equal ones and handled different. */
277 static const char utf8char_len_tab
[256] = {
278 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
279 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
280 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
281 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
282 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
283 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
284 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
285 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
290 utf8charlen(const unsigned char *p
)
292 return p
? utf8char_len_tab
[*p
] : 0;
296 strlen_utf8(unsigned char **str
)
298 unsigned char *s
= *str
;
299 unsigned char *end
= strchr((const char *)s
, '\0');
303 for (x
= 0;; x
++, s
+= len
) {
304 len
= utf8charlen(s
);
305 if (s
+ len
> end
) break;
311 #define utf8_issingle(p) (((p) & 0x80) == 0)
312 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
314 /* Start from @current and move back to @pos char. This pointer return. The
315 * most left pointer is @start. */
317 utf8_prevchar(unsigned char *current
, int pos
, unsigned char *start
)
319 if (current
== NULL
|| start
== NULL
|| pos
< 0)
321 while (pos
> 0 && current
!= start
) {
323 if (utf8_islead(*current
))
329 /* Count number of standard terminal cells needed for displaying UTF-8
332 utf8_char2cells(unsigned char *utf8_char
, unsigned char *end
)
337 end
= strchr((const char *)utf8_char
, '\0');
339 if(!utf8_char
|| !end
)
342 u
= utf8_to_unicode(&utf8_char
, end
);
344 return unicode_to_cell(u
);
347 /* Count number of standard terminal cells needed for displaying string
348 * with UTF-8 characters. */
350 utf8_ptr2cells(unsigned char *string
, unsigned char *end
)
352 int charlen
, cell
, cells
= 0;
355 end
= strchr((const char *)string
, '\0');
361 charlen
= utf8charlen(string
);
362 if (string
+ charlen
> end
)
365 cell
= utf8_char2cells(string
, end
);
376 /* Count number of characters in string. */
378 utf8_ptr2chars(unsigned char *string
, unsigned char *end
)
380 int charlen
, chars
= 0;
383 end
= strchr((const char *)string
, '\0');
389 charlen
= utf8charlen(string
);
390 if (string
+ charlen
> end
)
401 * Count number of bytes from begining of the string needed for displaying
402 * specified number of cells.
405 utf8_cells2bytes(unsigned char *string
, int max_cells
, unsigned char *end
)
407 unsigned int bytes
= 0, cells
= 0;
409 assert(max_cells
>=0);
412 end
= strchr((const char *)string
, '\0');
418 int cell
= utf8_char2cells(&string
[bytes
], end
);
423 if (cells
> max_cells
)
426 bytes
+= utf8charlen(&string
[bytes
]);
428 if (string
+ bytes
> end
) {
429 bytes
= end
- string
;
437 /* Take @max steps forward from @string in the specified @way, but
438 * not going past @end. Return the resulting address. Store the
439 * number of steps taken to *@count, unless @count is NULL.
441 * This assumes the text is valid UTF-8, and @string and @end point to
442 * character boundaries. If not, it doesn't crash but the results may
445 * This function can do some of the same jobs as utf8charlen(),
446 * utf8_cells2bytes(), and strlen_utf8(). */
448 utf8_step_forward(unsigned char *string
, unsigned char *end
,
449 int max
, enum utf8_step way
, int *count
)
452 unsigned char *current
= string
;
456 if_assert_failed
goto invalid_arg
;
458 end
= strchr((const char *)string
, '\0');
461 case UTF8_STEP_CHARACTERS
:
462 while (steps
< max
&& current
< end
) {
464 if (utf8_islead(*current
))
469 case UTF8_STEP_CELLS_FEWER
:
470 case UTF8_STEP_CELLS_MORE
:
471 while (steps
< max
&& current
< end
) {
473 unsigned char *prev
= current
;
476 u
= utf8_to_unicode(¤t
, end
);
477 if (u
== UCS_NO_CHAR
) {
478 /* Assume the incomplete sequence
485 width
= unicode_to_cell(u
);
486 if (way
== UTF8_STEP_CELLS_FEWER
487 && steps
+ width
> max
) {
497 INTERNAL("impossible enum utf8_step");
506 /* Take @max steps backward from @string in the specified @way, but
507 * not going past @start. Return the resulting address. Store the
508 * number of steps taken to *@count, unless @count is NULL.
510 * This assumes the text is valid UTF-8, and @string and @start point
511 * to character boundaries. If not, it doesn't crash but the results
512 * may be inconsistent.
514 * This function can do some of the same jobs as utf8_prevchar(). */
516 utf8_step_backward(unsigned char *string
, unsigned char *start
,
517 int max
, enum utf8_step way
, int *count
)
520 unsigned char *current
= string
;
525 if_assert_failed
goto invalid_arg
;
528 case UTF8_STEP_CHARACTERS
:
529 while (steps
< max
&& current
> start
) {
531 if (utf8_islead(*current
))
536 case UTF8_STEP_CELLS_FEWER
:
537 case UTF8_STEP_CELLS_MORE
:
538 while (steps
< max
) {
539 unsigned char *prev
= current
;
544 if (current
<= start
)
548 } while (current
> start
&& !utf8_islead(*current
));
551 u
= utf8_to_unicode(&look
, prev
);
552 if (u
== UCS_NO_CHAR
) {
553 /* Assume the incomplete sequence
557 width
= unicode_to_cell(u
);
559 if (way
== UTF8_STEP_CELLS_FEWER
560 && steps
+ width
> max
) {
570 INTERNAL("impossible enum utf8_step");
580 * Find out number of standard terminal collumns needed for displaying symbol
581 * (glyph) which represents Unicode character c.
583 * TODO: Use wcwidth when it is available. This seems to require:
584 * - Make the configure script check whether <wchar.h> and wcwidth exist.
585 * - Define _XOPEN_SOURCE and include <wchar.h>.
586 * - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
587 * matches ISO 10646 in all locales.)
588 * However, these do not suffice, because wcwidth depends on LC_CTYPE
589 * in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
590 * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
591 * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
592 * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
593 * character is apparently not supported in all locales. Why is that?
594 * - Perhaps there is standardese that requires supported characters
595 * to be convertable to multibyte form. Then ELinks could just pick
596 * some UTF-8 locale for its wcwidth purposes.
597 * - Perhaps wcwidth can even return different nonnegative values for
598 * the same ISO 10646 character in different locales. Then ELinks
599 * would have to set LC_CTYPE to match at least the terminal's
600 * charset (which may differ from the LC_CTYPE environment variable,
601 * especially when the master process is serving a slave terminal).
602 * But there is no guarantee that the libc supports all the same
603 * charsets as ELinks does.
604 * For now, it seems safest to avoid the potentially locale-dependent
605 * libc version of wcwidth, and instead use a hardcoded mapping.
607 * @return 2 for double-width glyph, 1 for others.
608 * 0 for unprintable glyphs (like 0x200e: "LEFT-TO-RIGHT MARK")
611 unicode_to_cell(unicode_val_T c
)
613 if (c
== 0x200e || c
== 0x200f)
616 && (c
<= 0x115f /* Hangul Jamo */
619 || (c
>= 0x2e80 && c
<= 0xa4cf
620 && c
!= 0x303f) /* CJK ... Yi */
621 || (c
>= 0xac00 && c
<= 0xd7a3) /* Hangul Syllables */
622 || (c
>= 0xf900 && c
<= 0xfaff) /* CJK Compatibility
624 || (c
>= 0xfe30 && c
<= 0xfe6f) /* CJK Compatibility Forms */
625 || (c
>= 0xff00 && c
<= 0xff60) /* Fullwidth Forms */
626 || (c
>= 0xffe0 && c
<= 0xffe6)
627 || (c
>= 0x20000 && c
<= 0x2fffd)
628 || (c
>= 0x30000 && c
<= 0x3fffd)))
634 /* Fold the case of a Unicode character, so that hotkeys in labels can
635 * be compared case-insensitively. It is unspecified whether the
636 * result will be in upper or lower case. */
638 unicode_fold_label_case(unicode_val_T c
)
640 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
642 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
643 /* For now, this supports only ASCII. It would be possible to
644 * use code generated from CaseFolding.txt of Unicode if the
645 * acknowledgements required by http://www.unicode.org/copyright.html
646 * were added to associated documentation of ELinks. */
647 if (c
>= 0x41 && c
<= 0x5A)
651 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
653 #endif /* CONFIG_UTF8 */
655 NONSTATIC_INLINE unicode_val_T
656 utf8_to_unicode(unsigned char **string
, const unsigned char *end
)
658 unsigned char *str
= *string
;
662 length
= utf8char_len_tab
[str
[0]];
664 if (str
+ length
> end
) {
669 case 1: /* U+0000 to U+007F */
670 if (str
[0] >= 0x80) {
673 return UCS_REPLACEMENT_CHARACTER
;
677 case 2: /* U+0080 to U+07FF */
678 if ((str
[1] & 0xc0) != 0x80)
680 u
= (str
[0] & 0x1f) << 6;
681 u
+= (str
[1] & 0x3f);
685 case 3: /* U+0800 to U+FFFF, except surrogates */
686 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80)
688 u
= (str
[0] & 0x0f) << 12;
689 u
+= ((str
[1] & 0x3f) << 6);
690 u
+= (str
[2] & 0x3f);
691 if (u
< 0x800 || is_utf16_surrogate(u
))
694 case 4: /* U+10000 to U+1FFFFF */
695 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80
696 || (str
[3] & 0xc0) != 0x80)
698 u
= (str
[0] & 0x0f) << 18;
699 u
+= ((str
[1] & 0x3f) << 12);
700 u
+= ((str
[2] & 0x3f) << 6);
701 u
+= (str
[3] & 0x3f);
705 case 5: /* U+200000 to U+3FFFFFF */
706 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80
707 || (str
[3] & 0xc0) != 0x80 || (str
[4] & 0xc0) != 0x80)
709 u
= (str
[0] & 0x0f) << 24;
710 u
+= ((str
[1] & 0x3f) << 18);
711 u
+= ((str
[2] & 0x3f) << 12);
712 u
+= ((str
[3] & 0x3f) << 6);
713 u
+= (str
[4] & 0x3f);
717 case 6: /* U+4000000 to U+7FFFFFFF */
718 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80
719 || (str
[3] & 0xc0) != 0x80 || (str
[4] & 0xc0) != 0x80
720 || (str
[5] & 0xc0) != 0x80)
722 u
= (str
[0] & 0x01) << 30;
723 u
+= ((str
[1] & 0x3f) << 24);
724 u
+= ((str
[2] & 0x3f) << 18);
725 u
+= ((str
[3] & 0x3f) << 12);
726 u
+= ((str
[4] & 0x3f) << 6);
727 u
+= (str
[5] & 0x3f);
732 INTERNAL("utf8char_len_tab out of range");
735 *string
= str
+ length
;
739 /* The common part of cp2u and cp2utf_8. */
741 cp2u_shared(const struct codepage_desc
*from
, unsigned char c
)
743 unicode_val_T u
= from
->highhalf
[c
- 0x80];
745 if (u
== 0xFFFF) u
= UCS_REPLACEMENT_CHARACTER
;
749 /* Used for converting input from the terminal. */
751 cp2u(int from
, unsigned char c
)
753 from
&= ~SYSTEM_CHARSET_FLAG
;
755 /* UTF-8 is a multibyte codepage and cannot be handled with
757 assert(!is_cp_ptr_utf8(&codepages
[from
]));
758 if_assert_failed
return UCS_REPLACEMENT_CHARACTER
;
760 if (c
< 0x80) return c
;
761 else return cp2u_shared(&codepages
[from
], c
);
764 /* This slow and ugly code is used by the terminal utf_8_io */
765 const unsigned char *
766 cp2utf8(int from
, int c
)
768 from
&= ~SYSTEM_CHARSET_FLAG
;
770 if (is_cp_ptr_utf8(&codepages
[from
]) || c
< 128)
773 return encode_utf8(cp2u_shared(&codepages
[from
], c
));
777 cp_to_unicode(int codepage
, unsigned char **string
, const unsigned char *end
)
781 if (is_cp_utf8(codepage
))
782 return utf8_to_unicode(string
, end
);
787 ret
= cp2u(codepage
, **string
);
793 #ifdef CONFIG_COMBINE
794 unicode_val_T last_combined
= UCS_BEGIN_COMBINED
- 1;
795 unicode_val_T
**combined
;
796 struct hash
*combined_hash
;
799 get_combined(unicode_val_T
*data
, int length
)
801 struct hash_item
*item
;
805 assert(length
>= 1 && length
<= UCS_MAX_LENGTH_COMBINED
);
806 if_assert_failed
return UCS_NO_CHAR
;
808 if (!combined_hash
) combined_hash
= init_hash8();
809 if (!combined_hash
) return UCS_NO_CHAR
;
810 item
= get_hash_item(combined_hash
, (unsigned char *)data
, length
* sizeof(*data
));
812 if (item
) return (unicode_val_T
)(long)item
->value
;
813 if (last_combined
>= UCS_END_COMBINED
) return UCS_NO_CHAR
;
815 key
= mem_alloc((length
+ 1) * sizeof(*key
));
816 if (!key
) return UCS_NO_CHAR
;
817 for (i
= 0; i
< length
; i
++)
819 key
[i
] = UCS_END_COMBINED
;
822 indeks
= last_combined
- UCS_BEGIN_COMBINED
;
824 combined
= mem_realloc(combined
, sizeof(*combined
) * (indeks
+ 1));
830 combined
[indeks
] = key
;
831 item
= add_hash_item(combined_hash
, (unsigned char *)key
,
832 length
* sizeof(*data
), (void *)(long)(last_combined
));
838 return last_combined
;
844 int i
, end
= last_combined
- UCS_BEGIN_COMBINED
+ 1;
847 free_hash(&combined_hash
);
848 for (i
= 0; i
< end
; i
++)
849 mem_free(combined
[i
]);
850 mem_free_if(combined
);
852 #endif /* CONFIG_COMBINE */
856 add_utf8(struct conv_table
*ct
, unicode_val_T u
, const unsigned char *str
)
858 unsigned char *p
= encode_utf8(u
);
861 if (ct
[*p
].t
) ct
= ct
[*p
].u
.tbl
;
863 struct conv_table
*nct
;
865 assertm(ct
[*p
].u
.str
== no_str
, "bad utf encoding #1");
866 if_assert_failed
return;
868 nct
= mem_calloc(256, sizeof(*nct
));
870 new_translation_table(nct
);
878 assertm(!ct
[*p
].t
, "bad utf encoding #2");
879 if_assert_failed
return;
881 if (ct
[*p
].u
.str
== no_str
)
885 /* A conversion table from some charset to UTF-8.
886 * If it is from UTF-8 to UTF-8, it converts each byte separately.
887 * Unlike in other translation tables, the strings in elements 0x80 to
888 * 0xFF are allocated dynamically. */
889 struct conv_table utf_table
[256];
890 int utf_table_init
= 1;
897 /* Cast away const. */
898 for (i
= 128; i
< 256; i
++)
899 mem_free((unsigned char *) utf_table
[i
].u
.str
);
902 static struct conv_table
*
903 get_translation_table_to_utf8(int from
)
908 if (from
== -1) return NULL
;
909 from
&= ~SYSTEM_CHARSET_FLAG
;
910 if (from
== lfr
) return utf_table
;
912 if (utf_table_init
) {
913 memset(utf_table
, 0, sizeof(utf_table
));
918 for (i
= 0; i
< 128; i
++)
919 utf_table
[i
].u
.str
= strings
[i
];
921 if (is_cp_ptr_utf8(&codepages
[from
])) {
922 for (i
= 128; i
< 256; i
++)
923 utf_table
[i
].u
.str
= stracpy(strings
[i
]);
927 for (i
= 128; i
< 256; i
++) {
928 unicode_val_T u
= codepages
[from
].highhalf
[i
- 0x80];
931 utf_table
[i
].u
.str
= NULL
;
933 utf_table
[i
].u
.str
= stracpy(encode_utf8(u
));
936 for (i
= 0; codepages
[from
].table
[i
].c
; i
++) {
937 unicode_val_T u
= codepages
[from
].table
[i
].u
;
939 if (!utf_table
[codepages
[from
].table
[i
].c
].u
.str
)
940 utf_table
[codepages
[from
].table
[i
].c
].u
.str
=
941 stracpy(encode_utf8(u
));
944 for (i
= 128; i
< 256; i
++)
945 if (!utf_table
[i
].u
.str
)
946 utf_table
[i
].u
.str
= stracpy(no_str
);
951 /* A conversion table between two charsets, where the target is not UTF-8. */
952 static struct conv_table table
[256];
953 static int first
= 1;
956 free_conv_table(void)
958 if (!utf_table_init
) free_utf_table();
960 memset(table
, 0, sizeof(table
));
963 new_translation_table(table
);
965 if (iconv_cd
!= (iconv_t
)-1) {
966 iconv_close(iconv_cd
);
967 iconv_cd
= (iconv_t
)-1;
974 get_translation_table(int from
, int to
)
979 from
&= ~SYSTEM_CHARSET_FLAG
;
980 to
&= ~SYSTEM_CHARSET_FLAG
;
982 memset(table
, 0, sizeof(table
));
986 if (codepages
[from
].iconv
) {
987 struct conv_table
*table2
= get_translation_table_to_utf8(34);
989 if (table2
) table2
->iconv_cp
= from
;
993 if (/*from == to ||*/ from
== -1 || to
== -1)
995 if (is_cp_ptr_utf8(&codepages
[to
])) {
996 struct conv_table
*table2
= get_translation_table_to_utf8(from
);
998 if (table2
) table2
->iconv_cp
= -1;
1001 if (from
== lfr
&& to
== lto
)
1005 new_translation_table(table
);
1007 if (is_cp_ptr_utf8(&codepages
[from
])) {
1010 /* Map U+00A0 and U+00AD the same way as u2cp() would. */
1011 add_utf8(table
, UCS_NO_BREAK_SPACE
, strings
[NBSP_CHAR
]);
1012 add_utf8(table
, UCS_SOFT_HYPHEN
, "");
1014 for (i
= 0x80; i
<= 0xFF; i
++)
1015 if (codepages
[to
].highhalf
[i
- 0x80] != 0xFFFF)
1017 codepages
[to
].highhalf
[i
- 0x80],
1020 for (i
= 0; codepages
[to
].table
[i
].c
; i
++)
1021 add_utf8(table
, codepages
[to
].table
[i
].u
,
1022 strings
[codepages
[to
].table
[i
].c
]);
1024 for (i
= 0; unicode_7b
[i
].x
!= -1; i
++)
1025 if (unicode_7b
[i
].x
>= 0x80)
1026 add_utf8(table
, unicode_7b
[i
].x
,
1032 for (i
= 128; i
< 256; i
++) {
1033 if (codepages
[from
].highhalf
[i
- 0x80] != 0xFFFF) {
1034 const unsigned char *u
;
1036 u
= u2cp(codepages
[from
].highhalf
[i
- 0x80], to
);
1037 if (u
) table
[i
].u
.str
= u
;
1046 xxstrcmp(unsigned char *s1
, unsigned char *s2
, int l2
)
1049 if (*s1
> *s2
) return 1;
1050 if (*s1
< *s2
) return -1;
1056 return *s2
? -1 : 0;
1059 /* Entity cache debugging purpose. */
1061 #define DEBUG_ENTITY_CACHE
1063 #undef DEBUG_ENTITY_CACHE
1066 struct entity_cache
{
1070 const unsigned char *result
;
1071 unsigned char str
[20]; /* Suffice in any case. */
1074 /* comparison function for qsort() */
1076 hits_cmp(const void *v1
, const void *v2
)
1078 const struct entity_cache
*a
= v1
, *b
= v2
;
1080 if (a
->hits
== b
->hits
) return 0;
1081 if (a
->hits
> b
->hits
) return -1;
1086 compare_entities(const void *key_
, const void *element_
)
1088 struct string
*key
= (struct string
*) key_
;
1089 struct entity
*element
= (struct entity
*) element_
;
1090 int length
= key
->length
;
1091 unsigned char *first
= key
->source
;
1092 unsigned char *second
= element
->s
;
1094 return xxstrcmp(first
, second
, length
);
1097 const unsigned char *
1098 get_entity_string(const unsigned char *str
, const int strlen
, int encoding
)
1100 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
1101 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
1102 will go in [0] table */
1103 static struct entity_cache entity_cache
[ENTITY_CACHE_MAXLEN
][ENTITY_CACHE_SIZE
];
1104 static unsigned int nb_entity_cache
[ENTITY_CACHE_MAXLEN
];
1105 unsigned int slen
= 0;
1106 const unsigned char *result
= NULL
;
1108 /* Note that an object of static storage duration is automatically
1109 * initialised to zero in C. */
1111 if (strlen
<= 0) return NULL
;
1114 /* TODO: caching UTF-8 */
1115 encoding
&= ~SYSTEM_CHARSET_FLAG
;
1116 if (is_cp_ptr_utf8(&codepages
[encoding
]))
1118 #endif /* CONFIG_UTF8 */
1120 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1121 * + google + slashdot + websites that result from a search for test on google,
1122 * + various ones) show quite impressive improvment:
1124 * 0: hits=2459 l=4 st='nbsp'
1125 * 1: hits=2152 l=6 st='eacute'
1126 * 2: hits=235 l=6 st='egrave'
1127 * 3: hits=136 l=6 st='agrave'
1128 * 4: hits=100 l=3 st='amp'
1129 * 5: hits=40 l=5 st='laquo'
1130 * 6: hits=8 l=4 st='copy'
1131 * 7: hits=5 l=2 st='gt'
1132 * 8: hits=2 l=2 st='lt'
1133 * 9: hits=1 l=6 st='middot'
1135 * Most of the time cache hit ratio is near 95%.
1137 * A long test shows: 15186 hits vs. 24 misses and mean iteration
1138 * count is kept < 2 (worst case 1.58). Not so bad ;)
1142 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1143 slen
= (strlen
> 1 && strlen
< ENTITY_CACHE_MAXLEN
) ? strlen
: 0;
1145 if (strlen
< ENTITY_CACHE_MAXLEN
&& nb_entity_cache
[slen
] > 0) {
1148 for (i
= 0; i
< nb_entity_cache
[slen
]; i
++) {
1149 if (entity_cache
[slen
][i
].encoding
== encoding
1150 && !memcmp(str
, entity_cache
[slen
][i
].str
, strlen
)) {
1151 #ifdef DEBUG_ENTITY_CACHE
1152 static double total_iter
= 0;
1153 static unsigned long hit_count
= 0;
1155 total_iter
+= i
+ 1;
1157 fprintf(stderr
, "hit after %d iter. (mean = %0.2f)\n", i
+ 1, total_iter
/ (double) hit_count
);
1159 if (entity_cache
[slen
][i
].hits
< (unsigned int) ~0)
1160 entity_cache
[slen
][i
].hits
++;
1161 return entity_cache
[slen
][i
].result
;
1164 #ifdef DEBUG_ENTITY_CACHE
1165 fprintf(stderr
, "miss\n");
1170 #endif /* CONFIG_UTF8 */
1171 if (*str
== '#') { /* Numeric entity. */
1172 int l
= (int) strlen
;
1173 unsigned char *st
= (unsigned char *) str
;
1174 unicode_val_T n
= 0;
1176 if (l
== 1) goto end
; /* &#; ? */
1178 if ((*st
| 32) == 'x') { /* Hexadecimal */
1180 if (l
== 1 || l
> 9) goto end
; /* xFFFFFFFF max. */
1183 unsigned char c
= (*(st
++) | 32);
1186 n
= (n
<< 4) | (c
- '0');
1187 else if (isxdigit(c
))
1188 n
= (n
<< 4) | (c
- 'a' + 10);
1190 goto end
; /* Bad char. */
1192 } else { /* Decimal */
1193 if (l
> 10) goto end
; /* 4294967295 max. */
1195 unsigned char c
= *(st
++);
1198 n
= n
* 10 + c
- '0';
1200 goto end
; /* Bad char. */
1201 /* Limit to 0xFFFFFFFF. */
1202 if (n
>= (unicode_val_T
) 0xFFFFFFFFu
)
1207 result
= u2cp(n
, encoding
);
1209 #ifdef DEBUG_ENTITY_CACHE
1210 fprintf(stderr
, "%lu %016x %s\n", (unsigned long) n
, n
, result
);
1212 } else { /* Text entity. */
1213 struct string key
= INIT_STRING((unsigned char *) str
, strlen
);
1214 struct entity
*element
= bsearch((void *) &key
, entities
,
1219 if (element
) result
= u2cp(element
->c
, encoding
);
1223 if (is_cp_ptr_utf8(&codepages
[encoding
])) {
1226 #endif /* CONFIG_UTF8 */
1228 /* Take care of potential buffer overflow. */
1229 if (strlen
< sizeof(entity_cache
[slen
][0].str
)) {
1230 struct entity_cache
*ece
;
1232 /* Sort entries by hit order. */
1233 if (nb_entity_cache
[slen
] > 1)
1234 qsort(&entity_cache
[slen
][0], nb_entity_cache
[slen
],
1235 sizeof(entity_cache
[slen
][0]), hits_cmp
);
1237 /* Increment number of cache entries if possible.
1238 * Else, just replace the least used entry. */
1239 if (nb_entity_cache
[slen
] < ENTITY_CACHE_SIZE
) nb_entity_cache
[slen
]++;
1240 ece
= &entity_cache
[slen
][nb_entity_cache
[slen
] - 1];
1242 /* Copy new entry to cache. */
1244 ece
->strlen
= strlen
;
1245 ece
->encoding
= encoding
;
1246 ece
->result
= result
;
1247 memcpy(ece
->str
, str
, strlen
);
1248 ece
->str
[strlen
] = '\0';
1251 #ifdef DEBUG_ENTITY_CACHE
1252 fprintf(stderr
, "Added in [%u]: l=%d st='%s'\n", slen
,
1253 entity_cache
[slen
][0].strlen
, entity_cache
[slen
][0].str
);
1258 fprintf(stderr
, "- Cache entries [%u] -\n", slen
);
1259 for (i
= 0; i
< nb_entity_cache
[slen
] ; i
++)
1260 fprintf(stderr
, "%d: hits=%u l=%d st='%s'\n", i
,
1261 entity_cache
[slen
][i
].hits
, entity_cache
[slen
][i
].strlen
,
1262 entity_cache
[slen
][i
].str
);
1263 fprintf(stderr
, "-----------------\n");
1265 #endif /* DEBUG_ENTITY_CACHE */
1271 convert_string(struct conv_table
*convert_table
,
1272 unsigned char *chars2
, int charslen2
, int cp
,
1273 enum convert_string_mode mode
, int *length
,
1274 void (*callback
)(void *data
, unsigned char *buf
, int buflen
),
1275 void *callback_data
)
1277 unsigned char *buffer
;
1280 unsigned char *chars
= chars2
;
1281 int charslen
= charslen2
;
1284 static char iconv_input
[256];
1285 static char iconv_output
[256 * 8];
1286 static size_t iconv_offset
;
1287 static int iconv_cp
;
1288 static size_t iconv_inleft
;
1289 size_t iconv_outleft
= 256 * 8;
1292 int chars_offset
= 0;
1294 if (!convert_table
&& !memchr(chars
, '&', charslen
)) {
1296 if (charslen
) callback(callback_data
, chars
, charslen
);
1299 return memacpy(chars
, charslen
);
1304 if (convert_table
&& convert_table
->iconv_cp
> 0) {
1306 cp
= convert_table
->iconv_cp
;
1308 is_iconv
= codepages
[cp
& ~SYSTEM_CHARSET_FLAG
].iconv
;
1313 /* Buffer allocation */
1315 buffer
= mem_alloc(ALLOC_GR
+ 1 /* trailing \0 */);
1316 if (!buffer
) return NULL
;
1321 size_t before
, to_copy
;
1324 if (iconv_cd
>= 0) {
1325 if (cp
!= iconv_cp
) {
1326 iconv_close(iconv_cd
);
1327 iconv_cd
= (iconv_t
)-1;
1330 if (iconv_cd
== (iconv_t
)-1) {
1332 iconv_cd
= iconv_open("utf-8", get_cp_mime_name(cp
));
1333 if (iconv_cd
== (iconv_t
)-1) {
1340 to_copy
= charslen2
- chars_offset
;
1341 if (to_copy
> 256 - iconv_offset
) to_copy
= 256 - iconv_offset
;
1342 memcpy(iconv_input
+ iconv_offset
, chars2
+ chars_offset
, to_copy
);
1343 iconv_outleft
= 256 * 8;
1344 iconv_inleft
= iconv_offset
+ to_copy
;
1346 outp
= iconv_output
;
1347 before
= iconv_inleft
;
1349 v
= iconv(iconv_cd
, &inp
, &iconv_inleft
, &outp
, &iconv_outleft
);
1350 chars_offset
+= before
- iconv_inleft
;
1351 charslen
= 256 * 8 - iconv_outleft
;
1353 chars
= (unsigned char *)iconv_output
;
1359 memcpy(iconv_input
, inp
, iconv_inleft
);
1360 iconv_offset
= iconv_inleft
;
1373 loop
= chars_offset
< charslen2
;
1379 while (charspos
< charslen
) {
1380 const unsigned char *translit
;
1383 buffer[bufferpos++] = chars[charspos++]; \
1388 if (chars
[charspos
] != '&') {
1389 struct conv_table
*t
;
1392 if (chars
[charspos
] < 128 || !convert_table
) PUTC
;
1397 while (t
[chars
[i
]].t
) {
1398 t
= t
[chars
[i
++]].u
.tbl
;
1399 if (i
>= charslen
) PUTC
;
1402 translit
= t
[chars
[i
]].u
.str
;
1405 } else if (mode
== CSM_FORM
|| mode
== CSM_NONE
) {
1409 int start
= charspos
+ 1;
1413 && (isasciialpha(chars
[i
])
1414 || isdigit(chars
[i
])
1415 || (chars
[i
] == '#')))
1418 /* This prevents bug 213: we were expanding "entities"
1419 * in URL query strings. */
1420 /* XXX: But this disables    usage, which
1421 * appears to be relatively common! --pasky */
1422 if ((mode
== CSM_DEFAULT
|| (chars
[i
] != '&' && chars
[i
] != '='))
1424 && !isasciialpha(chars
[i
]) && !isdigit(chars
[i
])) {
1425 translit
= get_entity_string(&chars
[start
], i
- start
,
1427 if (chars
[i
] != ';') {
1428 /* Eat    <foo> happily, but
1429 * pull back from the character after
1430 * entity string if it is not the valid
1435 if (!translit
) PUTC
;
1436 charspos
= i
+ (i
< charslen
);
1440 if (!translit
[0]) continue;
1443 buffer
[bufferpos
++] = translit
[0];
1449 unsigned char *new_
;
1451 buffer
[bufferpos
++] = *(translit
++);
1453 if (bufferpos
& (ALLOC_GR
- 1)) continue;
1456 buffer
[bufferpos
] = 0;
1457 callback(callback_data
, buffer
, bufferpos
);
1460 new_
= mem_realloc(buffer
, bufferpos
+ ALLOC_GR
);
1472 if (loop
) goto repeat
;
1476 buffer
[bufferpos
] = 0;
1477 if (length
) *length
= bufferpos
;
1480 if (bufferpos
) callback(callback_data
, buffer
, bufferpos
);
1489 #ifndef USE_FASTFIND
1491 get_cp_index(const unsigned char *name
)
1496 if (!c_strcasecmp(name
, "System")) {
1497 #if HAVE_LANGINFO_CODESET
1498 name
= nl_langinfo(CODESET
);
1499 syscp
= SYSTEM_CHARSET_FLAG
;
1505 for (i
= 0; codepages
[i
].name
; i
++) {
1506 for (a
= 0; codepages
[i
].aliases
[a
]; a
++) {
1507 /* In the past, we looked for the longest substring
1508 * in all the names; it is way too expensive, though:
1510 * % cumulative self self total
1511 * time seconds seconds calls us/call us/call name
1512 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1514 * Anything called from redraw_screen() is in fact
1515 * relatively expensive, even if it's called just
1516 * once. So we will do a simple strcasecmp() here.
1519 if (!c_strcasecmp(name
, codepages
[i
].aliases
[a
]))
1525 return get_cp_index("us-ascii") | syscp
;
1533 static unsigned int i_name
= 0;
1534 static unsigned int i_alias
= 0;
1536 /* Reset internal list pointer */
1538 charsets_list_reset(void)
1544 /* Returns a pointer to a struct that contains current key and data pointers
1545 * and increment internal pointer. It returns NULL when key is NULL. */
1546 struct fastfind_key_value
*
1547 charsets_list_next(void)
1549 static struct fastfind_key_value kv
;
1551 if (!codepages
[i_name
].name
) return NULL
;
1553 kv
.key
= codepages
[i_name
].aliases
[i_alias
];
1554 kv
.data
= (void *) &codepages
[i_name
]; /* cast away const */
1556 if (codepages
[i_name
].aliases
[i_alias
+ 1])
1566 static struct fastfind_index ff_charsets_index
1567 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset
, charsets_list_next
);
1569 /* It searchs for a charset named @name or one of its aliases and
1570 * returns index for it or -1 if not found. */
1572 get_cp_index(const unsigned char *name
)
1574 const struct codepage_desc
*codepage
;
1577 if (!c_strcasecmp(name
, "System")) {
1578 #if HAVE_LANGINFO_CODESET
1579 name
= nl_langinfo(CODESET
);
1580 syscp
= SYSTEM_CHARSET_FLAG
;
1586 codepage
= fastfind_search(&ff_charsets_index
, name
, strlen(name
));
1588 assert(codepages
<= codepage
&& codepage
< codepages
+ N_CODEPAGES
);
1589 return (codepage
- codepages
) | syscp
;
1592 return get_cp_index("us-ascii") | syscp
;
1599 #endif /* USE_FASTFIND */
1602 init_charsets_lookup(void)
1605 fastfind_index(&ff_charsets_index
, FF_COMPRESS
);
1610 free_charsets_lookup(void)
1613 fastfind_done(&ff_charsets_index
);
1617 /* Get the codepage's name for displaying to the user, or NULL if
1618 * @cp_index is one past the end. In the future, we might want to
1619 * localize these with gettext. So it may be best not to use this
1620 * function if the name will have to be converted back to an
1623 get_cp_name(int cp_index
)
1625 if (cp_index
< 0) return "none";
1626 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1628 return codepages
[cp_index
].name
;
1631 /* Get the codepage's name for saving to a configuration file. These
1632 * names can be converted back to indexes, even in future versions of
1635 get_cp_config_name(int cp_index
)
1637 if (cp_index
< 0) return "none";
1638 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1639 if (!codepages
[cp_index
].aliases
) return NULL
;
1641 return codepages
[cp_index
].aliases
[0];
1644 /* Get the codepage's name for sending to a library or server that
1645 * understands MIME charset names. This function irreversibly maps
1646 * the "System" codepage to the underlying charset. */
1648 get_cp_mime_name(int cp_index
)
1650 if (cp_index
< 0) return "none";
1651 cp_index
&= ~SYSTEM_CHARSET_FLAG
;
1652 if (!codepages
[cp_index
].aliases
) return NULL
;
1654 return codepages
[cp_index
].aliases
[0];
1658 is_cp_utf8(int cp_index
)
1660 cp_index
&= ~SYSTEM_CHARSET_FLAG
;
1661 return is_cp_ptr_utf8(&codepages
[cp_index
]);
1664 /* This function will be used by the xhtml parser. */
1666 get_cp_highhalf(const unsigned char *name
)
1668 int cp
= get_cp_index(name
);
1670 if (cp
< 0) return NULL
;
1671 cp
&= ~SYSTEM_CHARSET_FLAG
;
1672 return codepages
[cp
].highhalf
;