1 /* Charsets convertor */
4 #define _GNU_SOURCE /* strcasecmp() */
11 #if HAVE_LANGINFO_CODESET
28 #include "document/options.h"
29 #include "intl/charsets.h"
30 #include "util/conv.h"
31 #include "util/error.h"
32 #include "util/fastfind.h"
33 #include "util/hash.h"
34 #include "util/memory.h"
35 #include "util/string.h"
38 /* Fix namespace clash on MacOS. */
39 #define table table_elinks
43 /* This should in principle be unicode_val_T, but because all
44 * the values currently in codepage.inc fit in 16 bits, we can
45 * as well use uint16_t and halve sizeof(struct table_entry)
46 * from 8 bytes to 4. Should other characters ever be needed,
47 * unicode_val_T u : 24 might be a possibility, although it
48 * seems a little unportable as bitfields are in principle
49 * restricted to int, which may be 16-bit. */
53 struct codepage_desc
{
55 unsigned char *const *aliases
;
57 /* The Unicode mappings of codepage bytes 0x80...0xFF.
58 * (0x00...0x7F are assumed to be ASCII in all codepages.)
59 * Because all current values fit in 16 bits, we store them as
60 * uint16_t rather than unicode_val_T. If the codepage does
61 * not use some byte, then @highhalf maps that byte to 0xFFFF,
62 * which C code converts to UCS_REPLACEMENT_CHARACTER where
63 * appropriate. (U+FFFF is reserved and will never be
64 * assigned as a character.) */
65 const uint16_t *highhalf
;
67 /* If some byte in the codepage corresponds to multiple Unicode
68 * characters, then the preferred character is in @highhalf
69 * above, and the rest are listed here in @table. This table
70 * is not used for translating from the codepage to Unicode. */
71 const struct table_entry
*table
;
73 /* Whether use iconv for translation */
77 #include "intl/codepage.inc"
78 #include "intl/uni_7b.inc"
79 #include "intl/entity.inc"
81 /* Declare the external-linkage inline functions defined in this file.
82 * Avoid the GCC 4.3.1 warning: `foo' declared inline after being
83 * called. The functions are not declared inline in charsets.h
84 * because C99 6.7.4p6 says that every external-linkage function
85 * declared inline shall be defined in the same translation unit.
86 * The non-inline declarations in charsets.h also make sure that the
87 * compiler emits global definitions for the symbols so that the
88 * functions can be called from other translation units. */
89 NONSTATIC_INLINE
unsigned char *encode_utf8(unicode_val_T u
);
90 NONSTATIC_INLINE
int utf8charlen(const unsigned char *p
);
91 NONSTATIC_INLINE
int unicode_to_cell(unicode_val_T c
);
92 NONSTATIC_INLINE unicode_val_T
utf8_to_unicode(unsigned char **string
,
93 const unsigned char *end
);
95 static const char strings
[256][2] = {
96 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
97 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
98 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
99 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
100 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
101 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
102 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
103 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
104 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
105 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
106 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
107 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
108 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
109 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
110 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
111 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
112 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
113 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
114 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
115 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
116 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
117 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
118 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
119 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
120 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
121 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
122 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
123 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
124 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
125 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
126 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
127 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
131 static iconv_t iconv_cd
= (iconv_t
)-1;
135 free_translation_table(struct conv_table
*p
)
139 for (i
= 0; i
< 256; i
++)
141 free_translation_table(p
[i
].u
.tbl
);
146 /* A string used in conversion tables when there is no correct
147 * conversion. This is compared by address and therefore should be a
148 * named array rather than a pointer so that it won't share storage
149 * with any other string literal that happens to have the same
151 static const unsigned char no_str
[] = "*";
154 new_translation_table(struct conv_table
*p
)
158 for (i
= 0; i
< 256; i
++)
160 free_translation_table(p
[i
].u
.tbl
);
161 for (i
= 0; i
< 128; i
++) {
163 p
[i
].u
.str
= strings
[i
];
165 for (; i
< 256; i
++) {
172 #define BIN_SEARCH(table, entry, entries, key, result) \
174 long _s = 0, _e = (entries) - 1; \
176 while (_s <= _e || !((result) = -1)) { \
177 long _m = (_s + _e) / 2; \
179 if ((table)[_m].entry == (key)) { \
183 if ((table)[_m].entry > (key)) _e = _m - 1; \
184 if ((table)[_m].entry < (key)) _s = _m + 1; \
188 static const unicode_val_T strange_chars[32] = {
189 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
190 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
191 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
192 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
195 #define SYSTEM_CHARSET_FLAG 128
196 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
198 const unsigned char *
199 u2cp_(unicode_val_T u
, int to
, enum nbsp_mode nbsp_mode
)
204 if (u
< 128) return strings
[u
];
206 to
&= ~SYSTEM_CHARSET_FLAG
;
208 if (is_cp_ptr_utf8(&codepages
[to
]))
209 return encode_utf8(u
);
211 /* To mark non breaking spaces in non-UTF-8 strings, we use a
212 * special char NBSP_CHAR. */
213 if (u
== UCS_NO_BREAK_SPACE
) {
214 if (nbsp_mode
== NBSP_MODE_HACK
) return NBSP_CHAR_STRING
;
215 else /* NBSP_MODE_ASCII */ return " ";
217 if (u
== UCS_SOFT_HYPHEN
) return "";
220 unicode_val_T strange
= strange_chars
[u
- 0x80];
222 if (!strange
) return NULL
;
223 return u2cp_(strange
, to
, nbsp_mode
);
227 for (j
= 0; j
< 0x80; j
++)
228 if (codepages
[to
].highhalf
[j
] == u
)
229 return strings
[0x80 + j
];
230 for (j
= 0; codepages
[to
].table
[j
].c
; j
++)
231 if (codepages
[to
].table
[j
].u
== u
)
232 return strings
[codepages
[to
].table
[j
].c
];
234 BIN_SEARCH(unicode_7b
, x
, N_UNICODE_7B
, u
, s
);
235 if (s
!= -1) return unicode_7b
[s
].s
;
240 static unsigned char utf_buffer
[7];
242 NONSTATIC_INLINE
unsigned char *
243 encode_utf8(unicode_val_T u
)
245 memset(utf_buffer
, 0, 7);
250 utf_buffer
[0] = 0xc0 | ((u
>> 6) & 0x1f),
251 utf_buffer
[1] = 0x80 | (u
& 0x3f);
252 else if (u
< 0x10000)
253 utf_buffer
[0] = 0xe0 | ((u
>> 12) & 0x0f),
254 utf_buffer
[1] = 0x80 | ((u
>> 6) & 0x3f),
255 utf_buffer
[2] = 0x80 | (u
& 0x3f);
256 else if (u
< 0x200000)
257 utf_buffer
[0] = 0xf0 | ((u
>> 18) & 0x0f),
258 utf_buffer
[1] = 0x80 | ((u
>> 12) & 0x3f),
259 utf_buffer
[2] = 0x80 | ((u
>> 6) & 0x3f),
260 utf_buffer
[3] = 0x80 | (u
& 0x3f);
261 else if (u
< 0x4000000)
262 utf_buffer
[0] = 0xf8 | ((u
>> 24) & 0x0f),
263 utf_buffer
[1] = 0x80 | ((u
>> 18) & 0x3f),
264 utf_buffer
[2] = 0x80 | ((u
>> 12) & 0x3f),
265 utf_buffer
[3] = 0x80 | ((u
>> 6) & 0x3f),
266 utf_buffer
[4] = 0x80 | (u
& 0x3f);
267 else utf_buffer
[0] = 0xfc | ((u
>> 30) & 0x01),
268 utf_buffer
[1] = 0x80 | ((u
>> 24) & 0x3f),
269 utf_buffer
[2] = 0x80 | ((u
>> 18) & 0x3f),
270 utf_buffer
[3] = 0x80 | ((u
>> 12) & 0x3f),
271 utf_buffer
[4] = 0x80 | ((u
>> 6) & 0x3f),
272 utf_buffer
[5] = 0x80 | (u
& 0x3f);
277 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
278 * equal ones and handled different. */
279 static const char utf8char_len_tab
[256] = {
280 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
281 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
282 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
283 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
284 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
285 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
286 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
287 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
292 utf8charlen(const unsigned char *p
)
294 return p
? utf8char_len_tab
[*p
] : 0;
298 strlen_utf8(unsigned char **str
)
300 unsigned char *s
= *str
;
301 unsigned char *end
= strchr(s
, '\0');
305 for (x
= 0;; x
++, s
+= len
) {
306 len
= utf8charlen(s
);
307 if (s
+ len
> end
) break;
313 #define utf8_issingle(p) (((p) & 0x80) == 0)
314 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
316 /* Start from @current and move back to @pos char. This pointer return. The
317 * most left pointer is @start. */
319 utf8_prevchar(unsigned char *current
, int pos
, unsigned char *start
)
321 if (current
== NULL
|| start
== NULL
|| pos
< 0)
323 while (pos
> 0 && current
!= start
) {
325 if (utf8_islead(*current
))
331 /* Count number of standard terminal cells needed for displaying UTF-8
334 utf8_char2cells(unsigned char *utf8_char
, unsigned char *end
)
339 end
= strchr(utf8_char
, '\0');
341 if(!utf8_char
|| !end
)
344 u
= utf8_to_unicode(&utf8_char
, end
);
346 return unicode_to_cell(u
);
349 /* Count number of standard terminal cells needed for displaying string
350 * with UTF-8 characters. */
352 utf8_ptr2cells(unsigned char *string
, unsigned char *end
)
354 int charlen
, cell
, cells
= 0;
357 end
= strchr(string
, '\0');
363 charlen
= utf8charlen(string
);
364 if (string
+ charlen
> end
)
367 cell
= utf8_char2cells(string
, end
);
378 /* Count number of characters in string. */
380 utf8_ptr2chars(unsigned char *string
, unsigned char *end
)
382 int charlen
, chars
= 0;
385 end
= strchr(string
, '\0');
391 charlen
= utf8charlen(string
);
392 if (string
+ charlen
> end
)
403 * Count number of bytes from begining of the string needed for displaying
404 * specified number of cells.
407 utf8_cells2bytes(unsigned char *string
, int max_cells
, unsigned char *end
)
409 unsigned int bytes
= 0, cells
= 0;
411 assert(max_cells
>=0);
414 end
= strchr(string
, '\0');
420 int cell
= utf8_char2cells(&string
[bytes
], end
);
425 if (cells
> max_cells
)
428 bytes
+= utf8charlen(&string
[bytes
]);
430 if (string
+ bytes
> end
) {
431 bytes
= end
- string
;
439 /* Take @max steps forward from @string in the specified @way, but
440 * not going past @end. Return the resulting address. Store the
441 * number of steps taken to *@count, unless @count is NULL.
443 * This assumes the text is valid UTF-8, and @string and @end point to
444 * character boundaries. If not, it doesn't crash but the results may
447 * This function can do some of the same jobs as utf8charlen(),
448 * utf8_cells2bytes(), and strlen_utf8(). */
450 utf8_step_forward(unsigned char *string
, unsigned char *end
,
451 int max
, enum utf8_step way
, int *count
)
454 unsigned char *current
= string
;
458 if_assert_failed
goto invalid_arg
;
460 end
= strchr(string
, '\0');
463 case UTF8_STEP_CHARACTERS
:
464 while (steps
< max
&& current
< end
) {
466 if (utf8_islead(*current
))
471 case UTF8_STEP_CELLS_FEWER
:
472 case UTF8_STEP_CELLS_MORE
:
473 while (steps
< max
&& current
< end
) {
475 unsigned char *prev
= current
;
478 u
= utf8_to_unicode(¤t
, end
);
479 if (u
== UCS_NO_CHAR
) {
480 /* Assume the incomplete sequence
487 width
= unicode_to_cell(u
);
488 if (way
== UTF8_STEP_CELLS_FEWER
489 && steps
+ width
> max
) {
499 INTERNAL("impossible enum utf8_step");
508 /* Take @max steps backward from @string in the specified @way, but
509 * not going past @start. Return the resulting address. Store the
510 * number of steps taken to *@count, unless @count is NULL.
512 * This assumes the text is valid UTF-8, and @string and @start point
513 * to character boundaries. If not, it doesn't crash but the results
514 * may be inconsistent.
516 * This function can do some of the same jobs as utf8_prevchar(). */
518 utf8_step_backward(unsigned char *string
, unsigned char *start
,
519 int max
, enum utf8_step way
, int *count
)
522 unsigned char *current
= string
;
527 if_assert_failed
goto invalid_arg
;
530 case UTF8_STEP_CHARACTERS
:
531 while (steps
< max
&& current
> start
) {
533 if (utf8_islead(*current
))
538 case UTF8_STEP_CELLS_FEWER
:
539 case UTF8_STEP_CELLS_MORE
:
540 while (steps
< max
) {
541 unsigned char *prev
= current
;
546 if (current
<= start
)
550 } while (current
> start
&& !utf8_islead(*current
));
553 u
= utf8_to_unicode(&look
, prev
);
554 if (u
== UCS_NO_CHAR
) {
555 /* Assume the incomplete sequence
559 width
= unicode_to_cell(u
);
561 if (way
== UTF8_STEP_CELLS_FEWER
562 && steps
+ width
> max
) {
572 INTERNAL("impossible enum utf8_step");
582 * Find out number of standard terminal collumns needed for displaying symbol
583 * (glyph) which represents Unicode character c.
585 * TODO: Use wcwidth when it is available. This seems to require:
586 * - Make the configure script check whether <wchar.h> and wcwidth exist.
587 * - Define _XOPEN_SOURCE and include <wchar.h>.
588 * - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
589 * matches ISO 10646 in all locales.)
590 * However, these do not suffice, because wcwidth depends on LC_CTYPE
591 * in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
592 * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
593 * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
594 * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
595 * character is apparently not supported in all locales. Why is that?
596 * - Perhaps there is standardese that requires supported characters
597 * to be convertable to multibyte form. Then ELinks could just pick
598 * some UTF-8 locale for its wcwidth purposes.
599 * - Perhaps wcwidth can even return different nonnegative values for
600 * the same ISO 10646 character in different locales. Then ELinks
601 * would have to set LC_CTYPE to match at least the terminal's
602 * charset (which may differ from the LC_CTYPE environment variable,
603 * especially when the master process is serving a slave terminal).
604 * But there is no guarantee that the libc supports all the same
605 * charsets as ELinks does.
606 * For now, it seems safest to avoid the potentially locale-dependent
607 * libc version of wcwidth, and instead use a hardcoded mapping.
609 * @return 2 for double-width glyph, 1 for others.
610 * TODO: May be extended to return 0 for zero-width glyphs
611 * (like composing, maybe unprintable too).
614 unicode_to_cell(unicode_val_T c
)
617 && (c
<= 0x115f /* Hangul Jamo */
620 || (c
>= 0x2e80 && c
<= 0xa4cf
621 && c
!= 0x303f) /* CJK ... Yi */
622 || (c
>= 0xac00 && c
<= 0xd7a3) /* Hangul Syllables */
623 || (c
>= 0xf900 && c
<= 0xfaff) /* CJK Compatibility
625 || (c
>= 0xfe30 && c
<= 0xfe6f) /* CJK Compatibility Forms */
626 || (c
>= 0xff00 && c
<= 0xff60) /* Fullwidth Forms */
627 || (c
>= 0xffe0 && c
<= 0xffe6)
628 || (c
>= 0x20000 && c
<= 0x2fffd)
629 || (c
>= 0x30000 && c
<= 0x3fffd)))
635 /* Fold the case of a Unicode character, so that hotkeys in labels can
636 * be compared case-insensitively. It is unspecified whether the
637 * result will be in upper or lower case. */
639 unicode_fold_label_case(unicode_val_T c
)
641 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
643 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
644 /* For now, this supports only ASCII. It would be possible to
645 * use code generated from CaseFolding.txt of Unicode if the
646 * acknowledgements required by http://www.unicode.org/copyright.html
647 * were added to associated documentation of ELinks. */
648 if (c
>= 0x41 && c
<= 0x5A)
652 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
654 #endif /* CONFIG_UTF8 */
656 NONSTATIC_INLINE unicode_val_T
657 utf8_to_unicode(unsigned char **string
, const unsigned char *end
)
659 unsigned char *str
= *string
;
663 length
= utf8char_len_tab
[str
[0]];
665 if (str
+ length
> end
) {
670 case 1: /* U+0000 to U+007F */
671 if (str
[0] >= 0x80) {
674 return UCS_REPLACEMENT_CHARACTER
;
678 case 2: /* U+0080 to U+07FF */
679 if ((str
[1] & 0xc0) != 0x80)
681 u
= (str
[0] & 0x1f) << 6;
682 u
+= (str
[1] & 0x3f);
686 case 3: /* U+0800 to U+FFFF, except surrogates */
687 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80)
689 u
= (str
[0] & 0x0f) << 12;
690 u
+= ((str
[1] & 0x3f) << 6);
691 u
+= (str
[2] & 0x3f);
692 if (u
< 0x800 || is_utf16_surrogate(u
))
695 case 4: /* U+10000 to U+1FFFFF */
696 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80
697 || (str
[3] & 0xc0) != 0x80)
699 u
= (str
[0] & 0x0f) << 18;
700 u
+= ((str
[1] & 0x3f) << 12);
701 u
+= ((str
[2] & 0x3f) << 6);
702 u
+= (str
[3] & 0x3f);
706 case 5: /* U+200000 to U+3FFFFFF */
707 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80
708 || (str
[3] & 0xc0) != 0x80 || (str
[4] & 0xc0) != 0x80)
710 u
= (str
[0] & 0x0f) << 24;
711 u
+= ((str
[1] & 0x3f) << 18);
712 u
+= ((str
[2] & 0x3f) << 12);
713 u
+= ((str
[3] & 0x3f) << 6);
714 u
+= (str
[4] & 0x3f);
718 case 6: /* U+4000000 to U+7FFFFFFF */
719 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80
720 || (str
[3] & 0xc0) != 0x80 || (str
[4] & 0xc0) != 0x80
721 || (str
[5] & 0xc0) != 0x80)
723 u
= (str
[0] & 0x01) << 30;
724 u
+= ((str
[1] & 0x3f) << 24);
725 u
+= ((str
[2] & 0x3f) << 18);
726 u
+= ((str
[3] & 0x3f) << 12);
727 u
+= ((str
[4] & 0x3f) << 6);
728 u
+= (str
[5] & 0x3f);
733 INTERNAL("utf8char_len_tab out of range");
736 *string
= str
+ length
;
740 /* The common part of cp2u and cp2utf_8. */
742 cp2u_shared(const struct codepage_desc
*from
, unsigned char c
)
744 unicode_val_T u
= from
->highhalf
[c
- 0x80];
746 if (u
== 0xFFFF) u
= UCS_REPLACEMENT_CHARACTER
;
750 /* Used for converting input from the terminal. */
752 cp2u(int from
, unsigned char c
)
754 from
&= ~SYSTEM_CHARSET_FLAG
;
756 /* UTF-8 is a multibyte codepage and cannot be handled with
758 assert(!is_cp_ptr_utf8(&codepages
[from
]));
759 if_assert_failed
return UCS_REPLACEMENT_CHARACTER
;
761 if (c
< 0x80) return c
;
762 else return cp2u_shared(&codepages
[from
], c
);
765 /* This slow and ugly code is used by the terminal utf_8_io */
766 const unsigned char *
767 cp2utf8(int from
, int c
)
769 from
&= ~SYSTEM_CHARSET_FLAG
;
771 if (is_cp_ptr_utf8(&codepages
[from
]) || c
< 128)
774 return encode_utf8(cp2u_shared(&codepages
[from
], c
));
778 cp_to_unicode(int codepage
, unsigned char **string
, const unsigned char *end
)
782 if (is_cp_utf8(codepage
))
783 return utf8_to_unicode(string
, end
);
788 ret
= cp2u(codepage
, **string
);
794 #ifdef CONFIG_COMBINE
795 unicode_val_T last_combined
= UCS_BEGIN_COMBINED
- 1;
796 unicode_val_T
**combined
;
797 struct hash
*combined_hash
;
800 get_combined(unicode_val_T
*data
, int length
)
802 struct hash_item
*item
;
806 assert(length
>= 1 && length
<= UCS_MAX_LENGTH_COMBINED
);
807 if_assert_failed
return UCS_NO_CHAR
;
809 if (!combined_hash
) combined_hash
= init_hash8();
810 if (!combined_hash
) return UCS_NO_CHAR
;
811 item
= get_hash_item(combined_hash
, (unsigned char *)data
, length
* sizeof(*data
));
813 if (item
) return (unicode_val_T
)(long)item
->value
;
814 if (last_combined
>= UCS_END_COMBINED
) return UCS_NO_CHAR
;
816 key
= mem_alloc((length
+ 1) * sizeof(*key
));
817 if (!key
) return UCS_NO_CHAR
;
818 for (i
= 0; i
< length
; i
++)
820 key
[i
] = UCS_END_COMBINED
;
823 indeks
= last_combined
- UCS_BEGIN_COMBINED
;
825 combined
= mem_realloc(combined
, sizeof(*combined
) * (indeks
+ 1));
831 combined
[indeks
] = key
;
832 item
= add_hash_item(combined_hash
, (unsigned char *)key
,
833 length
* sizeof(*data
), (void *)(long)(last_combined
));
839 return last_combined
;
845 int i
, end
= last_combined
- UCS_BEGIN_COMBINED
+ 1;
848 free_hash(&combined_hash
);
849 for (i
= 0; i
< end
; i
++)
850 mem_free(combined
[i
]);
851 mem_free_if(combined
);
853 #endif /* CONFIG_COMBINE */
857 add_utf8(struct conv_table
*ct
, unicode_val_T u
, const unsigned char *str
)
859 unsigned char *p
= encode_utf8(u
);
862 if (ct
[*p
].t
) ct
= ct
[*p
].u
.tbl
;
864 struct conv_table
*nct
;
866 assertm(ct
[*p
].u
.str
== no_str
, "bad utf encoding #1");
867 if_assert_failed
return;
869 nct
= mem_calloc(256, sizeof(*nct
));
871 new_translation_table(nct
);
879 assertm(!ct
[*p
].t
, "bad utf encoding #2");
880 if_assert_failed
return;
882 if (ct
[*p
].u
.str
== no_str
)
886 /* A conversion table from some charset to UTF-8.
887 * If it is from UTF-8 to UTF-8, it converts each byte separately.
888 * Unlike in other translation tables, the strings in elements 0x80 to
889 * 0xFF are allocated dynamically. */
890 struct conv_table utf_table
[256];
891 int utf_table_init
= 1;
898 /* Cast away const. */
899 for (i
= 128; i
< 256; i
++)
900 mem_free((unsigned char *) utf_table
[i
].u
.str
);
903 static struct conv_table
*
904 get_translation_table_to_utf8(int from
)
909 if (from
== -1) return NULL
;
910 from
&= ~SYSTEM_CHARSET_FLAG
;
911 if (from
== lfr
) return utf_table
;
913 if (utf_table_init
) {
914 memset(utf_table
, 0, sizeof(utf_table
));
919 for (i
= 0; i
< 128; i
++)
920 utf_table
[i
].u
.str
= strings
[i
];
922 if (is_cp_ptr_utf8(&codepages
[from
])) {
923 for (i
= 128; i
< 256; i
++)
924 utf_table
[i
].u
.str
= stracpy(strings
[i
]);
928 for (i
= 128; i
< 256; i
++) {
929 unicode_val_T u
= codepages
[from
].highhalf
[i
- 0x80];
932 utf_table
[i
].u
.str
= NULL
;
934 utf_table
[i
].u
.str
= stracpy(encode_utf8(u
));
937 for (i
= 0; codepages
[from
].table
[i
].c
; i
++) {
938 unicode_val_T u
= codepages
[from
].table
[i
].u
;
940 if (!utf_table
[codepages
[from
].table
[i
].c
].u
.str
)
941 utf_table
[codepages
[from
].table
[i
].c
].u
.str
=
942 stracpy(encode_utf8(u
));
945 for (i
= 128; i
< 256; i
++)
946 if (!utf_table
[i
].u
.str
)
947 utf_table
[i
].u
.str
= stracpy(no_str
);
952 /* A conversion table between two charsets, where the target is not UTF-8. */
953 static struct conv_table table
[256];
954 static int first
= 1;
957 free_conv_table(void)
959 if (!utf_table_init
) free_utf_table();
961 memset(table
, 0, sizeof(table
));
964 new_translation_table(table
);
966 if (iconv_cd
!= (iconv_t
)-1) {
967 iconv_close(iconv_cd
);
968 iconv_cd
= (iconv_t
)-1;
975 get_translation_table(int from
, int to
)
980 from
&= ~SYSTEM_CHARSET_FLAG
;
981 to
&= ~SYSTEM_CHARSET_FLAG
;
983 memset(table
, 0, sizeof(table
));
987 if (codepages
[from
].iconv
) {
988 struct conv_table
*table2
= get_translation_table_to_utf8(34);
990 if (table2
) table2
->iconv_cp
= from
;
994 if (/*from == to ||*/ from
== -1 || to
== -1)
996 if (is_cp_ptr_utf8(&codepages
[to
])) {
997 struct conv_table
*table2
= get_translation_table_to_utf8(from
);
999 if (table2
) table2
->iconv_cp
= -1;
1002 if (from
== lfr
&& to
== lto
)
1006 new_translation_table(table
);
1008 if (is_cp_ptr_utf8(&codepages
[from
])) {
1011 /* Map U+00A0 and U+00AD the same way as u2cp() would. */
1012 add_utf8(table
, UCS_NO_BREAK_SPACE
, strings
[NBSP_CHAR
]);
1013 add_utf8(table
, UCS_SOFT_HYPHEN
, "");
1015 for (i
= 0x80; i
<= 0xFF; i
++)
1016 if (codepages
[to
].highhalf
[i
- 0x80] != 0xFFFF)
1018 codepages
[to
].highhalf
[i
- 0x80],
1021 for (i
= 0; codepages
[to
].table
[i
].c
; i
++)
1022 add_utf8(table
, codepages
[to
].table
[i
].u
,
1023 strings
[codepages
[to
].table
[i
].c
]);
1025 for (i
= 0; unicode_7b
[i
].x
!= -1; i
++)
1026 if (unicode_7b
[i
].x
>= 0x80)
1027 add_utf8(table
, unicode_7b
[i
].x
,
1033 for (i
= 128; i
< 256; i
++) {
1034 if (codepages
[from
].highhalf
[i
- 0x80] != 0xFFFF) {
1035 const unsigned char *u
;
1037 u
= u2cp(codepages
[from
].highhalf
[i
- 0x80], to
);
1038 if (u
) table
[i
].u
.str
= u
;
1047 xxstrcmp(unsigned char *s1
, unsigned char *s2
, int l2
)
1050 if (*s1
> *s2
) return 1;
1051 if (*s1
< *s2
) return -1;
1057 return *s2
? -1 : 0;
1060 /* Entity cache debugging purpose. */
1062 #define DEBUG_ENTITY_CACHE
1064 #undef DEBUG_ENTITY_CACHE
1067 struct entity_cache
{
1071 const unsigned char *result
;
1072 unsigned char str
[20]; /* Suffice in any case. */
1075 /* comparison function for qsort() */
1077 hits_cmp(const void *v1
, const void *v2
)
1079 const struct entity_cache
*a
= v1
, *b
= v2
;
1081 if (a
->hits
== b
->hits
) return 0;
1082 if (a
->hits
> b
->hits
) return -1;
1087 compare_entities(const void *key_
, const void *element_
)
1089 struct string
*key
= (struct string
*) key_
;
1090 struct entity
*element
= (struct entity
*) element_
;
1091 int length
= key
->length
;
1092 unsigned char *first
= key
->source
;
1093 unsigned char *second
= element
->s
;
1095 return xxstrcmp(first
, second
, length
);
1098 const unsigned char *
1099 get_entity_string(const unsigned char *str
, const int strlen
, int encoding
)
1101 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
1102 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
1103 will go in [0] table */
1104 static struct entity_cache entity_cache
[ENTITY_CACHE_MAXLEN
][ENTITY_CACHE_SIZE
];
1105 static unsigned int nb_entity_cache
[ENTITY_CACHE_MAXLEN
];
1106 unsigned int slen
= 0;
1107 const unsigned char *result
= NULL
;
1109 /* Note that an object of static storage duration is automatically
1110 * initialised to zero in C. */
1112 if (strlen
<= 0) return NULL
;
1115 /* TODO: caching UTF-8 */
1116 encoding
&= ~SYSTEM_CHARSET_FLAG
;
1117 if (is_cp_ptr_utf8(&codepages
[encoding
]))
1119 #endif /* CONFIG_UTF8 */
1121 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1122 * + google + slashdot + websites that result from a search for test on google,
1123 * + various ones) show quite impressive improvment:
1125 * 0: hits=2459 l=4 st='nbsp'
1126 * 1: hits=2152 l=6 st='eacute'
1127 * 2: hits=235 l=6 st='egrave'
1128 * 3: hits=136 l=6 st='agrave'
1129 * 4: hits=100 l=3 st='amp'
1130 * 5: hits=40 l=5 st='laquo'
1131 * 6: hits=8 l=4 st='copy'
1132 * 7: hits=5 l=2 st='gt'
1133 * 8: hits=2 l=2 st='lt'
1134 * 9: hits=1 l=6 st='middot'
1136 * Most of the time cache hit ratio is near 95%.
1138 * A long test shows: 15186 hits vs. 24 misses and mean iteration
1139 * count is kept < 2 (worst case 1.58). Not so bad ;)
1143 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1144 slen
= (strlen
> 1 && strlen
< ENTITY_CACHE_MAXLEN
) ? strlen
: 0;
1146 if (strlen
< ENTITY_CACHE_MAXLEN
&& nb_entity_cache
[slen
] > 0) {
1149 for (i
= 0; i
< nb_entity_cache
[slen
]; i
++) {
1150 if (entity_cache
[slen
][i
].encoding
== encoding
1151 && !memcmp(str
, entity_cache
[slen
][i
].str
, strlen
)) {
1152 #ifdef DEBUG_ENTITY_CACHE
1153 static double total_iter
= 0;
1154 static unsigned long hit_count
= 0;
1156 total_iter
+= i
+ 1;
1158 fprintf(stderr
, "hit after %d iter. (mean = %0.2f)\n", i
+ 1, total_iter
/ (double) hit_count
);
1160 if (entity_cache
[slen
][i
].hits
< (unsigned int) ~0)
1161 entity_cache
[slen
][i
].hits
++;
1162 return entity_cache
[slen
][i
].result
;
1165 #ifdef DEBUG_ENTITY_CACHE
1166 fprintf(stderr
, "miss\n");
1171 #endif /* CONFIG_UTF8 */
1172 if (*str
== '#') { /* Numeric entity. */
1173 int l
= (int) strlen
;
1174 unsigned char *st
= (unsigned char *) str
;
1175 unicode_val_T n
= 0;
1177 if (l
== 1) goto end
; /* &#; ? */
1179 if ((*st
| 32) == 'x') { /* Hexadecimal */
1181 if (l
== 1 || l
> 9) goto end
; /* xFFFFFFFF max. */
1184 unsigned char c
= (*(st
++) | 32);
1187 n
= (n
<< 4) | (c
- '0');
1188 else if (isxdigit(c
))
1189 n
= (n
<< 4) | (c
- 'a' + 10);
1191 goto end
; /* Bad char. */
1193 } else { /* Decimal */
1194 if (l
> 10) goto end
; /* 4294967295 max. */
1196 unsigned char c
= *(st
++);
1199 n
= n
* 10 + c
- '0';
1201 goto end
; /* Bad char. */
1202 /* Limit to 0xFFFFFFFF. */
1203 if (n
>= (unicode_val_T
) 0xFFFFFFFFu
)
1208 result
= u2cp(n
, encoding
);
1210 #ifdef DEBUG_ENTITY_CACHE
1211 fprintf(stderr
, "%lu %016x %s\n", (unsigned long) n
, n
, result
);
1213 } else { /* Text entity. */
1214 struct string key
= INIT_STRING((unsigned char *) str
, strlen
);
1215 struct entity
*element
= bsearch((void *) &key
, entities
,
1220 if (element
) result
= u2cp(element
->c
, encoding
);
1224 if (is_cp_ptr_utf8(&codepages
[encoding
])) {
1227 #endif /* CONFIG_UTF8 */
1229 /* Take care of potential buffer overflow. */
1230 if (strlen
< sizeof(entity_cache
[slen
][0].str
)) {
1231 struct entity_cache
*ece
;
1233 /* Sort entries by hit order. */
1234 if (nb_entity_cache
[slen
] > 1)
1235 qsort(&entity_cache
[slen
][0], nb_entity_cache
[slen
],
1236 sizeof(entity_cache
[slen
][0]), hits_cmp
);
1238 /* Increment number of cache entries if possible.
1239 * Else, just replace the least used entry. */
1240 if (nb_entity_cache
[slen
] < ENTITY_CACHE_SIZE
) nb_entity_cache
[slen
]++;
1241 ece
= &entity_cache
[slen
][nb_entity_cache
[slen
] - 1];
1243 /* Copy new entry to cache. */
1245 ece
->strlen
= strlen
;
1246 ece
->encoding
= encoding
;
1247 ece
->result
= result
;
1248 memcpy(ece
->str
, str
, strlen
);
1249 ece
->str
[strlen
] = '\0';
1252 #ifdef DEBUG_ENTITY_CACHE
1253 fprintf(stderr
, "Added in [%u]: l=%d st='%s'\n", slen
,
1254 entity_cache
[slen
][0].strlen
, entity_cache
[slen
][0].str
);
1259 fprintf(stderr
, "- Cache entries [%u] -\n", slen
);
1260 for (i
= 0; i
< nb_entity_cache
[slen
] ; i
++)
1261 fprintf(stderr
, "%d: hits=%u l=%d st='%s'\n", i
,
1262 entity_cache
[slen
][i
].hits
, entity_cache
[slen
][i
].strlen
,
1263 entity_cache
[slen
][i
].str
);
1264 fprintf(stderr
, "-----------------\n");
1266 #endif /* DEBUG_ENTITY_CACHE */
1272 convert_string(struct conv_table
*convert_table
,
1273 unsigned char *chars2
, int charslen2
, int cp
,
1274 enum convert_string_mode mode
, int *length
,
1275 void (*callback
)(void *data
, unsigned char *buf
, int buflen
),
1276 void *callback_data
)
1278 unsigned char *buffer
;
1281 unsigned char *chars
= chars2
;
1282 int charslen
= charslen2
;
1285 static char iconv_input
[256];
1286 static char iconv_output
[256 * 8];
1287 static size_t iconv_offset
;
1288 static int iconv_cp
;
1289 static size_t iconv_inleft
;
1290 size_t iconv_outleft
= 256 * 8;
1293 int chars_offset
= 0;
1295 if (!convert_table
&& !memchr(chars
, '&', charslen
)) {
1297 if (charslen
) callback(callback_data
, chars
, charslen
);
1300 return memacpy(chars
, charslen
);
1305 if (convert_table
&& convert_table
->iconv_cp
> 0) {
1307 cp
= convert_table
->iconv_cp
;
1309 is_iconv
= codepages
[cp
& ~SYSTEM_CHARSET_FLAG
].iconv
;
1314 /* Buffer allocation */
1316 buffer
= mem_alloc(ALLOC_GR
+ 1 /* trailing \0 */);
1317 if (!buffer
) return NULL
;
1322 size_t before
, to_copy
;
1325 if (iconv_cd
>= 0) {
1326 if (cp
!= iconv_cp
) {
1327 iconv_close(iconv_cd
);
1328 iconv_cd
= (iconv_t
)-1;
1331 if (iconv_cd
== (iconv_t
)-1) {
1333 iconv_cd
= iconv_open("utf-8", get_cp_mime_name(cp
));
1334 if (iconv_cd
== (iconv_t
)-1) {
1341 to_copy
= charslen2
- chars_offset
;
1342 if (to_copy
> 256 - iconv_offset
) to_copy
= 256 - iconv_offset
;
1343 memcpy(iconv_input
+ iconv_offset
, chars
+ chars_offset
, to_copy
);
1344 iconv_outleft
= 256 * 8;
1345 iconv_inleft
= iconv_offset
+ to_copy
;
1347 outp
= iconv_output
;
1348 before
= iconv_inleft
;
1350 v
= iconv(iconv_cd
, &inp
, &iconv_inleft
, &outp
, &iconv_outleft
);
1351 chars_offset
+= before
- iconv_inleft
;
1352 charslen
= 256 * 8 - iconv_outleft
;
1354 chars
= (unsigned char *)iconv_output
;
1360 memcpy(iconv_input
, inp
, iconv_inleft
);
1361 iconv_offset
= iconv_inleft
;
1376 loop
= chars_offset
< charslen2
;
1381 while (charspos
< charslen
) {
1382 const unsigned char *translit
;
1385 buffer[bufferpos++] = chars[charspos++]; \
1390 if (chars
[charspos
] != '&') {
1391 struct conv_table
*t
;
1394 if (chars
[charspos
] < 128 || !convert_table
) PUTC
;
1399 while (t
[chars
[i
]].t
) {
1400 t
= t
[chars
[i
++]].u
.tbl
;
1401 if (i
>= charslen
) PUTC
;
1404 translit
= t
[chars
[i
]].u
.str
;
1407 } else if (mode
== CSM_FORM
|| mode
== CSM_NONE
) {
1411 int start
= charspos
+ 1;
1415 && (isasciialpha(chars
[i
])
1416 || isdigit(chars
[i
])
1417 || (chars
[i
] == '#')))
1420 /* This prevents bug 213: we were expanding "entities"
1421 * in URL query strings. */
1422 /* XXX: But this disables    usage, which
1423 * appears to be relatively common! --pasky */
1424 if ((mode
== CSM_DEFAULT
|| (chars
[i
] != '&' && chars
[i
] != '='))
1426 && !isasciialpha(chars
[i
]) && !isdigit(chars
[i
])) {
1427 translit
= get_entity_string(&chars
[start
], i
- start
,
1429 if (chars
[i
] != ';') {
1430 /* Eat    <foo> happily, but
1431 * pull back from the character after
1432 * entity string if it is not the valid
1437 if (!translit
) PUTC
;
1438 charspos
= i
+ (i
< charslen
);
1442 if (!translit
[0]) continue;
1445 buffer
[bufferpos
++] = translit
[0];
1453 buffer
[bufferpos
++] = *(translit
++);
1455 if (bufferpos
& (ALLOC_GR
- 1)) continue;
1458 buffer
[bufferpos
] = 0;
1459 callback(callback_data
, buffer
, bufferpos
);
1462 new = mem_realloc(buffer
, bufferpos
+ ALLOC_GR
);
1474 if (loop
) goto repeat
;
1478 buffer
[bufferpos
] = 0;
1479 if (length
) *length
= bufferpos
;
1482 if (bufferpos
) callback(callback_data
, buffer
, bufferpos
);
1491 #ifndef USE_FASTFIND
1493 get_cp_index(const unsigned char *name
)
1498 if (!c_strcasecmp(name
, "System")) {
1499 #if HAVE_LANGINFO_CODESET
1500 name
= nl_langinfo(CODESET
);
1501 syscp
= SYSTEM_CHARSET_FLAG
;
1507 for (i
= 0; codepages
[i
].name
; i
++) {
1508 for (a
= 0; codepages
[i
].aliases
[a
]; a
++) {
1509 /* In the past, we looked for the longest substring
1510 * in all the names; it is way too expensive, though:
1512 * % cumulative self self total
1513 * time seconds seconds calls us/call us/call name
1514 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1516 * Anything called from redraw_screen() is in fact
1517 * relatively expensive, even if it's called just
1518 * once. So we will do a simple strcasecmp() here.
1521 if (!c_strcasecmp(name
, codepages
[i
].aliases
[a
]))
1527 return get_cp_index("us-ascii") | syscp
;
1535 static unsigned int i_name
= 0;
1536 static unsigned int i_alias
= 0;
1538 /* Reset internal list pointer */
1540 charsets_list_reset(void)
1546 /* Returns a pointer to a struct that contains current key and data pointers
1547 * and increment internal pointer. It returns NULL when key is NULL. */
1548 struct fastfind_key_value
*
1549 charsets_list_next(void)
1551 static struct fastfind_key_value kv
;
1553 if (!codepages
[i_name
].name
) return NULL
;
1555 kv
.key
= codepages
[i_name
].aliases
[i_alias
];
1556 kv
.data
= (void *) &codepages
[i_name
]; /* cast away const */
1558 if (codepages
[i_name
].aliases
[i_alias
+ 1])
1568 static struct fastfind_index ff_charsets_index
1569 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset
, charsets_list_next
);
1571 /* It searchs for a charset named @name or one of its aliases and
1572 * returns index for it or -1 if not found. */
1574 get_cp_index(const unsigned char *name
)
1576 const struct codepage_desc
*codepage
;
1579 if (!c_strcasecmp(name
, "System")) {
1580 #if HAVE_LANGINFO_CODESET
1581 name
= nl_langinfo(CODESET
);
1582 syscp
= SYSTEM_CHARSET_FLAG
;
1588 codepage
= fastfind_search(&ff_charsets_index
, name
, strlen(name
));
1590 assert(codepages
<= codepage
&& codepage
< codepages
+ N_CODEPAGES
);
1591 return (codepage
- codepages
) | syscp
;
1594 return get_cp_index("us-ascii") | syscp
;
1601 #endif /* USE_FASTFIND */
1604 init_charsets_lookup(void)
1607 fastfind_index(&ff_charsets_index
, FF_COMPRESS
);
1612 free_charsets_lookup(void)
1615 fastfind_done(&ff_charsets_index
);
1619 /* Get the codepage's name for displaying to the user, or NULL if
1620 * @cp_index is one past the end. In the future, we might want to
1621 * localize these with gettext. So it may be best not to use this
1622 * function if the name will have to be converted back to an
1625 get_cp_name(int cp_index
)
1627 if (cp_index
< 0) return "none";
1628 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1630 return codepages
[cp_index
].name
;
1633 /* Get the codepage's name for saving to a configuration file. These
1634 * names can be converted back to indexes, even in future versions of
1637 get_cp_config_name(int cp_index
)
1639 if (cp_index
< 0) return "none";
1640 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1641 if (!codepages
[cp_index
].aliases
) return NULL
;
1643 return codepages
[cp_index
].aliases
[0];
1646 /* Get the codepage's name for sending to a library or server that
1647 * understands MIME charset names. This function irreversibly maps
1648 * the "System" codepage to the underlying charset. */
1650 get_cp_mime_name(int cp_index
)
1652 if (cp_index
< 0) return "none";
1653 cp_index
&= ~SYSTEM_CHARSET_FLAG
;
1654 if (!codepages
[cp_index
].aliases
) return NULL
;
1656 return codepages
[cp_index
].aliases
[0];
1660 is_cp_utf8(int cp_index
)
1662 cp_index
&= ~SYSTEM_CHARSET_FLAG
;
1663 return is_cp_ptr_utf8(&codepages
[cp_index
]);
1666 /* This function will be used by the xhtml parser. */
1668 get_cp_highhalf(const unsigned char *name
)
1670 int cp
= get_cp_index(name
);
1672 if (cp
< 0) return NULL
;
1673 cp
&= ~SYSTEM_CHARSET_FLAG
;
1674 return codepages
[cp
].highhalf
;