1 /* Charsets convertor */
4 #define _GNU_SOURCE /* strcasecmp() */
11 #if HAVE_LANGINFO_CODESET
23 #include "document/options.h"
24 #include "intl/charsets.h"
25 #include "util/conv.h"
26 #include "util/error.h"
27 #include "util/fastfind.h"
28 #include "util/memory.h"
29 #include "util/string.h"
32 /* Fix namespace clash on MacOS. */
33 #define table table_elinks
37 /* This should in principle be unicode_val_T, but because all
38 * the values currently in codepage.inc fit in 16 bits, we can
39 * as well use uint16_t and halve sizeof(struct table_entry)
40 * from 8 bytes to 4. Should other characters ever be needed,
41 * unicode_val_T u : 24 might be a possibility, although it
42 * seems a little unportable as bitfields are in principle
43 * restricted to int, which may be 16-bit. */
47 struct codepage_desc
{
49 unsigned char *const *aliases
;
51 /* The Unicode mappings of codepage bytes 0x80...0xFF.
52 * (0x00...0x7F are assumed to be ASCII in all codepages.)
53 * Because all current values fit in 16 bits, we store them as
54 * uint16_t rather than unicode_val_T. If the codepage does
55 * not use some byte, then @highhalf maps that byte to 0xFFFF,
56 * which C code converts to UCS_REPLACEMENT_CHARACTER where
57 * appropriate. (U+FFFF is reserved and will never be
58 * assigned as a character.) */
59 const uint16_t *highhalf
;
61 /* If some byte in the codepage corresponds to multiple Unicode
62 * characters, then the preferred character is in @highhalf
63 * above, and the rest are listed here in @table. This table
64 * is not used for translating from the codepage to Unicode. */
65 const struct table_entry
*table
;
68 #include "intl/codepage.inc"
69 #include "intl/uni_7b.inc"
70 #include "intl/entity.inc"
73 static const char strings
[256][2] = {
74 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
75 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
76 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
77 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
78 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
79 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
80 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
81 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
82 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
83 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
84 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
85 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
86 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
87 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
88 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
89 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
90 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
91 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
92 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
93 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
94 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
95 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
96 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
97 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
98 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
99 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
100 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
101 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
102 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
103 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
104 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
105 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
109 free_translation_table(struct conv_table
*p
)
113 for (i
= 0; i
< 256; i
++)
115 free_translation_table(p
[i
].u
.tbl
);
120 /* A string used in conversion tables when there is no correct
121 * conversion. This is compared by address and therefore should be a
122 * named array rather than a pointer so that it won't share storage
123 * with any other string literal that happens to have the same
125 static const unsigned char no_str
[] = "*";
128 new_translation_table(struct conv_table
*p
)
132 for (i
= 0; i
< 256; i
++)
134 free_translation_table(p
[i
].u
.tbl
);
135 for (i
= 0; i
< 128; i
++) {
137 p
[i
].u
.str
= strings
[i
];
139 for (; i
< 256; i
++) {
145 #define BIN_SEARCH(table, entry, entries, key, result) \
147 long _s = 0, _e = (entries) - 1; \
149 while (_s <= _e || !((result) = -1)) { \
150 long _m = (_s + _e) / 2; \
152 if ((table)[_m].entry == (key)) { \
156 if ((table)[_m].entry > (key)) _e = _m - 1; \
157 if ((table)[_m].entry < (key)) _s = _m + 1; \
161 static const unicode_val_T strange_chars[32] = {
162 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
163 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
164 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
165 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
168 #define SYSTEM_CHARSET_FLAG 128
169 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
171 const unsigned char *
172 u2cp_(unicode_val_T u
, int to
, enum nbsp_mode nbsp_mode
)
177 if (u
< 128) return strings
[u
];
179 to
&= ~SYSTEM_CHARSET_FLAG
;
182 if (is_cp_ptr_utf8(&codepages
[to
]))
183 return encode_utf8(u
);
184 #endif /* CONFIG_UTF8 */
186 /* To mark non breaking spaces in non-UTF-8 strings, we use a
187 * special char NBSP_CHAR. */
188 if (u
== UCS_NO_BREAK_SPACE
) {
189 if (nbsp_mode
== NBSP_MODE_HACK
) return NBSP_CHAR_STRING
;
190 else /* NBSP_MODE_ASCII */ return " ";
192 if (u
== UCS_SOFT_HYPHEN
) return "";
195 unicode_val_T strange
= strange_chars
[u
- 0x80];
197 if (!strange
) return NULL
;
198 return u2cp_(strange
, to
, nbsp_mode
);
202 for (j
= 0; j
< 0x80; j
++)
203 if (codepages
[to
].highhalf
[j
] == u
)
204 return strings
[0x80 + j
];
205 for (j
= 0; codepages
[to
].table
[j
].c
; j
++)
206 if (codepages
[to
].table
[j
].u
== u
)
207 return strings
[codepages
[to
].table
[j
].c
];
209 BIN_SEARCH(unicode_7b
, x
, N_UNICODE_7B
, u
, s
);
210 if (s
!= -1) return unicode_7b
[s
].s
;
215 static unsigned char utf_buffer
[7];
218 inline unsigned char *
219 encode_utf8(unicode_val_T u
)
221 static unsigned char *
222 encode_utf8(unicode_val_T u
)
223 #endif /* CONFIG_UTF8 */
225 memset(utf_buffer
, 0, 7);
230 utf_buffer
[0] = 0xc0 | ((u
>> 6) & 0x1f),
231 utf_buffer
[1] = 0x80 | (u
& 0x3f);
232 else if (u
< 0x10000)
233 utf_buffer
[0] = 0xe0 | ((u
>> 12) & 0x0f),
234 utf_buffer
[1] = 0x80 | ((u
>> 6) & 0x3f),
235 utf_buffer
[2] = 0x80 | (u
& 0x3f);
236 else if (u
< 0x200000)
237 utf_buffer
[0] = 0xf0 | ((u
>> 18) & 0x0f),
238 utf_buffer
[1] = 0x80 | ((u
>> 12) & 0x3f),
239 utf_buffer
[2] = 0x80 | ((u
>> 6) & 0x3f),
240 utf_buffer
[3] = 0x80 | (u
& 0x3f);
241 else if (u
< 0x4000000)
242 utf_buffer
[0] = 0xf8 | ((u
>> 24) & 0x0f),
243 utf_buffer
[1] = 0x80 | ((u
>> 18) & 0x3f),
244 utf_buffer
[2] = 0x80 | ((u
>> 12) & 0x3f),
245 utf_buffer
[3] = 0x80 | ((u
>> 6) & 0x3f),
246 utf_buffer
[4] = 0x80 | (u
& 0x3f);
247 else utf_buffer
[0] = 0xfc | ((u
>> 30) & 0x01),
248 utf_buffer
[1] = 0x80 | ((u
>> 24) & 0x3f),
249 utf_buffer
[2] = 0x80 | ((u
>> 18) & 0x3f),
250 utf_buffer
[3] = 0x80 | ((u
>> 12) & 0x3f),
251 utf_buffer
[4] = 0x80 | ((u
>> 6) & 0x3f),
252 utf_buffer
[5] = 0x80 | (u
& 0x3f);
258 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
259 * equal ones and handled different. */
260 static const char utf8char_len_tab
[256] = {
261 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
262 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
263 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
264 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
265 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
266 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
267 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
268 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
271 inline int utf8charlen(const unsigned char *p
)
273 return p
? utf8char_len_tab
[*p
] : 0;
277 strlen_utf8(unsigned char **str
)
279 unsigned char *s
= *str
;
280 unsigned char *end
= strchr(s
, '\0');
284 for (x
= 0;; x
++, s
+= len
) {
285 len
= utf8charlen(s
);
286 if (s
+ len
> end
) break;
292 #define utf8_issingle(p) (((p) & 0x80) == 0)
293 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
295 /* Start from @current and move back to @pos char. This pointer return. The
296 * most left pointer is @start. */
297 inline unsigned char *
298 utf8_prevchar(unsigned char *current
, int pos
, unsigned char *start
)
300 if (current
== NULL
|| start
== NULL
|| pos
< 0)
302 while (pos
> 0 && current
!= start
) {
304 if (utf8_islead(*current
))
310 /* Count number of standard terminal cells needed for displaying UTF-8
313 utf8_char2cells(unsigned char *utf8_char
, unsigned char *end
)
318 end
= strchr(utf8_char
, '\0');
320 if(!utf8_char
|| !end
)
323 u
= utf8_to_unicode(&utf8_char
, end
);
325 return unicode_to_cell(u
);
328 /* Count number of standard terminal cells needed for displaying string
329 * with UTF-8 characters. */
331 utf8_ptr2cells(unsigned char *string
, unsigned char *end
)
333 int charlen
, cell
, cells
= 0;
336 end
= strchr(string
, '\0');
342 charlen
= utf8charlen(string
);
343 if (string
+ charlen
> end
)
346 cell
= utf8_char2cells(string
, end
);
357 /* Count number of characters in string. */
359 utf8_ptr2chars(unsigned char *string
, unsigned char *end
)
361 int charlen
, chars
= 0;
364 end
= strchr(string
, '\0');
370 charlen
= utf8charlen(string
);
371 if (string
+ charlen
> end
)
382 * Count number of bytes from begining of the string needed for displaying
383 * specified number of cells.
386 utf8_cells2bytes(unsigned char *string
, int max_cells
, unsigned char *end
)
388 unsigned int bytes
= 0, cells
= 0;
390 assert(max_cells
>=0);
393 end
= strchr(string
, '\0');
399 int cell
= utf8_char2cells(&string
[bytes
], end
);
404 if (cells
> max_cells
)
407 bytes
+= utf8charlen(&string
[bytes
]);
409 if (string
+ bytes
> end
) {
410 bytes
= end
- string
;
418 /* Take @max steps forward from @string in the specified @way, but
419 * not going past @end. Return the resulting address. Store the
420 * number of steps taken to *@count, unless @count is NULL.
422 * This assumes the text is valid UTF-8, and @string and @end point to
423 * character boundaries. If not, it doesn't crash but the results may
426 * This function can do some of the same jobs as utf8charlen(),
427 * utf8_cells2bytes(), and strlen_utf8(). */
429 utf8_step_forward(unsigned char *string
, unsigned char *end
,
430 int max
, enum utf8_step way
, int *count
)
433 unsigned char *current
= string
;
437 if_assert_failed
goto invalid_arg
;
439 end
= strchr(string
, '\0');
442 case UTF8_STEP_CHARACTERS
:
443 while (steps
< max
&& current
< end
) {
445 if (utf8_islead(*current
))
450 case UTF8_STEP_CELLS_FEWER
:
451 case UTF8_STEP_CELLS_MORE
:
452 while (steps
< max
) {
454 unsigned char *prev
= current
;
457 u
= utf8_to_unicode(¤t
, end
);
458 if (u
== UCS_NO_CHAR
) {
459 /* Assume the incomplete sequence
466 width
= unicode_to_cell(u
);
467 if (way
== UTF8_STEP_CELLS_FEWER
468 && steps
+ width
> max
) {
478 INTERNAL("impossible enum utf8_step");
487 /* Take @max steps backward from @string in the specified @way, but
488 * not going past @start. Return the resulting address. Store the
489 * number of steps taken to *@count, unless @count is NULL.
491 * This assumes the text is valid UTF-8, and @string and @start point
492 * to character boundaries. If not, it doesn't crash but the results
493 * may be inconsistent.
495 * This function can do some of the same jobs as utf8_prevchar(). */
497 utf8_step_backward(unsigned char *string
, unsigned char *start
,
498 int max
, enum utf8_step way
, int *count
)
501 unsigned char *current
= string
;
506 if_assert_failed
goto invalid_arg
;
509 case UTF8_STEP_CHARACTERS
:
510 while (steps
< max
&& current
> start
) {
512 if (utf8_islead(*current
))
517 case UTF8_STEP_CELLS_FEWER
:
518 case UTF8_STEP_CELLS_MORE
:
519 while (steps
< max
) {
520 unsigned char *prev
= current
;
525 if (current
<= start
)
529 } while (current
> start
&& !utf8_islead(*current
));
532 u
= utf8_to_unicode(&look
, prev
);
533 if (u
== UCS_NO_CHAR
) {
534 /* Assume the incomplete sequence
538 width
= unicode_to_cell(u
);
540 if (way
== UTF8_STEP_CELLS_FEWER
541 && steps
+ width
> max
) {
551 INTERNAL("impossible enum utf8_step");
561 * Find out number of standard terminal collumns needed for displaying symbol
562 * (glyph) which represents Unicode character c.
564 * TODO: Use wcwidth when it is available. This seems to require:
565 * - Make the configure script check whether <wchar.h> and wcwidth exist.
566 * - Define _XOPEN_SOURCE and include <wchar.h>.
567 * - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
568 * matches ISO 10646 in all locales.)
569 * However, these do not suffice, because wcwidth depends on LC_CTYPE
570 * in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
571 * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
572 * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
573 * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
574 * character is apparently not supported in all locales. Why is that?
575 * - Perhaps there is standardese that requires supported characters
576 * to be convertable to multibyte form. Then ELinks could just pick
577 * some UTF-8 locale for its wcwidth purposes.
578 * - Perhaps wcwidth can even return different nonnegative values for
579 * the same ISO 10646 character in different locales. Then ELinks
580 * would have to set LC_CTYPE to match at least the terminal's
581 * charset (which may differ from the LC_CTYPE environment variable,
582 * especially when the master process is serving a slave terminal).
583 * But there is no guarantee that the libc supports all the same
584 * charsets as ELinks does.
585 * For now, it seems safest to avoid the potentially locale-dependent
586 * libc version of wcwidth, and instead use a hardcoded mapping.
588 * @return 2 for double-width glyph, 1 for others.
589 * TODO: May be extended to return 0 for zero-width glyphs
590 * (like composing, maybe unprintable too).
593 unicode_to_cell(unicode_val_T c
)
596 && (c
<= 0x115f /* Hangul Jamo */
599 || (c
>= 0x2e80 && c
<= 0xa4cf
600 && c
!= 0x303f) /* CJK ... Yi */
601 || (c
>= 0xac00 && c
<= 0xd7a3) /* Hangul Syllables */
602 || (c
>= 0xf900 && c
<= 0xfaff) /* CJK Compatibility
604 || (c
>= 0xfe30 && c
<= 0xfe6f) /* CJK Compatibility Forms */
605 || (c
>= 0xff00 && c
<= 0xff60) /* Fullwidth Forms */
606 || (c
>= 0xffe0 && c
<= 0xffe6)
607 || (c
>= 0x20000 && c
<= 0x2fffd)
608 || (c
>= 0x30000 && c
<= 0x3fffd)))
614 /* Fold the case of a Unicode character, so that hotkeys in labels can
615 * be compared case-insensitively. It is unspecified whether the
616 * result will be in upper or lower case. */
618 unicode_fold_label_case(unicode_val_T c
)
620 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
622 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
623 /* For now, this supports only ASCII. It would be possible to
624 * use code generated from CaseFolding.txt of Unicode if the
625 * acknowledgements required by http://www.unicode.org/copyright.html
626 * were added to associated documentation of ELinks. */
627 if (c
>= 0x41 && c
<= 0x5A)
631 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
635 utf8_to_unicode(unsigned char **string
, const unsigned char *end
)
637 unsigned char *str
= *string
;
641 length
= utf8char_len_tab
[str
[0]];
643 if (str
+ length
> end
) {
648 case 1: /* U+0000 to U+007F */
649 if (str
[0] >= 0x80) {
652 return UCS_REPLACEMENT_CHARACTER
;
656 case 2: /* U+0080 to U+07FF */
657 if ((str
[1] & 0xc0) != 0x80)
659 u
= (str
[0] & 0x1f) << 6;
660 u
+= (str
[1] & 0x3f);
664 case 3: /* U+0800 to U+FFFF, except surrogates */
665 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80)
667 u
= (str
[0] & 0x0f) << 12;
668 u
+= ((str
[1] & 0x3f) << 6);
669 u
+= (str
[2] & 0x3f);
670 if (u
< 0x800 || is_utf16_surrogate(u
))
673 case 4: /* U+10000 to U+1FFFFF */
674 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80
675 || (str
[3] & 0xc0) != 0x80)
677 u
= (str
[0] & 0x0f) << 18;
678 u
+= ((str
[1] & 0x3f) << 12);
679 u
+= ((str
[2] & 0x3f) << 6);
680 u
+= (str
[3] & 0x3f);
684 case 5: /* U+200000 to U+3FFFFFF */
685 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80
686 || (str
[3] & 0xc0) != 0x80 || (str
[4] & 0xc0) != 0x80)
688 u
= (str
[0] & 0x0f) << 24;
689 u
+= ((str
[1] & 0x3f) << 18);
690 u
+= ((str
[2] & 0x3f) << 12);
691 u
+= ((str
[3] & 0x3f) << 6);
692 u
+= (str
[4] & 0x3f);
696 case 6: /* U+4000000 to U+7FFFFFFF */
697 if ((str
[1] & 0xc0) != 0x80 || (str
[2] & 0xc0) != 0x80
698 || (str
[3] & 0xc0) != 0x80 || (str
[4] & 0xc0) != 0x80
699 || (str
[5] & 0xc0) != 0x80)
701 u
= (str
[0] & 0x01) << 30;
702 u
+= ((str
[1] & 0x3f) << 24);
703 u
+= ((str
[2] & 0x3f) << 18);
704 u
+= ((str
[3] & 0x3f) << 12);
705 u
+= ((str
[4] & 0x3f) << 6);
706 u
+= (str
[5] & 0x3f);
711 INTERNAL("utf8char_len_tab out of range");
714 *string
= str
+ length
;
717 #endif /* CONFIG_UTF8 */
719 /* The common part of cp2u and cp2utf_8. */
721 cp2u_shared(const struct codepage_desc
*from
, unsigned char c
)
723 unicode_val_T u
= from
->highhalf
[c
- 0x80];
725 if (u
== 0xFFFF) u
= UCS_REPLACEMENT_CHARACTER
;
729 /* Used for converting input from the terminal. */
731 cp2u(int from
, unsigned char c
)
733 from
&= ~SYSTEM_CHARSET_FLAG
;
735 /* UTF-8 is a multibyte codepage and cannot be handled with
737 assert(!is_cp_ptr_utf8(&codepages
[from
]));
738 if_assert_failed
return UCS_REPLACEMENT_CHARACTER
;
740 if (c
< 0x80) return c
;
741 else return cp2u_shared(&codepages
[from
], c
);
744 /* This slow and ugly code is used by the terminal utf_8_io */
745 const unsigned char *
746 cp2utf8(int from
, int c
)
748 from
&= ~SYSTEM_CHARSET_FLAG
;
750 if (is_cp_ptr_utf8(&codepages
[from
]) || c
< 128)
753 return encode_utf8(cp2u_shared(&codepages
[from
], c
));
758 cp_to_unicode(int codepage
, unsigned char **string
, unsigned char *end
)
762 if (is_cp_utf8(codepage
))
763 return utf8_to_unicode(string
, end
);
768 ret
= cp2u(codepage
, **string
);
772 #endif /* CONFIG_UTF8 */
776 add_utf8(struct conv_table
*ct
, unicode_val_T u
, const unsigned char *str
)
778 unsigned char *p
= encode_utf8(u
);
781 if (ct
[*p
].t
) ct
= ct
[*p
].u
.tbl
;
783 struct conv_table
*nct
;
785 assertm(ct
[*p
].u
.str
== no_str
, "bad utf encoding #1");
786 if_assert_failed
return;
788 nct
= mem_calloc(256, sizeof(*nct
));
790 new_translation_table(nct
);
798 assertm(!ct
[*p
].t
, "bad utf encoding #2");
799 if_assert_failed
return;
801 if (ct
[*p
].u
.str
== no_str
)
805 /* A conversion table from some charset to UTF-8.
806 * If it is from UTF-8 to UTF-8, it converts each byte separately.
807 * Unlike in other translation tables, the strings in elements 0x80 to
808 * 0xFF are allocated dynamically. */
809 struct conv_table utf_table
[256];
810 int utf_table_init
= 1;
817 /* Cast away const. */
818 for (i
= 128; i
< 256; i
++)
819 mem_free((unsigned char *) utf_table
[i
].u
.str
);
822 static struct conv_table
*
823 get_translation_table_to_utf8(int from
)
828 if (from
== -1) return NULL
;
829 from
&= ~SYSTEM_CHARSET_FLAG
;
830 if (from
== lfr
) return utf_table
;
832 if (utf_table_init
) {
833 memset(utf_table
, 0, sizeof(utf_table
));
838 for (i
= 0; i
< 128; i
++)
839 utf_table
[i
].u
.str
= strings
[i
];
841 if (is_cp_ptr_utf8(&codepages
[from
])) {
842 for (i
= 128; i
< 256; i
++)
843 utf_table
[i
].u
.str
= stracpy(strings
[i
]);
847 for (i
= 128; i
< 256; i
++) {
848 unicode_val_T u
= codepages
[from
].highhalf
[i
- 0x80];
851 utf_table
[i
].u
.str
= NULL
;
853 utf_table
[i
].u
.str
= stracpy(encode_utf8(u
));
856 for (i
= 0; codepages
[from
].table
[i
].c
; i
++) {
857 unicode_val_T u
= codepages
[from
].table
[i
].u
;
859 if (!utf_table
[codepages
[from
].table
[i
].c
].u
.str
)
860 utf_table
[codepages
[from
].table
[i
].c
].u
.str
=
861 stracpy(encode_utf8(u
));
864 for (i
= 128; i
< 256; i
++)
865 if (!utf_table
[i
].u
.str
)
866 utf_table
[i
].u
.str
= stracpy(no_str
);
871 /* A conversion table between two charsets, where the target is not UTF-8. */
872 static struct conv_table table
[256];
873 static int first
= 1;
876 free_conv_table(void)
878 if (!utf_table_init
) free_utf_table();
880 memset(table
, 0, sizeof(table
));
883 new_translation_table(table
);
888 get_translation_table(int from
, int to
)
893 from
&= ~SYSTEM_CHARSET_FLAG
;
894 to
&= ~SYSTEM_CHARSET_FLAG
;
896 memset(table
, 0, sizeof(table
));
899 if (/*from == to ||*/ from
== -1 || to
== -1)
901 if (is_cp_ptr_utf8(&codepages
[to
]))
902 return get_translation_table_to_utf8(from
);
903 if (from
== lfr
&& to
== lto
)
907 new_translation_table(table
);
909 if (is_cp_ptr_utf8(&codepages
[from
])) {
912 /* Map U+00A0 and U+00AD the same way as u2cp() would. */
913 add_utf8(table
, UCS_NO_BREAK_SPACE
, strings
[NBSP_CHAR
]);
914 add_utf8(table
, UCS_SOFT_HYPHEN
, "");
916 for (i
= 0x80; i
<= 0xFF; i
++)
917 if (codepages
[to
].highhalf
[i
- 0x80] != 0xFFFF)
919 codepages
[to
].highhalf
[i
- 0x80],
922 for (i
= 0; codepages
[to
].table
[i
].c
; i
++)
923 add_utf8(table
, codepages
[to
].table
[i
].u
,
924 strings
[codepages
[to
].table
[i
].c
]);
926 for (i
= 0; unicode_7b
[i
].x
!= -1; i
++)
927 if (unicode_7b
[i
].x
>= 0x80)
928 add_utf8(table
, unicode_7b
[i
].x
,
934 for (i
= 128; i
< 256; i
++) {
935 if (codepages
[from
].highhalf
[i
- 0x80] != 0xFFFF) {
936 const unsigned char *u
;
938 u
= u2cp(codepages
[from
].highhalf
[i
- 0x80], to
);
939 if (u
) table
[i
].u
.str
= u
;
948 xxstrcmp(unsigned char *s1
, unsigned char *s2
, int l2
)
951 if (*s1
> *s2
) return 1;
952 if (*s1
< *s2
) return -1;
961 /* Entity cache debugging purpose. */
963 #define DEBUG_ENTITY_CACHE
965 #undef DEBUG_ENTITY_CACHE
968 struct entity_cache
{
972 const unsigned char *result
;
973 unsigned char str
[20]; /* Suffice in any case. */
976 /* comparison function for qsort() */
978 hits_cmp(const void *v1
, const void *v2
)
980 const struct entity_cache
*a
= v1
, *b
= v2
;
982 if (a
->hits
== b
->hits
) return 0;
983 if (a
->hits
> b
->hits
) return -1;
988 compare_entities(const void *key_
, const void *element_
)
990 struct string
*key
= (struct string
*) key_
;
991 struct entity
*element
= (struct entity
*) element_
;
992 int length
= key
->length
;
993 unsigned char *first
= key
->source
;
994 unsigned char *second
= element
->s
;
996 return xxstrcmp(first
, second
, length
);
999 const unsigned char *
1000 get_entity_string(const unsigned char *str
, const int strlen
, int encoding
)
1002 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
1003 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
1004 will go in [0] table */
1005 static struct entity_cache entity_cache
[ENTITY_CACHE_MAXLEN
][ENTITY_CACHE_SIZE
];
1006 static unsigned int nb_entity_cache
[ENTITY_CACHE_MAXLEN
];
1007 static int first_time
= 1;
1008 unsigned int slen
= 0;
1009 const unsigned char *result
= NULL
;
1011 if (strlen
<= 0) return NULL
;
1014 /* TODO: caching UTF-8 */
1015 encoding
&= ~SYSTEM_CHARSET_FLAG
;
1016 if (is_cp_ptr_utf8(&codepages
[encoding
]))
1018 #endif /* CONFIG_UTF8 */
1021 memset(&nb_entity_cache
, 0, ENTITY_CACHE_MAXLEN
* sizeof(unsigned int));
1025 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1026 * + google + slashdot + websites that result from a search for test on google,
1027 * + various ones) show quite impressive improvment:
1029 * 0: hits=2459 l=4 st='nbsp'
1030 * 1: hits=2152 l=6 st='eacute'
1031 * 2: hits=235 l=6 st='egrave'
1032 * 3: hits=136 l=6 st='agrave'
1033 * 4: hits=100 l=3 st='amp'
1034 * 5: hits=40 l=5 st='laquo'
1035 * 6: hits=8 l=4 st='copy'
1036 * 7: hits=5 l=2 st='gt'
1037 * 8: hits=2 l=2 st='lt'
1038 * 9: hits=1 l=6 st='middot'
1040 * Most of the time cache hit ratio is near 95%.
1042 * A long test shows: 15186 hits vs. 24 misses and mean iteration
1043 * count is kept < 2 (worst case 1.58). Not so bad ;)
1047 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1048 slen
= (strlen
> 1 && strlen
< ENTITY_CACHE_MAXLEN
) ? strlen
: 0;
1050 if (strlen
< ENTITY_CACHE_MAXLEN
&& nb_entity_cache
[slen
] > 0) {
1053 for (i
= 0; i
< nb_entity_cache
[slen
]; i
++) {
1054 if (entity_cache
[slen
][i
].encoding
== encoding
1055 && !memcmp(str
, entity_cache
[slen
][i
].str
, strlen
)) {
1056 #ifdef DEBUG_ENTITY_CACHE
1057 static double total_iter
= 0;
1058 static unsigned long hit_count
= 0;
1060 total_iter
+= i
+ 1;
1062 fprintf(stderr
, "hit after %d iter. (mean = %0.2f)\n", i
+ 1, total_iter
/ (double) hit_count
);
1064 if (entity_cache
[slen
][i
].hits
< (unsigned int) ~0)
1065 entity_cache
[slen
][i
].hits
++;
1066 return entity_cache
[slen
][i
].result
;
1069 #ifdef DEBUG_ENTITY_CACHE
1070 fprintf(stderr
, "miss\n");
1075 #endif /* CONFIG_UTF8 */
1076 if (*str
== '#') { /* Numeric entity. */
1077 int l
= (int) strlen
;
1078 unsigned char *st
= (unsigned char *) str
;
1079 unicode_val_T n
= 0;
1081 if (l
== 1) goto end
; /* &#; ? */
1083 if ((*st
| 32) == 'x') { /* Hexadecimal */
1085 if (l
== 1 || l
> 9) goto end
; /* xFFFFFFFF max. */
1088 unsigned char c
= (*(st
++) | 32);
1091 n
= (n
<< 4) | (c
- '0');
1092 else if (isxdigit(c
))
1093 n
= (n
<< 4) | (c
- 'a' + 10);
1095 goto end
; /* Bad char. */
1097 } else { /* Decimal */
1098 if (l
> 10) goto end
; /* 4294967295 max. */
1100 unsigned char c
= *(st
++);
1103 n
= n
* 10 + c
- '0';
1105 goto end
; /* Bad char. */
1106 /* Limit to 0xFFFFFFFF. */
1107 if (n
>= (unicode_val_T
) 0xFFFFFFFFu
)
1112 result
= u2cp(n
, encoding
);
1114 #ifdef DEBUG_ENTITY_CACHE
1115 fprintf(stderr
, "%lu %016x %s\n", (unsigned long) n
, n
, result
);
1117 } else { /* Text entity. */
1118 struct string key
= INIT_STRING((unsigned char *) str
, strlen
);
1119 struct entity
*element
= bsearch((void *) &key
, entities
,
1124 if (element
) result
= u2cp(element
->c
, encoding
);
1128 if (is_cp_ptr_utf8(&codepages
[encoding
])) {
1131 #endif /* CONFIG_UTF8 */
1133 /* Take care of potential buffer overflow. */
1134 if (strlen
< sizeof(entity_cache
[slen
][0].str
)) {
1135 struct entity_cache
*ece
;
1137 /* Sort entries by hit order. */
1138 if (nb_entity_cache
[slen
] > 1)
1139 qsort(&entity_cache
[slen
][0], nb_entity_cache
[slen
],
1140 sizeof(entity_cache
[slen
][0]), hits_cmp
);
1142 /* Increment number of cache entries if possible.
1143 * Else, just replace the least used entry. */
1144 if (nb_entity_cache
[slen
] < ENTITY_CACHE_SIZE
) nb_entity_cache
[slen
]++;
1145 ece
= &entity_cache
[slen
][nb_entity_cache
[slen
] - 1];
1147 /* Copy new entry to cache. */
1149 ece
->strlen
= strlen
;
1150 ece
->encoding
= encoding
;
1151 ece
->result
= result
;
1152 memcpy(ece
->str
, str
, strlen
);
1153 ece
->str
[strlen
] = '\0';
1156 #ifdef DEBUG_ENTITY_CACHE
1157 fprintf(stderr
, "Added in [%u]: l=%d st='%s'\n", slen
,
1158 entity_cache
[slen
][0].strlen
, entity_cache
[slen
][0].str
);
1163 fprintf(stderr
, "- Cache entries [%u] -\n", slen
);
1164 for (i
= 0; i
< nb_entity_cache
[slen
] ; i
++)
1165 fprintf(stderr
, "%d: hits=%u l=%d st='%s'\n", i
,
1166 entity_cache
[slen
][i
].hits
, entity_cache
[slen
][i
].strlen
,
1167 entity_cache
[slen
][i
].str
);
1168 fprintf(stderr
, "-----------------\n");
1170 #endif /* DEBUG_ENTITY_CACHE */
1176 convert_string(struct conv_table
*convert_table
,
1177 unsigned char *chars
, int charslen
, int cp
,
1178 enum convert_string_mode mode
, int *length
,
1179 void (*callback
)(void *data
, unsigned char *buf
, int buflen
),
1180 void *callback_data
)
1182 unsigned char *buffer
;
1186 if (!convert_table
&& !memchr(chars
, '&', charslen
)) {
1188 if (charslen
) callback(callback_data
, chars
, charslen
);
1191 return memacpy(chars
, charslen
);
1195 /* Buffer allocation */
1197 buffer
= mem_alloc(ALLOC_GR
+ 1 /* trailing \0 */);
1198 if (!buffer
) return NULL
;
1202 while (charspos
< charslen
) {
1203 const unsigned char *translit
;
1206 buffer[bufferpos++] = chars[charspos++]; \
1211 if (chars
[charspos
] != '&') {
1212 struct conv_table
*t
;
1215 if (chars
[charspos
] < 128 || !convert_table
) PUTC
;
1220 while (t
[chars
[i
]].t
) {
1221 t
= t
[chars
[i
++]].u
.tbl
;
1222 if (i
>= charslen
) PUTC
;
1225 translit
= t
[chars
[i
]].u
.str
;
1228 } else if (mode
== CSM_FORM
|| mode
== CSM_NONE
) {
1232 int start
= charspos
+ 1;
1236 && (isasciialpha(chars
[i
])
1237 || isdigit(chars
[i
])
1238 || (chars
[i
] == '#')))
1241 /* This prevents bug 213: we were expanding "entities"
1242 * in URL query strings. */
1243 /* XXX: But this disables    usage, which
1244 * appears to be relatively common! --pasky */
1245 if ((mode
== CSM_DEFAULT
|| (chars
[i
] != '&' && chars
[i
] != '='))
1247 && !isasciialpha(chars
[i
]) && !isdigit(chars
[i
])) {
1248 translit
= get_entity_string(&chars
[start
], i
- start
,
1250 if (chars
[i
] != ';') {
1251 /* Eat    <foo> happily, but
1252 * pull back from the character after
1253 * entity string if it is not the valid
1258 if (!translit
) PUTC
;
1259 charspos
= i
+ (i
< charslen
);
1263 if (!translit
[0]) continue;
1266 buffer
[bufferpos
++] = translit
[0];
1274 buffer
[bufferpos
++] = *(translit
++);
1276 if (bufferpos
& (ALLOC_GR
- 1)) continue;
1279 buffer
[bufferpos
] = 0;
1280 callback(callback_data
, buffer
, bufferpos
);
1283 new = mem_realloc(buffer
, bufferpos
+ ALLOC_GR
);
1296 buffer
[bufferpos
] = 0;
1297 if (length
) *length
= bufferpos
;
1300 if (bufferpos
) callback(callback_data
, buffer
, bufferpos
);
1309 #ifndef USE_FASTFIND
1311 get_cp_index(const unsigned char *name
)
1316 if (!strcasecmp(name
, "System")) {
1317 #if HAVE_LANGINFO_CODESET
1318 name
= nl_langinfo(CODESET
);
1319 syscp
= SYSTEM_CHARSET_FLAG
;
1325 for (i
= 0; codepages
[i
].name
; i
++) {
1326 for (a
= 0; codepages
[i
].aliases
[a
]; a
++) {
1327 /* In the past, we looked for the longest substring
1328 * in all the names; it is way too expensive, though:
1330 * % cumulative self self total
1331 * time seconds seconds calls us/call us/call name
1332 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1334 * Anything called from redraw_screen() is in fact
1335 * relatively expensive, even if it's called just
1336 * once. So we will do a simple strcasecmp() here.
1339 if (!strcasecmp(name
, codepages
[i
].aliases
[a
]))
1345 return get_cp_index("us-ascii") | syscp
;
1353 static unsigned int i_name
= 0;
1354 static unsigned int i_alias
= 0;
1356 /* Reset internal list pointer */
1358 charsets_list_reset(void)
1364 /* Returns a pointer to a struct that contains current key and data pointers
1365 * and increment internal pointer. It returns NULL when key is NULL. */
1366 struct fastfind_key_value
*
1367 charsets_list_next(void)
1369 static struct fastfind_key_value kv
;
1371 if (!codepages
[i_name
].name
) return NULL
;
1373 kv
.key
= codepages
[i_name
].aliases
[i_alias
];
1374 kv
.data
= (void *) &codepages
[i_name
]; /* cast away const */
1376 if (codepages
[i_name
].aliases
[i_alias
+ 1])
1386 static struct fastfind_index ff_charsets_index
1387 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset
, charsets_list_next
);
1389 /* It searchs for a charset named @name or one of its aliases and
1390 * returns index for it or -1 if not found. */
1392 get_cp_index(const unsigned char *name
)
1394 const struct codepage_desc
*codepage
;
1397 if (!strcasecmp(name
, "System")) {
1398 #if HAVE_LANGINFO_CODESET
1399 name
= nl_langinfo(CODESET
);
1400 syscp
= SYSTEM_CHARSET_FLAG
;
1406 codepage
= fastfind_search(&ff_charsets_index
, name
, strlen(name
));
1408 assert(codepages
<= codepage
&& codepage
< codepages
+ N_CODEPAGES
);
1409 return (codepage
- codepages
) | syscp
;
1412 return get_cp_index("us-ascii") | syscp
;
1419 #endif /* USE_FASTFIND */
1422 init_charsets_lookup(void)
1425 fastfind_index(&ff_charsets_index
, FF_COMPRESS
);
1430 free_charsets_lookup(void)
1433 fastfind_done(&ff_charsets_index
);
1437 /* Get the codepage's name for displaying to the user, or NULL if
1438 * @cp_index is one past the end. In the future, we might want to
1439 * localize these with gettext. So it may be best not to use this
1440 * function if the name will have to be converted back to an
1443 get_cp_name(int cp_index
)
1445 if (cp_index
< 0) return "none";
1446 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1448 return codepages
[cp_index
].name
;
1451 /* Get the codepage's name for saving to a configuration file. These
1452 * names can be converted back to indexes, even in future versions of
1455 get_cp_config_name(int cp_index
)
1457 if (cp_index
< 0) return "none";
1458 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1459 if (!codepages
[cp_index
].aliases
) return NULL
;
1461 return codepages
[cp_index
].aliases
[0];
1464 /* Get the codepage's name for sending to a library or server that
1465 * understands MIME charset names. This function irreversibly maps
1466 * the "System" codepage to the underlying charset. */
1468 get_cp_mime_name(int cp_index
)
1470 if (cp_index
< 0) return "none";
1471 cp_index
&= ~SYSTEM_CHARSET_FLAG
;
1472 if (!codepages
[cp_index
].aliases
) return NULL
;
1474 return codepages
[cp_index
].aliases
[0];
1478 is_cp_utf8(int cp_index
)
1480 cp_index
&= ~SYSTEM_CHARSET_FLAG
;
1481 return is_cp_ptr_utf8(&codepages
[cp_index
]);