1 /* Charsets convertor */
4 #define _GNU_SOURCE /* strcasecmp() */
11 #if HAVE_LANGINFO_CODESET
23 #include "document/options.h"
24 #include "intl/charsets.h"
25 #include "util/conv.h"
26 #include "util/error.h"
27 #include "util/fastfind.h"
28 #include "util/memory.h"
29 #include "util/string.h"
32 /* Fix namespace clash on MacOS. */
33 #define table table_elinks
37 /* This should in principle be unicode_val_T, but because all
38 * the values currently in codepage.inc fit in 16 bits, we can
39 * as well use uint16_t and halve sizeof(struct table_entry)
40 * from 8 bytes to 4. Should other characters ever be needed,
41 * unicode_val_T u : 24 might be a possibility, although it
42 * seems a little unportable as bitfields are in principle
43 * restricted to int, which may be 16-bit. */
47 struct codepage_desc
{
49 unsigned char *const *aliases
;
51 /* The Unicode mappings of codepage bytes 0x80...0xFF.
52 * (0x00...0x7F are assumed to be ASCII in all codepages.)
53 * Because all current values fit in 16 bits, we store them as
54 * uint16_t rather than unicode_val_T. If the codepage does
55 * not use some byte, then @highhalf maps that byte to 0xFFFF,
56 * which C code converts to UCS_REPLACEMENT_CHARACTER where
57 * appropriate. (U+FFFF is reserved and will never be
58 * assigned as a character.) */
59 const uint16_t *highhalf
;
61 /* If some byte in the codepage corresponds to multiple Unicode
62 * characters, then the preferred character is in @highhalf
63 * above, and the rest are listed here in @extra. This table
64 * is not used for translating from the codepage to Unicode. */
65 const struct table_entry
*table
;
68 #include "intl/codepage.inc"
69 #include "intl/uni_7b.inc"
70 #include "intl/entity.inc"
73 static char strings
[256][2] = {
74 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
75 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
76 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
77 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
78 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
79 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
80 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
81 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
82 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
83 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
84 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
85 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
86 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
87 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
88 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
89 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
90 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
91 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
92 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
93 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
94 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
95 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
96 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
97 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
98 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
99 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
100 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
101 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
102 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
103 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
104 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
105 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
109 free_translation_table(struct conv_table
*p
)
113 for (i
= 0; i
< 256; i
++)
115 free_translation_table(p
[i
].u
.tbl
);
120 static unsigned char *no_str
= "*";
123 new_translation_table(struct conv_table
*p
)
127 for (i
= 0; i
< 256; i
++)
129 free_translation_table(p
[i
].u
.tbl
);
130 for (i
= 0; i
< 128; i
++) {
132 p
[i
].u
.str
= strings
[i
];
134 for (; i
< 256; i
++) {
140 #define BIN_SEARCH(table, entry, entries, key, result) \
142 long _s = 0, _e = (entries) - 1; \
144 while (_s <= _e || !((result) = -1)) { \
145 long _m = (_s + _e) / 2; \
147 if ((table)[_m].entry == (key)) { \
151 if ((table)[_m].entry > (key)) _e = _m - 1; \
152 if ((table)[_m].entry < (key)) _s = _m + 1; \
156 static const unicode_val_T strange_chars[32] = {
157 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
158 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
159 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
160 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
163 #define SYSTEM_CHARSET_FLAG 128
164 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
167 u2cp_(unicode_val_T u
, int to
, enum nbsp_mode nbsp_mode
)
172 if (u
< 128) return strings
[u
];
174 to
&= ~SYSTEM_CHARSET_FLAG
;
177 if (is_cp_ptr_utf8(&codepages
[to
]))
178 return encode_utf8(u
);
179 #endif /* CONFIG_UTF8 */
181 /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
183 if (nbsp_mode
== NBSP_MODE_HACK
) return NBSP_CHAR_STRING
;
184 else /* NBSP_MODE_ASCII */ return " ";
186 if (u
== 0xad) return "";
189 unicode_val_T strange
= strange_chars
[u
- 0x80];
191 if (!strange
) return NULL
;
192 return u2cp_(strange
, to
, nbsp_mode
);
196 for (j
= 0; j
< 0x80; j
++)
197 if (codepages
[to
].highhalf
[j
] == u
)
198 return strings
[0x80 + j
];
199 for (j
= 0; codepages
[to
].table
[j
].c
; j
++)
200 if (codepages
[to
].table
[j
].u
== u
)
201 return strings
[codepages
[to
].table
[j
].c
];
203 BIN_SEARCH(unicode_7b
, x
, N_UNICODE_7B
, u
, s
);
204 if (s
!= -1) return unicode_7b
[s
].s
;
209 static unsigned char utf_buffer
[7];
212 inline unsigned char *
213 encode_utf8(unicode_val_T u
)
215 static unsigned char *
216 encode_utf8(unicode_val_T u
)
217 #endif /* CONFIG_UTF8 */
219 memset(utf_buffer
, 0, 7);
224 utf_buffer
[0] = 0xc0 | ((u
>> 6) & 0x1f),
225 utf_buffer
[1] = 0x80 | (u
& 0x3f);
226 else if (u
< 0x10000)
227 utf_buffer
[0] = 0xe0 | ((u
>> 12) & 0x0f),
228 utf_buffer
[1] = 0x80 | ((u
>> 6) & 0x3f),
229 utf_buffer
[2] = 0x80 | (u
& 0x3f);
230 else if (u
< 0x200000)
231 utf_buffer
[0] = 0xf0 | ((u
>> 18) & 0x0f),
232 utf_buffer
[1] = 0x80 | ((u
>> 12) & 0x3f),
233 utf_buffer
[2] = 0x80 | ((u
>> 6) & 0x3f),
234 utf_buffer
[3] = 0x80 | (u
& 0x3f);
235 else if (u
< 0x4000000)
236 utf_buffer
[0] = 0xf8 | ((u
>> 24) & 0x0f),
237 utf_buffer
[1] = 0x80 | ((u
>> 18) & 0x3f),
238 utf_buffer
[2] = 0x80 | ((u
>> 12) & 0x3f),
239 utf_buffer
[3] = 0x80 | ((u
>> 6) & 0x3f),
240 utf_buffer
[4] = 0x80 | (u
& 0x3f);
241 else utf_buffer
[0] = 0xfc | ((u
>> 30) & 0x01),
242 utf_buffer
[1] = 0x80 | ((u
>> 24) & 0x3f),
243 utf_buffer
[2] = 0x80 | ((u
>> 18) & 0x3f),
244 utf_buffer
[3] = 0x80 | ((u
>> 12) & 0x3f),
245 utf_buffer
[4] = 0x80 | ((u
>> 6) & 0x3f),
246 utf_buffer
[5] = 0x80 | (u
& 0x3f);
252 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
253 * equal ones and handled different. */
254 static char utf8char_len_tab
[256] = {
255 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
256 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
257 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
258 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
259 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
260 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
261 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
262 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
265 inline int utf8charlen(const unsigned char *p
)
267 return p
? utf8char_len_tab
[*p
] : 0;
271 strlen_utf8(unsigned char **str
)
273 unsigned char *s
= *str
;
274 unsigned char *end
= strchr(s
, '\0');
278 for (x
= 0;; x
++, s
+= len
) {
279 len
= utf8charlen(s
);
280 if (s
+ len
> end
) break;
286 #define utf8_issingle(p) (((p) & 0x80) == 0)
287 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
289 /* Start from @current and move back to @pos char. This pointer return. The
290 * most left pointer is @start. */
291 inline unsigned char *
292 utf8_prevchar(unsigned char *current
, int pos
, unsigned char *start
)
294 if (current
== NULL
|| start
== NULL
|| pos
< 0)
296 while (pos
> 0 && current
!= start
) {
298 if (utf8_islead(*current
))
304 /* Count number of standard terminal cells needed for displaying UTF-8
307 utf8_char2cells(unsigned char *utf8_char
, unsigned char *end
)
312 end
= strchr(utf8_char
, '\0');
314 if(!utf8_char
|| !end
)
317 u
= utf8_to_unicode(&utf8_char
, end
);
319 return unicode_to_cell(u
);
322 /* Count number of standard terminal cells needed for displaying string
323 * with UTF-8 characters. */
325 utf8_ptr2cells(unsigned char *string
, unsigned char *end
)
327 int charlen
, cell
, cells
= 0;
330 end
= strchr(string
, '\0');
336 charlen
= utf8charlen(string
);
337 if (string
+ charlen
> end
)
340 cell
= utf8_char2cells(string
, end
);
351 /* Count number of characters in string. */
353 utf8_ptr2chars(unsigned char *string
, unsigned char *end
)
355 int charlen
, chars
= 0;
358 end
= strchr(string
, '\0');
364 charlen
= utf8charlen(string
);
365 if (string
+ charlen
> end
)
376 * Count number of bytes from begining of the string needed for displaying
377 * specified number of cells.
380 utf8_cells2bytes(unsigned char *string
, int max_cells
, unsigned char *end
)
382 unsigned int bytes
= 0, cells
= 0;
384 assert(max_cells
>=0);
387 end
= strchr(string
, '\0');
393 int cell
= utf8_char2cells(&string
[bytes
], end
);
398 if (cells
> max_cells
)
401 bytes
+= utf8charlen(&string
[bytes
]);
403 if (string
+ bytes
> end
) {
404 bytes
= end
- string
;
412 /* Take @max steps forward from @string in the specified @way, but
413 * not going past @end. Return the resulting address. Store the
414 * number of steps taken to *@count, unless @count is NULL.
416 * This assumes the text is valid UTF-8, and @string and @end point to
417 * character boundaries. If not, it doesn't crash but the results may
420 * This function can do some of the same jobs as utf8charlen(),
421 * utf8_cells2bytes(), and strlen_utf8(). */
423 utf8_step_forward(unsigned char *string
, unsigned char *end
,
424 int max
, enum utf8_step way
, int *count
)
427 unsigned char *current
= string
;
431 if_assert_failed
goto invalid_arg
;
433 end
= strchr(string
, '\0');
436 case utf8_step_characters
:
437 while (steps
< max
&& current
< end
) {
439 if (utf8_islead(*current
))
444 case utf8_step_cells_fewer
:
445 case utf8_step_cells_more
:
446 while (steps
< max
) {
448 unsigned char *prev
= current
;
451 u
= utf8_to_unicode(¤t
, end
);
452 if (u
== UCS_NO_CHAR
) {
453 /* Assume the incomplete sequence
460 width
= unicode_to_cell(u
);
461 if (way
== utf8_step_cells_fewer
462 && steps
+ width
> max
) {
472 INTERNAL("impossible enum utf8_step");
481 /* Take @max steps backward from @string in the specified @way, but
482 * not going past @start. Return the resulting address. Store the
483 * number of steps taken to *@count, unless @count is NULL.
485 * This assumes the text is valid UTF-8, and @string and @start point
486 * to character boundaries. If not, it doesn't crash but the results
487 * may be inconsistent.
489 * This function can do some of the same jobs as utf8_prevchar(). */
491 utf8_step_backward(unsigned char *string
, unsigned char *start
,
492 int max
, enum utf8_step way
, int *count
)
495 unsigned char *current
= string
;
500 if_assert_failed
goto invalid_arg
;
503 case utf8_step_characters
:
504 while (steps
< max
&& current
> start
) {
506 if (utf8_islead(*current
))
511 case utf8_step_cells_fewer
:
512 case utf8_step_cells_more
:
513 while (steps
< max
) {
514 unsigned char *prev
= current
;
519 if (current
<= start
)
523 } while (current
> start
&& !utf8_islead(*current
));
526 u
= utf8_to_unicode(&look
, prev
);
527 if (u
== UCS_NO_CHAR
) {
528 /* Assume the incomplete sequence
532 width
= unicode_to_cell(u
);
534 if (way
== utf8_step_cells_fewer
535 && steps
+ width
> max
) {
545 INTERNAL("impossible enum utf8_step");
555 * Find out number of standard terminal collumns needed for displaying symbol
556 * (glyph) which represents Unicode character c.
558 * TODO: Use wcwidth when it is available. This seems to require:
559 * - Make the configure script check whether <wchar.h> and wcwidth exist.
560 * - Define _XOPEN_SOURCE and include <wchar.h>.
561 * - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
562 * matches ISO 10646 in all locales.)
563 * However, these do not suffice, because wcwidth depends on LC_CTYPE
564 * in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
565 * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
566 * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
567 * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
568 * character is apparently not supported in all locales. Why is that?
569 * - Perhaps there is standardese that requires supported characters
570 * to be convertable to multibyte form. Then ELinks could just pick
571 * some UTF-8 locale for its wcwidth purposes.
572 * - Perhaps wcwidth can even return different nonnegative values for
573 * the same ISO 10646 character in different locales. Then ELinks
574 * would have to set LC_CTYPE to match at least the terminal's
575 * charset (which may differ from the LC_CTYPE environment variable,
576 * especially when the master process is serving a slave terminal).
577 * But there is no guarantee that the libc supports all the same
578 * charsets as ELinks does.
579 * For now, it seems safest to avoid the potentially locale-dependent
580 * libc version of wcwidth, and instead use a hardcoded mapping.
582 * @return 2 for double-width glyph, 1 for others.
583 * TODO: May be extended to return 0 for zero-width glyphs
584 * (like composing, maybe unprintable too).
587 unicode_to_cell(unicode_val_T c
)
590 && (c
<= 0x115f /* Hangul Jamo */
593 || (c
>= 0x2e80 && c
<= 0xa4cf
594 && c
!= 0x303f) /* CJK ... Yi */
595 || (c
>= 0xac00 && c
<= 0xd7a3) /* Hangul Syllables */
596 || (c
>= 0xf900 && c
<= 0xfaff) /* CJK Compatibility
598 || (c
>= 0xfe30 && c
<= 0xfe6f) /* CJK Compatibility Forms */
599 || (c
>= 0xff00 && c
<= 0xff60) /* Fullwidth Forms */
600 || (c
>= 0xffe0 && c
<= 0xffe6)
601 || (c
>= 0x20000 && c
<= 0x2fffd)
602 || (c
>= 0x30000 && c
<= 0x3fffd)))
608 /* Fold the case of a Unicode character, so that hotkeys in labels can
609 * be compared case-insensitively. It is unspecified whether the
610 * result will be in upper or lower case. */
612 unicode_fold_label_case(unicode_val_T c
)
614 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
616 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
617 /* For now, this supports only ASCII. It would be possible to
618 * use code generated from CaseFolding.txt of Unicode if the
619 * acknowledgements required by http://www.unicode.org/copyright.html
620 * were added to associated documentation of ELinks. */
621 if (c
>= 0x41 && c
<= 0x5A)
625 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
629 utf8_to_unicode(unsigned char **string
, unsigned char *end
)
631 unsigned char *str
= *string
;
635 length
= utf8char_len_tab
[str
[0]];
637 if (str
+ length
> end
) {
646 u
= (str
[0] & 0x1f) << 6;
647 u
+= (str
[1] & 0x3f);
650 u
= (str
[0] & 0x0f) << 12;
651 u
+= ((str
[1] & 0x3f) << 6);
652 u
+= (str
[2] & 0x3f);
655 u
= (str
[0] & 0x0f) << 18;
656 u
+= ((str
[1] & 0x3f) << 12);
657 u
+= ((str
[2] & 0x3f) << 6);
658 u
+= (str
[3] & 0x3f);
661 u
= (str
[0] & 0x0f) << 24;
662 u
+= ((str
[1] & 0x3f) << 18);
663 u
+= ((str
[2] & 0x3f) << 12);
664 u
+= ((str
[3] & 0x3f) << 6);
665 u
+= (str
[4] & 0x3f);
669 u
= (str
[0] & 0x01) << 30;
670 u
+= ((str
[1] & 0x3f) << 24);
671 u
+= ((str
[2] & 0x3f) << 18);
672 u
+= ((str
[3] & 0x3f) << 12);
673 u
+= ((str
[4] & 0x3f) << 6);
674 u
+= (str
[5] & 0x3f);
677 *string
= str
+ length
;
680 #endif /* CONFIG_UTF8 */
682 /* The common part of cp2u and cp2utf_8. */
684 cp2u_shared(const struct codepage_desc
*from
, unsigned char c
)
686 unicode_val_T u
= from
->highhalf
[c
- 0x80];
688 if (u
== 0xFFFF) u
= UCS_REPLACEMENT_CHARACTER
;
692 /* Used for converting input from the terminal. */
694 cp2u(int from
, unsigned char c
)
696 from
&= ~SYSTEM_CHARSET_FLAG
;
698 /* UTF-8 is a multibyte codepage and cannot be handled with
700 assert(!is_cp_ptr_utf8(&codepages
[from
]));
701 if_assert_failed
return UCS_REPLACEMENT_CHARACTER
;
703 if (c
< 0x80) return c
;
704 else return cp2u_shared(&codepages
[from
], c
);
707 /* This slow and ugly code is used by the terminal utf_8_io */
709 cp2utf8(int from
, int c
)
711 from
&= ~SYSTEM_CHARSET_FLAG
;
713 if (is_cp_ptr_utf8(&codepages
[from
]) || c
< 128)
716 return encode_utf8(cp2u_shared(&codepages
[from
], c
));
721 cp_to_unicode(int codepage
, unsigned char **string
, unsigned char *end
)
725 if (is_cp_utf8(codepage
))
726 return utf8_to_unicode(string
, end
);
731 ret
= cp2u(codepage
, **string
);
735 #endif /* CONFIG_UTF8 */
739 add_utf8(struct conv_table
*ct
, unicode_val_T u
, unsigned char *str
)
741 unsigned char *p
= encode_utf8(u
);
744 if (ct
[*p
].t
) ct
= ct
[*p
].u
.tbl
;
746 struct conv_table
*nct
;
748 assertm(ct
[*p
].u
.str
== no_str
, "bad utf encoding #1");
749 if_assert_failed
return;
751 nct
= mem_calloc(256, sizeof(*nct
));
753 new_translation_table(nct
);
761 assertm(!ct
[*p
].t
, "bad utf encoding #2");
762 if_assert_failed
return;
764 if (ct
[*p
].u
.str
== no_str
)
768 struct conv_table utf_table
[256];
769 int utf_table_init
= 1;
776 for (i
= 128; i
< 256; i
++)
777 mem_free(utf_table
[i
].u
.str
);
780 static struct conv_table
*
781 get_translation_table_to_utf8(int from
)
786 if (from
== -1) return NULL
;
787 from
&= ~SYSTEM_CHARSET_FLAG
;
788 if (from
== lfr
) return utf_table
;
791 memset(utf_table
, 0, sizeof(utf_table
)),
796 for (i
= 0; i
< 128; i
++)
797 utf_table
[i
].u
.str
= strings
[i
];
799 if (is_cp_ptr_utf8(&codepages
[from
])) {
800 for (i
= 128; i
< 256; i
++)
801 utf_table
[i
].u
.str
= stracpy(strings
[i
]);
805 for (i
= 128; i
< 256; i
++) {
806 unicode_val_T u
= codepages
[from
].highhalf
[i
- 0x80];
809 utf_table
[i
].u
.str
= NULL
;
811 utf_table
[i
].u
.str
= stracpy(encode_utf8(u
));
814 for (i
= 0; codepages
[from
].table
[i
].c
; i
++) {
815 unicode_val_T u
= codepages
[from
].table
[i
].u
;
817 if (!utf_table
[codepages
[from
].table
[i
].c
].u
.str
)
818 utf_table
[codepages
[from
].table
[i
].c
].u
.str
=
819 stracpy(encode_utf8(u
));
822 for (i
= 128; i
< 256; i
++)
823 if (!utf_table
[i
].u
.str
)
824 utf_table
[i
].u
.str
= stracpy(no_str
);
829 struct conv_table table
[256];
830 static int first
= 1;
833 free_conv_table(void)
835 if (!utf_table_init
) free_utf_table();
837 memset(table
, 0, sizeof(table
));
840 new_translation_table(table
);
845 get_translation_table(int from
, int to
)
850 from
&= ~SYSTEM_CHARSET_FLAG
;
851 to
&= ~SYSTEM_CHARSET_FLAG
;
853 memset(table
, 0, sizeof(table
));
856 if (/*from == to ||*/ from
== -1 || to
== -1)
858 if (is_cp_ptr_utf8(&codepages
[to
]))
859 return get_translation_table_to_utf8(from
);
860 if (from
== lfr
&& to
== lto
)
864 new_translation_table(table
);
866 if (is_cp_ptr_utf8(&codepages
[from
])) {
869 for (i
= 0x80; i
<= 0xFF; i
++)
870 if (codepages
[to
].highhalf
[i
- 0x80] != 0xFFFF)
872 codepages
[to
].highhalf
[i
- 0x80],
875 for (i
= 0; codepages
[to
].table
[i
].c
; i
++)
876 add_utf8(table
, codepages
[to
].table
[i
].u
,
877 strings
[codepages
[to
].table
[i
].c
]);
879 for (i
= 0; unicode_7b
[i
].x
!= -1; i
++)
880 if (unicode_7b
[i
].x
>= 0x80)
881 add_utf8(table
, unicode_7b
[i
].x
,
887 for (i
= 128; i
< 256; i
++) {
888 if (codepages
[from
].highhalf
[i
- 0x80] != 0xFFFF) {
891 u
= u2cp(codepages
[from
].highhalf
[i
- 0x80], to
);
892 if (u
) table
[i
].u
.str
= u
;
901 xxstrcmp(unsigned char *s1
, unsigned char *s2
, int l2
)
904 if (*s1
> *s2
) return 1;
905 if (*s1
< *s2
) return -1;
914 /* Entity cache debugging purpose. */
916 #define DEBUG_ENTITY_CACHE
918 #undef DEBUG_ENTITY_CACHE
921 struct entity_cache
{
925 unsigned char *result
;
926 unsigned char str
[20]; /* Suffice in any case. */
930 hits_cmp(struct entity_cache
*a
, struct entity_cache
*b
)
932 if (a
->hits
== b
->hits
) return 0;
933 if (a
->hits
> b
->hits
) return -1;
938 compare_entities(const void *key_
, const void *element_
)
940 struct string
*key
= (struct string
*) key_
;
941 struct entity
*element
= (struct entity
*) element_
;
942 int length
= key
->length
;
943 unsigned char *first
= key
->source
;
944 unsigned char *second
= element
->s
;
946 return xxstrcmp(first
, second
, length
);
950 get_entity_string(const unsigned char *str
, const int strlen
, int encoding
)
952 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
953 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
954 will go in [0] table */
955 static struct entity_cache entity_cache
[ENTITY_CACHE_MAXLEN
][ENTITY_CACHE_SIZE
];
956 static unsigned int nb_entity_cache
[ENTITY_CACHE_MAXLEN
];
957 static int first_time
= 1;
958 unsigned int slen
= 0;
959 unsigned char *result
= NULL
;
961 if (strlen
<= 0) return NULL
;
964 /* TODO: caching UTF-8 */
965 encoding
&= ~SYSTEM_CHARSET_FLAG
;
966 if (is_cp_ptr_utf8(&codepages
[encoding
]))
968 #endif /* CONFIG_UTF8 */
971 memset(&nb_entity_cache
, 0, ENTITY_CACHE_MAXLEN
* sizeof(unsigned int));
975 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
976 * + google + slashdot + websites that result from a search for test on google,
977 * + various ones) show a quite impressive improvment:
979 * 0: hits=2459 l=4 st='nbsp'
980 * 1: hits=2152 l=6 st='eacute'
981 * 2: hits=235 l=6 st='egrave'
982 * 3: hits=136 l=6 st='agrave'
983 * 4: hits=100 l=3 st='amp'
984 * 5: hits=40 l=5 st='laquo'
985 * 6: hits=8 l=4 st='copy'
986 * 7: hits=5 l=2 st='gt'
987 * 8: hits=2 l=2 st='lt'
988 * 9: hits=1 l=6 st='middot'
990 * Most of the time cache hit ratio is near 95%.
992 * A long test shows: 15186 hits vs. 24 misses and mean iteration
993 * count is kept < 2 (worst case 1.58). Not so bad ;)
997 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
998 slen
= (strlen
> 1 && strlen
< ENTITY_CACHE_MAXLEN
) ? strlen
: 0;
1000 if (strlen
< ENTITY_CACHE_MAXLEN
&& nb_entity_cache
[slen
] > 0) {
1003 for (i
= 0; i
< nb_entity_cache
[slen
]; i
++) {
1004 if (entity_cache
[slen
][i
].encoding
== encoding
1005 && !memcmp(str
, entity_cache
[slen
][i
].str
, strlen
)) {
1006 #ifdef DEBUG_ENTITY_CACHE
1007 static double total_iter
= 0;
1008 static unsigned long hit_count
= 0;
1010 total_iter
+= i
+ 1;
1012 fprintf(stderr
, "hit after %d iter. (mean = %0.2f)\n", i
+ 1, total_iter
/ (double) hit_count
);
1014 if (entity_cache
[slen
][i
].hits
< (unsigned int) ~0)
1015 entity_cache
[slen
][i
].hits
++;
1016 return entity_cache
[slen
][i
].result
;
1019 #ifdef DEBUG_ENTITY_CACHE
1020 fprintf(stderr
, "miss\n");
1025 #endif /* CONFIG_UTF8 */
1026 if (*str
== '#') { /* Numeric entity. */
1027 int l
= (int) strlen
;
1028 unsigned char *st
= (unsigned char *) str
;
1029 unicode_val_T n
= 0;
1031 if (l
== 1) goto end
; /* &#; ? */
1033 if ((*st
| 32) == 'x') { /* Hexadecimal */
1035 if (l
== 1 || l
> 9) goto end
; /* xFFFFFFFF max. */
1038 unsigned char c
= (*(st
++) | 32);
1041 n
= (n
<< 4) | (c
- '0');
1042 else if (isxdigit(c
))
1043 n
= (n
<< 4) | (c
- 'a' + 10);
1045 goto end
; /* Bad char. */
1047 } else { /* Decimal */
1048 if (l
> 10) goto end
; /* 4294967295 max. */
1050 unsigned char c
= *(st
++);
1053 n
= n
* 10 + c
- '0';
1055 goto end
; /* Bad char. */
1056 /* Limit to 0xFFFFFFFF. */
1057 if (n
>= (unicode_val_T
) 0xFFFFFFFFu
)
1062 result
= u2cp(n
, encoding
);
1064 #ifdef DEBUG_ENTITY_CACHE
1065 fprintf(stderr
, "%lu %016x %s\n", (unsigned long) n
, n
, result
);
1067 } else { /* Text entity. */
1068 struct string key
= INIT_STRING((unsigned char *) str
, strlen
);
1069 struct entity
*element
= bsearch((void *) &key
, entities
,
1074 if (element
) result
= u2cp(element
->c
, encoding
);
1078 if (is_cp_ptr_utf8(&codepages
[encoding
])) {
1081 #endif /* CONFIG_UTF8 */
1083 /* Take care of potential buffer overflow. */
1084 if (strlen
< sizeof(entity_cache
[slen
][0].str
)) {
1085 struct entity_cache
*ece
= &entity_cache
[slen
][nb_entity_cache
[slen
]];
1087 /* Copy new entry to cache. */
1089 ece
->strlen
= strlen
;
1090 ece
->encoding
= encoding
;
1091 ece
->result
= result
;
1092 memcpy(ece
->str
, str
, strlen
);
1093 ece
->str
[strlen
] = '\0';
1095 /* Increment number of cache entries if possible. */
1096 if (nb_entity_cache
[slen
] < ENTITY_CACHE_SIZE
) nb_entity_cache
[slen
]++;
1098 #ifdef DEBUG_ENTITY_CACHE
1099 fprintf(stderr
, "Added in [%u]: l=%d st='%s'\n", slen
,
1100 entity_cache
[slen
][0].strlen
, entity_cache
[slen
][0].str
);
1104 /* Sort entries by hit order. */
1105 if (nb_entity_cache
[slen
] > 1)
1106 qsort(&entity_cache
[slen
][0], nb_entity_cache
[slen
],
1107 sizeof(entity_cache
[slen
][0]), (void *) hits_cmp
);
1109 #ifdef DEBUG_ENTITY_CACHE
1113 fprintf(stderr
, "- Cache entries [%u] -\n", slen
);
1114 for (i
= 0; i
< nb_entity_cache
[slen
] ; i
++)
1115 fprintf(stderr
, "%d: hits=%u l=%d st='%s'\n", i
,
1116 entity_cache
[slen
][i
].hits
, entity_cache
[slen
][i
].strlen
,
1117 entity_cache
[slen
][i
].str
);
1118 fprintf(stderr
, "-----------------\n");
1126 convert_string(struct conv_table
*convert_table
,
1127 unsigned char *chars
, int charslen
, int cp
,
1128 enum convert_string_mode mode
, int *length
,
1129 void (*callback
)(void *data
, unsigned char *buf
, int buflen
),
1130 void *callback_data
)
1132 unsigned char *buffer
;
1136 if (!convert_table
&& !memchr(chars
, '&', charslen
)) {
1138 if (charslen
) callback(callback_data
, chars
, charslen
);
1141 return memacpy(chars
, charslen
);
1145 /* Buffer allocation */
1147 buffer
= mem_alloc(ALLOC_GR
+ 1 /* trailing \0 */);
1148 if (!buffer
) return NULL
;
1152 while (charspos
< charslen
) {
1153 unsigned char *translit
;
1156 buffer[bufferpos++] = chars[charspos++]; \
1161 if (chars
[charspos
] != '&') {
1162 struct conv_table
*t
;
1165 if (chars
[charspos
] < 128 || !convert_table
) PUTC
;
1170 while (t
[chars
[i
]].t
) {
1171 t
= t
[chars
[i
++]].u
.tbl
;
1172 if (i
>= charslen
) PUTC
;
1175 translit
= t
[chars
[i
]].u
.str
;
1178 } else if (mode
== CSM_FORM
|| mode
== CSM_NONE
) {
1182 int start
= charspos
+ 1;
1186 && (isasciialpha(chars
[i
])
1187 || isdigit(chars
[i
])
1188 || (chars
[i
] == '#')))
1191 /* This prevents bug 213: we were expanding "entities"
1192 * in URL query strings. */
1193 /* XXX: But this disables    usage, which
1194 * appears to be relatively common! --pasky */
1195 if ((mode
== CSM_DEFAULT
|| (chars
[i
] != '&' && chars
[i
] != '='))
1197 && !isasciialpha(chars
[i
]) && !isdigit(chars
[i
])) {
1198 translit
= get_entity_string(&chars
[start
], i
- start
,
1200 if (chars
[i
] != ';') {
1201 /* Eat    <foo> happily, but
1202 * pull back from the character after
1203 * entity string if it is not the valid
1208 if (!translit
) PUTC
;
1209 charspos
= i
+ (i
< charslen
);
1213 if (!translit
[0]) continue;
1216 buffer
[bufferpos
++] = translit
[0];
1224 buffer
[bufferpos
++] = *(translit
++);
1226 if (bufferpos
& (ALLOC_GR
- 1)) continue;
1229 buffer
[bufferpos
] = 0;
1230 callback(callback_data
, buffer
, bufferpos
);
1233 new = mem_realloc(buffer
, bufferpos
+ ALLOC_GR
);
1246 buffer
[bufferpos
] = 0;
1247 if (length
) *length
= bufferpos
;
1250 if (bufferpos
) callback(callback_data
, buffer
, bufferpos
);
1259 #ifndef USE_FASTFIND
1261 get_cp_index(unsigned char *name
)
1266 if (!strcasecmp(name
, "System")) {
1267 #if HAVE_LANGINFO_CODESET
1268 name
= nl_langinfo(CODESET
);
1269 syscp
= SYSTEM_CHARSET_FLAG
;
1275 for (i
= 0; codepages
[i
].name
; i
++) {
1276 for (a
= 0; codepages
[i
].aliases
[a
]; a
++) {
1277 /* In the past, we looked for the longest substring
1278 * in all the names; it is way too expensive, though:
1280 * % cumulative self self total
1281 * time seconds seconds calls us/call us/call name
1282 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1284 * Anything called from redraw_screen() is in fact
1285 * relatively expensive, even if it's called just
1286 * once. So we will do a simple strcasecmp() here.
1289 if (!strcasecmp(name
, codepages
[i
].aliases
[a
]))
1295 return get_cp_index("us-ascii") | syscp
;
1303 static unsigned int i_name
= 0;
1304 static unsigned int i_alias
= 0;
1306 /* Reset internal list pointer */
1308 charsets_list_reset(void)
1314 /* Returns a pointer to a struct that contains current key and data pointers
1315 * and increment internal pointer. It returns NULL when key is NULL. */
1316 struct fastfind_key_value
*
1317 charsets_list_next(void)
1319 static struct fastfind_key_value kv
;
1321 if (!codepages
[i_name
].name
) return NULL
;
1323 kv
.key
= codepages
[i_name
].aliases
[i_alias
];
1324 kv
.data
= (void *) &codepages
[i_name
]; /* cast away const */
1326 if (codepages
[i_name
].aliases
[i_alias
+ 1])
1336 static struct fastfind_index ff_charsets_index
1337 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset
, charsets_list_next
);
1339 /* It searchs for a charset named @name or one of its aliases and
1340 * returns index for it or -1 if not found. */
1342 get_cp_index(unsigned char *name
)
1344 const struct codepage_desc
*codepage
;
1347 if (!strcasecmp(name
, "System")) {
1348 #if HAVE_LANGINFO_CODESET
1349 name
= nl_langinfo(CODESET
);
1350 syscp
= SYSTEM_CHARSET_FLAG
;
1356 codepage
= fastfind_search(&ff_charsets_index
, name
, strlen(name
));
1358 assert(codepages
<= codepage
&& codepage
< codepages
+ N_CODEPAGES
);
1359 return (codepage
- codepages
) | syscp
;
1362 return get_cp_index("us-ascii") | syscp
;
1369 #endif /* USE_FASTFIND */
1372 init_charsets_lookup(void)
1375 fastfind_index(&ff_charsets_index
, FF_COMPRESS
);
1380 free_charsets_lookup(void)
1383 fastfind_done(&ff_charsets_index
);
1388 get_cp_name(int cp_index
)
1390 if (cp_index
< 0) return "none";
1391 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1393 return codepages
[cp_index
].name
;
1397 get_cp_mime_name(int cp_index
)
1399 if (cp_index
< 0) return "none";
1400 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1401 if (!codepages
[cp_index
].aliases
) return NULL
;
1403 return codepages
[cp_index
].aliases
[0];
1407 is_cp_utf8(int cp_index
)
1409 cp_index
&= ~SYSTEM_CHARSET_FLAG
;
1410 return is_cp_ptr_utf8(&codepages
[cp_index
]);