1 /* Charsets convertor */
7 #if HAVE_LANGINFO_CODESET
19 #include "document/options.h"
20 #include "intl/charsets.h"
21 #include "util/conv.h"
22 #include "util/error.h"
23 #include "util/fastfind.h"
24 #include "util/memory.h"
25 #include "util/string.h"
28 /* Fix namespace clash on MacOS. */
29 #define table table_elinks
36 struct codepage_desc
{
38 unsigned char **aliases
;
39 struct table_entry
*table
;
42 #include "intl/codepage.inc"
43 #include "intl/uni_7b.inc"
44 #include "intl/entity.inc"
47 static char strings
[256][2] = {
48 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
49 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
50 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
51 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
52 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
53 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
54 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
55 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
56 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
57 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
58 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
59 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
60 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
61 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
62 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
63 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
64 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
65 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
66 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
67 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
68 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
69 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
70 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
71 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
72 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
73 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
74 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
75 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
76 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
77 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
78 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
79 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
83 free_translation_table(struct conv_table
*p
)
87 for (i
= 0; i
< 256; i
++)
89 free_translation_table(p
[i
].u
.tbl
);
94 static unsigned char *no_str
= "*";
97 new_translation_table(struct conv_table
*p
)
101 for (i
= 0; i
< 256; i
++)
103 free_translation_table(p
[i
].u
.tbl
);
104 for (i
= 0; i
< 128; i
++) {
106 p
[i
].u
.str
= strings
[i
];
108 for (; i
< 256; i
++) {
114 #define BIN_SEARCH(table, entry, entries, key, result) \
116 long _s = 0, _e = (entries) - 1; \
118 while (_s <= _e || !((result) = -1)) { \
119 long _m = (_s + _e) / 2; \
121 if ((table)[_m].entry == (key)) { \
125 if ((table)[_m].entry > (key)) _e = _m - 1; \
126 if ((table)[_m].entry < (key)) _s = _m + 1; \
130 static const unicode_val_T strange_chars[32] = {
131 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
132 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
133 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
134 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
137 #define SYSTEM_CHARSET_FLAG 128
140 u2cp_(unicode_val_T u
, int to
, int no_nbsp_hack
)
145 if (u
< 128) return strings
[u
];
147 to
&= ~SYSTEM_CHARSET_FLAG
;
150 if (codepages
[to
].table
== table_utf_8
)
151 return encode_utf_8(u
);
152 #endif /* CONFIG_UTF_8 */
154 /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
155 if (u
== 0xa0) return no_nbsp_hack
? " " : NBSP_CHAR_STRING
;
156 if (u
== 0xad) return "";
159 unicode_val_T strange
= strange_chars
[u
- 0x80];
161 if (!strange
) return NULL
;
162 return u2cp_(strange
, to
, no_nbsp_hack
);
166 for (j
= 0; codepages
[to
].table
[j
].c
; j
++)
167 if (codepages
[to
].table
[j
].u
== u
)
168 return strings
[codepages
[to
].table
[j
].c
];
170 BIN_SEARCH(unicode_7b
, x
, N_UNICODE_7B
, u
, s
);
171 if (s
!= -1) return unicode_7b
[s
].s
;
176 static unsigned char utf_buffer
[7];
179 inline unsigned char *
180 encode_utf_8(unicode_val_T u
)
182 static unsigned char *
183 encode_utf_8(unicode_val_T u
)
184 #endif /* CONFIG_UTF_8 */
186 memset(utf_buffer
, 0, 7);
191 utf_buffer
[0] = 0xc0 | ((u
>> 6) & 0x1f),
192 utf_buffer
[1] = 0x80 | (u
& 0x3f);
193 else if (u
< 0x10000)
194 utf_buffer
[0] = 0xe0 | ((u
>> 12) & 0x0f),
195 utf_buffer
[1] = 0x80 | ((u
>> 6) & 0x3f),
196 utf_buffer
[2] = 0x80 | (u
& 0x3f);
197 else if (u
< 0x200000)
198 utf_buffer
[0] = 0xf0 | ((u
>> 18) & 0x0f),
199 utf_buffer
[1] = 0x80 | ((u
>> 12) & 0x3f),
200 utf_buffer
[2] = 0x80 | ((u
>> 6) & 0x3f),
201 utf_buffer
[3] = 0x80 | (u
& 0x3f);
202 else if (u
< 0x4000000)
203 utf_buffer
[0] = 0xf8 | ((u
>> 24) & 0x0f),
204 utf_buffer
[1] = 0x80 | ((u
>> 18) & 0x3f),
205 utf_buffer
[2] = 0x80 | ((u
>> 12) & 0x3f),
206 utf_buffer
[3] = 0x80 | ((u
>> 6) & 0x3f),
207 utf_buffer
[4] = 0x80 | (u
& 0x3f);
208 else utf_buffer
[0] = 0xfc | ((u
>> 30) & 0x01),
209 utf_buffer
[1] = 0x80 | ((u
>> 24) & 0x3f),
210 utf_buffer
[2] = 0x80 | ((u
>> 18) & 0x3f),
211 utf_buffer
[3] = 0x80 | ((u
>> 12) & 0x3f),
212 utf_buffer
[4] = 0x80 | ((u
>> 6) & 0x3f),
213 utf_buffer
[5] = 0x80 | (u
& 0x3f);
219 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
220 * equal ones and handled different. */
221 static char utf8char_len_tab
[256] = {
222 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
223 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
224 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
225 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
226 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
227 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
228 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
229 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
232 inline int utf8charlen(const unsigned char *p
)
234 return p
? utf8char_len_tab
[*p
] : 0;
238 strlen_utf8(unsigned char **str
)
240 unsigned char *s
= *str
;
241 unsigned char *end
= strchr(s
, '\0');
245 for (x
= 0;; x
++, s
+= len
) {
246 len
= utf8charlen(s
);
247 if (s
+ len
> end
) break;
253 #define utf8_issingle(p) (((p) & 0x80) == 0)
254 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
256 /* Start from @current and move back to @pos char. This pointer return. The
257 * most left pointer is @start. */
258 inline unsigned char *
259 utf8_prevchar(unsigned char *current
, int pos
, unsigned char *start
)
261 if (current
== NULL
|| start
== NULL
|| pos
< 0)
263 while (pos
> 0 && current
!= start
) {
265 if (utf8_islead(*current
))
271 /* Count number of standard terminal cells needed for displaying UTF-8
274 utf8_char2cells(unsigned char *utf8_char
, unsigned char *end
)
279 end
= strchr(utf8_char
, '\0');
281 if(!utf8_char
|| !end
)
284 u
= utf_8_to_unicode(&utf8_char
, end
);
286 return unicode_to_cell(u
);
289 /* Count number of standard terminal cells needed for displaying string
290 * with UTF-8 characters. */
292 utf8_ptr2cells(unsigned char *string
, unsigned char *end
)
294 int charlen
, cell
, cells
= 0;
297 end
= strchr(string
, '\0');
303 charlen
= utf8charlen(string
);
304 if (string
+ charlen
> end
)
307 cell
= utf8_char2cells(string
, end
);
318 /* Count number of characters in string. */
320 utf8_ptr2chars(unsigned char *string
, unsigned char *end
)
322 int charlen
, chars
= 0;
325 end
= strchr(string
, '\0');
331 charlen
= utf8charlen(string
);
332 if (string
+ charlen
> end
)
343 * Count number of bytes from begining of the string needed for displaying
344 * specified number of cells.
347 utf8_cells2bytes(unsigned char *string
, int max_cells
, unsigned char *end
)
349 unsigned int bytes
= 0, cells
= 0;
351 assert(max_cells
>=0);
354 end
= strchr(string
, '\0');
360 int cell
= utf8_char2cells(&string
[bytes
], end
);
365 if (cells
> max_cells
)
368 bytes
+= utf8charlen(&string
[bytes
]);
370 if (string
+ bytes
> end
) {
371 bytes
= end
- string
;
379 /* Take @max steps forward from @string in the specified @way, but
380 * not going past @end. Return the resulting address. Store the
381 * number of steps taken to *@count, unless @count is NULL.
383 * This assumes the text is valid UTF-8, and @string and @end point to
384 * character boundaries. If not, it doesn't crash but the results may
387 * This function can do some of the same jobs as utf8charlen(),
388 * utf8_cells2bytes(), and strlen_utf8(). */
390 utf8_step_forward(unsigned char *string
, unsigned char *end
,
391 int max
, enum utf8_step way
, int *count
)
394 unsigned char *current
= string
;
398 if_assert_failed
goto invalid_arg
;
400 end
= strchr(string
, '\0');
403 case utf8_step_characters
:
404 while (steps
< max
&& current
< end
) {
406 if (utf8_islead(*current
))
411 case utf8_step_cells_fewer
:
412 case utf8_step_cells_more
:
413 while (steps
< max
) {
415 unsigned char *prev
= current
;
418 u
= utf_8_to_unicode(¤t
, end
);
419 if (u
== UCS_NO_CHAR
) {
420 /* Assume the incomplete sequence
427 width
= unicode_to_cell(u
);
428 if (way
== utf8_step_cells_fewer
429 && steps
+ width
> max
) {
439 INTERNAL("impossible enum utf8_step");
448 /* Take @max steps backward from @string in the specified @way, but
449 * not going past @start. Return the resulting address. Store the
450 * number of steps taken to *@count, unless @count is NULL.
452 * This assumes the text is valid UTF-8, and @string and @start point
453 * to character boundaries. If not, it doesn't crash but the results
454 * may be inconsistent.
456 * This function can do some of the same jobs as utf8_prevchar(). */
458 utf8_step_backward(unsigned char *string
, unsigned char *start
,
459 int max
, enum utf8_step way
, int *count
)
462 unsigned char *current
= string
;
467 if_assert_failed
goto invalid_arg
;
470 case utf8_step_characters
:
471 while (steps
< max
&& current
> start
) {
473 if (utf8_islead(*current
))
478 case utf8_step_cells_fewer
:
479 case utf8_step_cells_more
:
480 while (steps
< max
) {
481 unsigned char *prev
= current
;
486 if (current
<= start
)
490 } while (current
> start
&& !utf8_islead(*current
));
493 u
= utf_8_to_unicode(&look
, prev
);
494 if (u
== UCS_NO_CHAR
) {
495 /* Assume the incomplete sequence
499 width
= unicode_to_cell(u
);
501 if (way
== utf8_step_cells_fewer
502 && steps
+ width
> max
) {
512 INTERNAL("impossible enum utf8_step");
522 * Find out number of standard terminal collumns needed for displaying symbol
523 * (glyph) which represents Unicode character c.
524 * TODO: Use wcwidth when it is available.
526 * @return 2 for double-width glyph, 1 for others.
527 * TODO: May be extended to return 0 for zero-width glyphs
528 * (like composing, maybe unprintable too).
531 unicode_to_cell(unicode_val_T c
)
534 && (c
<= 0x115f /* Hangul Jamo */
537 || (c
>= 0x2e80 && c
<= 0xa4cf
538 && c
!= 0x303f) /* CJK ... Yi */
539 || (c
>= 0xac00 && c
<= 0xd7a3) /* Hangul Syllables */
540 || (c
>= 0xf900 && c
<= 0xfaff) /* CJK Compatibility
542 || (c
>= 0xfe30 && c
<= 0xfe6f) /* CJK Compatibility Forms */
543 || (c
>= 0xff00 && c
<= 0xff60) /* Fullwidth Forms */
544 || (c
>= 0xffe0 && c
<= 0xffe6)
545 || (c
>= 0x20000 && c
<= 0x2fffd)
546 || (c
>= 0x30000 && c
<= 0x3fffd)))
552 /* Fold the case of a Unicode character, so that hotkeys in labels can
553 * be compared case-insensitively. It is unspecified whether the
554 * result will be in upper or lower case. */
556 unicode_fold_label_case(unicode_val_T c
)
558 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
560 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
561 /* For now, this supports only ASCII. It would be possible to
562 * use code generated from CaseFolding.txt of Unicode if the
563 * acknowledgements required by http://www.unicode.org/copyright.html
564 * were added to associated documentation of ELinks. */
565 if (c
>= 0x41 && c
<= 0x5A)
569 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
573 utf_8_to_unicode(unsigned char **string
, unsigned char *end
)
575 unsigned char *str
= *string
;
579 length
= utf8char_len_tab
[str
[0]];
581 if (str
+ length
> end
) {
590 u
= (str
[0] & 0x1f) << 6;
591 u
+= (str
[1] & 0x3f);
594 u
= (str
[0] & 0x0f) << 12;
595 u
+= ((str
[1] & 0x3f) << 6);
596 u
+= (str
[2] & 0x3f);
599 u
= (str
[0] & 0x0f) << 18;
600 u
+= ((str
[1] & 0x3f) << 12);
601 u
+= ((str
[2] & 0x3f) << 6);
602 u
+= (str
[3] & 0x3f);
605 u
= (str
[0] & 0x0f) << 24;
606 u
+= ((str
[1] & 0x3f) << 18);
607 u
+= ((str
[2] & 0x3f) << 12);
608 u
+= ((str
[3] & 0x3f) << 6);
609 u
+= (str
[4] & 0x3f);
613 u
= (str
[0] & 0x01) << 30;
614 u
+= ((str
[1] & 0x3f) << 24);
615 u
+= ((str
[2] & 0x3f) << 18);
616 u
+= ((str
[3] & 0x3f) << 12);
617 u
+= ((str
[4] & 0x3f) << 6);
618 u
+= (str
[5] & 0x3f);
621 *string
= str
+ length
;
624 #endif /* CONFIG_UTF_8 */
626 /* Slow algorithm, the common part of cp2u and cp2utf_8. */
628 cp2u_shared(const struct codepage_desc
*from
, unsigned char c
)
632 for (j
= 0; from
->table
[j
].c
; j
++)
633 if (from
->table
[j
].c
== c
)
634 return from
->table
[j
].u
;
636 return UCS_REPLACEMENT_CHARACTER
;
639 /* Slow algorithm, used for converting input from the terminal. */
641 cp2u(int from
, unsigned char c
)
643 from
&= ~SYSTEM_CHARSET_FLAG
;
645 /* UTF-8 is a multibyte codepage and cannot be handled with
647 assert(codepages
[from
].table
!= table_utf_8
);
648 if_assert_failed
return UCS_REPLACEMENT_CHARACTER
;
650 if (c
< 0x80) return c
;
651 else return cp2u_shared(&codepages
[from
], c
);
654 /* This slow and ugly code is used by the terminal utf_8_io */
656 cp2utf_8(int from
, int c
)
658 from
&= ~SYSTEM_CHARSET_FLAG
;
660 if (codepages
[from
].table
== table_utf_8
|| c
< 128)
663 return encode_utf_8(cp2u_shared(&codepages
[from
], c
));
668 cp_to_unicode(int codepage
, unsigned char **string
, unsigned char *end
)
672 if (is_cp_utf8(codepage
))
673 return utf_8_to_unicode(string
, end
);
678 ret
= cp2u(codepage
, **string
);
682 #endif /* CONFIG_UTF_8 */
686 add_utf_8(struct conv_table
*ct
, unicode_val_T u
, unsigned char *str
)
688 unsigned char *p
= encode_utf_8(u
);
691 if (ct
[*p
].t
) ct
= ct
[*p
].u
.tbl
;
693 struct conv_table
*nct
;
695 assertm(ct
[*p
].u
.str
== no_str
, "bad utf encoding #1");
696 if_assert_failed
return;
698 nct
= mem_calloc(256, sizeof(*nct
));
700 new_translation_table(nct
);
708 assertm(!ct
[*p
].t
, "bad utf encoding #2");
709 if_assert_failed
return;
711 if (ct
[*p
].u
.str
== no_str
)
715 struct conv_table utf_table
[256];
716 int utf_table_init
= 1;
723 for (i
= 128; i
< 256; i
++)
724 mem_free(utf_table
[i
].u
.str
);
727 static struct conv_table
*
728 get_translation_table_to_utf_8(int from
)
733 if (from
== -1) return NULL
;
734 from
&= ~SYSTEM_CHARSET_FLAG
;
735 if (from
== lfr
) return utf_table
;
738 memset(utf_table
, 0, sizeof(utf_table
)),
743 for (i
= 0; i
< 128; i
++)
744 utf_table
[i
].u
.str
= strings
[i
];
746 if (codepages
[from
].table
== table_utf_8
) {
747 for (i
= 128; i
< 256; i
++)
748 utf_table
[i
].u
.str
= stracpy(strings
[i
]);
752 for (i
= 128; i
< 256; i
++)
753 utf_table
[i
].u
.str
= NULL
;
755 for (i
= 0; codepages
[from
].table
[i
].c
; i
++) {
756 unicode_val_T u
= codepages
[from
].table
[i
].u
;
758 if (!utf_table
[codepages
[from
].table
[i
].c
].u
.str
)
759 utf_table
[codepages
[from
].table
[i
].c
].u
.str
=
760 stracpy(encode_utf_8(u
));
763 for (i
= 128; i
< 256; i
++)
764 if (!utf_table
[i
].u
.str
)
765 utf_table
[i
].u
.str
= stracpy(no_str
);
770 struct conv_table table
[256];
771 static int first
= 1;
774 free_conv_table(void)
776 if (!utf_table_init
) free_utf_table();
778 memset(table
, 0, sizeof(table
));
781 new_translation_table(table
);
786 get_translation_table(int from
, int to
)
791 from
&= ~SYSTEM_CHARSET_FLAG
;
792 to
&= ~SYSTEM_CHARSET_FLAG
;
794 memset(table
, 0, sizeof(table
));
797 if (/*from == to ||*/ from
== -1 || to
== -1)
799 if (codepages
[to
].table
== table_utf_8
)
800 return get_translation_table_to_utf_8(from
);
801 if (from
== lfr
&& to
== lto
)
805 new_translation_table(table
);
807 if (codepages
[from
].table
== table_utf_8
) {
810 for (i
= 0; codepages
[to
].table
[i
].c
; i
++)
811 add_utf_8(table
, codepages
[to
].table
[i
].u
,
812 strings
[codepages
[to
].table
[i
].c
]);
814 for (i
= 0; unicode_7b
[i
].x
!= -1; i
++)
815 if (unicode_7b
[i
].x
>= 0x80)
816 add_utf_8(table
, unicode_7b
[i
].x
,
822 for (i
= 128; i
< 256; i
++) {
825 for (j
= 0; codepages
[from
].table
[j
].c
; j
++) {
826 if (codepages
[from
].table
[j
].c
== i
) {
829 u
= u2cp(codepages
[from
].table
[j
].u
, to
);
830 if (u
) table
[i
].u
.str
= u
;
841 xxstrcmp(unsigned char *s1
, unsigned char *s2
, int l2
)
844 if (*s1
> *s2
) return 1;
845 if (*s1
< *s2
) return -1;
854 /* Entity cache debugging purpose. */
856 #define DEBUG_ENTITY_CACHE
858 #undef DEBUG_ENTITY_CACHE
861 struct entity_cache
{
865 unsigned char *result
;
866 unsigned char str
[20]; /* Suffice in any case. */
870 hits_cmp(struct entity_cache
*a
, struct entity_cache
*b
)
872 if (a
->hits
== b
->hits
) return 0;
873 if (a
->hits
> b
->hits
) return -1;
878 compare_entities(const void *key_
, const void *element_
)
880 struct string
*key
= (struct string
*) key_
;
881 struct entity
*element
= (struct entity
*) element_
;
882 int length
= key
->length
;
883 unsigned char *first
= key
->source
;
884 unsigned char *second
= element
->s
;
886 return xxstrcmp(first
, second
, length
);
890 get_entity_string(const unsigned char *str
, const int strlen
, int encoding
)
892 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
893 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
894 will go in [0] table */
895 static struct entity_cache entity_cache
[ENTITY_CACHE_MAXLEN
][ENTITY_CACHE_SIZE
];
896 static unsigned int nb_entity_cache
[ENTITY_CACHE_MAXLEN
];
897 static int first_time
= 1;
898 unsigned int slen
= 0;
899 unsigned char *result
= NULL
;
901 if (strlen
<= 0) return NULL
;
904 /* TODO: caching UTF-8 */
905 encoding
&= ~SYSTEM_CHARSET_FLAG
;
906 if (codepages
[encoding
].table
== table_utf_8
)
908 #endif /* CONFIG_UTF_8 */
911 memset(&nb_entity_cache
, 0, ENTITY_CACHE_MAXLEN
* sizeof(unsigned int));
915 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
916 * + google + slashdot + websites that result from a search for test on google,
917 * + various ones) show a quite impressive improvment:
919 * 0: hits=2459 l=4 st='nbsp'
920 * 1: hits=2152 l=6 st='eacute'
921 * 2: hits=235 l=6 st='egrave'
922 * 3: hits=136 l=6 st='agrave'
923 * 4: hits=100 l=3 st='amp'
924 * 5: hits=40 l=5 st='laquo'
925 * 6: hits=8 l=4 st='copy'
926 * 7: hits=5 l=2 st='gt'
927 * 8: hits=2 l=2 st='lt'
928 * 9: hits=1 l=6 st='middot'
930 * Most of the time cache hit ratio is near 95%.
932 * A long test shows: 15186 hits vs. 24 misses and mean iteration
933 * count is kept < 2 (worst case 1.58). Not so bad ;)
937 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
938 slen
= (strlen
> 1 && strlen
< ENTITY_CACHE_MAXLEN
) ? strlen
: 0;
940 if (strlen
< ENTITY_CACHE_MAXLEN
&& nb_entity_cache
[slen
] > 0) {
943 for (i
= 0; i
< nb_entity_cache
[slen
]; i
++) {
944 if (entity_cache
[slen
][i
].encoding
== encoding
945 && !memcmp(str
, entity_cache
[slen
][i
].str
, strlen
)) {
946 #ifdef DEBUG_ENTITY_CACHE
947 static double total_iter
= 0;
948 static unsigned long hit_count
= 0;
952 fprintf(stderr
, "hit after %d iter. (mean = %0.2f)\n", i
+ 1, total_iter
/ (double) hit_count
);
954 if (entity_cache
[slen
][i
].hits
< (unsigned int) ~0)
955 entity_cache
[slen
][i
].hits
++;
956 return entity_cache
[slen
][i
].result
;
959 #ifdef DEBUG_ENTITY_CACHE
960 fprintf(stderr
, "miss\n");
965 #endif /* CONFIG_UTF_8 */
966 if (*str
== '#') { /* Numeric entity. */
967 int l
= (int) strlen
;
968 unsigned char *st
= (unsigned char *) str
;
971 if (l
== 1) goto end
; /* &#; ? */
973 if ((*st
| 32) == 'x') { /* Hexadecimal */
975 if (l
== 1 || l
> 9) goto end
; /* xFFFFFFFF max. */
978 unsigned char c
= (*(st
++) | 32);
981 n
= (n
<< 4) | (c
- '0');
982 else if (isxdigit(c
))
983 n
= (n
<< 4) | (c
- 'a' + 10);
985 goto end
; /* Bad char. */
987 } else { /* Decimal */
988 if (l
> 10) goto end
; /* 4294967295 max. */
990 unsigned char c
= *(st
++);
993 n
= n
* 10 + c
- '0';
995 goto end
; /* Bad char. */
996 /* Limit to 0xFFFFFFFF. */
997 if (n
>= (unicode_val_T
) 0xFFFFFFFFu
)
1002 result
= u2cp(n
, encoding
);
1004 #ifdef DEBUG_ENTITY_CACHE
1005 fprintf(stderr
, "%lu %016x %s\n", (unsigned long) n
, n
, result
);
1007 } else { /* Text entity. */
1008 struct string key
= INIT_STRING((unsigned char *) str
, strlen
);
1009 struct entity
*element
= bsearch((void *) &key
, entities
,
1014 if (element
) result
= u2cp(element
->c
, encoding
);
1018 if (codepages
[encoding
].table
== table_utf_8
) {
1021 #endif /* CONFIG_UTF_8 */
1023 /* Take care of potential buffer overflow. */
1024 if (strlen
< sizeof(entity_cache
[slen
][0].str
)) {
1025 struct entity_cache
*ece
= &entity_cache
[slen
][nb_entity_cache
[slen
]];
1027 /* Copy new entry to cache. */
1029 ece
->strlen
= strlen
;
1030 ece
->encoding
= encoding
;
1031 ece
->result
= result
;
1032 memcpy(ece
->str
, str
, strlen
);
1033 ece
->str
[strlen
] = '\0';
1035 /* Increment number of cache entries if possible. */
1036 if (nb_entity_cache
[slen
] < ENTITY_CACHE_SIZE
) nb_entity_cache
[slen
]++;
1038 #ifdef DEBUG_ENTITY_CACHE
1039 fprintf(stderr
, "Added in [%u]: l=%d st='%s'\n", slen
,
1040 entity_cache
[slen
][0].strlen
, entity_cache
[slen
][0].str
);
1044 /* Sort entries by hit order. */
1045 if (nb_entity_cache
[slen
] > 1)
1046 qsort(&entity_cache
[slen
][0], nb_entity_cache
[slen
],
1047 sizeof(entity_cache
[slen
][0]), (void *) hits_cmp
);
1049 #ifdef DEBUG_ENTITY_CACHE
1053 fprintf(stderr
, "- Cache entries [%u] -\n", slen
);
1054 for (i
= 0; i
< nb_entity_cache
[slen
] ; i
++)
1055 fprintf(stderr
, "%d: hits=%u l=%d st='%s'\n", i
,
1056 entity_cache
[slen
][i
].hits
, entity_cache
[slen
][i
].strlen
,
1057 entity_cache
[slen
][i
].str
);
1058 fprintf(stderr
, "-----------------\n");
1066 convert_string(struct conv_table
*convert_table
,
1067 unsigned char *chars
, int charslen
, int cp
,
1068 enum convert_string_mode mode
, int *length
,
1069 void (*callback
)(void *data
, unsigned char *buf
, int buflen
),
1070 void *callback_data
)
1072 unsigned char *buffer
;
1076 if (!convert_table
&& !memchr(chars
, '&', charslen
)) {
1078 if (charslen
) callback(callback_data
, chars
, charslen
);
1081 return memacpy(chars
, charslen
);
1085 /* Buffer allocation */
1087 buffer
= mem_alloc(ALLOC_GR
+ 1 /* trailing \0 */);
1088 if (!buffer
) return NULL
;
1092 while (charspos
< charslen
) {
1093 unsigned char *translit
;
1096 buffer[bufferpos++] = chars[charspos++]; \
1101 if (chars
[charspos
] != '&') {
1102 struct conv_table
*t
;
1105 if (chars
[charspos
] < 128 || !convert_table
) PUTC
;
1110 while (t
[chars
[i
]].t
) {
1111 t
= t
[chars
[i
++]].u
.tbl
;
1112 if (i
>= charslen
) PUTC
;
1115 translit
= t
[chars
[i
]].u
.str
;
1118 } else if (mode
== CSM_FORM
|| mode
== CSM_NONE
) {
1122 int start
= charspos
+ 1;
1126 && (isasciialpha(chars
[i
])
1127 || isdigit(chars
[i
])
1128 || (chars
[i
] == '#')))
1131 /* This prevents bug 213: we were expanding "entities"
1132 * in URL query strings. */
1133 /* XXX: But this disables    usage, which
1134 * appears to be relatively common! --pasky */
1135 if ((mode
== CSM_DEFAULT
|| (chars
[i
] != '&' && chars
[i
] != '='))
1137 && !isasciialpha(chars
[i
]) && !isdigit(chars
[i
])) {
1138 translit
= get_entity_string(&chars
[start
], i
- start
,
1140 if (chars
[i
] != ';') {
1141 /* Eat    <foo> happily, but
1142 * pull back from the character after
1143 * entity string if it is not the valid
1148 if (!translit
) PUTC
;
1149 charspos
= i
+ (i
< charslen
);
1153 if (!translit
[0]) continue;
1156 buffer
[bufferpos
++] = translit
[0];
1164 buffer
[bufferpos
++] = *(translit
++);
1166 if (bufferpos
& (ALLOC_GR
- 1)) continue;
1169 buffer
[bufferpos
] = 0;
1170 callback(callback_data
, buffer
, bufferpos
);
1173 new = mem_realloc(buffer
, bufferpos
+ ALLOC_GR
);
1186 buffer
[bufferpos
] = 0;
1187 if (length
) *length
= bufferpos
;
1190 if (bufferpos
) callback(callback_data
, buffer
, bufferpos
);
1199 #ifndef USE_FASTFIND
1201 get_cp_index(unsigned char *name
)
1206 if (!strcasecmp(name
, "System")) {
1207 #if HAVE_LANGINFO_CODESET
1208 name
= nl_langinfo(CODESET
);
1209 syscp
= SYSTEM_CHARSET_FLAG
;
1215 for (i
= 0; codepages
[i
].name
; i
++) {
1216 for (a
= 0; codepages
[i
].aliases
[a
]; a
++) {
1217 /* In the past, we looked for the longest substring
1218 * in all the names; it is way too expensive, though:
1220 * % cumulative self self total
1221 * time seconds seconds calls us/call us/call name
1222 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1224 * Anything called from redraw_screen() is in fact
1225 * relatively expensive, even if it's called just
1226 * once. So we will do a simple strcasecmp() here.
1229 if (!strcasecmp(name
, codepages
[i
].aliases
[a
]))
1235 return get_cp_index("us-ascii") | syscp
;
1243 static unsigned int i_name
= 0;
1244 static unsigned int i_alias
= 0;
1246 /* Reset internal list pointer */
1248 charsets_list_reset(void)
1254 /* Returns a pointer to a struct that contains current key and data pointers
1255 * and increment internal pointer. It returns NULL when key is NULL. */
1256 struct fastfind_key_value
*
1257 charsets_list_next(void)
1259 static struct fastfind_key_value kv
;
1261 if (!codepages
[i_name
].name
) return NULL
;
1263 kv
.key
= codepages
[i_name
].aliases
[i_alias
];
1264 kv
.data
= &codepages
[i_name
];
1266 if (codepages
[i_name
].aliases
[i_alias
+ 1])
1276 static struct fastfind_index ff_charsets_index
1277 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset
, charsets_list_next
);
1279 /* It searchs for a charset named @name or one of its aliases and
1280 * returns index for it or -1 if not found. */
1282 get_cp_index(unsigned char *name
)
1284 struct codepage_desc
*codepage
;
1287 if (!strcasecmp(name
, "System")) {
1288 #if HAVE_LANGINFO_CODESET
1289 name
= nl_langinfo(CODESET
);
1290 syscp
= SYSTEM_CHARSET_FLAG
;
1296 codepage
= fastfind_search(&ff_charsets_index
, name
, strlen(name
));
1298 assert(codepages
<= codepage
&& codepage
< codepages
+ N_CODEPAGES
);
1299 return (codepage
- codepages
) | syscp
;
1302 return get_cp_index("us-ascii") | syscp
;
1309 #endif /* USE_FASTFIND */
1312 init_charsets_lookup(void)
1315 fastfind_index(&ff_charsets_index
, FF_COMPRESS
);
1320 free_charsets_lookup(void)
1323 fastfind_done(&ff_charsets_index
);
1328 get_cp_name(int cp_index
)
1330 if (cp_index
< 0) return "none";
1331 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1333 return codepages
[cp_index
].name
;
1337 get_cp_mime_name(int cp_index
)
1339 if (cp_index
< 0) return "none";
1340 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1341 if (!codepages
[cp_index
].aliases
) return NULL
;
1343 return codepages
[cp_index
].aliases
[0];
1347 is_cp_utf8(int cp_index
)
1349 cp_index
&= ~SYSTEM_CHARSET_FLAG
;
1350 return codepages
[cp_index
].table
== table_utf_8
;