1 /* Charsets convertor */
7 #if HAVE_LANGINFO_CODESET
19 #include "document/options.h"
20 #include "intl/charsets.h"
21 #include "util/conv.h"
22 #include "util/error.h"
23 #include "util/fastfind.h"
24 #include "util/memory.h"
25 #include "util/string.h"
28 /* Fix namespace clash on MacOS. */
29 #define table table_elinks
33 /* This should in principle be unicode_val_T, but because all
34 * the values currently in codepage.inc fit in 16 bits, we can
35 * as well use uint16_t and halve sizeof(struct table_entry)
36 * from 8 bytes to 4. Should other characters ever be needed,
37 * unicode_val_T u : 24 might be a possibility, although it
38 * seems a little unportable as bitfields are in principle
39 * restricted to int, which may be 16-bit. */
43 struct codepage_desc
{
45 unsigned char *const *aliases
;
46 const struct table_entry
*table
;
49 #include "intl/codepage.inc"
50 #include "intl/uni_7b.inc"
51 #include "intl/entity.inc"
54 static char strings
[256][2] = {
55 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
56 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
57 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
58 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
59 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
60 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
61 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
62 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
63 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
64 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
65 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
66 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
67 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
68 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
69 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
70 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
71 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
72 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
73 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
74 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
75 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
76 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
77 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
78 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
79 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
80 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
81 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
82 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
83 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
84 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
85 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
86 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
90 free_translation_table(struct conv_table
*p
)
94 for (i
= 0; i
< 256; i
++)
96 free_translation_table(p
[i
].u
.tbl
);
101 static unsigned char *no_str
= "*";
104 new_translation_table(struct conv_table
*p
)
108 for (i
= 0; i
< 256; i
++)
110 free_translation_table(p
[i
].u
.tbl
);
111 for (i
= 0; i
< 128; i
++) {
113 p
[i
].u
.str
= strings
[i
];
115 for (; i
< 256; i
++) {
121 #define BIN_SEARCH(table, entry, entries, key, result) \
123 long _s = 0, _e = (entries) - 1; \
125 while (_s <= _e || !((result) = -1)) { \
126 long _m = (_s + _e) / 2; \
128 if ((table)[_m].entry == (key)) { \
132 if ((table)[_m].entry > (key)) _e = _m - 1; \
133 if ((table)[_m].entry < (key)) _s = _m + 1; \
137 static const unicode_val_T strange_chars[32] = {
138 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
139 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
140 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
141 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
144 #define SYSTEM_CHARSET_FLAG 128
147 u2cp_(unicode_val_T u
, int to
, int no_nbsp_hack
)
152 if (u
< 128) return strings
[u
];
154 to
&= ~SYSTEM_CHARSET_FLAG
;
157 if (codepages
[to
].table
== table_utf8
)
158 return encode_utf8(u
);
159 #endif /* CONFIG_UTF8 */
161 /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
162 if (u
== 0xa0) return no_nbsp_hack
? " " : NBSP_CHAR_STRING
;
163 if (u
== 0xad) return "";
166 unicode_val_T strange
= strange_chars
[u
- 0x80];
168 if (!strange
) return NULL
;
169 return u2cp_(strange
, to
, no_nbsp_hack
);
173 for (j
= 0; codepages
[to
].table
[j
].c
; j
++)
174 if (codepages
[to
].table
[j
].u
== u
)
175 return strings
[codepages
[to
].table
[j
].c
];
177 BIN_SEARCH(unicode_7b
, x
, N_UNICODE_7B
, u
, s
);
178 if (s
!= -1) return unicode_7b
[s
].s
;
183 static unsigned char utf_buffer
[7];
186 inline unsigned char *
187 encode_utf8(unicode_val_T u
)
189 static unsigned char *
190 encode_utf8(unicode_val_T u
)
191 #endif /* CONFIG_UTF8 */
193 memset(utf_buffer
, 0, 7);
198 utf_buffer
[0] = 0xc0 | ((u
>> 6) & 0x1f),
199 utf_buffer
[1] = 0x80 | (u
& 0x3f);
200 else if (u
< 0x10000)
201 utf_buffer
[0] = 0xe0 | ((u
>> 12) & 0x0f),
202 utf_buffer
[1] = 0x80 | ((u
>> 6) & 0x3f),
203 utf_buffer
[2] = 0x80 | (u
& 0x3f);
204 else if (u
< 0x200000)
205 utf_buffer
[0] = 0xf0 | ((u
>> 18) & 0x0f),
206 utf_buffer
[1] = 0x80 | ((u
>> 12) & 0x3f),
207 utf_buffer
[2] = 0x80 | ((u
>> 6) & 0x3f),
208 utf_buffer
[3] = 0x80 | (u
& 0x3f);
209 else if (u
< 0x4000000)
210 utf_buffer
[0] = 0xf8 | ((u
>> 24) & 0x0f),
211 utf_buffer
[1] = 0x80 | ((u
>> 18) & 0x3f),
212 utf_buffer
[2] = 0x80 | ((u
>> 12) & 0x3f),
213 utf_buffer
[3] = 0x80 | ((u
>> 6) & 0x3f),
214 utf_buffer
[4] = 0x80 | (u
& 0x3f);
215 else utf_buffer
[0] = 0xfc | ((u
>> 30) & 0x01),
216 utf_buffer
[1] = 0x80 | ((u
>> 24) & 0x3f),
217 utf_buffer
[2] = 0x80 | ((u
>> 18) & 0x3f),
218 utf_buffer
[3] = 0x80 | ((u
>> 12) & 0x3f),
219 utf_buffer
[4] = 0x80 | ((u
>> 6) & 0x3f),
220 utf_buffer
[5] = 0x80 | (u
& 0x3f);
226 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
227 * equal ones and handled different. */
228 static char utf8char_len_tab
[256] = {
229 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
230 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
231 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
232 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
233 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
234 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
235 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
236 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
239 inline int utf8charlen(const unsigned char *p
)
241 return p
? utf8char_len_tab
[*p
] : 0;
245 strlen_utf8(unsigned char **str
)
247 unsigned char *s
= *str
;
248 unsigned char *end
= strchr(s
, '\0');
252 for (x
= 0;; x
++, s
+= len
) {
253 len
= utf8charlen(s
);
254 if (s
+ len
> end
) break;
260 #define utf8_issingle(p) (((p) & 0x80) == 0)
261 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
263 /* Start from @current and move back to @pos char. This pointer return. The
264 * most left pointer is @start. */
265 inline unsigned char *
266 utf8_prevchar(unsigned char *current
, int pos
, unsigned char *start
)
268 if (current
== NULL
|| start
== NULL
|| pos
< 0)
270 while (pos
> 0 && current
!= start
) {
272 if (utf8_islead(*current
))
278 /* Count number of standard terminal cells needed for displaying UTF-8
281 utf8_char2cells(unsigned char *utf8_char
, unsigned char *end
)
286 end
= strchr(utf8_char
, '\0');
288 if(!utf8_char
|| !end
)
291 u
= utf8_to_unicode(&utf8_char
, end
);
293 return unicode_to_cell(u
);
296 /* Count number of standard terminal cells needed for displaying string
297 * with UTF-8 characters. */
299 utf8_ptr2cells(unsigned char *string
, unsigned char *end
)
301 int charlen
, cell
, cells
= 0;
304 end
= strchr(string
, '\0');
310 charlen
= utf8charlen(string
);
311 if (string
+ charlen
> end
)
314 cell
= utf8_char2cells(string
, end
);
325 /* Count number of characters in string. */
327 utf8_ptr2chars(unsigned char *string
, unsigned char *end
)
329 int charlen
, chars
= 0;
332 end
= strchr(string
, '\0');
338 charlen
= utf8charlen(string
);
339 if (string
+ charlen
> end
)
350 * Count number of bytes from begining of the string needed for displaying
351 * specified number of cells.
354 utf8_cells2bytes(unsigned char *string
, int max_cells
, unsigned char *end
)
356 unsigned int bytes
= 0, cells
= 0;
358 assert(max_cells
>=0);
361 end
= strchr(string
, '\0');
367 int cell
= utf8_char2cells(&string
[bytes
], end
);
372 if (cells
> max_cells
)
375 bytes
+= utf8charlen(&string
[bytes
]);
377 if (string
+ bytes
> end
) {
378 bytes
= end
- string
;
386 /* Take @max steps forward from @string in the specified @way, but
387 * not going past @end. Return the resulting address. Store the
388 * number of steps taken to *@count, unless @count is NULL.
390 * This assumes the text is valid UTF-8, and @string and @end point to
391 * character boundaries. If not, it doesn't crash but the results may
394 * This function can do some of the same jobs as utf8charlen(),
395 * utf8_cells2bytes(), and strlen_utf8(). */
397 utf8_step_forward(unsigned char *string
, unsigned char *end
,
398 int max
, enum utf8_step way
, int *count
)
401 unsigned char *current
= string
;
405 if_assert_failed
goto invalid_arg
;
407 end
= strchr(string
, '\0');
410 case utf8_step_characters
:
411 while (steps
< max
&& current
< end
) {
413 if (utf8_islead(*current
))
418 case utf8_step_cells_fewer
:
419 case utf8_step_cells_more
:
420 while (steps
< max
) {
422 unsigned char *prev
= current
;
425 u
= utf8_to_unicode(¤t
, end
);
426 if (u
== UCS_NO_CHAR
) {
427 /* Assume the incomplete sequence
434 width
= unicode_to_cell(u
);
435 if (way
== utf8_step_cells_fewer
436 && steps
+ width
> max
) {
446 INTERNAL("impossible enum utf8_step");
455 /* Take @max steps backward from @string in the specified @way, but
456 * not going past @start. Return the resulting address. Store the
457 * number of steps taken to *@count, unless @count is NULL.
459 * This assumes the text is valid UTF-8, and @string and @start point
460 * to character boundaries. If not, it doesn't crash but the results
461 * may be inconsistent.
463 * This function can do some of the same jobs as utf8_prevchar(). */
465 utf8_step_backward(unsigned char *string
, unsigned char *start
,
466 int max
, enum utf8_step way
, int *count
)
469 unsigned char *current
= string
;
474 if_assert_failed
goto invalid_arg
;
477 case utf8_step_characters
:
478 while (steps
< max
&& current
> start
) {
480 if (utf8_islead(*current
))
485 case utf8_step_cells_fewer
:
486 case utf8_step_cells_more
:
487 while (steps
< max
) {
488 unsigned char *prev
= current
;
493 if (current
<= start
)
497 } while (current
> start
&& !utf8_islead(*current
));
500 u
= utf8_to_unicode(&look
, prev
);
501 if (u
== UCS_NO_CHAR
) {
502 /* Assume the incomplete sequence
506 width
= unicode_to_cell(u
);
508 if (way
== utf8_step_cells_fewer
509 && steps
+ width
> max
) {
519 INTERNAL("impossible enum utf8_step");
529 * Find out number of standard terminal collumns needed for displaying symbol
530 * (glyph) which represents Unicode character c.
531 * TODO: Use wcwidth when it is available.
533 * @return 2 for double-width glyph, 1 for others.
534 * TODO: May be extended to return 0 for zero-width glyphs
535 * (like composing, maybe unprintable too).
538 unicode_to_cell(unicode_val_T c
)
541 && (c
<= 0x115f /* Hangul Jamo */
544 || (c
>= 0x2e80 && c
<= 0xa4cf
545 && c
!= 0x303f) /* CJK ... Yi */
546 || (c
>= 0xac00 && c
<= 0xd7a3) /* Hangul Syllables */
547 || (c
>= 0xf900 && c
<= 0xfaff) /* CJK Compatibility
549 || (c
>= 0xfe30 && c
<= 0xfe6f) /* CJK Compatibility Forms */
550 || (c
>= 0xff00 && c
<= 0xff60) /* Fullwidth Forms */
551 || (c
>= 0xffe0 && c
<= 0xffe6)
552 || (c
>= 0x20000 && c
<= 0x2fffd)
553 || (c
>= 0x30000 && c
<= 0x3fffd)))
559 /* Fold the case of a Unicode character, so that hotkeys in labels can
560 * be compared case-insensitively. It is unspecified whether the
561 * result will be in upper or lower case. */
563 unicode_fold_label_case(unicode_val_T c
)
565 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
567 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
568 /* For now, this supports only ASCII. It would be possible to
569 * use code generated from CaseFolding.txt of Unicode if the
570 * acknowledgements required by http://www.unicode.org/copyright.html
571 * were added to associated documentation of ELinks. */
572 if (c
>= 0x41 && c
<= 0x5A)
576 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
580 utf8_to_unicode(unsigned char **string
, unsigned char *end
)
582 unsigned char *str
= *string
;
586 length
= utf8char_len_tab
[str
[0]];
588 if (str
+ length
> end
) {
597 u
= (str
[0] & 0x1f) << 6;
598 u
+= (str
[1] & 0x3f);
601 u
= (str
[0] & 0x0f) << 12;
602 u
+= ((str
[1] & 0x3f) << 6);
603 u
+= (str
[2] & 0x3f);
606 u
= (str
[0] & 0x0f) << 18;
607 u
+= ((str
[1] & 0x3f) << 12);
608 u
+= ((str
[2] & 0x3f) << 6);
609 u
+= (str
[3] & 0x3f);
612 u
= (str
[0] & 0x0f) << 24;
613 u
+= ((str
[1] & 0x3f) << 18);
614 u
+= ((str
[2] & 0x3f) << 12);
615 u
+= ((str
[3] & 0x3f) << 6);
616 u
+= (str
[4] & 0x3f);
620 u
= (str
[0] & 0x01) << 30;
621 u
+= ((str
[1] & 0x3f) << 24);
622 u
+= ((str
[2] & 0x3f) << 18);
623 u
+= ((str
[3] & 0x3f) << 12);
624 u
+= ((str
[4] & 0x3f) << 6);
625 u
+= (str
[5] & 0x3f);
628 *string
= str
+ length
;
631 #endif /* CONFIG_UTF8 */
633 /* Slow algorithm, the common part of cp2u and cp2utf8. */
635 cp2u_shared(const struct codepage_desc
*from
, unsigned char c
)
639 for (j
= 0; from
->table
[j
].c
; j
++)
640 if (from
->table
[j
].c
== c
)
641 return from
->table
[j
].u
;
643 return UCS_REPLACEMENT_CHARACTER
;
646 /* Slow algorithm, used for converting input from the terminal. */
648 cp2u(int from
, unsigned char c
)
650 from
&= ~SYSTEM_CHARSET_FLAG
;
652 /* UTF-8 is a multibyte codepage and cannot be handled with
654 assert(codepages
[from
].table
!= table_utf8
);
655 if_assert_failed
return UCS_REPLACEMENT_CHARACTER
;
657 if (c
< 0x80) return c
;
658 else return cp2u_shared(&codepages
[from
], c
);
661 /* This slow and ugly code is used by the terminal utf_8_io */
663 cp2utf8(int from
, int c
)
665 from
&= ~SYSTEM_CHARSET_FLAG
;
667 if (codepages
[from
].table
== table_utf8
|| c
< 128)
670 return encode_utf8(cp2u_shared(&codepages
[from
], c
));
675 cp_to_unicode(int codepage
, unsigned char **string
, unsigned char *end
)
679 if (is_cp_utf8(codepage
))
680 return utf8_to_unicode(string
, end
);
685 ret
= cp2u(codepage
, **string
);
689 #endif /* CONFIG_UTF8 */
693 add_utf8(struct conv_table
*ct
, unicode_val_T u
, unsigned char *str
)
695 unsigned char *p
= encode_utf8(u
);
698 if (ct
[*p
].t
) ct
= ct
[*p
].u
.tbl
;
700 struct conv_table
*nct
;
702 assertm(ct
[*p
].u
.str
== no_str
, "bad utf encoding #1");
703 if_assert_failed
return;
705 nct
= mem_calloc(256, sizeof(*nct
));
707 new_translation_table(nct
);
715 assertm(!ct
[*p
].t
, "bad utf encoding #2");
716 if_assert_failed
return;
718 if (ct
[*p
].u
.str
== no_str
)
722 struct conv_table utf_table
[256];
723 int utf_table_init
= 1;
730 for (i
= 128; i
< 256; i
++)
731 mem_free(utf_table
[i
].u
.str
);
734 static struct conv_table
*
735 get_translation_table_to_utf8(int from
)
740 if (from
== -1) return NULL
;
741 from
&= ~SYSTEM_CHARSET_FLAG
;
742 if (from
== lfr
) return utf_table
;
745 memset(utf_table
, 0, sizeof(utf_table
)),
750 for (i
= 0; i
< 128; i
++)
751 utf_table
[i
].u
.str
= strings
[i
];
753 if (codepages
[from
].table
== table_utf8
) {
754 for (i
= 128; i
< 256; i
++)
755 utf_table
[i
].u
.str
= stracpy(strings
[i
]);
759 for (i
= 128; i
< 256; i
++)
760 utf_table
[i
].u
.str
= NULL
;
762 for (i
= 0; codepages
[from
].table
[i
].c
; i
++) {
763 unicode_val_T u
= codepages
[from
].table
[i
].u
;
765 if (!utf_table
[codepages
[from
].table
[i
].c
].u
.str
)
766 utf_table
[codepages
[from
].table
[i
].c
].u
.str
=
767 stracpy(encode_utf8(u
));
770 for (i
= 128; i
< 256; i
++)
771 if (!utf_table
[i
].u
.str
)
772 utf_table
[i
].u
.str
= stracpy(no_str
);
777 struct conv_table table
[256];
778 static int first
= 1;
781 free_conv_table(void)
783 if (!utf_table_init
) free_utf_table();
785 memset(table
, 0, sizeof(table
));
788 new_translation_table(table
);
793 get_translation_table(int from
, int to
)
798 from
&= ~SYSTEM_CHARSET_FLAG
;
799 to
&= ~SYSTEM_CHARSET_FLAG
;
801 memset(table
, 0, sizeof(table
));
804 if (/*from == to ||*/ from
== -1 || to
== -1)
806 if (codepages
[to
].table
== table_utf8
)
807 return get_translation_table_to_utf8(from
);
808 if (from
== lfr
&& to
== lto
)
812 new_translation_table(table
);
814 if (codepages
[from
].table
== table_utf8
) {
817 for (i
= 0; codepages
[to
].table
[i
].c
; i
++)
818 add_utf8(table
, codepages
[to
].table
[i
].u
,
819 strings
[codepages
[to
].table
[i
].c
]);
821 for (i
= 0; unicode_7b
[i
].x
!= -1; i
++)
822 if (unicode_7b
[i
].x
>= 0x80)
823 add_utf8(table
, unicode_7b
[i
].x
,
829 for (i
= 128; i
< 256; i
++) {
832 for (j
= 0; codepages
[from
].table
[j
].c
; j
++) {
833 if (codepages
[from
].table
[j
].c
== i
) {
836 u
= u2cp(codepages
[from
].table
[j
].u
, to
);
837 if (u
) table
[i
].u
.str
= u
;
848 xxstrcmp(unsigned char *s1
, unsigned char *s2
, int l2
)
851 if (*s1
> *s2
) return 1;
852 if (*s1
< *s2
) return -1;
861 /* Entity cache debugging purpose. */
863 #define DEBUG_ENTITY_CACHE
865 #undef DEBUG_ENTITY_CACHE
868 struct entity_cache
{
872 unsigned char *result
;
873 unsigned char str
[20]; /* Suffice in any case. */
877 hits_cmp(struct entity_cache
*a
, struct entity_cache
*b
)
879 if (a
->hits
== b
->hits
) return 0;
880 if (a
->hits
> b
->hits
) return -1;
885 compare_entities(const void *key_
, const void *element_
)
887 struct string
*key
= (struct string
*) key_
;
888 struct entity
*element
= (struct entity
*) element_
;
889 int length
= key
->length
;
890 unsigned char *first
= key
->source
;
891 unsigned char *second
= element
->s
;
893 return xxstrcmp(first
, second
, length
);
897 get_entity_string(const unsigned char *str
, const int strlen
, int encoding
)
899 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
900 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
901 will go in [0] table */
902 static struct entity_cache entity_cache
[ENTITY_CACHE_MAXLEN
][ENTITY_CACHE_SIZE
];
903 static unsigned int nb_entity_cache
[ENTITY_CACHE_MAXLEN
];
904 static int first_time
= 1;
905 unsigned int slen
= 0;
906 unsigned char *result
= NULL
;
908 if (strlen
<= 0) return NULL
;
911 /* TODO: caching UTF-8 */
912 encoding
&= ~SYSTEM_CHARSET_FLAG
;
913 if (codepages
[encoding
].table
== table_utf8
)
915 #endif /* CONFIG_UTF8 */
918 memset(&nb_entity_cache
, 0, ENTITY_CACHE_MAXLEN
* sizeof(unsigned int));
922 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
923 * + google + slashdot + websites that result from a search for test on google,
924 * + various ones) show a quite impressive improvment:
926 * 0: hits=2459 l=4 st='nbsp'
927 * 1: hits=2152 l=6 st='eacute'
928 * 2: hits=235 l=6 st='egrave'
929 * 3: hits=136 l=6 st='agrave'
930 * 4: hits=100 l=3 st='amp'
931 * 5: hits=40 l=5 st='laquo'
932 * 6: hits=8 l=4 st='copy'
933 * 7: hits=5 l=2 st='gt'
934 * 8: hits=2 l=2 st='lt'
935 * 9: hits=1 l=6 st='middot'
937 * Most of the time cache hit ratio is near 95%.
939 * A long test shows: 15186 hits vs. 24 misses and mean iteration
940 * count is kept < 2 (worst case 1.58). Not so bad ;)
944 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
945 slen
= (strlen
> 1 && strlen
< ENTITY_CACHE_MAXLEN
) ? strlen
: 0;
947 if (strlen
< ENTITY_CACHE_MAXLEN
&& nb_entity_cache
[slen
] > 0) {
950 for (i
= 0; i
< nb_entity_cache
[slen
]; i
++) {
951 if (entity_cache
[slen
][i
].encoding
== encoding
952 && !memcmp(str
, entity_cache
[slen
][i
].str
, strlen
)) {
953 #ifdef DEBUG_ENTITY_CACHE
954 static double total_iter
= 0;
955 static unsigned long hit_count
= 0;
959 fprintf(stderr
, "hit after %d iter. (mean = %0.2f)\n", i
+ 1, total_iter
/ (double) hit_count
);
961 if (entity_cache
[slen
][i
].hits
< (unsigned int) ~0)
962 entity_cache
[slen
][i
].hits
++;
963 return entity_cache
[slen
][i
].result
;
966 #ifdef DEBUG_ENTITY_CACHE
967 fprintf(stderr
, "miss\n");
972 #endif /* CONFIG_UTF8 */
973 if (*str
== '#') { /* Numeric entity. */
974 int l
= (int) strlen
;
975 unsigned char *st
= (unsigned char *) str
;
978 if (l
== 1) goto end
; /* &#; ? */
980 if ((*st
| 32) == 'x') { /* Hexadecimal */
982 if (l
== 1 || l
> 9) goto end
; /* xFFFFFFFF max. */
985 unsigned char c
= (*(st
++) | 32);
988 n
= (n
<< 4) | (c
- '0');
989 else if (isxdigit(c
))
990 n
= (n
<< 4) | (c
- 'a' + 10);
992 goto end
; /* Bad char. */
994 } else { /* Decimal */
995 if (l
> 10) goto end
; /* 4294967295 max. */
997 unsigned char c
= *(st
++);
1000 n
= n
* 10 + c
- '0';
1002 goto end
; /* Bad char. */
1003 /* Limit to 0xFFFFFFFF. */
1004 if (n
>= (unicode_val_T
) 0xFFFFFFFFu
)
1009 result
= u2cp(n
, encoding
);
1011 #ifdef DEBUG_ENTITY_CACHE
1012 fprintf(stderr
, "%lu %016x %s\n", (unsigned long) n
, n
, result
);
1014 } else { /* Text entity. */
1015 struct string key
= INIT_STRING((unsigned char *) str
, strlen
);
1016 struct entity
*element
= bsearch((void *) &key
, entities
,
1021 if (element
) result
= u2cp(element
->c
, encoding
);
1025 if (codepages
[encoding
].table
== table_utf8
) {
1028 #endif /* CONFIG_UTF8 */
1030 /* Take care of potential buffer overflow. */
1031 if (strlen
< sizeof(entity_cache
[slen
][0].str
)) {
1032 struct entity_cache
*ece
= &entity_cache
[slen
][nb_entity_cache
[slen
]];
1034 /* Copy new entry to cache. */
1036 ece
->strlen
= strlen
;
1037 ece
->encoding
= encoding
;
1038 ece
->result
= result
;
1039 memcpy(ece
->str
, str
, strlen
);
1040 ece
->str
[strlen
] = '\0';
1042 /* Increment number of cache entries if possible. */
1043 if (nb_entity_cache
[slen
] < ENTITY_CACHE_SIZE
) nb_entity_cache
[slen
]++;
1045 #ifdef DEBUG_ENTITY_CACHE
1046 fprintf(stderr
, "Added in [%u]: l=%d st='%s'\n", slen
,
1047 entity_cache
[slen
][0].strlen
, entity_cache
[slen
][0].str
);
1051 /* Sort entries by hit order. */
1052 if (nb_entity_cache
[slen
] > 1)
1053 qsort(&entity_cache
[slen
][0], nb_entity_cache
[slen
],
1054 sizeof(entity_cache
[slen
][0]), (void *) hits_cmp
);
1056 #ifdef DEBUG_ENTITY_CACHE
1060 fprintf(stderr
, "- Cache entries [%u] -\n", slen
);
1061 for (i
= 0; i
< nb_entity_cache
[slen
] ; i
++)
1062 fprintf(stderr
, "%d: hits=%u l=%d st='%s'\n", i
,
1063 entity_cache
[slen
][i
].hits
, entity_cache
[slen
][i
].strlen
,
1064 entity_cache
[slen
][i
].str
);
1065 fprintf(stderr
, "-----------------\n");
1073 convert_string(struct conv_table
*convert_table
,
1074 unsigned char *chars
, int charslen
, int cp
,
1075 enum convert_string_mode mode
, int *length
,
1076 void (*callback
)(void *data
, unsigned char *buf
, int buflen
),
1077 void *callback_data
)
1079 unsigned char *buffer
;
1083 if (!convert_table
&& !memchr(chars
, '&', charslen
)) {
1085 if (charslen
) callback(callback_data
, chars
, charslen
);
1088 return memacpy(chars
, charslen
);
1092 /* Buffer allocation */
1094 buffer
= mem_alloc(ALLOC_GR
+ 1 /* trailing \0 */);
1095 if (!buffer
) return NULL
;
1099 while (charspos
< charslen
) {
1100 unsigned char *translit
;
1103 buffer[bufferpos++] = chars[charspos++]; \
1108 if (chars
[charspos
] != '&') {
1109 struct conv_table
*t
;
1112 if (chars
[charspos
] < 128 || !convert_table
) PUTC
;
1117 while (t
[chars
[i
]].t
) {
1118 t
= t
[chars
[i
++]].u
.tbl
;
1119 if (i
>= charslen
) PUTC
;
1122 translit
= t
[chars
[i
]].u
.str
;
1125 } else if (mode
== CSM_FORM
|| mode
== CSM_NONE
) {
1129 int start
= charspos
+ 1;
1133 && (isasciialpha(chars
[i
])
1134 || isdigit(chars
[i
])
1135 || (chars
[i
] == '#')))
1138 /* This prevents bug 213: we were expanding "entities"
1139 * in URL query strings. */
1140 /* XXX: But this disables    usage, which
1141 * appears to be relatively common! --pasky */
1142 if ((mode
== CSM_DEFAULT
|| (chars
[i
] != '&' && chars
[i
] != '='))
1144 && !isasciialpha(chars
[i
]) && !isdigit(chars
[i
])) {
1145 translit
= get_entity_string(&chars
[start
], i
- start
,
1147 if (chars
[i
] != ';') {
1148 /* Eat    <foo> happily, but
1149 * pull back from the character after
1150 * entity string if it is not the valid
1155 if (!translit
) PUTC
;
1156 charspos
= i
+ (i
< charslen
);
1160 if (!translit
[0]) continue;
1163 buffer
[bufferpos
++] = translit
[0];
1171 buffer
[bufferpos
++] = *(translit
++);
1173 if (bufferpos
& (ALLOC_GR
- 1)) continue;
1176 buffer
[bufferpos
] = 0;
1177 callback(callback_data
, buffer
, bufferpos
);
1180 new = mem_realloc(buffer
, bufferpos
+ ALLOC_GR
);
1193 buffer
[bufferpos
] = 0;
1194 if (length
) *length
= bufferpos
;
1197 if (bufferpos
) callback(callback_data
, buffer
, bufferpos
);
1206 #ifndef USE_FASTFIND
1208 get_cp_index(unsigned char *name
)
1213 if (!strcasecmp(name
, "System")) {
1214 #if HAVE_LANGINFO_CODESET
1215 name
= nl_langinfo(CODESET
);
1216 syscp
= SYSTEM_CHARSET_FLAG
;
1222 for (i
= 0; codepages
[i
].name
; i
++) {
1223 for (a
= 0; codepages
[i
].aliases
[a
]; a
++) {
1224 /* In the past, we looked for the longest substring
1225 * in all the names; it is way too expensive, though:
1227 * % cumulative self self total
1228 * time seconds seconds calls us/call us/call name
1229 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1231 * Anything called from redraw_screen() is in fact
1232 * relatively expensive, even if it's called just
1233 * once. So we will do a simple strcasecmp() here.
1236 if (!strcasecmp(name
, codepages
[i
].aliases
[a
]))
1242 return get_cp_index("us-ascii") | syscp
;
1250 static unsigned int i_name
= 0;
1251 static unsigned int i_alias
= 0;
1253 /* Reset internal list pointer */
1255 charsets_list_reset(void)
1261 /* Returns a pointer to a struct that contains current key and data pointers
1262 * and increment internal pointer. It returns NULL when key is NULL. */
1263 struct fastfind_key_value
*
1264 charsets_list_next(void)
1266 static struct fastfind_key_value kv
;
1268 if (!codepages
[i_name
].name
) return NULL
;
1270 kv
.key
= codepages
[i_name
].aliases
[i_alias
];
1271 kv
.data
= (void *) &codepages
[i_name
]; /* cast away const */
1273 if (codepages
[i_name
].aliases
[i_alias
+ 1])
1283 static struct fastfind_index ff_charsets_index
1284 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset
, charsets_list_next
);
1286 /* It searchs for a charset named @name or one of its aliases and
1287 * returns index for it or -1 if not found. */
1289 get_cp_index(unsigned char *name
)
1291 const struct codepage_desc
*codepage
;
1294 if (!strcasecmp(name
, "System")) {
1295 #if HAVE_LANGINFO_CODESET
1296 name
= nl_langinfo(CODESET
);
1297 syscp
= SYSTEM_CHARSET_FLAG
;
1303 codepage
= fastfind_search(&ff_charsets_index
, name
, strlen(name
));
1305 assert(codepages
<= codepage
&& codepage
< codepages
+ N_CODEPAGES
);
1306 return (codepage
- codepages
) | syscp
;
1309 return get_cp_index("us-ascii") | syscp
;
1316 #endif /* USE_FASTFIND */
1319 init_charsets_lookup(void)
1322 fastfind_index(&ff_charsets_index
, FF_COMPRESS
);
1327 free_charsets_lookup(void)
1330 fastfind_done(&ff_charsets_index
);
1335 get_cp_name(int cp_index
)
1337 if (cp_index
< 0) return "none";
1338 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1340 return codepages
[cp_index
].name
;
1344 get_cp_mime_name(int cp_index
)
1346 if (cp_index
< 0) return "none";
1347 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1348 if (!codepages
[cp_index
].aliases
) return NULL
;
1350 return codepages
[cp_index
].aliases
[0];
1354 is_cp_utf8(int cp_index
)
1356 cp_index
&= ~SYSTEM_CHARSET_FLAG
;
1357 return codepages
[cp_index
].table
== table_utf8
;