1 /* Charsets convertor */
7 #if HAVE_LANGINFO_CODESET
19 #include "document/options.h"
20 #include "intl/charsets.h"
21 #include "util/conv.h"
22 #include "util/error.h"
23 #include "util/fastfind.h"
24 #include "util/memory.h"
25 #include "util/string.h"
28 /* Fix namespace clash on MacOS. */
29 #define table table_elinks
36 struct codepage_desc
{
38 unsigned char **aliases
;
39 struct table_entry
*table
;
42 #include "intl/codepage.inc"
43 #include "intl/uni_7b.inc"
44 #include "intl/entity.inc"
47 static char strings
[256][2] = {
48 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
49 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
50 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
51 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
52 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
53 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
54 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
55 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
56 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
57 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
58 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
59 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
60 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
61 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
62 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
63 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
64 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
65 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
66 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
67 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
68 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
69 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
70 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
71 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
72 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
73 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
74 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
75 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
76 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
77 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
78 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
79 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
83 free_translation_table(struct conv_table
*p
)
87 for (i
= 0; i
< 256; i
++)
89 free_translation_table(p
[i
].u
.tbl
);
94 static unsigned char *no_str
= "*";
97 new_translation_table(struct conv_table
*p
)
101 for (i
= 0; i
< 256; i
++)
103 free_translation_table(p
[i
].u
.tbl
);
104 for (i
= 0; i
< 128; i
++) {
106 p
[i
].u
.str
= strings
[i
];
108 for (; i
< 256; i
++) {
114 #define BIN_SEARCH(table, entry, entries, key, result) \
116 long _s = 0, _e = (entries) - 1; \
118 while (_s <= _e || !((result) = -1)) { \
119 long _m = (_s + _e) / 2; \
121 if ((table)[_m].entry == (key)) { \
125 if ((table)[_m].entry > (key)) _e = _m - 1; \
126 if ((table)[_m].entry < (key)) _s = _m + 1; \
130 static const unicode_val_T strange_chars[32] = {
131 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
132 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
133 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
134 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
137 #define SYSTEM_CHARSET_FLAG 128
140 u2cp_(unicode_val_T u
, int to
, int no_nbsp_hack
)
145 if (u
< 128) return strings
[u
];
147 to
&= ~SYSTEM_CHARSET_FLAG
;
150 if (codepages
[to
].table
== table_utf_8
)
151 return encode_utf_8(u
);
152 #endif /* CONFIG_UTF_8 */
154 /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
155 if (u
== 0xa0) return no_nbsp_hack
? " " : NBSP_CHAR_STRING
;
156 if (u
== 0xad) return "";
159 unicode_val_T strange
= strange_chars
[u
- 0x80];
161 if (!strange
) return NULL
;
162 return u2cp_(strange
, to
, no_nbsp_hack
);
166 for (j
= 0; codepages
[to
].table
[j
].c
; j
++)
167 if (codepages
[to
].table
[j
].u
== u
)
168 return strings
[codepages
[to
].table
[j
].c
];
170 BIN_SEARCH(unicode_7b
, x
, N_UNICODE_7B
, u
, s
);
171 if (s
!= -1) return unicode_7b
[s
].s
;
176 static unsigned char utf_buffer
[7];
179 inline unsigned char *
180 encode_utf_8(unicode_val_T u
)
182 static unsigned char *
183 encode_utf_8(unicode_val_T u
)
184 #endif /* CONFIG_UTF_8 */
186 memset(utf_buffer
, 0, 7);
191 utf_buffer
[0] = 0xc0 | ((u
>> 6) & 0x1f),
192 utf_buffer
[1] = 0x80 | (u
& 0x3f);
193 else if (u
< 0x10000)
194 utf_buffer
[0] = 0xe0 | ((u
>> 12) & 0x0f),
195 utf_buffer
[1] = 0x80 | ((u
>> 6) & 0x3f),
196 utf_buffer
[2] = 0x80 | (u
& 0x3f);
197 else if (u
< 0x200000)
198 utf_buffer
[0] = 0xf0 | ((u
>> 18) & 0x0f),
199 utf_buffer
[1] = 0x80 | ((u
>> 12) & 0x3f),
200 utf_buffer
[2] = 0x80 | ((u
>> 6) & 0x3f),
201 utf_buffer
[3] = 0x80 | (u
& 0x3f);
202 else if (u
< 0x4000000)
203 utf_buffer
[0] = 0xf8 | ((u
>> 24) & 0x0f),
204 utf_buffer
[1] = 0x80 | ((u
>> 18) & 0x3f),
205 utf_buffer
[2] = 0x80 | ((u
>> 12) & 0x3f),
206 utf_buffer
[3] = 0x80 | ((u
>> 6) & 0x3f),
207 utf_buffer
[4] = 0x80 | (u
& 0x3f);
208 else utf_buffer
[0] = 0xfc | ((u
>> 30) & 0x01),
209 utf_buffer
[1] = 0x80 | ((u
>> 24) & 0x3f),
210 utf_buffer
[2] = 0x80 | ((u
>> 18) & 0x3f),
211 utf_buffer
[3] = 0x80 | ((u
>> 12) & 0x3f),
212 utf_buffer
[4] = 0x80 | ((u
>> 6) & 0x3f),
213 utf_buffer
[5] = 0x80 | (u
& 0x3f);
219 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
220 * equal ones and handled different. */
221 static char utf8char_len_tab
[256] = {
222 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
223 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
224 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
225 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
226 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
227 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
228 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
229 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
232 inline int utf8charlen(const unsigned char *p
)
234 return p
? utf8char_len_tab
[*p
] : 0;
238 strlen_utf8(unsigned char **str
)
240 unsigned char *s
= *str
;
241 unsigned char *end
= strchr(s
, '\0');
245 for (x
= 0;; x
++, s
+= len
) {
246 len
= utf8charlen(s
);
247 if (s
+ len
> end
) break;
253 #define utf8_issingle(p) (((p) & 0x80) == 0)
254 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
256 /* Start from @current and move back to @pos char. This pointer return. The
257 * most left pointer is @start. */
258 inline unsigned char *
259 utf8_prevchar(unsigned char *current
, int pos
, unsigned char *start
)
261 if (current
== NULL
|| start
== NULL
|| pos
< 0)
263 while (pos
> 0 && current
!= start
) {
265 if (utf8_islead(*current
))
271 /* Count number of standard terminal cells needed for displaying UTF-8
274 utf8_char2cells(unsigned char *utf8_char
, unsigned char *end
)
279 end
= strchr(utf8_char
, '\0');
281 if(!utf8_char
|| !end
)
284 u
= utf_8_to_unicode(&utf8_char
, end
);
286 return unicode_to_cell(u
);
289 /* Count number of standard terminal cells needed for displaying string
290 * with UTF-8 characters. */
292 utf8_ptr2cells(unsigned char *string
, unsigned char *end
)
294 int charlen
, cell
, cells
= 0;
297 end
= strchr(string
, '\0');
303 charlen
= utf8charlen(string
);
304 if (string
+ charlen
> end
)
307 cell
= utf8_char2cells(string
, end
);
318 /* Count number of characters in string. */
320 utf8_ptr2chars(unsigned char *string
, unsigned char *end
)
322 int charlen
, chars
= 0;
325 end
= strchr(string
, '\0');
331 charlen
= utf8charlen(string
);
332 if (string
+ charlen
> end
)
343 * Count number of bytes from begining of the string needed for displaying
344 * specified number of cells.
347 utf8_cells2bytes(unsigned char *string
, int max_cells
, unsigned char *end
)
349 unsigned int bytes
= 0, cells
= 0;
351 assert(max_cells
>=0);
354 end
= strchr(string
, '\0');
360 int cell
= utf8_char2cells(&string
[bytes
], end
);
365 if (cells
> max_cells
)
368 bytes
+= utf8charlen(&string
[bytes
]);
370 if (string
+ bytes
> end
) {
371 bytes
= end
- string
;
380 * Find out number of standard terminal collumns needed for displaying symbol
381 * (glyph) which represents Unicode character c.
382 * TODO: Use wcwidth when it is available.
384 * @return 2 for double-width glyph, 1 for others.
385 * TODO: May be extended to return 0 for zero-width glyphs
386 * (like composing, maybe unprintable too).
389 unicode_to_cell(unicode_val_T c
)
392 && (c
<= 0x115f /* Hangul Jamo */
395 || (c
>= 0x2e80 && c
<= 0xa4cf
396 && c
!= 0x303f) /* CJK ... Yi */
397 || (c
>= 0xac00 && c
<= 0xd7a3) /* Hangul Syllables */
398 || (c
>= 0xf900 && c
<= 0xfaff) /* CJK Compatibility
400 || (c
>= 0xfe30 && c
<= 0xfe6f) /* CJK Compatibility Forms */
401 || (c
>= 0xff00 && c
<= 0xff60) /* Fullwidth Forms */
402 || (c
>= 0xffe0 && c
<= 0xffe6)
403 || (c
>= 0x20000 && c
<= 0x2fffd)
404 || (c
>= 0x30000 && c
<= 0x3fffd)))
410 /* Fold the case of a Unicode character, so that hotkeys in labels can
411 * be compared case-insensitively. This should be called only if
412 * check_kbd_label_key(c) is true. It is unspecified whether the
413 * result will be in upper or lower case. */
415 unicode_fold_label_case(unicode_val_T c
)
417 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
419 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
420 /* For now, this supports only ASCII. It would be possible to
421 * use code generated from CaseFolding.txt of Unicode if the
422 * acknowledgements required by http://www.unicode.org/copyright.html
423 * were added to associated documentation of ELinks. */
424 if (c
>= 0x41 && c
<= 0x5A)
428 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
432 utf_8_to_unicode(unsigned char **string
, unsigned char *end
)
434 unsigned char *str
= *string
;
438 length
= utf8char_len_tab
[str
[0]];
440 if (str
+ length
> end
) {
449 u
= (str
[0] & 0x1f) << 6;
450 u
+= (str
[1] & 0x3f);
453 u
= (str
[0] & 0x0f) << 12;
454 u
+= ((str
[1] & 0x3f) << 6);
455 u
+= (str
[2] & 0x3f);
458 u
= (str
[0] & 0x0f) << 18;
459 u
+= ((str
[1] & 0x3f) << 12);
460 u
+= ((str
[2] & 0x3f) << 6);
461 u
+= (str
[3] & 0x3f);
464 u
= (str
[0] & 0x0f) << 24;
465 u
+= ((str
[1] & 0x3f) << 18);
466 u
+= ((str
[2] & 0x3f) << 12);
467 u
+= ((str
[3] & 0x3f) << 6);
468 u
+= (str
[4] & 0x3f);
472 u
= (str
[0] & 0x01) << 30;
473 u
+= ((str
[1] & 0x3f) << 24);
474 u
+= ((str
[2] & 0x3f) << 18);
475 u
+= ((str
[3] & 0x3f) << 12);
476 u
+= ((str
[4] & 0x3f) << 6);
477 u
+= (str
[5] & 0x3f);
480 *string
= str
+ length
;
483 #endif /* CONFIG_UTF_8 */
485 /* Slow algorithm, the common part of cp2u and cp2utf_8. */
487 cp2u_shared(const struct codepage_desc
*from
, unsigned char c
)
491 for (j
= 0; from
->table
[j
].c
; j
++)
492 if (from
->table
[j
].c
== c
)
493 return from
->table
[j
].u
;
498 /* Slow algorithm, used for converting input from the terminal. */
500 cp2u(int from
, unsigned char c
)
502 from
&= ~SYSTEM_CHARSET_FLAG
;
504 /* UTF-8 is a multibyte codepage and cannot be handled with
506 assert(codepages
[from
].table
!= table_utf_8
);
507 if_assert_failed
return UCS_NO_CHAR
;
509 if (c
< 0x80) return c
;
510 else return cp2u_shared(&codepages
[from
], c
);
513 /* This slow and ugly code is used by the terminal utf_8_io */
515 cp2utf_8(int from
, int c
)
517 from
&= ~SYSTEM_CHARSET_FLAG
;
519 if (codepages
[from
].table
== table_utf_8
|| c
< 128)
522 return encode_utf_8(cp2u_shared(&codepages
[from
], c
));
527 cp_to_unicode(int codepage
, unsigned char **string
, unsigned char *end
)
529 if (is_cp_utf8(codepage
))
530 return utf_8_to_unicode(string
, end
);
535 unicode_val_T ret
= cp2u(codepage
, **string
);
541 #endif /* CONFIG_UTF_8 */
545 add_utf_8(struct conv_table
*ct
, unicode_val_T u
, unsigned char *str
)
547 unsigned char *p
= encode_utf_8(u
);
550 if (ct
[*p
].t
) ct
= ct
[*p
].u
.tbl
;
552 struct conv_table
*nct
;
554 assertm(ct
[*p
].u
.str
== no_str
, "bad utf encoding #1");
555 if_assert_failed
return;
557 nct
= mem_calloc(256, sizeof(*nct
));
559 new_translation_table(nct
);
567 assertm(!ct
[*p
].t
, "bad utf encoding #2");
568 if_assert_failed
return;
570 if (ct
[*p
].u
.str
== no_str
)
574 struct conv_table utf_table
[256];
575 int utf_table_init
= 1;
582 for (i
= 128; i
< 256; i
++)
583 mem_free(utf_table
[i
].u
.str
);
586 static struct conv_table
*
587 get_translation_table_to_utf_8(int from
)
592 if (from
== -1) return NULL
;
593 from
&= ~SYSTEM_CHARSET_FLAG
;
594 if (from
== lfr
) return utf_table
;
597 memset(utf_table
, 0, sizeof(utf_table
)),
602 for (i
= 0; i
< 128; i
++)
603 utf_table
[i
].u
.str
= strings
[i
];
605 if (codepages
[from
].table
== table_utf_8
) {
606 for (i
= 128; i
< 256; i
++)
607 utf_table
[i
].u
.str
= stracpy(strings
[i
]);
611 for (i
= 128; i
< 256; i
++)
612 utf_table
[i
].u
.str
= NULL
;
614 for (i
= 0; codepages
[from
].table
[i
].c
; i
++) {
615 unicode_val_T u
= codepages
[from
].table
[i
].u
;
617 if (!utf_table
[codepages
[from
].table
[i
].c
].u
.str
)
618 utf_table
[codepages
[from
].table
[i
].c
].u
.str
=
619 stracpy(encode_utf_8(u
));
622 for (i
= 128; i
< 256; i
++)
623 if (!utf_table
[i
].u
.str
)
624 utf_table
[i
].u
.str
= stracpy(no_str
);
629 struct conv_table table
[256];
630 static int first
= 1;
633 free_conv_table(void)
635 if (!utf_table_init
) free_utf_table();
637 memset(table
, 0, sizeof(table
));
640 new_translation_table(table
);
645 get_translation_table(int from
, int to
)
650 from
&= ~SYSTEM_CHARSET_FLAG
;
651 to
&= ~SYSTEM_CHARSET_FLAG
;
653 memset(table
, 0, sizeof(table
));
656 if (/*from == to ||*/ from
== -1 || to
== -1)
658 if (codepages
[to
].table
== table_utf_8
)
659 return get_translation_table_to_utf_8(from
);
660 if (from
== lfr
&& to
== lto
)
664 new_translation_table(table
);
666 if (codepages
[from
].table
== table_utf_8
) {
669 for (i
= 0; codepages
[to
].table
[i
].c
; i
++)
670 add_utf_8(table
, codepages
[to
].table
[i
].u
,
671 strings
[codepages
[to
].table
[i
].c
]);
673 for (i
= 0; unicode_7b
[i
].x
!= -1; i
++)
674 if (unicode_7b
[i
].x
>= 0x80)
675 add_utf_8(table
, unicode_7b
[i
].x
,
681 for (i
= 128; i
< 256; i
++) {
684 for (j
= 0; codepages
[from
].table
[j
].c
; j
++) {
685 if (codepages
[from
].table
[j
].c
== i
) {
688 u
= u2cp(codepages
[from
].table
[j
].u
, to
);
689 if (u
) table
[i
].u
.str
= u
;
700 xxstrcmp(unsigned char *s1
, unsigned char *s2
, int l2
)
703 if (*s1
> *s2
) return 1;
704 if (*s1
< *s2
) return -1;
713 /* Entity cache debugging purpose. */
715 #define DEBUG_ENTITY_CACHE
717 #undef DEBUG_ENTITY_CACHE
720 struct entity_cache
{
724 unsigned char *result
;
725 unsigned char str
[20]; /* Suffice in any case. */
729 hits_cmp(struct entity_cache
*a
, struct entity_cache
*b
)
731 if (a
->hits
== b
->hits
) return 0;
732 if (a
->hits
> b
->hits
) return -1;
737 compare_entities(const void *key_
, const void *element_
)
739 struct string
*key
= (struct string
*) key_
;
740 struct entity
*element
= (struct entity
*) element_
;
741 int length
= key
->length
;
742 unsigned char *first
= key
->source
;
743 unsigned char *second
= element
->s
;
745 return xxstrcmp(first
, second
, length
);
749 get_entity_string(const unsigned char *str
, const int strlen
, int encoding
)
751 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
752 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
753 will go in [0] table */
754 static struct entity_cache entity_cache
[ENTITY_CACHE_MAXLEN
][ENTITY_CACHE_SIZE
];
755 static unsigned int nb_entity_cache
[ENTITY_CACHE_MAXLEN
];
756 static int first_time
= 1;
757 unsigned int slen
= 0;
758 unsigned char *result
= NULL
;
760 if (strlen
<= 0) return NULL
;
763 /* TODO: caching UTF-8 */
764 encoding
&= ~SYSTEM_CHARSET_FLAG
;
765 if (codepages
[encoding
].table
== table_utf_8
)
767 #endif /* CONFIG_UTF_8 */
770 memset(&nb_entity_cache
, 0, ENTITY_CACHE_MAXLEN
* sizeof(unsigned int));
774 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
775 * + google + slashdot + websites that result from a search for test on google,
776 * + various ones) show a quite impressive improvment:
778 * 0: hits=2459 l=4 st='nbsp'
779 * 1: hits=2152 l=6 st='eacute'
780 * 2: hits=235 l=6 st='egrave'
781 * 3: hits=136 l=6 st='agrave'
782 * 4: hits=100 l=3 st='amp'
783 * 5: hits=40 l=5 st='laquo'
784 * 6: hits=8 l=4 st='copy'
785 * 7: hits=5 l=2 st='gt'
786 * 8: hits=2 l=2 st='lt'
787 * 9: hits=1 l=6 st='middot'
789 * Most of the time cache hit ratio is near 95%.
791 * A long test shows: 15186 hits vs. 24 misses and mean iteration
792 * count is kept < 2 (worst case 1.58). Not so bad ;)
796 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
797 slen
= (strlen
> 1 && strlen
< ENTITY_CACHE_MAXLEN
) ? strlen
: 0;
799 if (strlen
< ENTITY_CACHE_MAXLEN
&& nb_entity_cache
[slen
] > 0) {
802 for (i
= 0; i
< nb_entity_cache
[slen
]; i
++) {
803 if (entity_cache
[slen
][i
].encoding
== encoding
804 && !memcmp(str
, entity_cache
[slen
][i
].str
, strlen
)) {
805 #ifdef DEBUG_ENTITY_CACHE
806 static double total_iter
= 0;
807 static unsigned long hit_count
= 0;
811 fprintf(stderr
, "hit after %d iter. (mean = %0.2f)\n", i
+ 1, total_iter
/ (double) hit_count
);
813 if (entity_cache
[slen
][i
].hits
< (unsigned int) ~0)
814 entity_cache
[slen
][i
].hits
++;
815 return entity_cache
[slen
][i
].result
;
818 #ifdef DEBUG_ENTITY_CACHE
819 fprintf(stderr
, "miss\n");
824 #endif /* CONFIG_UTF_8 */
825 if (*str
== '#') { /* Numeric entity. */
826 int l
= (int) strlen
;
827 unsigned char *st
= (unsigned char *) str
;
830 if (l
== 1) goto end
; /* &#; ? */
832 if ((*st
| 32) == 'x') { /* Hexadecimal */
834 if (l
== 1 || l
> 9) goto end
; /* xFFFFFFFF max. */
837 unsigned char c
= (*(st
++) | 32);
840 n
= (n
<< 4) | (c
- '0');
841 else if (isxdigit(c
))
842 n
= (n
<< 4) | (c
- 'a' + 10);
844 goto end
; /* Bad char. */
846 } else { /* Decimal */
847 if (l
> 10) goto end
; /* 4294967295 max. */
849 unsigned char c
= *(st
++);
852 n
= n
* 10 + c
- '0';
854 goto end
; /* Bad char. */
855 /* Limit to 0xFFFFFFFF. */
856 if (n
>= (unicode_val_T
) 0xFFFFFFFFu
)
861 result
= u2cp(n
, encoding
);
863 #ifdef DEBUG_ENTITY_CACHE
864 fprintf(stderr
, "%lu %016x %s\n", (unsigned long) n
, n
, result
);
866 } else { /* Text entity. */
867 struct string key
= INIT_STRING((unsigned char *) str
, strlen
);
868 struct entity
*element
= bsearch((void *) &key
, entities
,
873 if (element
) result
= u2cp(element
->c
, encoding
);
877 if (codepages
[encoding
].table
== table_utf_8
) {
880 #endif /* CONFIG_UTF_8 */
882 /* Take care of potential buffer overflow. */
883 if (strlen
< sizeof(entity_cache
[slen
][0].str
)) {
884 struct entity_cache
*ece
= &entity_cache
[slen
][nb_entity_cache
[slen
]];
886 /* Copy new entry to cache. */
888 ece
->strlen
= strlen
;
889 ece
->encoding
= encoding
;
890 ece
->result
= result
;
891 memcpy(ece
->str
, str
, strlen
);
892 ece
->str
[strlen
] = '\0';
894 /* Increment number of cache entries if possible. */
895 if (nb_entity_cache
[slen
] < ENTITY_CACHE_SIZE
) nb_entity_cache
[slen
]++;
897 #ifdef DEBUG_ENTITY_CACHE
898 fprintf(stderr
, "Added in [%u]: l=%d st='%s'\n", slen
,
899 entity_cache
[slen
][0].strlen
, entity_cache
[slen
][0].str
);
903 /* Sort entries by hit order. */
904 if (nb_entity_cache
[slen
] > 1)
905 qsort(&entity_cache
[slen
][0], nb_entity_cache
[slen
],
906 sizeof(entity_cache
[slen
][0]), (void *) hits_cmp
);
908 #ifdef DEBUG_ENTITY_CACHE
912 fprintf(stderr
, "- Cache entries [%u] -\n", slen
);
913 for (i
= 0; i
< nb_entity_cache
[slen
] ; i
++)
914 fprintf(stderr
, "%d: hits=%u l=%d st='%s'\n", i
,
915 entity_cache
[slen
][i
].hits
, entity_cache
[slen
][i
].strlen
,
916 entity_cache
[slen
][i
].str
);
917 fprintf(stderr
, "-----------------\n");
925 convert_string(struct conv_table
*convert_table
,
926 unsigned char *chars
, int charslen
, int cp
,
927 enum convert_string_mode mode
, int *length
,
928 void (*callback
)(void *data
, unsigned char *buf
, int buflen
),
931 unsigned char *buffer
;
935 if (!convert_table
&& !memchr(chars
, '&', charslen
)) {
937 if (charslen
) callback(callback_data
, chars
, charslen
);
940 return memacpy(chars
, charslen
);
944 /* Buffer allocation */
946 buffer
= mem_alloc(ALLOC_GR
+ 1 /* trailing \0 */);
947 if (!buffer
) return NULL
;
951 while (charspos
< charslen
) {
952 unsigned char *translit
;
955 buffer[bufferpos++] = chars[charspos++]; \
960 if (chars
[charspos
] != '&') {
961 struct conv_table
*t
;
964 if (chars
[charspos
] < 128 || !convert_table
) PUTC
;
969 while (t
[chars
[i
]].t
) {
970 t
= t
[chars
[i
++]].u
.tbl
;
971 if (i
>= charslen
) PUTC
;
974 translit
= t
[chars
[i
]].u
.str
;
977 } else if (mode
== CSM_FORM
|| mode
== CSM_NONE
) {
981 int start
= charspos
+ 1;
985 && (isasciialpha(chars
[i
])
987 || (chars
[i
] == '#')))
990 /* This prevents bug 213: we were expanding "entities"
991 * in URL query strings. */
992 /* XXX: But this disables    usage, which
993 * appears to be relatively common! --pasky */
994 if ((mode
== CSM_DEFAULT
|| (chars
[i
] != '&' && chars
[i
] != '='))
996 && !isasciialpha(chars
[i
]) && !isdigit(chars
[i
])) {
997 translit
= get_entity_string(&chars
[start
], i
- start
,
999 if (chars
[i
] != ';') {
1000 /* Eat    <foo> happily, but
1001 * pull back from the character after
1002 * entity string if it is not the valid
1007 if (!translit
) PUTC
;
1008 charspos
= i
+ (i
< charslen
);
1012 if (!translit
[0]) continue;
1015 buffer
[bufferpos
++] = translit
[0];
1023 buffer
[bufferpos
++] = *(translit
++);
1025 if (bufferpos
& (ALLOC_GR
- 1)) continue;
1028 buffer
[bufferpos
] = 0;
1029 callback(callback_data
, buffer
, bufferpos
);
1032 new = mem_realloc(buffer
, bufferpos
+ ALLOC_GR
);
1045 buffer
[bufferpos
] = 0;
1046 if (length
) *length
= bufferpos
;
1049 if (bufferpos
) callback(callback_data
, buffer
, bufferpos
);
1058 #ifndef USE_FASTFIND
1060 get_cp_index(unsigned char *name
)
1065 if (!strcasecmp(name
, "System")) {
1066 #if HAVE_LANGINFO_CODESET
1067 name
= nl_langinfo(CODESET
);
1068 syscp
= SYSTEM_CHARSET_FLAG
;
1074 for (i
= 0; codepages
[i
].name
; i
++) {
1075 for (a
= 0; codepages
[i
].aliases
[a
]; a
++) {
1076 /* In the past, we looked for the longest substring
1077 * in all the names; it is way too expensive, though:
1079 * % cumulative self self total
1080 * time seconds seconds calls us/call us/call name
1081 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1083 * Anything called from redraw_screen() is in fact
1084 * relatively expensive, even if it's called just
1085 * once. So we will do a simple strcasecmp() here.
1088 if (!strcasecmp(name
, codepages
[i
].aliases
[a
]))
1094 return get_cp_index("us-ascii") | syscp
;
1102 static unsigned int i_name
= 0;
1103 static unsigned int i_alias
= 0;
1105 /* Reset internal list pointer */
1107 charsets_list_reset(void)
1113 /* Returns a pointer to a struct that contains current key and data pointers
1114 * and increment internal pointer. It returns NULL when key is NULL. */
1115 struct fastfind_key_value
*
1116 charsets_list_next(void)
1118 static struct fastfind_key_value kv
;
1120 if (!codepages
[i_name
].name
) return NULL
;
1122 kv
.key
= codepages
[i_name
].aliases
[i_alias
];
1123 kv
.data
= &codepages
[i_name
];
1125 if (codepages
[i_name
].aliases
[i_alias
+ 1])
1135 static struct fastfind_index ff_charsets_index
1136 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset
, charsets_list_next
);
1138 /* It searchs for a charset named @name or one of its aliases and
1139 * returns index for it or -1 if not found. */
1141 get_cp_index(unsigned char *name
)
1143 struct codepage_desc
*codepage
;
1146 if (!strcasecmp(name
, "System")) {
1147 #if HAVE_LANGINFO_CODESET
1148 name
= nl_langinfo(CODESET
);
1149 syscp
= SYSTEM_CHARSET_FLAG
;
1155 codepage
= fastfind_search(&ff_charsets_index
, name
, strlen(name
));
1157 assert(codepages
<= codepage
&& codepage
< codepages
+ N_CODEPAGES
);
1158 return (codepage
- codepages
) | syscp
;
1161 return get_cp_index("us-ascii") | syscp
;
1168 #endif /* USE_FASTFIND */
1171 init_charsets_lookup(void)
1174 fastfind_index(&ff_charsets_index
, FF_COMPRESS
);
1179 free_charsets_lookup(void)
1182 fastfind_done(&ff_charsets_index
);
1187 get_cp_name(int cp_index
)
1189 if (cp_index
< 0) return "none";
1190 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1192 return codepages
[cp_index
].name
;
1196 get_cp_mime_name(int cp_index
)
1198 if (cp_index
< 0) return "none";
1199 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1200 if (!codepages
[cp_index
].aliases
) return NULL
;
1202 return codepages
[cp_index
].aliases
[0];
1206 is_cp_utf8(int cp_index
)
1208 cp_index
&= ~SYSTEM_CHARSET_FLAG
;
1209 return codepages
[cp_index
].table
== table_utf_8
;