1 /* Charsets convertor */
7 #if HAVE_LANGINFO_CODESET
19 #include "document/options.h"
20 #include "intl/charsets.h"
21 #include "util/conv.h"
22 #include "util/error.h"
23 #include "util/fastfind.h"
24 #include "util/memory.h"
25 #include "util/string.h"
28 /* Fix namespace clash on MacOS. */
29 #define table table_elinks
36 struct codepage_desc
{
38 unsigned char **aliases
;
39 struct table_entry
*table
;
42 #include "intl/codepage.inc"
43 #include "intl/uni_7b.inc"
44 #include "intl/entity.inc"
47 static char strings
[256][2] = {
48 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
49 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
50 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
51 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
52 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
53 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
54 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
55 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
56 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
57 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
58 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
59 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
60 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
61 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
62 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
63 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
64 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
65 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
66 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
67 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
68 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
69 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
70 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
71 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
72 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
73 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
74 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
75 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
76 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
77 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
78 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
79 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
83 free_translation_table(struct conv_table
*p
)
87 for (i
= 0; i
< 256; i
++)
89 free_translation_table(p
[i
].u
.tbl
);
94 static unsigned char *no_str
= "*";
97 new_translation_table(struct conv_table
*p
)
101 for (i
= 0; i
< 256; i
++)
103 free_translation_table(p
[i
].u
.tbl
);
104 for (i
= 0; i
< 128; i
++) {
106 p
[i
].u
.str
= strings
[i
];
108 for (; i
< 256; i
++) {
114 #define BIN_SEARCH(table, entry, entries, key, result) \
116 long _s = 0, _e = (entries) - 1; \
118 while (_s <= _e || !((result) = -1)) { \
119 long _m = (_s + _e) / 2; \
121 if ((table)[_m].entry == (key)) { \
125 if ((table)[_m].entry > (key)) _e = _m - 1; \
126 if ((table)[_m].entry < (key)) _s = _m + 1; \
130 static const unicode_val_T strange_chars[32] = {
131 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
132 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
133 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
134 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
137 #define SYSTEM_CHARSET_FLAG 128
140 u2cp_(unicode_val_T u
, int to
, int no_nbsp_hack
)
145 if (u
< 128) return strings
[u
];
147 to
&= ~SYSTEM_CHARSET_FLAG
;
150 if (codepages
[to
].table
== table_utf_8
)
151 return encode_utf_8(u
);
152 #endif /* CONFIG_UTF_8 */
154 /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
155 if (u
== 0xa0) return no_nbsp_hack
? " " : NBSP_CHAR_STRING
;
156 if (u
== 0xad) return "";
159 unicode_val_T strange
= strange_chars
[u
- 0x80];
161 if (!strange
) return NULL
;
162 return u2cp_(strange
, to
, no_nbsp_hack
);
166 for (j
= 0; codepages
[to
].table
[j
].c
; j
++)
167 if (codepages
[to
].table
[j
].u
== u
)
168 return strings
[codepages
[to
].table
[j
].c
];
170 BIN_SEARCH(unicode_7b
, x
, N_UNICODE_7B
, u
, s
);
171 if (s
!= -1) return unicode_7b
[s
].s
;
176 static unsigned char utf_buffer
[7];
179 inline unsigned char *
180 encode_utf_8(unicode_val_T u
)
182 static unsigned char *
183 encode_utf_8(unicode_val_T u
)
184 #endif /* CONFIG_UTF_8 */
186 memset(utf_buffer
, 0, 7);
191 utf_buffer
[0] = 0xc0 | ((u
>> 6) & 0x1f),
192 utf_buffer
[1] = 0x80 | (u
& 0x3f);
193 else if (u
< 0x10000)
194 utf_buffer
[0] = 0xe0 | ((u
>> 12) & 0x0f),
195 utf_buffer
[1] = 0x80 | ((u
>> 6) & 0x3f),
196 utf_buffer
[2] = 0x80 | (u
& 0x3f);
197 else if (u
< 0x200000)
198 utf_buffer
[0] = 0xf0 | ((u
>> 18) & 0x0f),
199 utf_buffer
[1] = 0x80 | ((u
>> 12) & 0x3f),
200 utf_buffer
[2] = 0x80 | ((u
>> 6) & 0x3f),
201 utf_buffer
[3] = 0x80 | (u
& 0x3f);
202 else if (u
< 0x4000000)
203 utf_buffer
[0] = 0xf8 | ((u
>> 24) & 0x0f),
204 utf_buffer
[1] = 0x80 | ((u
>> 18) & 0x3f),
205 utf_buffer
[2] = 0x80 | ((u
>> 12) & 0x3f),
206 utf_buffer
[3] = 0x80 | ((u
>> 6) & 0x3f),
207 utf_buffer
[4] = 0x80 | (u
& 0x3f);
208 else utf_buffer
[0] = 0xfc | ((u
>> 30) & 0x01),
209 utf_buffer
[1] = 0x80 | ((u
>> 24) & 0x3f),
210 utf_buffer
[2] = 0x80 | ((u
>> 18) & 0x3f),
211 utf_buffer
[3] = 0x80 | ((u
>> 12) & 0x3f),
212 utf_buffer
[4] = 0x80 | ((u
>> 6) & 0x3f),
213 utf_buffer
[5] = 0x80 | (u
& 0x3f);
219 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
220 * equal ones and handled different. */
221 static char utf8char_len_tab
[256] = {
222 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
223 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
224 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
225 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
226 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
227 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
228 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
229 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
232 inline int utf8charlen(const unsigned char *p
)
234 return p
? utf8char_len_tab
[*p
] : 0;
238 strlen_utf8(unsigned char **str
)
240 unsigned char *s
= *str
;
241 unsigned char *end
= strchr(s
, '\0');
245 for (x
= 0;; x
++, s
+= len
) {
246 len
= utf8charlen(s
);
247 if (s
+ len
> end
) break;
253 #define utf8_issingle(p) (((p) & 0x80) == 0)
254 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
256 /* Start from @current and move back to @pos char. This pointer return. The
257 * most left pointer is @start. */
258 inline unsigned char *
259 utf8_prevchar(unsigned char *current
, int pos
, unsigned char *start
)
261 if (current
== NULL
|| start
== NULL
|| pos
< 0)
263 while (pos
> 0 && current
!= start
) {
265 if (utf8_islead(*current
))
271 /* Count number of standard terminal cells needed for displaying UTF-8
274 utf8_char2cells(unsigned char *utf8_char
, unsigned char *end
)
279 end
= strchr(utf8_char
, '\0');
281 if(!utf8_char
|| !end
)
284 u
= utf_8_to_unicode(&utf8_char
, end
);
286 return unicode_to_cell(u
);
289 /* Count number of standard terminal cells needed for displaying string
290 * with UTF-8 characters. */
292 utf8_ptr2cells(unsigned char *string
, unsigned char *end
)
294 int charlen
, cell
, cells
= 0;
297 end
= strchr(string
, '\0');
303 charlen
= utf8charlen(string
);
304 if (string
+ charlen
> end
)
307 cell
= utf8_char2cells(string
, end
);
318 /* Count number of characters in string. */
320 utf8_ptr2chars(unsigned char *string
, unsigned char *end
)
322 int charlen
, chars
= 0;
325 end
= strchr(string
, '\0');
331 charlen
= utf8charlen(string
);
332 if (string
+ charlen
> end
)
343 * Count number of bytes from begining of the string needed for displaying
344 * specified number of cells.
347 utf8_cells2bytes(unsigned char *string
, int max_cells
, unsigned char *end
)
349 unsigned int bytes
= 0, cells
= 0;
351 assert(max_cells
>=0);
354 end
= strchr(string
, '\0');
360 int cell
= utf8_char2cells(&string
[bytes
], end
);
365 if (cells
> max_cells
)
368 bytes
+= utf8charlen(&string
[bytes
]);
370 if (string
+ bytes
> end
) {
371 bytes
= end
- string
;
380 * Find out number of standard terminal collumns needed for displaying symbol
381 * (glyph) which represents Unicode character c.
382 * TODO: Use wcwidth when it is available.
384 * @return 2 for double-width glyph, 1 for others.
385 * TODO: May be extended to return 0 for zero-width glyphs
386 * (like composing, maybe unprintable too).
389 unicode_to_cell(unicode_val_T c
)
392 && (c
<= 0x115f /* Hangul Jamo */
395 || (c
>= 0x2e80 && c
<= 0xa4cf
396 && c
!= 0x303f) /* CJK ... Yi */
397 || (c
>= 0xac00 && c
<= 0xd7a3) /* Hangul Syllables */
398 || (c
>= 0xf900 && c
<= 0xfaff) /* CJK Compatibility
400 || (c
>= 0xfe30 && c
<= 0xfe6f) /* CJK Compatibility Forms */
401 || (c
>= 0xff00 && c
<= 0xff60) /* Fullwidth Forms */
402 || (c
>= 0xffe0 && c
<= 0xffe6)
403 || (c
>= 0x20000 && c
<= 0x2fffd)
404 || (c
>= 0x30000 && c
<= 0x3fffd)))
410 /* Fold the case of a Unicode character, so that hotkeys in labels can
411 * be compared case-insensitively. It is unspecified whether the
412 * result will be in upper or lower case. */
414 unicode_fold_label_case(unicode_val_T c
)
416 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
418 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
419 /* For now, this supports only ASCII. It would be possible to
420 * use code generated from CaseFolding.txt of Unicode if the
421 * acknowledgements required by http://www.unicode.org/copyright.html
422 * were added to associated documentation of ELinks. */
423 if (c
>= 0x41 && c
<= 0x5A)
427 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
431 utf_8_to_unicode(unsigned char **string
, unsigned char *end
)
433 unsigned char *str
= *string
;
437 length
= utf8char_len_tab
[str
[0]];
439 if (str
+ length
> end
) {
448 u
= (str
[0] & 0x1f) << 6;
449 u
+= (str
[1] & 0x3f);
452 u
= (str
[0] & 0x0f) << 12;
453 u
+= ((str
[1] & 0x3f) << 6);
454 u
+= (str
[2] & 0x3f);
457 u
= (str
[0] & 0x0f) << 18;
458 u
+= ((str
[1] & 0x3f) << 12);
459 u
+= ((str
[2] & 0x3f) << 6);
460 u
+= (str
[3] & 0x3f);
463 u
= (str
[0] & 0x0f) << 24;
464 u
+= ((str
[1] & 0x3f) << 18);
465 u
+= ((str
[2] & 0x3f) << 12);
466 u
+= ((str
[3] & 0x3f) << 6);
467 u
+= (str
[4] & 0x3f);
471 u
= (str
[0] & 0x01) << 30;
472 u
+= ((str
[1] & 0x3f) << 24);
473 u
+= ((str
[2] & 0x3f) << 18);
474 u
+= ((str
[3] & 0x3f) << 12);
475 u
+= ((str
[4] & 0x3f) << 6);
476 u
+= (str
[5] & 0x3f);
479 *string
= str
+ length
;
482 #endif /* CONFIG_UTF_8 */
484 /* Slow algorithm, the common part of cp2u and cp2utf_8. */
486 cp2u_shared(const struct codepage_desc
*from
, unsigned char c
)
490 for (j
= 0; from
->table
[j
].c
; j
++)
491 if (from
->table
[j
].c
== c
)
492 return from
->table
[j
].u
;
494 return UCS_REPLACEMENT_CHARACTER
;
497 /* Slow algorithm, used for converting input from the terminal. */
499 cp2u(int from
, unsigned char c
)
501 from
&= ~SYSTEM_CHARSET_FLAG
;
503 /* UTF-8 is a multibyte codepage and cannot be handled with
505 assert(codepages
[from
].table
!= table_utf_8
);
506 if_assert_failed
return UCS_REPLACEMENT_CHARACTER
;
508 if (c
< 0x80) return c
;
509 else return cp2u_shared(&codepages
[from
], c
);
512 /* This slow and ugly code is used by the terminal utf_8_io */
514 cp2utf_8(int from
, int c
)
516 from
&= ~SYSTEM_CHARSET_FLAG
;
518 if (codepages
[from
].table
== table_utf_8
|| c
< 128)
521 return encode_utf_8(cp2u_shared(&codepages
[from
], c
));
526 cp_to_unicode(int codepage
, unsigned char **string
, unsigned char *end
)
530 if (is_cp_utf8(codepage
))
531 return utf_8_to_unicode(string
, end
);
536 ret
= cp2u(codepage
, **string
);
540 #endif /* CONFIG_UTF_8 */
544 add_utf_8(struct conv_table
*ct
, unicode_val_T u
, unsigned char *str
)
546 unsigned char *p
= encode_utf_8(u
);
549 if (ct
[*p
].t
) ct
= ct
[*p
].u
.tbl
;
551 struct conv_table
*nct
;
553 assertm(ct
[*p
].u
.str
== no_str
, "bad utf encoding #1");
554 if_assert_failed
return;
556 nct
= mem_calloc(256, sizeof(*nct
));
558 new_translation_table(nct
);
566 assertm(!ct
[*p
].t
, "bad utf encoding #2");
567 if_assert_failed
return;
569 if (ct
[*p
].u
.str
== no_str
)
573 struct conv_table utf_table
[256];
574 int utf_table_init
= 1;
581 for (i
= 128; i
< 256; i
++)
582 mem_free(utf_table
[i
].u
.str
);
585 static struct conv_table
*
586 get_translation_table_to_utf_8(int from
)
591 if (from
== -1) return NULL
;
592 from
&= ~SYSTEM_CHARSET_FLAG
;
593 if (from
== lfr
) return utf_table
;
596 memset(utf_table
, 0, sizeof(utf_table
)),
601 for (i
= 0; i
< 128; i
++)
602 utf_table
[i
].u
.str
= strings
[i
];
604 if (codepages
[from
].table
== table_utf_8
) {
605 for (i
= 128; i
< 256; i
++)
606 utf_table
[i
].u
.str
= stracpy(strings
[i
]);
610 for (i
= 128; i
< 256; i
++)
611 utf_table
[i
].u
.str
= NULL
;
613 for (i
= 0; codepages
[from
].table
[i
].c
; i
++) {
614 unicode_val_T u
= codepages
[from
].table
[i
].u
;
616 if (!utf_table
[codepages
[from
].table
[i
].c
].u
.str
)
617 utf_table
[codepages
[from
].table
[i
].c
].u
.str
=
618 stracpy(encode_utf_8(u
));
621 for (i
= 128; i
< 256; i
++)
622 if (!utf_table
[i
].u
.str
)
623 utf_table
[i
].u
.str
= stracpy(no_str
);
628 struct conv_table table
[256];
629 static int first
= 1;
632 free_conv_table(void)
634 if (!utf_table_init
) free_utf_table();
636 memset(table
, 0, sizeof(table
));
639 new_translation_table(table
);
644 get_translation_table(int from
, int to
)
649 from
&= ~SYSTEM_CHARSET_FLAG
;
650 to
&= ~SYSTEM_CHARSET_FLAG
;
652 memset(table
, 0, sizeof(table
));
655 if (/*from == to ||*/ from
== -1 || to
== -1)
657 if (codepages
[to
].table
== table_utf_8
)
658 return get_translation_table_to_utf_8(from
);
659 if (from
== lfr
&& to
== lto
)
663 new_translation_table(table
);
665 if (codepages
[from
].table
== table_utf_8
) {
668 for (i
= 0; codepages
[to
].table
[i
].c
; i
++)
669 add_utf_8(table
, codepages
[to
].table
[i
].u
,
670 strings
[codepages
[to
].table
[i
].c
]);
672 for (i
= 0; unicode_7b
[i
].x
!= -1; i
++)
673 if (unicode_7b
[i
].x
>= 0x80)
674 add_utf_8(table
, unicode_7b
[i
].x
,
680 for (i
= 128; i
< 256; i
++) {
683 for (j
= 0; codepages
[from
].table
[j
].c
; j
++) {
684 if (codepages
[from
].table
[j
].c
== i
) {
687 u
= u2cp(codepages
[from
].table
[j
].u
, to
);
688 if (u
) table
[i
].u
.str
= u
;
699 xxstrcmp(unsigned char *s1
, unsigned char *s2
, int l2
)
702 if (*s1
> *s2
) return 1;
703 if (*s1
< *s2
) return -1;
712 /* Entity cache debugging purpose. */
714 #define DEBUG_ENTITY_CACHE
716 #undef DEBUG_ENTITY_CACHE
719 struct entity_cache
{
723 unsigned char *result
;
724 unsigned char str
[20]; /* Suffice in any case. */
728 hits_cmp(struct entity_cache
*a
, struct entity_cache
*b
)
730 if (a
->hits
== b
->hits
) return 0;
731 if (a
->hits
> b
->hits
) return -1;
736 compare_entities(const void *key_
, const void *element_
)
738 struct string
*key
= (struct string
*) key_
;
739 struct entity
*element
= (struct entity
*) element_
;
740 int length
= key
->length
;
741 unsigned char *first
= key
->source
;
742 unsigned char *second
= element
->s
;
744 return xxstrcmp(first
, second
, length
);
748 get_entity_string(const unsigned char *str
, const int strlen
, int encoding
)
750 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
751 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
752 will go in [0] table */
753 static struct entity_cache entity_cache
[ENTITY_CACHE_MAXLEN
][ENTITY_CACHE_SIZE
];
754 static unsigned int nb_entity_cache
[ENTITY_CACHE_MAXLEN
];
755 static int first_time
= 1;
756 unsigned int slen
= 0;
757 unsigned char *result
= NULL
;
759 if (strlen
<= 0) return NULL
;
762 /* TODO: caching UTF-8 */
763 encoding
&= ~SYSTEM_CHARSET_FLAG
;
764 if (codepages
[encoding
].table
== table_utf_8
)
766 #endif /* CONFIG_UTF_8 */
769 memset(&nb_entity_cache
, 0, ENTITY_CACHE_MAXLEN
* sizeof(unsigned int));
773 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
774 * + google + slashdot + websites that result from a search for test on google,
775 * + various ones) show a quite impressive improvment:
777 * 0: hits=2459 l=4 st='nbsp'
778 * 1: hits=2152 l=6 st='eacute'
779 * 2: hits=235 l=6 st='egrave'
780 * 3: hits=136 l=6 st='agrave'
781 * 4: hits=100 l=3 st='amp'
782 * 5: hits=40 l=5 st='laquo'
783 * 6: hits=8 l=4 st='copy'
784 * 7: hits=5 l=2 st='gt'
785 * 8: hits=2 l=2 st='lt'
786 * 9: hits=1 l=6 st='middot'
788 * Most of the time cache hit ratio is near 95%.
790 * A long test shows: 15186 hits vs. 24 misses and mean iteration
791 * count is kept < 2 (worst case 1.58). Not so bad ;)
795 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
796 slen
= (strlen
> 1 && strlen
< ENTITY_CACHE_MAXLEN
) ? strlen
: 0;
798 if (strlen
< ENTITY_CACHE_MAXLEN
&& nb_entity_cache
[slen
] > 0) {
801 for (i
= 0; i
< nb_entity_cache
[slen
]; i
++) {
802 if (entity_cache
[slen
][i
].encoding
== encoding
803 && !memcmp(str
, entity_cache
[slen
][i
].str
, strlen
)) {
804 #ifdef DEBUG_ENTITY_CACHE
805 static double total_iter
= 0;
806 static unsigned long hit_count
= 0;
810 fprintf(stderr
, "hit after %d iter. (mean = %0.2f)\n", i
+ 1, total_iter
/ (double) hit_count
);
812 if (entity_cache
[slen
][i
].hits
< (unsigned int) ~0)
813 entity_cache
[slen
][i
].hits
++;
814 return entity_cache
[slen
][i
].result
;
817 #ifdef DEBUG_ENTITY_CACHE
818 fprintf(stderr
, "miss\n");
823 #endif /* CONFIG_UTF_8 */
824 if (*str
== '#') { /* Numeric entity. */
825 int l
= (int) strlen
;
826 unsigned char *st
= (unsigned char *) str
;
829 if (l
== 1) goto end
; /* &#; ? */
831 if ((*st
| 32) == 'x') { /* Hexadecimal */
833 if (l
== 1 || l
> 9) goto end
; /* xFFFFFFFF max. */
836 unsigned char c
= (*(st
++) | 32);
839 n
= (n
<< 4) | (c
- '0');
840 else if (isxdigit(c
))
841 n
= (n
<< 4) | (c
- 'a' + 10);
843 goto end
; /* Bad char. */
845 } else { /* Decimal */
846 if (l
> 10) goto end
; /* 4294967295 max. */
848 unsigned char c
= *(st
++);
851 n
= n
* 10 + c
- '0';
853 goto end
; /* Bad char. */
854 /* Limit to 0xFFFFFFFF. */
855 if (n
>= (unicode_val_T
) 0xFFFFFFFFu
)
860 result
= u2cp(n
, encoding
);
862 #ifdef DEBUG_ENTITY_CACHE
863 fprintf(stderr
, "%lu %016x %s\n", (unsigned long) n
, n
, result
);
865 } else { /* Text entity. */
866 struct string key
= INIT_STRING((unsigned char *) str
, strlen
);
867 struct entity
*element
= bsearch((void *) &key
, entities
,
872 if (element
) result
= u2cp(element
->c
, encoding
);
876 if (codepages
[encoding
].table
== table_utf_8
) {
879 #endif /* CONFIG_UTF_8 */
881 /* Take care of potential buffer overflow. */
882 if (strlen
< sizeof(entity_cache
[slen
][0].str
)) {
883 struct entity_cache
*ece
= &entity_cache
[slen
][nb_entity_cache
[slen
]];
885 /* Copy new entry to cache. */
887 ece
->strlen
= strlen
;
888 ece
->encoding
= encoding
;
889 ece
->result
= result
;
890 memcpy(ece
->str
, str
, strlen
);
891 ece
->str
[strlen
] = '\0';
893 /* Increment number of cache entries if possible. */
894 if (nb_entity_cache
[slen
] < ENTITY_CACHE_SIZE
) nb_entity_cache
[slen
]++;
896 #ifdef DEBUG_ENTITY_CACHE
897 fprintf(stderr
, "Added in [%u]: l=%d st='%s'\n", slen
,
898 entity_cache
[slen
][0].strlen
, entity_cache
[slen
][0].str
);
902 /* Sort entries by hit order. */
903 if (nb_entity_cache
[slen
] > 1)
904 qsort(&entity_cache
[slen
][0], nb_entity_cache
[slen
],
905 sizeof(entity_cache
[slen
][0]), (void *) hits_cmp
);
907 #ifdef DEBUG_ENTITY_CACHE
911 fprintf(stderr
, "- Cache entries [%u] -\n", slen
);
912 for (i
= 0; i
< nb_entity_cache
[slen
] ; i
++)
913 fprintf(stderr
, "%d: hits=%u l=%d st='%s'\n", i
,
914 entity_cache
[slen
][i
].hits
, entity_cache
[slen
][i
].strlen
,
915 entity_cache
[slen
][i
].str
);
916 fprintf(stderr
, "-----------------\n");
924 convert_string(struct conv_table
*convert_table
,
925 unsigned char *chars
, int charslen
, int cp
,
926 enum convert_string_mode mode
, int *length
,
927 void (*callback
)(void *data
, unsigned char *buf
, int buflen
),
930 unsigned char *buffer
;
934 if (!convert_table
&& !memchr(chars
, '&', charslen
)) {
936 if (charslen
) callback(callback_data
, chars
, charslen
);
939 return memacpy(chars
, charslen
);
943 /* Buffer allocation */
945 buffer
= mem_alloc(ALLOC_GR
+ 1 /* trailing \0 */);
946 if (!buffer
) return NULL
;
950 while (charspos
< charslen
) {
951 unsigned char *translit
;
954 buffer[bufferpos++] = chars[charspos++]; \
959 if (chars
[charspos
] != '&') {
960 struct conv_table
*t
;
963 if (chars
[charspos
] < 128 || !convert_table
) PUTC
;
968 while (t
[chars
[i
]].t
) {
969 t
= t
[chars
[i
++]].u
.tbl
;
970 if (i
>= charslen
) PUTC
;
973 translit
= t
[chars
[i
]].u
.str
;
976 } else if (mode
== CSM_FORM
|| mode
== CSM_NONE
) {
980 int start
= charspos
+ 1;
984 && (isasciialpha(chars
[i
])
986 || (chars
[i
] == '#')))
989 /* This prevents bug 213: we were expanding "entities"
990 * in URL query strings. */
991 /* XXX: But this disables    usage, which
992 * appears to be relatively common! --pasky */
993 if ((mode
== CSM_DEFAULT
|| (chars
[i
] != '&' && chars
[i
] != '='))
995 && !isasciialpha(chars
[i
]) && !isdigit(chars
[i
])) {
996 translit
= get_entity_string(&chars
[start
], i
- start
,
998 if (chars
[i
] != ';') {
999 /* Eat    <foo> happily, but
1000 * pull back from the character after
1001 * entity string if it is not the valid
1006 if (!translit
) PUTC
;
1007 charspos
= i
+ (i
< charslen
);
1011 if (!translit
[0]) continue;
1014 buffer
[bufferpos
++] = translit
[0];
1022 buffer
[bufferpos
++] = *(translit
++);
1024 if (bufferpos
& (ALLOC_GR
- 1)) continue;
1027 buffer
[bufferpos
] = 0;
1028 callback(callback_data
, buffer
, bufferpos
);
1031 new = mem_realloc(buffer
, bufferpos
+ ALLOC_GR
);
1044 buffer
[bufferpos
] = 0;
1045 if (length
) *length
= bufferpos
;
1048 if (bufferpos
) callback(callback_data
, buffer
, bufferpos
);
1057 #ifndef USE_FASTFIND
1059 get_cp_index(unsigned char *name
)
1064 if (!strcasecmp(name
, "System")) {
1065 #if HAVE_LANGINFO_CODESET
1066 name
= nl_langinfo(CODESET
);
1067 syscp
= SYSTEM_CHARSET_FLAG
;
1073 for (i
= 0; codepages
[i
].name
; i
++) {
1074 for (a
= 0; codepages
[i
].aliases
[a
]; a
++) {
1075 /* In the past, we looked for the longest substring
1076 * in all the names; it is way too expensive, though:
1078 * % cumulative self self total
1079 * time seconds seconds calls us/call us/call name
1080 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1082 * Anything called from redraw_screen() is in fact
1083 * relatively expensive, even if it's called just
1084 * once. So we will do a simple strcasecmp() here.
1087 if (!strcasecmp(name
, codepages
[i
].aliases
[a
]))
1093 return get_cp_index("us-ascii") | syscp
;
1101 static unsigned int i_name
= 0;
1102 static unsigned int i_alias
= 0;
1104 /* Reset internal list pointer */
1106 charsets_list_reset(void)
1112 /* Returns a pointer to a struct that contains current key and data pointers
1113 * and increment internal pointer. It returns NULL when key is NULL. */
1114 struct fastfind_key_value
*
1115 charsets_list_next(void)
1117 static struct fastfind_key_value kv
;
1119 if (!codepages
[i_name
].name
) return NULL
;
1121 kv
.key
= codepages
[i_name
].aliases
[i_alias
];
1122 kv
.data
= &codepages
[i_name
];
1124 if (codepages
[i_name
].aliases
[i_alias
+ 1])
1134 static struct fastfind_index ff_charsets_index
1135 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset
, charsets_list_next
);
1137 /* It searchs for a charset named @name or one of its aliases and
1138 * returns index for it or -1 if not found. */
1140 get_cp_index(unsigned char *name
)
1142 struct codepage_desc
*codepage
;
1145 if (!strcasecmp(name
, "System")) {
1146 #if HAVE_LANGINFO_CODESET
1147 name
= nl_langinfo(CODESET
);
1148 syscp
= SYSTEM_CHARSET_FLAG
;
1154 codepage
= fastfind_search(&ff_charsets_index
, name
, strlen(name
));
1156 assert(codepages
<= codepage
&& codepage
< codepages
+ N_CODEPAGES
);
1157 return (codepage
- codepages
) | syscp
;
1160 return get_cp_index("us-ascii") | syscp
;
1167 #endif /* USE_FASTFIND */
1170 init_charsets_lookup(void)
1173 fastfind_index(&ff_charsets_index
, FF_COMPRESS
);
1178 free_charsets_lookup(void)
1181 fastfind_done(&ff_charsets_index
);
1186 get_cp_name(int cp_index
)
1188 if (cp_index
< 0) return "none";
1189 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1191 return codepages
[cp_index
].name
;
1195 get_cp_mime_name(int cp_index
)
1197 if (cp_index
< 0) return "none";
1198 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
1199 if (!codepages
[cp_index
].aliases
) return NULL
;
1201 return codepages
[cp_index
].aliases
[0];
1205 is_cp_utf8(int cp_index
)
1207 cp_index
&= ~SYSTEM_CHARSET_FLAG
;
1208 return codepages
[cp_index
].table
== table_utf_8
;