1 /* Charsets convertor */
7 #if HAVE_LANGINFO_CODESET
16 #include "document/options.h"
17 #include "intl/charsets.h"
18 #include "util/conv.h"
19 #include "util/error.h"
20 #include "util/fastfind.h"
21 #include "util/memory.h"
22 #include "util/string.h"
25 /* Fix namespace clash on MacOS. */
26 #define table table_elinks
33 struct codepage_desc
{
35 unsigned char **aliases
;
36 struct table_entry
*table
;
39 #include "intl/codepage.inc"
40 #include "intl/uni_7b.inc"
41 #include "intl/entity.inc"
44 static char strings
[256][2] = {
45 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
46 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
47 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
48 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
49 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
50 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
51 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
52 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
53 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
54 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
55 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
56 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
57 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
58 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
59 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
60 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
61 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
62 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
63 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
64 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
65 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
66 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
67 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
68 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
69 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
70 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
71 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
72 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
73 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
74 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
75 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
76 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
80 free_translation_table(struct conv_table
*p
)
84 for (i
= 0; i
< 256; i
++)
86 free_translation_table(p
[i
].u
.tbl
);
91 static unsigned char *no_str
= "*";
94 new_translation_table(struct conv_table
*p
)
98 for (i
= 0; i
< 256; i
++)
100 free_translation_table(p
[i
].u
.tbl
);
101 for (i
= 0; i
< 128; i
++) {
103 p
[i
].u
.str
= strings
[i
];
105 for (; i
< 256; i
++) {
111 #define BIN_SEARCH(table, entry, entries, key, result) \
113 long _s = 0, _e = (entries) - 1; \
115 while (_s <= _e || !((result) = -1)) { \
116 long _m = (_s + _e) / 2; \
118 if ((table)[_m].entry == (key)) { \
122 if ((table)[_m].entry > (key)) _e = _m - 1; \
123 if ((table)[_m].entry < (key)) _s = _m + 1; \
127 static const unicode_val_T strange_chars[32] = {
128 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
129 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
130 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
131 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
134 #define SYSTEM_CHARSET_FLAG 128
137 u2cp_(unicode_val_T u
, int to
, int no_nbsp_hack
)
142 if (u
< 128) return strings
[u
];
143 /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
144 if (u
== 0xa0) return no_nbsp_hack
? " " : NBSP_CHAR_STRING
;
145 if (u
== 0xad) return "";
148 unicode_val_T strange
= strange_chars
[u
- 0x80];
150 if (!strange
) return NULL
;
151 return u2cp_(strange
, to
, no_nbsp_hack
);
154 to
&= ~SYSTEM_CHARSET_FLAG
;
156 for (j
= 0; codepages
[to
].table
[j
].c
; j
++)
157 if (codepages
[to
].table
[j
].u
== u
)
158 return strings
[codepages
[to
].table
[j
].c
];
160 BIN_SEARCH(unicode_7b
, x
, N_UNICODE_7B
, u
, s
);
161 if (s
!= -1) return unicode_7b
[s
].s
;
166 static unsigned char utf_buffer
[7];
168 static unsigned char *
169 encode_utf_8(unicode_val_T u
)
171 memset(utf_buffer
, 0, 7);
176 utf_buffer
[0] = 0xc0 | ((u
>> 6) & 0x1f),
177 utf_buffer
[1] = 0x80 | (u
& 0x3f);
178 else if (u
< 0x10000)
179 utf_buffer
[0] = 0xe0 | ((u
>> 12) & 0x0f),
180 utf_buffer
[1] = 0x80 | ((u
>> 6) & 0x3f),
181 utf_buffer
[2] = 0x80 | (u
& 0x3f);
182 else if (u
< 0x200000)
183 utf_buffer
[0] = 0xf0 | ((u
>> 18) & 0x0f),
184 utf_buffer
[1] = 0x80 | ((u
>> 12) & 0x3f),
185 utf_buffer
[2] = 0x80 | ((u
>> 6) & 0x3f),
186 utf_buffer
[3] = 0x80 | (u
& 0x3f);
187 else if (u
< 0x4000000)
188 utf_buffer
[0] = 0xf8 | ((u
>> 24) & 0x0f),
189 utf_buffer
[1] = 0x80 | ((u
>> 18) & 0x3f),
190 utf_buffer
[2] = 0x80 | ((u
>> 12) & 0x3f),
191 utf_buffer
[3] = 0x80 | ((u
>> 6) & 0x3f),
192 utf_buffer
[4] = 0x80 | (u
& 0x3f);
193 else utf_buffer
[0] = 0xfc | ((u
>> 30) & 0x01),
194 utf_buffer
[1] = 0x80 | ((u
>> 24) & 0x3f),
195 utf_buffer
[2] = 0x80 | ((u
>> 18) & 0x3f),
196 utf_buffer
[3] = 0x80 | ((u
>> 12) & 0x3f),
197 utf_buffer
[4] = 0x80 | ((u
>> 6) & 0x3f),
198 utf_buffer
[5] = 0x80 | (u
& 0x3f);
203 /* This slow and ugly code is used by the terminal utf_8_io */
205 cp2utf_8(int from
, int c
)
209 from
&= ~SYSTEM_CHARSET_FLAG
;
211 if (codepages
[from
].table
== table_utf_8
|| c
< 128)
214 for (j
= 0; codepages
[from
].table
[j
].c
; j
++)
215 if (codepages
[from
].table
[j
].c
== c
)
216 return encode_utf_8(codepages
[from
].table
[j
].u
);
218 return encode_utf_8(UCS_NO_CHAR
);
222 add_utf_8(struct conv_table
*ct
, unicode_val_T u
, unsigned char *str
)
224 unsigned char *p
= encode_utf_8(u
);
227 if (ct
[*p
].t
) ct
= ct
[*p
].u
.tbl
;
229 struct conv_table
*nct
;
231 assertm(ct
[*p
].u
.str
== no_str
, "bad utf encoding #1");
232 if_assert_failed
return;
234 nct
= mem_calloc(256, sizeof(*nct
));
236 new_translation_table(nct
);
244 assertm(!ct
[*p
].t
, "bad utf encoding #2");
245 if_assert_failed
return;
247 if (ct
[*p
].u
.str
== no_str
)
251 struct conv_table utf_table
[256];
252 int utf_table_init
= 1;
259 for (i
= 128; i
< 256; i
++)
260 mem_free(utf_table
[i
].u
.str
);
263 static struct conv_table
*
264 get_translation_table_to_utf_8(int from
)
269 if (from
== -1) return NULL
;
270 from
&= ~SYSTEM_CHARSET_FLAG
;
271 if (from
== lfr
) return utf_table
;
273 memset(utf_table
, 0, sizeof(utf_table
)),
278 for (i
= 0; i
< 128; i
++)
279 utf_table
[i
].u
.str
= strings
[i
];
281 if (codepages
[from
].table
== table_utf_8
) {
282 for (i
= 128; i
< 256; i
++)
283 utf_table
[i
].u
.str
= stracpy(strings
[i
]);
287 for (i
= 128; i
< 256; i
++)
288 utf_table
[i
].u
.str
= NULL
;
290 for (i
= 0; codepages
[from
].table
[i
].c
; i
++) {
291 unicode_val_T u
= codepages
[from
].table
[i
].u
;
293 if (!utf_table
[codepages
[from
].table
[i
].c
].u
.str
)
294 utf_table
[codepages
[from
].table
[i
].c
].u
.str
=
295 stracpy(encode_utf_8(u
));
298 for (i
= 128; i
< 256; i
++)
299 if (!utf_table
[i
].u
.str
)
300 utf_table
[i
].u
.str
= stracpy(no_str
);
305 struct conv_table table
[256];
306 static int first
= 1;
309 free_conv_table(void)
311 if (!utf_table_init
) free_utf_table();
313 memset(table
, 0, sizeof(table
));
316 new_translation_table(table
);
321 get_translation_table(int from
, int to
)
326 from
&= ~SYSTEM_CHARSET_FLAG
;
327 to
&= ~SYSTEM_CHARSET_FLAG
;
329 memset(table
, 0, sizeof(table
));
332 if (/*from == to ||*/ from
== -1 || to
== -1)
334 if (codepages
[to
].table
== table_utf_8
)
335 return get_translation_table_to_utf_8(from
);
336 if (from
== lfr
&& to
== lto
)
340 new_translation_table(table
);
342 if (codepages
[from
].table
== table_utf_8
) {
345 for (i
= 0; codepages
[to
].table
[i
].c
; i
++)
346 add_utf_8(table
, codepages
[to
].table
[i
].u
,
347 strings
[codepages
[to
].table
[i
].c
]);
349 for (i
= 0; unicode_7b
[i
].x
!= -1; i
++)
350 if (unicode_7b
[i
].x
>= 0x80)
351 add_utf_8(table
, unicode_7b
[i
].x
,
357 for (i
= 128; i
< 256; i
++) {
360 for (j
= 0; codepages
[from
].table
[j
].c
; j
++) {
361 if (codepages
[from
].table
[j
].c
== i
) {
364 u
= u2cp(codepages
[from
].table
[j
].u
, to
);
365 if (u
) table
[i
].u
.str
= u
;
376 xxstrcmp(unsigned char *s1
, unsigned char *s2
, int l2
)
379 if (*s1
> *s2
) return 1;
380 if (*s1
< *s2
) return -1;
389 /* Entity cache debugging purpose. */
391 #define DEBUG_ENTITY_CACHE
393 #undef DEBUG_ENTITY_CACHE
396 struct entity_cache
{
400 unsigned char *result
;
401 unsigned char str
[20]; /* Suffice in any case. */
405 hits_cmp(struct entity_cache
*a
, struct entity_cache
*b
)
407 if (a
->hits
== b
->hits
) return 0;
408 if (a
->hits
> b
->hits
) return -1;
413 compare_entities(const void *key_
, const void *element_
)
415 struct string
*key
= (struct string
*) key_
;
416 struct entity
*element
= (struct entity
*) element_
;
417 int length
= key
->length
;
418 unsigned char *first
= key
->source
;
419 unsigned char *second
= element
->s
;
421 return xxstrcmp(first
, second
, length
);
425 get_entity_string(const unsigned char *str
, const int strlen
, int encoding
)
427 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
428 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
429 will go in [0] table */
430 static struct entity_cache entity_cache
[ENTITY_CACHE_MAXLEN
][ENTITY_CACHE_SIZE
];
431 static unsigned int nb_entity_cache
[ENTITY_CACHE_MAXLEN
];
432 static int first_time
= 1;
434 unsigned char *result
= NULL
;
436 if (strlen
<= 0) return NULL
;
439 memset(&nb_entity_cache
, 0, ENTITY_CACHE_MAXLEN
* sizeof(unsigned int));
443 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
444 * + google + slashdot + websites that result from a search for test on google,
445 * + various ones) show a quite impressive improvment:
447 * 0: hits=2459 l=4 st='nbsp'
448 * 1: hits=2152 l=6 st='eacute'
449 * 2: hits=235 l=6 st='egrave'
450 * 3: hits=136 l=6 st='agrave'
451 * 4: hits=100 l=3 st='amp'
452 * 5: hits=40 l=5 st='laquo'
453 * 6: hits=8 l=4 st='copy'
454 * 7: hits=5 l=2 st='gt'
455 * 8: hits=2 l=2 st='lt'
456 * 9: hits=1 l=6 st='middot'
458 * Most of the time cache hit ratio is near 95%.
460 * A long test shows: 15186 hits vs. 24 misses and mean iteration
461 * count is kept < 2 (worst case 1.58). Not so bad ;)
465 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
466 slen
= (strlen
> 1 && strlen
< ENTITY_CACHE_MAXLEN
) ? strlen
: 0;
468 if (strlen
< ENTITY_CACHE_MAXLEN
&& nb_entity_cache
[slen
] > 0) {
471 for (i
= 0; i
< nb_entity_cache
[slen
]; i
++) {
472 if (entity_cache
[slen
][i
].encoding
== encoding
473 && !memcmp(str
, entity_cache
[slen
][i
].str
, strlen
)) {
474 #ifdef DEBUG_ENTITY_CACHE
475 static double total_iter
= 0;
476 static unsigned long hit_count
= 0;
480 fprintf(stderr
, "hit after %d iter. (mean = %0.2f)\n", i
+ 1, total_iter
/ (double) hit_count
);
482 if (entity_cache
[slen
][i
].hits
< (unsigned int) ~0)
483 entity_cache
[slen
][i
].hits
++;
484 return entity_cache
[slen
][i
].result
;
487 #ifdef DEBUG_ENTITY_CACHE
488 fprintf(stderr
, "miss\n");
492 if (*str
== '#') { /* Numeric entity. */
493 int l
= (int) strlen
;
494 unsigned char *st
= (unsigned char *) str
;
497 if (l
== 1) goto end
; /* &#; ? */
499 if ((*st
| 32) == 'x') { /* Hexadecimal */
501 if (l
== 1 || l
> 9) goto end
; /* xFFFFFFFF max. */
504 unsigned char c
= (*(st
++) | 32);
507 n
= (n
<< 4) | (c
- '0');
508 else if (isxdigit(c
))
509 n
= (n
<< 4) | (c
- 'a' + 10);
511 goto end
; /* Bad char. */
513 } else { /* Decimal */
514 if (l
> 10) goto end
; /* 4294967295 max. */
516 unsigned char c
= *(st
++);
519 n
= n
* 10 + c
- '0';
521 goto end
; /* Bad char. */
522 /* Limit to 0xFFFFFFFF. */
523 if (n
>= (unicode_val_T
) 0xFFFFFFFFu
)
528 result
= u2cp(n
, encoding
);
530 #ifdef DEBUG_ENTITY_CACHE
531 fprintf(stderr
, "%lu %016x %s\n", (unsigned long) n
, n
, result
);
533 } else { /* Text entity. */
534 struct string key
= INIT_STRING((unsigned char *) str
, strlen
);
535 struct entity
*element
= bsearch((void *) &key
, entities
,
540 if (element
) result
= u2cp(element
->c
, encoding
);
544 /* Take care of potential buffer overflow. */
545 if (strlen
< sizeof(entity_cache
[slen
][0].str
)) {
546 struct entity_cache
*ece
= &entity_cache
[slen
][nb_entity_cache
[slen
]];
548 /* Copy new entry to cache. */
550 ece
->strlen
= strlen
;
551 ece
->encoding
= encoding
;
552 ece
->result
= result
;
553 memcpy(ece
->str
, str
, strlen
);
554 ece
->str
[strlen
] = '\0';
556 /* Increment number of cache entries if possible. */
557 if (nb_entity_cache
[slen
] < ENTITY_CACHE_SIZE
) nb_entity_cache
[slen
]++;
559 #ifdef DEBUG_ENTITY_CACHE
560 fprintf(stderr
, "Added in [%u]: l=%d st='%s'\n", slen
,
561 entity_cache
[slen
][0].strlen
, entity_cache
[slen
][0].str
);
565 /* Sort entries by hit order. */
566 if (nb_entity_cache
[slen
] > 1)
567 qsort(&entity_cache
[slen
][0], nb_entity_cache
[slen
],
568 sizeof(entity_cache
[slen
][0]), (void *) hits_cmp
);
570 #ifdef DEBUG_ENTITY_CACHE
574 fprintf(stderr
, "- Cache entries [%u] -\n", slen
);
575 for (i
= 0; i
< nb_entity_cache
[slen
] ; i
++)
576 fprintf(stderr
, "%d: hits=%u l=%d st='%s'\n", i
,
577 entity_cache
[slen
][i
].hits
, entity_cache
[slen
][i
].strlen
,
578 entity_cache
[slen
][i
].str
);
579 fprintf(stderr
, "-----------------\n");
587 convert_string(struct conv_table
*convert_table
,
588 unsigned char *chars
, int charslen
, int cp
,
589 enum convert_string_mode mode
, int *length
,
590 void (*callback
)(void *data
, unsigned char *buf
, int buflen
),
593 unsigned char *buffer
;
597 if (!convert_table
&& !memchr(chars
, '&', charslen
)) {
599 if (charslen
) callback(callback_data
, chars
, charslen
);
602 return memacpy(chars
, charslen
);
606 /* Buffer allocation */
608 buffer
= mem_alloc(ALLOC_GR
+ 1 /* trailing \0 */);
609 if (!buffer
) return NULL
;
613 while (charspos
< charslen
) {
614 unsigned char *translit
;
617 buffer[bufferpos++] = chars[charspos++]; \
622 if (chars
[charspos
] != '&') {
623 struct conv_table
*t
;
626 if (chars
[charspos
] < 128 || !convert_table
) PUTC
;
631 while (t
[chars
[i
]].t
) {
632 t
= t
[chars
[i
++]].u
.tbl
;
633 if (i
>= charslen
) PUTC
;
636 translit
= t
[chars
[i
]].u
.str
;
639 } else if (mode
== CSM_FORM
|| mode
== CSM_NONE
) {
643 int start
= charspos
+ 1;
647 && (isasciialpha(chars
[i
])
649 || (chars
[i
] == '#')))
652 /* This prevents bug 213: we were expanding "entities"
653 * in URL query strings. */
654 /* XXX: But this disables    usage, which
655 * appears to be relatively common! --pasky */
656 if ((mode
== CSM_DEFAULT
|| (chars
[i
] != '&' && chars
[i
] != '='))
658 && !isasciialpha(chars
[i
]) && !isdigit(chars
[i
])) {
659 translit
= get_entity_string(&chars
[start
], i
- start
,
661 if (chars
[i
] != ';') {
662 /* Eat    <foo> happily, but
663 * pull back from the character after
664 * entity string if it is not the valid
670 charspos
= i
+ (i
< charslen
);
674 if (!translit
[0]) continue;
677 buffer
[bufferpos
++] = translit
[0];
685 buffer
[bufferpos
++] = *(translit
++);
687 if (bufferpos
& (ALLOC_GR
- 1)) continue;
690 buffer
[bufferpos
] = 0;
691 callback(callback_data
, buffer
, bufferpos
);
694 new = mem_realloc(buffer
, bufferpos
+ ALLOC_GR
);
707 buffer
[bufferpos
] = 0;
708 if (length
) *length
= bufferpos
;
711 if (bufferpos
) callback(callback_data
, buffer
, bufferpos
);
722 get_cp_index(unsigned char *name
)
727 if (!strcasecmp(name
, "System")) {
728 #if HAVE_LANGINFO_CODESET
729 name
= nl_langinfo(CODESET
);
730 syscp
= SYSTEM_CHARSET_FLAG
;
736 for (i
= 0; codepages
[i
].name
; i
++) {
737 for (a
= 0; codepages
[i
].aliases
[a
]; a
++) {
738 /* In the past, we looked for the longest substring
739 * in all the names; it is way too expensive, though:
741 * % cumulative self self total
742 * time seconds seconds calls us/call us/call name
743 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
745 * Anything called from redraw_screen() is in fact
746 * relatively expensive, even if it's called just
747 * once. So we will do a simple strcasecmp() here.
750 if (!strcasecmp(name
, codepages
[i
].aliases
[a
]))
756 return get_cp_index("us-ascii") | syscp
;
764 static unsigned int i_name
= 0;
765 static unsigned int i_alias
= 0;
767 /* Reset internal list pointer */
769 charsets_list_reset(void)
775 /* Returns a pointer to a struct that contains current key and data pointers
776 * and increment internal pointer. It returns NULL when key is NULL. */
777 struct fastfind_key_value
*
778 charsets_list_next(void)
780 static struct fastfind_key_value kv
;
782 if (!codepages
[i_name
].name
) return NULL
;
784 kv
.key
= codepages
[i_name
].aliases
[i_alias
];
785 kv
.data
= &codepages
[i_name
];
787 if (codepages
[i_name
].aliases
[i_alias
+ 1])
797 static struct fastfind_index ff_charsets_index
798 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset
, charsets_list_next
);
800 /* It searchs for a charset named @name or one of its aliases and
801 * returns index for it or -1 if not found. */
803 get_cp_index(unsigned char *name
)
805 struct codepage_desc
*codepage
;
808 if (!strcasecmp(name
, "System")) {
809 #if HAVE_LANGINFO_CODESET
810 name
= nl_langinfo(CODESET
);
811 syscp
= SYSTEM_CHARSET_FLAG
;
817 codepage
= fastfind_search(&ff_charsets_index
, name
, strlen(name
));
819 assert(codepages
<= codepage
&& codepage
< codepages
+ N_CODEPAGES
);
820 return (codepage
- codepages
) | syscp
;
823 return get_cp_index("us-ascii") | syscp
;
830 #endif /* USE_FASTFIND */
833 init_charsets_lookup(void)
836 fastfind_index(&ff_charsets_index
, FF_COMPRESS
);
841 free_charsets_lookup(void)
844 fastfind_done(&ff_charsets_index
);
849 get_cp_name(int cp_index
)
851 if (cp_index
< 0) return "none";
852 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
854 return codepages
[cp_index
].name
;
858 get_cp_mime_name(int cp_index
)
860 if (cp_index
< 0) return "none";
861 if (cp_index
& SYSTEM_CHARSET_FLAG
) return "System";
862 if (!codepages
[cp_index
].aliases
) return NULL
;
864 return codepages
[cp_index
].aliases
[0];
868 is_cp_special(int cp_index
)
870 cp_index
&= ~SYSTEM_CHARSET_FLAG
;
871 return codepages
[cp_index
].table
== table_utf_8
;