iconv: Bail out of the loop when an illegal sequence of bytes occurs.
[elinks/elinks-j605.git] / src / intl / charsets.c
blob339e0ebbcc73266976dfd0af6e2e3c3dbf050f3c
1 /* Charsets convertor */
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE /* strcasecmp() */
5 #endif
7 #ifdef HAVE_CONFIG_H
8 #include "config.h"
9 #endif
11 #if HAVE_LANGINFO_CODESET
12 #include <langinfo.h>
13 #endif
15 #include <ctype.h>
16 #include <stdlib.h>
17 #if HAVE_WCTYPE_H
18 #include <wctype.h>
19 #endif
21 #ifdef HAVE_ICONV
22 #include <errno.h>
23 #include <iconv.h>
24 #endif
26 #include "elinks.h"
28 #include "document/options.h"
29 #include "intl/charsets.h"
30 #include "util/conv.h"
31 #include "util/error.h"
32 #include "util/fastfind.h"
33 #include "util/hash.h"
34 #include "util/memory.h"
35 #include "util/string.h"
38 /* Fix namespace clash on MacOS. */
39 #define table table_elinks
41 struct table_entry {
42 unsigned char c;
43 /* This should in principle be unicode_val_T, but because all
44 * the values currently in codepage.inc fit in 16 bits, we can
45 * as well use uint16_t and halve sizeof(struct table_entry)
46 * from 8 bytes to 4. Should other characters ever be needed,
47 * unicode_val_T u : 24 might be a possibility, although it
48 * seems a little unportable as bitfields are in principle
49 * restricted to int, which may be 16-bit. */
50 uint16_t u;
53 struct codepage_desc {
54 unsigned char *name;
55 unsigned char *const *aliases;
57 /* The Unicode mappings of codepage bytes 0x80...0xFF.
58 * (0x00...0x7F are assumed to be ASCII in all codepages.)
59 * Because all current values fit in 16 bits, we store them as
60 * uint16_t rather than unicode_val_T. If the codepage does
61 * not use some byte, then @highhalf maps that byte to 0xFFFF,
62 * which C code converts to UCS_REPLACEMENT_CHARACTER where
63 * appropriate. (U+FFFF is reserved and will never be
64 * assigned as a character.) */
65 const uint16_t *highhalf;
67 /* If some byte in the codepage corresponds to multiple Unicode
68 * characters, then the preferred character is in @highhalf
69 * above, and the rest are listed here in @table. This table
70 * is not used for translating from the codepage to Unicode. */
71 const struct table_entry *table;
73 /* Whether use iconv for translation */
74 unsigned int iconv:1;
77 #include "intl/codepage.inc"
78 #include "intl/uni_7b.inc"
79 #include "intl/entity.inc"
81 /* Declare the external-linkage inline functions defined in this file.
82 * Avoid the GCC 4.3.1 warning: `foo' declared inline after being
83 * called. The functions are not declared inline in charsets.h
84 * because C99 6.7.4p6 says that every external-linkage function
85 * declared inline shall be defined in the same translation unit.
86 * The non-inline declarations in charsets.h also make sure that the
87 * compiler emits global definitions for the symbols so that the
88 * functions can be called from other translation units. */
89 NONSTATIC_INLINE unsigned char *encode_utf8(unicode_val_T u);
90 NONSTATIC_INLINE int utf8charlen(const unsigned char *p);
91 NONSTATIC_INLINE int unicode_to_cell(unicode_val_T c);
92 NONSTATIC_INLINE unicode_val_T utf8_to_unicode(unsigned char **string,
93 const unsigned char *end);
95 static const char strings[256][2] = {
96 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
97 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
98 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
99 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
100 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
101 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
102 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
103 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
104 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
105 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
106 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
107 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
108 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
109 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
110 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
111 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
112 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
113 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
114 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
115 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
116 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
117 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
118 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
119 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
120 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
121 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
122 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
123 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
124 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
125 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
126 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
127 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
130 #ifdef HAVE_ICONV
131 static iconv_t iconv_cd = (iconv_t)-1;
132 #endif
134 static void
135 free_translation_table(struct conv_table *p)
137 int i;
139 for (i = 0; i < 256; i++)
140 if (p[i].t)
141 free_translation_table(p[i].u.tbl);
143 mem_free(p);
146 /* A string used in conversion tables when there is no correct
147 * conversion. This is compared by address and therefore should be a
148 * named array rather than a pointer so that it won't share storage
149 * with any other string literal that happens to have the same
150 * characters. */
151 static const unsigned char no_str[] = "*";
153 static void
154 new_translation_table(struct conv_table *p)
156 int i;
158 for (i = 0; i < 256; i++)
159 if (p[i].t)
160 free_translation_table(p[i].u.tbl);
161 for (i = 0; i < 128; i++) {
162 p[i].t = 0;
163 p[i].u.str = strings[i];
165 for (; i < 256; i++) {
166 p[i].t = 0;
167 p[i].u.str = no_str;
169 p->iconv_cp = -1;
172 #define BIN_SEARCH(table, entry, entries, key, result) \
174 long _s = 0, _e = (entries) - 1; \
176 while (_s <= _e || !((result) = -1)) { \
177 long _m = (_s + _e) / 2; \
179 if ((table)[_m].entry == (key)) { \
180 (result) = _m; \
181 break; \
183 if ((table)[_m].entry > (key)) _e = _m - 1; \
184 if ((table)[_m].entry < (key)) _s = _m + 1; \
188 static const unicode_val_T strange_chars[32] = {
189 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
190 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
191 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
192 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
195 #define SYSTEM_CHARSET_FLAG 128
196 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
198 const unsigned char *
199 u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
201 int j;
202 int s;
204 if (u < 128) return strings[u];
206 if (u < 0xa0) {
207 u = strange_chars[u - 0x80];
208 if (!u) return NULL;
211 to &= ~SYSTEM_CHARSET_FLAG;
213 if (is_cp_ptr_utf8(&codepages[to]))
214 return encode_utf8(u);
216 /* To mark non breaking spaces in non-UTF-8 strings, we use a
217 * special char NBSP_CHAR. */
218 if (u == UCS_NO_BREAK_SPACE) {
219 if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
220 else /* NBSP_MODE_ASCII */ return " ";
222 if (u == UCS_SOFT_HYPHEN) return "";
224 if (u < 0xFFFF)
225 for (j = 0; j < 0x80; j++)
226 if (codepages[to].highhalf[j] == u)
227 return strings[0x80 + j];
228 for (j = 0; codepages[to].table[j].c; j++)
229 if (codepages[to].table[j].u == u)
230 return strings[codepages[to].table[j].c];
232 BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
233 if (s != -1) return unicode_7b[s].s;
235 return no_str;
238 static unsigned char utf_buffer[7];
240 NONSTATIC_INLINE unsigned char *
241 encode_utf8(unicode_val_T u)
243 memset(utf_buffer, 0, 7);
245 if (u < 0x80)
246 utf_buffer[0] = u;
247 else if (u < 0x800)
248 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
249 utf_buffer[1] = 0x80 | (u & 0x3f);
250 else if (u < 0x10000)
251 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
252 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
253 utf_buffer[2] = 0x80 | (u & 0x3f);
254 else if (u < 0x200000)
255 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
256 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
257 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
258 utf_buffer[3] = 0x80 | (u & 0x3f);
259 else if (u < 0x4000000)
260 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
261 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
262 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
263 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
264 utf_buffer[4] = 0x80 | (u & 0x3f);
265 else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
266 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
267 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
268 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
269 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
270 utf_buffer[5] = 0x80 | (u & 0x3f);
272 return utf_buffer;
275 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
276 * equal ones and handled different. */
277 static const char utf8char_len_tab[256] = {
278 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
279 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
280 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
281 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
282 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
283 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
284 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
285 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
288 #ifdef CONFIG_UTF8
289 NONSTATIC_INLINE int
290 utf8charlen(const unsigned char *p)
292 return p ? utf8char_len_tab[*p] : 0;
296 strlen_utf8(unsigned char **str)
298 unsigned char *s = *str;
299 unsigned char *end = strchr(s, '\0');
300 int x;
301 int len;
303 for (x = 0;; x++, s += len) {
304 len = utf8charlen(s);
305 if (s + len > end) break;
307 *str = s;
308 return x;
311 #define utf8_issingle(p) (((p) & 0x80) == 0)
312 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
314 /* Start from @current and move back to @pos char. This pointer return. The
315 * most left pointer is @start. */
316 unsigned char *
317 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
319 if (current == NULL || start == NULL || pos < 0)
320 return NULL;
321 while (pos > 0 && current != start) {
322 current--;
323 if (utf8_islead(*current))
324 pos--;
326 return current;
329 /* Count number of standard terminal cells needed for displaying UTF-8
330 * character. */
332 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
334 unicode_val_T u;
336 if (end == NULL)
337 end = strchr(utf8_char, '\0');
339 if(!utf8_char || !end)
340 return -1;
342 u = utf8_to_unicode(&utf8_char, end);
344 return unicode_to_cell(u);
347 /* Count number of standard terminal cells needed for displaying string
348 * with UTF-8 characters. */
350 utf8_ptr2cells(unsigned char *string, unsigned char *end)
352 int charlen, cell, cells = 0;
354 if (end == NULL)
355 end = strchr(string, '\0');
357 if(!string || !end)
358 return -1;
360 do {
361 charlen = utf8charlen(string);
362 if (string + charlen > end)
363 break;
365 cell = utf8_char2cells(string, end);
366 if (cell < 0)
367 return -1;
369 cells += cell;
370 string += charlen;
371 } while (1);
373 return cells;
376 /* Count number of characters in string. */
378 utf8_ptr2chars(unsigned char *string, unsigned char *end)
380 int charlen, chars = 0;
382 if (end == NULL)
383 end = strchr(string, '\0');
385 if(!string || !end)
386 return -1;
388 do {
389 charlen = utf8charlen(string);
390 if (string + charlen > end)
391 break;
393 chars++;
394 string += charlen;
395 } while (1);
397 return chars;
401 * Count number of bytes from begining of the string needed for displaying
402 * specified number of cells.
405 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
407 unsigned int bytes = 0, cells = 0;
409 assert(max_cells>=0);
411 if (end == NULL)
412 end = strchr(string, '\0');
414 if(!string || !end)
415 return -1;
417 do {
418 int cell = utf8_char2cells(&string[bytes], end);
419 if (cell < 0)
420 return -1;
422 cells += cell;
423 if (cells > max_cells)
424 break;
426 bytes += utf8charlen(&string[bytes]);
428 if (string + bytes > end) {
429 bytes = end - string;
430 break;
432 } while(1);
434 return bytes;
437 /* Take @max steps forward from @string in the specified @way, but
438 * not going past @end. Return the resulting address. Store the
439 * number of steps taken to *@count, unless @count is NULL.
441 * This assumes the text is valid UTF-8, and @string and @end point to
442 * character boundaries. If not, it doesn't crash but the results may
443 * be inconsistent.
445 * This function can do some of the same jobs as utf8charlen(),
446 * utf8_cells2bytes(), and strlen_utf8(). */
447 unsigned char *
448 utf8_step_forward(unsigned char *string, unsigned char *end,
449 int max, enum utf8_step way, int *count)
451 int steps = 0;
452 unsigned char *current = string;
454 assert(string);
455 assert(max >= 0);
456 if_assert_failed goto invalid_arg;
457 if (end == NULL)
458 end = strchr(string, '\0');
460 switch (way) {
461 case UTF8_STEP_CHARACTERS:
462 while (steps < max && current < end) {
463 ++current;
464 if (utf8_islead(*current))
465 ++steps;
467 break;
469 case UTF8_STEP_CELLS_FEWER:
470 case UTF8_STEP_CELLS_MORE:
471 while (steps < max && current < end) {
472 unicode_val_T u;
473 unsigned char *prev = current;
474 int width;
476 u = utf8_to_unicode(&current, end);
477 if (u == UCS_NO_CHAR) {
478 /* Assume the incomplete sequence
479 * costs one cell. */
480 current = end;
481 ++steps;
482 break;
485 width = unicode_to_cell(u);
486 if (way == UTF8_STEP_CELLS_FEWER
487 && steps + width > max) {
488 /* Back off. */
489 current = prev;
490 break;
492 steps += width;
494 break;
496 default:
497 INTERNAL("impossible enum utf8_step");
500 invalid_arg:
501 if (count)
502 *count = steps;
503 return current;
506 /* Take @max steps backward from @string in the specified @way, but
507 * not going past @start. Return the resulting address. Store the
508 * number of steps taken to *@count, unless @count is NULL.
510 * This assumes the text is valid UTF-8, and @string and @start point
511 * to character boundaries. If not, it doesn't crash but the results
512 * may be inconsistent.
514 * This function can do some of the same jobs as utf8_prevchar(). */
515 unsigned char *
516 utf8_step_backward(unsigned char *string, unsigned char *start,
517 int max, enum utf8_step way, int *count)
519 int steps = 0;
520 unsigned char *current = string;
522 assert(string);
523 assert(start);
524 assert(max >= 0);
525 if_assert_failed goto invalid_arg;
527 switch (way) {
528 case UTF8_STEP_CHARACTERS:
529 while (steps < max && current > start) {
530 --current;
531 if (utf8_islead(*current))
532 ++steps;
534 break;
536 case UTF8_STEP_CELLS_FEWER:
537 case UTF8_STEP_CELLS_MORE:
538 while (steps < max) {
539 unsigned char *prev = current;
540 unsigned char *look;
541 unicode_val_T u;
542 int width;
544 if (current <= start)
545 break;
546 do {
547 --current;
548 } while (current > start && !utf8_islead(*current));
550 look = current;
551 u = utf8_to_unicode(&look, prev);
552 if (u == UCS_NO_CHAR) {
553 /* Assume the incomplete sequence
554 * costs one cell. */
555 width = 1;
556 } else
557 width = unicode_to_cell(u);
559 if (way == UTF8_STEP_CELLS_FEWER
560 && steps + width > max) {
561 /* Back off. */
562 current = prev;
563 break;
565 steps += width;
567 break;
569 default:
570 INTERNAL("impossible enum utf8_step");
573 invalid_arg:
574 if (count)
575 *count = steps;
576 return current;
580 * Find out number of standard terminal collumns needed for displaying symbol
581 * (glyph) which represents Unicode character c.
583 * TODO: Use wcwidth when it is available. This seems to require:
584 * - Make the configure script check whether <wchar.h> and wcwidth exist.
585 * - Define _XOPEN_SOURCE and include <wchar.h>.
586 * - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
587 * matches ISO 10646 in all locales.)
588 * However, these do not suffice, because wcwidth depends on LC_CTYPE
589 * in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
590 * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
591 * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
592 * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
593 * character is apparently not supported in all locales. Why is that?
594 * - Perhaps there is standardese that requires supported characters
595 * to be convertable to multibyte form. Then ELinks could just pick
596 * some UTF-8 locale for its wcwidth purposes.
597 * - Perhaps wcwidth can even return different nonnegative values for
598 * the same ISO 10646 character in different locales. Then ELinks
599 * would have to set LC_CTYPE to match at least the terminal's
600 * charset (which may differ from the LC_CTYPE environment variable,
601 * especially when the master process is serving a slave terminal).
602 * But there is no guarantee that the libc supports all the same
603 * charsets as ELinks does.
604 * For now, it seems safest to avoid the potentially locale-dependent
605 * libc version of wcwidth, and instead use a hardcoded mapping.
607 * @return 2 for double-width glyph, 1 for others.
608 * TODO: May be extended to return 0 for zero-width glyphs
609 * (like composing, maybe unprintable too).
611 NONSTATIC_INLINE int
612 unicode_to_cell(unicode_val_T c)
614 if (c >= 0x1100
615 && (c <= 0x115f /* Hangul Jamo */
616 || c == 0x2329
617 || c == 0x232a
618 || (c >= 0x2e80 && c <= 0xa4cf
619 && c != 0x303f) /* CJK ... Yi */
620 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
621 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
622 Ideographs */
623 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
624 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
625 || (c >= 0xffe0 && c <= 0xffe6)
626 || (c >= 0x20000 && c <= 0x2fffd)
627 || (c >= 0x30000 && c <= 0x3fffd)))
628 return 2;
630 return 1;
633 /* Fold the case of a Unicode character, so that hotkeys in labels can
634 * be compared case-insensitively. It is unspecified whether the
635 * result will be in upper or lower case. */
636 unicode_val_T
637 unicode_fold_label_case(unicode_val_T c)
639 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
640 return towlower(c);
641 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
642 /* For now, this supports only ASCII. It would be possible to
643 * use code generated from CaseFolding.txt of Unicode if the
644 * acknowledgements required by http://www.unicode.org/copyright.html
645 * were added to associated documentation of ELinks. */
646 if (c >= 0x41 && c <= 0x5A)
647 return c + 0x20;
648 else
649 return c;
650 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
652 #endif /* CONFIG_UTF8 */
654 NONSTATIC_INLINE unicode_val_T
655 utf8_to_unicode(unsigned char **string, const unsigned char *end)
657 unsigned char *str = *string;
658 unicode_val_T u;
659 int length;
661 length = utf8char_len_tab[str[0]];
663 if (str + length > end) {
664 return UCS_NO_CHAR;
667 switch (length) {
668 case 1: /* U+0000 to U+007F */
669 if (str[0] >= 0x80) {
670 invalid_utf8:
671 ++*string;
672 return UCS_REPLACEMENT_CHARACTER;
674 u = str[0];
675 break;
676 case 2: /* U+0080 to U+07FF */
677 if ((str[1] & 0xc0) != 0x80)
678 goto invalid_utf8;
679 u = (str[0] & 0x1f) << 6;
680 u += (str[1] & 0x3f);
681 if (u < 0x80)
682 goto invalid_utf8;
683 break;
684 case 3: /* U+0800 to U+FFFF, except surrogates */
685 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
686 goto invalid_utf8;
687 u = (str[0] & 0x0f) << 12;
688 u += ((str[1] & 0x3f) << 6);
689 u += (str[2] & 0x3f);
690 if (u < 0x800 || is_utf16_surrogate(u))
691 goto invalid_utf8;
692 break;
693 case 4: /* U+10000 to U+1FFFFF */
694 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
695 || (str[3] & 0xc0) != 0x80)
696 goto invalid_utf8;
697 u = (str[0] & 0x0f) << 18;
698 u += ((str[1] & 0x3f) << 12);
699 u += ((str[2] & 0x3f) << 6);
700 u += (str[3] & 0x3f);
701 if (u < 0x10000)
702 goto invalid_utf8;
703 break;
704 case 5: /* U+200000 to U+3FFFFFF */
705 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
706 || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
707 goto invalid_utf8;
708 u = (str[0] & 0x0f) << 24;
709 u += ((str[1] & 0x3f) << 18);
710 u += ((str[2] & 0x3f) << 12);
711 u += ((str[3] & 0x3f) << 6);
712 u += (str[4] & 0x3f);
713 if (u < 0x200000)
714 goto invalid_utf8;
715 break;
716 case 6: /* U+4000000 to U+7FFFFFFF */
717 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
718 || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
719 || (str[5] & 0xc0) != 0x80)
720 goto invalid_utf8;
721 u = (str[0] & 0x01) << 30;
722 u += ((str[1] & 0x3f) << 24);
723 u += ((str[2] & 0x3f) << 18);
724 u += ((str[3] & 0x3f) << 12);
725 u += ((str[4] & 0x3f) << 6);
726 u += (str[5] & 0x3f);
727 if (u < 0x4000000)
728 goto invalid_utf8;
729 break;
730 default:
731 INTERNAL("utf8char_len_tab out of range");
732 goto invalid_utf8;
734 *string = str + length;
735 return u;
738 /* The common part of cp2u and cp2utf_8. */
739 static unicode_val_T
740 cp2u_shared(const struct codepage_desc *from, unsigned char c)
742 unicode_val_T u = from->highhalf[c - 0x80];
744 if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
745 return u;
748 /* Used for converting input from the terminal. */
749 unicode_val_T
750 cp2u(int from, unsigned char c)
752 from &= ~SYSTEM_CHARSET_FLAG;
754 /* UTF-8 is a multibyte codepage and cannot be handled with
755 * this function. */
756 assert(!is_cp_ptr_utf8(&codepages[from]));
757 if_assert_failed return UCS_REPLACEMENT_CHARACTER;
759 if (c < 0x80) return c;
760 else return cp2u_shared(&codepages[from], c);
763 /* This slow and ugly code is used by the terminal utf_8_io */
764 const unsigned char *
765 cp2utf8(int from, int c)
767 from &= ~SYSTEM_CHARSET_FLAG;
769 if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
770 return strings[c];
772 return encode_utf8(cp2u_shared(&codepages[from], c));
775 unicode_val_T
776 cp_to_unicode(int codepage, unsigned char **string, const unsigned char *end)
778 unicode_val_T ret;
780 if (is_cp_utf8(codepage))
781 return utf8_to_unicode(string, end);
783 if (*string >= end)
784 return UCS_NO_CHAR;
786 ret = cp2u(codepage, **string);
787 ++*string;
788 return ret;
792 #ifdef CONFIG_COMBINE
793 unicode_val_T last_combined = UCS_BEGIN_COMBINED - 1;
794 unicode_val_T **combined;
795 struct hash *combined_hash;
797 unicode_val_T
798 get_combined(unicode_val_T *data, int length)
800 struct hash_item *item;
801 unicode_val_T *key;
802 int i, indeks;
804 assert(length >= 1 && length <= UCS_MAX_LENGTH_COMBINED);
805 if_assert_failed return UCS_NO_CHAR;
807 if (!combined_hash) combined_hash = init_hash8();
808 if (!combined_hash) return UCS_NO_CHAR;
809 item = get_hash_item(combined_hash, (unsigned char *)data, length * sizeof(*data));
811 if (item) return (unicode_val_T)(long)item->value;
812 if (last_combined >= UCS_END_COMBINED) return UCS_NO_CHAR;
814 key = mem_alloc((length + 1) * sizeof(*key));
815 if (!key) return UCS_NO_CHAR;
816 for (i = 0; i < length; i++)
817 key[i] = data[i];
818 key[i] = UCS_END_COMBINED;
820 last_combined++;
821 indeks = last_combined - UCS_BEGIN_COMBINED;
823 combined = mem_realloc(combined, sizeof(*combined) * (indeks + 1));
824 if (!combined) {
825 mem_free(key);
826 last_combined--;
827 return UCS_NO_CHAR;
829 combined[indeks] = key;
830 item = add_hash_item(combined_hash, (unsigned char *)key,
831 length * sizeof(*data), (void *)(long)(last_combined));
832 if (!item) {
833 last_combined--;
834 mem_free(key);
835 return UCS_NO_CHAR;
837 return last_combined;
840 void
841 free_combined()
843 int i, end = last_combined - UCS_BEGIN_COMBINED + 1;
845 if (combined_hash)
846 free_hash(&combined_hash);
847 for (i = 0; i < end; i++)
848 mem_free(combined[i]);
849 mem_free_if(combined);
851 #endif /* CONFIG_COMBINE */
854 static void
855 add_utf8(struct conv_table *ct, unicode_val_T u, const unsigned char *str)
857 unsigned char *p = encode_utf8(u);
859 while (p[1]) {
860 if (ct[*p].t) ct = ct[*p].u.tbl;
861 else {
862 struct conv_table *nct;
864 assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
865 if_assert_failed return;
867 nct = mem_calloc(256, sizeof(*nct));
868 if (!nct) return;
869 new_translation_table(nct);
870 ct[*p].t = 1;
871 ct[*p].u.tbl = nct;
872 ct = nct;
874 p++;
877 assertm(!ct[*p].t, "bad utf encoding #2");
878 if_assert_failed return;
880 if (ct[*p].u.str == no_str)
881 ct[*p].u.str = str;
884 /* A conversion table from some charset to UTF-8.
885 * If it is from UTF-8 to UTF-8, it converts each byte separately.
886 * Unlike in other translation tables, the strings in elements 0x80 to
887 * 0xFF are allocated dynamically. */
888 struct conv_table utf_table[256];
889 int utf_table_init = 1;
891 static void
892 free_utf_table(void)
894 int i;
896 /* Cast away const. */
897 for (i = 128; i < 256; i++)
898 mem_free((unsigned char *) utf_table[i].u.str);
901 static struct conv_table *
902 get_translation_table_to_utf8(int from)
904 int i;
905 static int lfr = -1;
907 if (from == -1) return NULL;
908 from &= ~SYSTEM_CHARSET_FLAG;
909 if (from == lfr) return utf_table;
910 lfr = from;
911 if (utf_table_init) {
912 memset(utf_table, 0, sizeof(utf_table));
913 utf_table_init = 0;
914 } else
915 free_utf_table();
917 for (i = 0; i < 128; i++)
918 utf_table[i].u.str = strings[i];
920 if (is_cp_ptr_utf8(&codepages[from])) {
921 for (i = 128; i < 256; i++)
922 utf_table[i].u.str = stracpy(strings[i]);
923 return utf_table;
926 for (i = 128; i < 256; i++) {
927 unicode_val_T u = codepages[from].highhalf[i - 0x80];
929 if (u == 0xFFFF)
930 utf_table[i].u.str = NULL;
931 else
932 utf_table[i].u.str = stracpy(encode_utf8(u));
935 for (i = 0; codepages[from].table[i].c; i++) {
936 unicode_val_T u = codepages[from].table[i].u;
938 if (!utf_table[codepages[from].table[i].c].u.str)
939 utf_table[codepages[from].table[i].c].u.str =
940 stracpy(encode_utf8(u));
943 for (i = 128; i < 256; i++)
944 if (!utf_table[i].u.str)
945 utf_table[i].u.str = stracpy(no_str);
947 return utf_table;
950 /* A conversion table between two charsets, where the target is not UTF-8. */
951 static struct conv_table table[256];
952 static int first = 1;
954 void
955 free_conv_table(void)
957 if (!utf_table_init) free_utf_table();
958 if (first) {
959 memset(table, 0, sizeof(table));
960 first = 0;
962 new_translation_table(table);
963 #ifdef HAVE_ICONV
964 if (iconv_cd != (iconv_t)-1) {
965 iconv_close(iconv_cd);
966 iconv_cd = (iconv_t)-1;
968 #endif
972 struct conv_table *
973 get_translation_table(int from, int to)
975 static int lfr = -1;
976 static int lto = -1;
978 from &= ~SYSTEM_CHARSET_FLAG;
979 to &= ~SYSTEM_CHARSET_FLAG;
980 if (first) {
981 memset(table, 0, sizeof(table));
982 first = 0;
985 if (codepages[from].iconv) {
986 struct conv_table *table2 = get_translation_table_to_utf8(34);
988 if (table2) table2->iconv_cp = from;
989 return table2;
992 if (/*from == to ||*/ from == -1 || to == -1)
993 return NULL;
994 if (is_cp_ptr_utf8(&codepages[to])) {
995 struct conv_table *table2 = get_translation_table_to_utf8(from);
997 if (table2) table2->iconv_cp = -1;
998 return table2;
1000 if (from == lfr && to == lto)
1001 return table;
1002 lfr = from;
1003 lto = to;
1004 new_translation_table(table);
1006 if (is_cp_ptr_utf8(&codepages[from])) {
1007 int i;
1009 /* Map U+00A0 and U+00AD the same way as u2cp() would. */
1010 add_utf8(table, UCS_NO_BREAK_SPACE, strings[NBSP_CHAR]);
1011 add_utf8(table, UCS_SOFT_HYPHEN, "");
1013 for (i = 0x80; i <= 0xFF; i++)
1014 if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
1015 add_utf8(table,
1016 codepages[to].highhalf[i - 0x80],
1017 strings[i]);
1019 for (i = 0; codepages[to].table[i].c; i++)
1020 add_utf8(table, codepages[to].table[i].u,
1021 strings[codepages[to].table[i].c]);
1023 for (i = 0; unicode_7b[i].x != -1; i++)
1024 if (unicode_7b[i].x >= 0x80)
1025 add_utf8(table, unicode_7b[i].x,
1026 unicode_7b[i].s);
1028 } else {
1029 int i;
1031 for (i = 128; i < 256; i++) {
1032 if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
1033 const unsigned char *u;
1035 u = u2cp(codepages[from].highhalf[i - 0x80], to);
1036 if (u) table[i].u.str = u;
1041 return table;
1044 static inline int
1045 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
1047 while (l2) {
1048 if (*s1 > *s2) return 1;
1049 if (*s1 < *s2) return -1;
1050 s1++;
1051 s2++;
1052 l2--;
1055 return *s2 ? -1 : 0;
1058 /* Entity cache debugging purpose. */
1059 #if 0
1060 #define DEBUG_ENTITY_CACHE
1061 #else
1062 #undef DEBUG_ENTITY_CACHE
1063 #endif
1065 struct entity_cache {
1066 unsigned int hits;
1067 int strlen;
1068 int encoding;
1069 const unsigned char *result;
1070 unsigned char str[20]; /* Suffice in any case. */
1073 /* comparison function for qsort() */
1074 static int
1075 hits_cmp(const void *v1, const void *v2)
1077 const struct entity_cache *a = v1, *b = v2;
1079 if (a->hits == b->hits) return 0;
1080 if (a->hits > b->hits) return -1;
1081 else return 1;
1084 static int
1085 compare_entities(const void *key_, const void *element_)
1087 struct string *key = (struct string *) key_;
1088 struct entity *element = (struct entity *) element_;
1089 int length = key->length;
1090 unsigned char *first = key->source;
1091 unsigned char *second = element->s;
1093 return xxstrcmp(first, second, length);
1096 const unsigned char *
1097 get_entity_string(const unsigned char *str, const int strlen, int encoding)
1099 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
1100 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
1101 will go in [0] table */
1102 static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
1103 static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
1104 unsigned int slen = 0;
1105 const unsigned char *result = NULL;
1107 /* Note that an object of static storage duration is automatically
1108 * initialised to zero in C. */
1110 if (strlen <= 0) return NULL;
1112 #ifdef CONFIG_UTF8
1113 /* TODO: caching UTF-8 */
1114 encoding &= ~SYSTEM_CHARSET_FLAG;
1115 if (is_cp_ptr_utf8(&codepages[encoding]))
1116 goto skip;
1117 #endif /* CONFIG_UTF8 */
1119 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1120 * + google + slashdot + websites that result from a search for test on google,
1121 * + various ones) show quite impressive improvment:
1122 * Top ten is:
1123 * 0: hits=2459 l=4 st='nbsp'
1124 * 1: hits=2152 l=6 st='eacute'
1125 * 2: hits=235 l=6 st='egrave'
1126 * 3: hits=136 l=6 st='agrave'
1127 * 4: hits=100 l=3 st='amp'
1128 * 5: hits=40 l=5 st='laquo'
1129 * 6: hits=8 l=4 st='copy'
1130 * 7: hits=5 l=2 st='gt'
1131 * 8: hits=2 l=2 st='lt'
1132 * 9: hits=1 l=6 st='middot'
1134 * Most of the time cache hit ratio is near 95%.
1136 * A long test shows: 15186 hits vs. 24 misses and mean iteration
1137 * count is kept < 2 (worst case 1.58). Not so bad ;)
1139 * --Zas */
1141 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1142 slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
1144 if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
1145 int i;
1147 for (i = 0; i < nb_entity_cache[slen]; i++) {
1148 if (entity_cache[slen][i].encoding == encoding
1149 && !memcmp(str, entity_cache[slen][i].str, strlen)) {
1150 #ifdef DEBUG_ENTITY_CACHE
1151 static double total_iter = 0;
1152 static unsigned long hit_count = 0;
1154 total_iter += i + 1;
1155 hit_count++;
1156 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
1157 #endif
1158 if (entity_cache[slen][i].hits < (unsigned int) ~0)
1159 entity_cache[slen][i].hits++;
1160 return entity_cache[slen][i].result;
1163 #ifdef DEBUG_ENTITY_CACHE
1164 fprintf(stderr, "miss\n");
1165 #endif
1167 #ifdef CONFIG_UTF8
1168 skip:
1169 #endif /* CONFIG_UTF8 */
1170 if (*str == '#') { /* Numeric entity. */
1171 int l = (int) strlen;
1172 unsigned char *st = (unsigned char *) str;
1173 unicode_val_T n = 0;
1175 if (l == 1) goto end; /* &#; ? */
1176 st++, l--;
1177 if ((*st | 32) == 'x') { /* Hexadecimal */
1179 if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
1180 st++, l--;
1181 do {
1182 unsigned char c = (*(st++) | 32);
1184 if (isdigit(c))
1185 n = (n << 4) | (c - '0');
1186 else if (isxdigit(c))
1187 n = (n << 4) | (c - 'a' + 10);
1188 else
1189 goto end; /* Bad char. */
1190 } while (--l);
1191 } else { /* Decimal */
1192 if (l > 10) goto end; /* 4294967295 max. */
1193 do {
1194 unsigned char c = *(st++);
1196 if (isdigit(c))
1197 n = n * 10 + c - '0';
1198 else
1199 goto end; /* Bad char. */
1200 /* Limit to 0xFFFFFFFF. */
1201 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1202 goto end;
1203 } while (--l);
1206 result = u2cp(n, encoding);
1208 #ifdef DEBUG_ENTITY_CACHE
1209 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1210 #endif
1211 } else { /* Text entity. */
1212 struct string key = INIT_STRING((unsigned char *) str, strlen);
1213 struct entity *element = bsearch((void *) &key, entities,
1214 N_ENTITIES,
1215 sizeof(*element),
1216 compare_entities);
1218 if (element) result = u2cp(element->c, encoding);
1221 #ifdef CONFIG_UTF8
1222 if (is_cp_ptr_utf8(&codepages[encoding])) {
1223 return result;
1225 #endif /* CONFIG_UTF8 */
1226 end:
1227 /* Take care of potential buffer overflow. */
1228 if (strlen < sizeof(entity_cache[slen][0].str)) {
1229 struct entity_cache *ece;
1231 /* Sort entries by hit order. */
1232 if (nb_entity_cache[slen] > 1)
1233 qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1234 sizeof(entity_cache[slen][0]), hits_cmp);
1236 /* Increment number of cache entries if possible.
1237 * Else, just replace the least used entry. */
1238 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1239 ece = &entity_cache[slen][nb_entity_cache[slen] - 1];
1241 /* Copy new entry to cache. */
1242 ece->hits = 1;
1243 ece->strlen = strlen;
1244 ece->encoding = encoding;
1245 ece->result = result;
1246 memcpy(ece->str, str, strlen);
1247 ece->str[strlen] = '\0';
1250 #ifdef DEBUG_ENTITY_CACHE
1251 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1252 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1255 unsigned int i;
1257 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1258 for (i = 0; i < nb_entity_cache[slen] ; i++)
1259 fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1260 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1261 entity_cache[slen][i].str);
1262 fprintf(stderr, "-----------------\n");
1264 #endif /* DEBUG_ENTITY_CACHE */
1266 return result;
1269 unsigned char *
1270 convert_string(struct conv_table *convert_table,
1271 unsigned char *chars2, int charslen2, int cp,
1272 enum convert_string_mode mode, int *length,
1273 void (*callback)(void *data, unsigned char *buf, int buflen),
1274 void *callback_data)
1276 unsigned char *buffer;
1277 int bufferpos = 0;
1278 int charspos = 0;
1279 unsigned char *chars = chars2;
1280 int charslen = charslen2;
1282 #ifdef HAVE_ICONV
1283 static char iconv_input[256];
1284 static char iconv_output[256 * 8];
1285 static size_t iconv_offset;
1286 static int iconv_cp;
1287 static size_t iconv_inleft;
1288 size_t iconv_outleft = 256 * 8;
1289 int loop = 0;
1290 int is_iconv = 0;
1291 int chars_offset = 0;
1293 if (!convert_table && !memchr(chars, '&', charslen)) {
1294 if (callback) {
1295 if (charslen) callback(callback_data, chars, charslen);
1296 return NULL;
1297 } else {
1298 return memacpy(chars, charslen);
1302 if (cp >= 0) {
1303 if (convert_table && convert_table->iconv_cp > 0) {
1304 is_iconv = 1;
1305 cp = convert_table->iconv_cp;
1306 } else {
1307 is_iconv = codepages[cp & ~SYSTEM_CHARSET_FLAG].iconv;
1310 #endif
1312 /* Buffer allocation */
1314 buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1315 if (!buffer) return NULL;
1317 #ifdef HAVE_ICONV
1318 if (is_iconv) {
1319 int v;
1320 size_t before, to_copy;
1321 char *outp, *inp;
1323 if (iconv_cd >= 0) {
1324 if (cp != iconv_cp) {
1325 iconv_close(iconv_cd);
1326 iconv_cd = (iconv_t)-1;
1329 if (iconv_cd == (iconv_t)-1) {
1330 iconv_offset = 0;
1331 iconv_cd = iconv_open("utf-8", get_cp_mime_name(cp));
1332 if (iconv_cd == (iconv_t)-1) {
1333 mem_free(buffer);
1334 return NULL;
1336 iconv_cp = cp;
1338 repeat:
1339 to_copy = charslen2 - chars_offset;
1340 if (to_copy > 256 - iconv_offset) to_copy = 256 - iconv_offset;
1341 memcpy(iconv_input + iconv_offset, chars + chars_offset, to_copy);
1342 iconv_outleft = 256 * 8;
1343 iconv_inleft = iconv_offset + to_copy;
1344 inp = iconv_input;
1345 outp = iconv_output;
1346 before = iconv_inleft;
1348 v = iconv(iconv_cd, &inp, &iconv_inleft, &outp, &iconv_outleft);
1349 chars_offset += before - iconv_inleft;
1350 charslen = 256 * 8 - iconv_outleft;
1352 chars = (unsigned char *)iconv_output;
1353 charspos = 0;
1355 if (v == -1) {
1356 switch (errno) {
1357 case EINVAL:
1358 memcpy(iconv_input, inp, iconv_inleft);
1359 iconv_offset = iconv_inleft;
1360 break;
1361 case EILSEQ:
1362 loop = 0;
1363 goto out;
1364 break;
1365 default:
1366 iconv_offset = 0;
1368 } else {
1369 iconv_offset = 0;
1372 loop = chars_offset < charslen2;
1374 #endif
1375 /* Iterate ;-) */
1377 out:
1378 while (charspos < charslen) {
1379 const unsigned char *translit;
1381 #define PUTC do { \
1382 buffer[bufferpos++] = chars[charspos++]; \
1383 translit = ""; \
1384 goto flush; \
1385 } while (0)
1387 if (chars[charspos] != '&') {
1388 struct conv_table *t;
1389 int i;
1391 if (chars[charspos] < 128 || !convert_table) PUTC;
1393 t = convert_table;
1394 i = charspos;
1396 while (t[chars[i]].t) {
1397 t = t[chars[i++]].u.tbl;
1398 if (i >= charslen) PUTC;
1401 translit = t[chars[i]].u.str;
1402 charspos = i + 1;
1404 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1405 PUTC;
1407 } else {
1408 int start = charspos + 1;
1409 int i = start;
1411 while (i < charslen
1412 && (isasciialpha(chars[i])
1413 || isdigit(chars[i])
1414 || (chars[i] == '#')))
1415 i++;
1417 /* This prevents bug 213: we were expanding "entities"
1418 * in URL query strings. */
1419 /* XXX: But this disables &nbsp&nbsp usage, which
1420 * appears to be relatively common! --pasky */
1421 if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1422 && i > start
1423 && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1424 translit = get_entity_string(&chars[start], i - start,
1425 cp);
1426 if (chars[i] != ';') {
1427 /* Eat &nbsp &nbsp<foo> happily, but
1428 * pull back from the character after
1429 * entity string if it is not the valid
1430 * terminator. */
1431 i--;
1434 if (!translit) PUTC;
1435 charspos = i + (i < charslen);
1436 } else PUTC;
1439 if (!translit[0]) continue;
1441 if (!translit[1]) {
1442 buffer[bufferpos++] = translit[0];
1443 translit = "";
1444 goto flush;
1447 while (*translit) {
1448 unsigned char *new;
1450 buffer[bufferpos++] = *(translit++);
1451 flush:
1452 if (bufferpos & (ALLOC_GR - 1)) continue;
1454 if (callback) {
1455 buffer[bufferpos] = 0;
1456 callback(callback_data, buffer, bufferpos);
1457 bufferpos = 0;
1458 } else {
1459 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1460 if (!new) {
1461 mem_free(buffer);
1462 return NULL;
1464 buffer = new;
1467 #undef PUTC
1470 #ifdef HAVE_ICONV
1471 if (loop) goto repeat;
1472 #endif
1473 /* Say bye */
1475 buffer[bufferpos] = 0;
1476 if (length) *length = bufferpos;
1478 if (callback) {
1479 if (bufferpos) callback(callback_data, buffer, bufferpos);
1480 mem_free(buffer);
1481 return NULL;
1482 } else {
1483 return buffer;
1488 #ifndef USE_FASTFIND
1490 get_cp_index(const unsigned char *name)
1492 int i, a;
1493 int syscp = 0;
1495 if (!c_strcasecmp(name, "System")) {
1496 #if HAVE_LANGINFO_CODESET
1497 name = nl_langinfo(CODESET);
1498 syscp = SYSTEM_CHARSET_FLAG;
1499 #else
1500 name = "us-ascii";
1501 #endif
1504 for (i = 0; codepages[i].name; i++) {
1505 for (a = 0; codepages[i].aliases[a]; a++) {
1506 /* In the past, we looked for the longest substring
1507 * in all the names; it is way too expensive, though:
1509 * % cumulative self self total
1510 * time seconds seconds calls us/call us/call name
1511 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1513 * Anything called from redraw_screen() is in fact
1514 * relatively expensive, even if it's called just
1515 * once. So we will do a simple strcasecmp() here.
1518 if (!c_strcasecmp(name, codepages[i].aliases[a]))
1519 return i | syscp;
1523 if (syscp) {
1524 return get_cp_index("us-ascii") | syscp;
1525 } else {
1526 return -1;
1530 #else
1532 static unsigned int i_name = 0;
1533 static unsigned int i_alias = 0;
1535 /* Reset internal list pointer */
1536 void
1537 charsets_list_reset(void)
1539 i_name = 0;
1540 i_alias = 0;
1543 /* Returns a pointer to a struct that contains current key and data pointers
1544 * and increment internal pointer. It returns NULL when key is NULL. */
1545 struct fastfind_key_value *
1546 charsets_list_next(void)
1548 static struct fastfind_key_value kv;
1550 if (!codepages[i_name].name) return NULL;
1552 kv.key = codepages[i_name].aliases[i_alias];
1553 kv.data = (void *) &codepages[i_name]; /* cast away const */
1555 if (codepages[i_name].aliases[i_alias + 1])
1556 i_alias++;
1557 else {
1558 i_name++;
1559 i_alias = 0;
1562 return &kv;
1565 static struct fastfind_index ff_charsets_index
1566 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1568 /* It searchs for a charset named @name or one of its aliases and
1569 * returns index for it or -1 if not found. */
1571 get_cp_index(const unsigned char *name)
1573 const struct codepage_desc *codepage;
1574 int syscp = 0;
1576 if (!c_strcasecmp(name, "System")) {
1577 #if HAVE_LANGINFO_CODESET
1578 name = nl_langinfo(CODESET);
1579 syscp = SYSTEM_CHARSET_FLAG;
1580 #else
1581 name = "us-ascii";
1582 #endif
1585 codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1586 if (codepage) {
1587 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1588 return (codepage - codepages) | syscp;
1590 } else if (syscp) {
1591 return get_cp_index("us-ascii") | syscp;
1593 } else {
1594 return -1;
1598 #endif /* USE_FASTFIND */
1600 void
1601 init_charsets_lookup(void)
1603 #ifdef USE_FASTFIND
1604 fastfind_index(&ff_charsets_index, FF_COMPRESS);
1605 #endif
1608 void
1609 free_charsets_lookup(void)
1611 #ifdef USE_FASTFIND
1612 fastfind_done(&ff_charsets_index);
1613 #endif
1616 /* Get the codepage's name for displaying to the user, or NULL if
1617 * @cp_index is one past the end. In the future, we might want to
1618 * localize these with gettext. So it may be best not to use this
1619 * function if the name will have to be converted back to an
1620 * index. */
1621 unsigned char *
1622 get_cp_name(int cp_index)
1624 if (cp_index < 0) return "none";
1625 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1627 return codepages[cp_index].name;
1630 /* Get the codepage's name for saving to a configuration file. These
1631 * names can be converted back to indexes, even in future versions of
1632 * ELinks. */
1633 unsigned char *
1634 get_cp_config_name(int cp_index)
1636 if (cp_index < 0) return "none";
1637 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1638 if (!codepages[cp_index].aliases) return NULL;
1640 return codepages[cp_index].aliases[0];
1643 /* Get the codepage's name for sending to a library or server that
1644 * understands MIME charset names. This function irreversibly maps
1645 * the "System" codepage to the underlying charset. */
1646 unsigned char *
1647 get_cp_mime_name(int cp_index)
1649 if (cp_index < 0) return "none";
1650 cp_index &= ~SYSTEM_CHARSET_FLAG;
1651 if (!codepages[cp_index].aliases) return NULL;
1653 return codepages[cp_index].aliases[0];
1657 is_cp_utf8(int cp_index)
1659 cp_index &= ~SYSTEM_CHARSET_FLAG;
1660 return is_cp_ptr_utf8(&codepages[cp_index]);
1663 /* This function will be used by the xhtml parser. */
1664 const uint16_t *
1665 get_cp_highhalf(const unsigned char *name)
1667 int cp = get_cp_index(name);
1669 if (cp < 0) return NULL;
1670 cp &= ~SYSTEM_CHARSET_FLAG;
1671 return codepages[cp].highhalf;