Handle mailcap's copiousoutput without an external pager.
[elinks.git] / src / intl / charsets.c
blob61e3f48008667490be2764812e6042e304c72cb8
1 /* Charsets convertor */
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE /* strcasecmp() */
5 #endif
7 #ifdef HAVE_CONFIG_H
8 #include "config.h"
9 #endif
11 #if HAVE_LANGINFO_CODESET
12 #include <langinfo.h>
13 #endif
15 #include <ctype.h>
16 #include <stdlib.h>
17 #if HAVE_WCTYPE_H
18 #include <wctype.h>
19 #endif
21 #ifdef HAVE_ICONV
22 #include <errno.h>
23 #include <iconv.h>
24 #endif
26 #include "elinks.h"
28 #include "document/options.h"
29 #include "intl/charsets.h"
30 #include "util/conv.h"
31 #include "util/error.h"
32 #include "util/fastfind.h"
33 #include "util/hash.h"
34 #include "util/memory.h"
35 #include "util/string.h"
38 /* Fix namespace clash on MacOS. */
39 #define table table_elinks
41 struct table_entry {
42 unsigned char c;
43 /* This should in principle be unicode_val_T, but because all
44 * the values currently in codepage.inc fit in 16 bits, we can
45 * as well use uint16_t and halve sizeof(struct table_entry)
46 * from 8 bytes to 4. Should other characters ever be needed,
47 * unicode_val_T u : 24 might be a possibility, although it
48 * seems a little unportable as bitfields are in principle
49 * restricted to int, which may be 16-bit. */
50 uint16_t u;
53 struct codepage_desc {
54 unsigned char *name;
55 unsigned char *const *aliases;
57 /* The Unicode mappings of codepage bytes 0x80...0xFF.
58 * (0x00...0x7F are assumed to be ASCII in all codepages.)
59 * Because all current values fit in 16 bits, we store them as
60 * uint16_t rather than unicode_val_T. If the codepage does
61 * not use some byte, then @highhalf maps that byte to 0xFFFF,
62 * which C code converts to UCS_REPLACEMENT_CHARACTER where
63 * appropriate. (U+FFFF is reserved and will never be
64 * assigned as a character.) */
65 const uint16_t *highhalf;
67 /* If some byte in the codepage corresponds to multiple Unicode
68 * characters, then the preferred character is in @highhalf
69 * above, and the rest are listed here in @table. This table
70 * is not used for translating from the codepage to Unicode. */
71 const struct table_entry *table;
73 /* Whether use iconv for translation */
74 unsigned int iconv:1;
77 #include "intl/codepage.inc"
78 #include "intl/uni_7b.inc"
79 #include "intl/entity.inc"
81 /* Declare the external-linkage inline functions defined in this file.
82 * Avoid the GCC 4.3.1 warning: `foo' declared inline after being
83 * called. The functions are not declared inline in charsets.h
84 * because C99 6.7.4p6 says that every external-linkage function
85 * declared inline shall be defined in the same translation unit.
86 * The non-inline declarations in charsets.h also make sure that the
87 * compiler emits global definitions for the symbols so that the
88 * functions can be called from other translation units. */
89 NONSTATIC_INLINE unsigned char *encode_utf8(unicode_val_T u);
90 NONSTATIC_INLINE int utf8charlen(const unsigned char *p);
91 NONSTATIC_INLINE int unicode_to_cell(unicode_val_T c);
92 NONSTATIC_INLINE unicode_val_T utf8_to_unicode(unsigned char **string,
93 const unsigned char *end);
95 static const char strings[256][2] = {
96 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
97 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
98 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
99 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
100 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
101 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
102 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
103 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
104 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
105 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
106 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
107 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
108 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
109 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
110 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
111 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
112 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
113 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
114 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
115 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
116 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
117 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
118 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
119 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
120 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
121 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
122 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
123 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
124 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
125 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
126 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
127 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
130 #ifdef HAVE_ICONV
131 static iconv_t iconv_cd = (iconv_t)-1;
132 #endif
134 static void
135 free_translation_table(struct conv_table *p)
137 int i;
139 for (i = 0; i < 256; i++)
140 if (p[i].t)
141 free_translation_table(p[i].u.tbl);
143 mem_free(p);
146 /* A string used in conversion tables when there is no correct
147 * conversion. This is compared by address and therefore should be a
148 * named array rather than a pointer so that it won't share storage
149 * with any other string literal that happens to have the same
150 * characters. */
151 static const unsigned char no_str[] = "*";
153 static void
154 new_translation_table(struct conv_table *p)
156 int i;
158 for (i = 0; i < 256; i++)
159 if (p[i].t)
160 free_translation_table(p[i].u.tbl);
161 for (i = 0; i < 128; i++) {
162 p[i].t = 0;
163 p[i].u.str = strings[i];
165 for (; i < 256; i++) {
166 p[i].t = 0;
167 p[i].u.str = no_str;
169 p->iconv_cp = -1;
172 #define BIN_SEARCH(table, entry, entries, key, result) \
174 long _s = 0, _e = (entries) - 1; \
176 while (_s <= _e || !((result) = -1)) { \
177 long _m = (_s + _e) / 2; \
179 if ((table)[_m].entry == (key)) { \
180 (result) = _m; \
181 break; \
183 if ((table)[_m].entry > (key)) _e = _m - 1; \
184 if ((table)[_m].entry < (key)) _s = _m + 1; \
188 static const unicode_val_T strange_chars[32] = {
189 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
190 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
191 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
192 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
195 #define SYSTEM_CHARSET_FLAG 128
196 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
198 const unsigned char *
199 u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
201 int j;
202 int s;
204 if (u < 128) return strings[u];
206 to &= ~SYSTEM_CHARSET_FLAG;
208 if (is_cp_ptr_utf8(&codepages[to]))
209 return encode_utf8(u);
211 /* To mark non breaking spaces in non-UTF-8 strings, we use a
212 * special char NBSP_CHAR. */
213 if (u == UCS_NO_BREAK_SPACE) {
214 if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
215 else /* NBSP_MODE_ASCII */ return " ";
217 if (u == UCS_SOFT_HYPHEN) return "";
219 if (u < 0xa0) {
220 unicode_val_T strange = strange_chars[u - 0x80];
222 if (!strange) return NULL;
223 return u2cp_(strange, to, nbsp_mode);
226 if (u < 0xFFFF)
227 for (j = 0; j < 0x80; j++)
228 if (codepages[to].highhalf[j] == u)
229 return strings[0x80 + j];
230 for (j = 0; codepages[to].table[j].c; j++)
231 if (codepages[to].table[j].u == u)
232 return strings[codepages[to].table[j].c];
234 BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
235 if (s != -1) return unicode_7b[s].s;
237 return no_str;
240 static unsigned char utf_buffer[7];
242 NONSTATIC_INLINE unsigned char *
243 encode_utf8(unicode_val_T u)
245 memset(utf_buffer, 0, 7);
247 if (u < 0x80)
248 utf_buffer[0] = u;
249 else if (u < 0x800)
250 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
251 utf_buffer[1] = 0x80 | (u & 0x3f);
252 else if (u < 0x10000)
253 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
254 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
255 utf_buffer[2] = 0x80 | (u & 0x3f);
256 else if (u < 0x200000)
257 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
258 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
259 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
260 utf_buffer[3] = 0x80 | (u & 0x3f);
261 else if (u < 0x4000000)
262 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
263 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
264 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
265 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
266 utf_buffer[4] = 0x80 | (u & 0x3f);
267 else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
268 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
269 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
270 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
271 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
272 utf_buffer[5] = 0x80 | (u & 0x3f);
274 return utf_buffer;
277 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
278 * equal ones and handled different. */
279 static const char utf8char_len_tab[256] = {
280 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
281 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
282 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
283 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
284 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
285 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
286 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
287 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
290 #ifdef CONFIG_UTF8
291 NONSTATIC_INLINE int
292 utf8charlen(const unsigned char *p)
294 return p ? utf8char_len_tab[*p] : 0;
298 strlen_utf8(unsigned char **str)
300 unsigned char *s = *str;
301 unsigned char *end = strchr(s, '\0');
302 int x;
303 int len;
305 for (x = 0;; x++, s += len) {
306 len = utf8charlen(s);
307 if (s + len > end) break;
309 *str = s;
310 return x;
313 #define utf8_issingle(p) (((p) & 0x80) == 0)
314 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
316 /* Start from @current and move back to @pos char. This pointer return. The
317 * most left pointer is @start. */
318 unsigned char *
319 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
321 if (current == NULL || start == NULL || pos < 0)
322 return NULL;
323 while (pos > 0 && current != start) {
324 current--;
325 if (utf8_islead(*current))
326 pos--;
328 return current;
331 /* Count number of standard terminal cells needed for displaying UTF-8
332 * character. */
334 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
336 unicode_val_T u;
338 if (end == NULL)
339 end = strchr(utf8_char, '\0');
341 if(!utf8_char || !end)
342 return -1;
344 u = utf8_to_unicode(&utf8_char, end);
346 return unicode_to_cell(u);
349 /* Count number of standard terminal cells needed for displaying string
350 * with UTF-8 characters. */
352 utf8_ptr2cells(unsigned char *string, unsigned char *end)
354 int charlen, cell, cells = 0;
356 if (end == NULL)
357 end = strchr(string, '\0');
359 if(!string || !end)
360 return -1;
362 do {
363 charlen = utf8charlen(string);
364 if (string + charlen > end)
365 break;
367 cell = utf8_char2cells(string, end);
368 if (cell < 0)
369 return -1;
371 cells += cell;
372 string += charlen;
373 } while (1);
375 return cells;
378 /* Count number of characters in string. */
380 utf8_ptr2chars(unsigned char *string, unsigned char *end)
382 int charlen, chars = 0;
384 if (end == NULL)
385 end = strchr(string, '\0');
387 if(!string || !end)
388 return -1;
390 do {
391 charlen = utf8charlen(string);
392 if (string + charlen > end)
393 break;
395 chars++;
396 string += charlen;
397 } while (1);
399 return chars;
403 * Count number of bytes from begining of the string needed for displaying
404 * specified number of cells.
407 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
409 unsigned int bytes = 0, cells = 0;
411 assert(max_cells>=0);
413 if (end == NULL)
414 end = strchr(string, '\0');
416 if(!string || !end)
417 return -1;
419 do {
420 int cell = utf8_char2cells(&string[bytes], end);
421 if (cell < 0)
422 return -1;
424 cells += cell;
425 if (cells > max_cells)
426 break;
428 bytes += utf8charlen(&string[bytes]);
430 if (string + bytes > end) {
431 bytes = end - string;
432 break;
434 } while(1);
436 return bytes;
439 /* Take @max steps forward from @string in the specified @way, but
440 * not going past @end. Return the resulting address. Store the
441 * number of steps taken to *@count, unless @count is NULL.
443 * This assumes the text is valid UTF-8, and @string and @end point to
444 * character boundaries. If not, it doesn't crash but the results may
445 * be inconsistent.
447 * This function can do some of the same jobs as utf8charlen(),
448 * utf8_cells2bytes(), and strlen_utf8(). */
449 unsigned char *
450 utf8_step_forward(unsigned char *string, unsigned char *end,
451 int max, enum utf8_step way, int *count)
453 int steps = 0;
454 unsigned char *current = string;
456 assert(string);
457 assert(max >= 0);
458 if_assert_failed goto invalid_arg;
459 if (end == NULL)
460 end = strchr(string, '\0');
462 switch (way) {
463 case UTF8_STEP_CHARACTERS:
464 while (steps < max && current < end) {
465 ++current;
466 if (utf8_islead(*current))
467 ++steps;
469 break;
471 case UTF8_STEP_CELLS_FEWER:
472 case UTF8_STEP_CELLS_MORE:
473 while (steps < max && current < end) {
474 unicode_val_T u;
475 unsigned char *prev = current;
476 int width;
478 u = utf8_to_unicode(&current, end);
479 if (u == UCS_NO_CHAR) {
480 /* Assume the incomplete sequence
481 * costs one cell. */
482 current = end;
483 ++steps;
484 break;
487 width = unicode_to_cell(u);
488 if (way == UTF8_STEP_CELLS_FEWER
489 && steps + width > max) {
490 /* Back off. */
491 current = prev;
492 break;
494 steps += width;
496 break;
498 default:
499 INTERNAL("impossible enum utf8_step");
502 invalid_arg:
503 if (count)
504 *count = steps;
505 return current;
508 /* Take @max steps backward from @string in the specified @way, but
509 * not going past @start. Return the resulting address. Store the
510 * number of steps taken to *@count, unless @count is NULL.
512 * This assumes the text is valid UTF-8, and @string and @start point
513 * to character boundaries. If not, it doesn't crash but the results
514 * may be inconsistent.
516 * This function can do some of the same jobs as utf8_prevchar(). */
517 unsigned char *
518 utf8_step_backward(unsigned char *string, unsigned char *start,
519 int max, enum utf8_step way, int *count)
521 int steps = 0;
522 unsigned char *current = string;
524 assert(string);
525 assert(start);
526 assert(max >= 0);
527 if_assert_failed goto invalid_arg;
529 switch (way) {
530 case UTF8_STEP_CHARACTERS:
531 while (steps < max && current > start) {
532 --current;
533 if (utf8_islead(*current))
534 ++steps;
536 break;
538 case UTF8_STEP_CELLS_FEWER:
539 case UTF8_STEP_CELLS_MORE:
540 while (steps < max) {
541 unsigned char *prev = current;
542 unsigned char *look;
543 unicode_val_T u;
544 int width;
546 if (current <= start)
547 break;
548 do {
549 --current;
550 } while (current > start && !utf8_islead(*current));
552 look = current;
553 u = utf8_to_unicode(&look, prev);
554 if (u == UCS_NO_CHAR) {
555 /* Assume the incomplete sequence
556 * costs one cell. */
557 width = 1;
558 } else
559 width = unicode_to_cell(u);
561 if (way == UTF8_STEP_CELLS_FEWER
562 && steps + width > max) {
563 /* Back off. */
564 current = prev;
565 break;
567 steps += width;
569 break;
571 default:
572 INTERNAL("impossible enum utf8_step");
575 invalid_arg:
576 if (count)
577 *count = steps;
578 return current;
582 * Find out number of standard terminal collumns needed for displaying symbol
583 * (glyph) which represents Unicode character c.
585 * TODO: Use wcwidth when it is available. This seems to require:
586 * - Make the configure script check whether <wchar.h> and wcwidth exist.
587 * - Define _XOPEN_SOURCE and include <wchar.h>.
588 * - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
589 * matches ISO 10646 in all locales.)
590 * However, these do not suffice, because wcwidth depends on LC_CTYPE
591 * in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
592 * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
593 * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
594 * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
595 * character is apparently not supported in all locales. Why is that?
596 * - Perhaps there is standardese that requires supported characters
597 * to be convertable to multibyte form. Then ELinks could just pick
598 * some UTF-8 locale for its wcwidth purposes.
599 * - Perhaps wcwidth can even return different nonnegative values for
600 * the same ISO 10646 character in different locales. Then ELinks
601 * would have to set LC_CTYPE to match at least the terminal's
602 * charset (which may differ from the LC_CTYPE environment variable,
603 * especially when the master process is serving a slave terminal).
604 * But there is no guarantee that the libc supports all the same
605 * charsets as ELinks does.
606 * For now, it seems safest to avoid the potentially locale-dependent
607 * libc version of wcwidth, and instead use a hardcoded mapping.
609 * @return 2 for double-width glyph, 1 for others.
610 * TODO: May be extended to return 0 for zero-width glyphs
611 * (like composing, maybe unprintable too).
613 NONSTATIC_INLINE int
614 unicode_to_cell(unicode_val_T c)
616 if (c >= 0x1100
617 && (c <= 0x115f /* Hangul Jamo */
618 || c == 0x2329
619 || c == 0x232a
620 || (c >= 0x2e80 && c <= 0xa4cf
621 && c != 0x303f) /* CJK ... Yi */
622 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
623 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
624 Ideographs */
625 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
626 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
627 || (c >= 0xffe0 && c <= 0xffe6)
628 || (c >= 0x20000 && c <= 0x2fffd)
629 || (c >= 0x30000 && c <= 0x3fffd)))
630 return 2;
632 return 1;
635 /* Fold the case of a Unicode character, so that hotkeys in labels can
636 * be compared case-insensitively. It is unspecified whether the
637 * result will be in upper or lower case. */
638 unicode_val_T
639 unicode_fold_label_case(unicode_val_T c)
641 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
642 return towlower(c);
643 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
644 /* For now, this supports only ASCII. It would be possible to
645 * use code generated from CaseFolding.txt of Unicode if the
646 * acknowledgements required by http://www.unicode.org/copyright.html
647 * were added to associated documentation of ELinks. */
648 if (c >= 0x41 && c <= 0x5A)
649 return c + 0x20;
650 else
651 return c;
652 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
654 #endif /* CONFIG_UTF8 */
656 NONSTATIC_INLINE unicode_val_T
657 utf8_to_unicode(unsigned char **string, const unsigned char *end)
659 unsigned char *str = *string;
660 unicode_val_T u;
661 int length;
663 length = utf8char_len_tab[str[0]];
665 if (str + length > end) {
666 return UCS_NO_CHAR;
669 switch (length) {
670 case 1: /* U+0000 to U+007F */
671 if (str[0] >= 0x80) {
672 invalid_utf8:
673 ++*string;
674 return UCS_REPLACEMENT_CHARACTER;
676 u = str[0];
677 break;
678 case 2: /* U+0080 to U+07FF */
679 if ((str[1] & 0xc0) != 0x80)
680 goto invalid_utf8;
681 u = (str[0] & 0x1f) << 6;
682 u += (str[1] & 0x3f);
683 if (u < 0x80)
684 goto invalid_utf8;
685 break;
686 case 3: /* U+0800 to U+FFFF, except surrogates */
687 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
688 goto invalid_utf8;
689 u = (str[0] & 0x0f) << 12;
690 u += ((str[1] & 0x3f) << 6);
691 u += (str[2] & 0x3f);
692 if (u < 0x800 || is_utf16_surrogate(u))
693 goto invalid_utf8;
694 break;
695 case 4: /* U+10000 to U+1FFFFF */
696 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
697 || (str[3] & 0xc0) != 0x80)
698 goto invalid_utf8;
699 u = (str[0] & 0x0f) << 18;
700 u += ((str[1] & 0x3f) << 12);
701 u += ((str[2] & 0x3f) << 6);
702 u += (str[3] & 0x3f);
703 if (u < 0x10000)
704 goto invalid_utf8;
705 break;
706 case 5: /* U+200000 to U+3FFFFFF */
707 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
708 || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
709 goto invalid_utf8;
710 u = (str[0] & 0x0f) << 24;
711 u += ((str[1] & 0x3f) << 18);
712 u += ((str[2] & 0x3f) << 12);
713 u += ((str[3] & 0x3f) << 6);
714 u += (str[4] & 0x3f);
715 if (u < 0x200000)
716 goto invalid_utf8;
717 break;
718 case 6: /* U+4000000 to U+7FFFFFFF */
719 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
720 || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
721 || (str[5] & 0xc0) != 0x80)
722 goto invalid_utf8;
723 u = (str[0] & 0x01) << 30;
724 u += ((str[1] & 0x3f) << 24);
725 u += ((str[2] & 0x3f) << 18);
726 u += ((str[3] & 0x3f) << 12);
727 u += ((str[4] & 0x3f) << 6);
728 u += (str[5] & 0x3f);
729 if (u < 0x4000000)
730 goto invalid_utf8;
731 break;
732 default:
733 INTERNAL("utf8char_len_tab out of range");
734 goto invalid_utf8;
736 *string = str + length;
737 return u;
740 /* The common part of cp2u and cp2utf_8. */
741 static unicode_val_T
742 cp2u_shared(const struct codepage_desc *from, unsigned char c)
744 unicode_val_T u = from->highhalf[c - 0x80];
746 if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
747 return u;
750 /* Used for converting input from the terminal. */
751 unicode_val_T
752 cp2u(int from, unsigned char c)
754 from &= ~SYSTEM_CHARSET_FLAG;
756 /* UTF-8 is a multibyte codepage and cannot be handled with
757 * this function. */
758 assert(!is_cp_ptr_utf8(&codepages[from]));
759 if_assert_failed return UCS_REPLACEMENT_CHARACTER;
761 if (c < 0x80) return c;
762 else return cp2u_shared(&codepages[from], c);
765 /* This slow and ugly code is used by the terminal utf_8_io */
766 const unsigned char *
767 cp2utf8(int from, int c)
769 from &= ~SYSTEM_CHARSET_FLAG;
771 if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
772 return strings[c];
774 return encode_utf8(cp2u_shared(&codepages[from], c));
777 unicode_val_T
778 cp_to_unicode(int codepage, unsigned char **string, const unsigned char *end)
780 unicode_val_T ret;
782 if (is_cp_utf8(codepage))
783 return utf8_to_unicode(string, end);
785 if (*string >= end)
786 return UCS_NO_CHAR;
788 ret = cp2u(codepage, **string);
789 ++*string;
790 return ret;
794 #ifdef CONFIG_COMBINE
795 unicode_val_T last_combined = UCS_BEGIN_COMBINED - 1;
796 unicode_val_T **combined;
797 struct hash *combined_hash;
799 unicode_val_T
800 get_combined(unicode_val_T *data, int length)
802 struct hash_item *item;
803 unicode_val_T *key;
804 int i, indeks;
806 assert(length >= 1 && length <= UCS_MAX_LENGTH_COMBINED);
807 if_assert_failed return UCS_NO_CHAR;
809 if (!combined_hash) combined_hash = init_hash8();
810 if (!combined_hash) return UCS_NO_CHAR;
811 item = get_hash_item(combined_hash, (unsigned char *)data, length * sizeof(*data));
813 if (item) return (unicode_val_T)(long)item->value;
814 if (last_combined >= UCS_END_COMBINED) return UCS_NO_CHAR;
816 key = mem_alloc((length + 1) * sizeof(*key));
817 if (!key) return UCS_NO_CHAR;
818 for (i = 0; i < length; i++)
819 key[i] = data[i];
820 key[i] = UCS_END_COMBINED;
822 last_combined++;
823 indeks = last_combined - UCS_BEGIN_COMBINED;
825 combined = mem_realloc(combined, sizeof(*combined) * (indeks + 1));
826 if (!combined) {
827 mem_free(key);
828 last_combined--;
829 return UCS_NO_CHAR;
831 combined[indeks] = key;
832 item = add_hash_item(combined_hash, (unsigned char *)key,
833 length * sizeof(*data), (void *)(long)(last_combined));
834 if (!item) {
835 last_combined--;
836 mem_free(key);
837 return UCS_NO_CHAR;
839 return last_combined;
842 void
843 free_combined()
845 int i, end = last_combined - UCS_BEGIN_COMBINED + 1;
847 if (combined_hash)
848 free_hash(&combined_hash);
849 for (i = 0; i < end; i++)
850 mem_free(combined[i]);
851 mem_free_if(combined);
853 #endif /* CONFIG_COMBINE */
856 static void
857 add_utf8(struct conv_table *ct, unicode_val_T u, const unsigned char *str)
859 unsigned char *p = encode_utf8(u);
861 while (p[1]) {
862 if (ct[*p].t) ct = ct[*p].u.tbl;
863 else {
864 struct conv_table *nct;
866 assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
867 if_assert_failed return;
869 nct = mem_calloc(256, sizeof(*nct));
870 if (!nct) return;
871 new_translation_table(nct);
872 ct[*p].t = 1;
873 ct[*p].u.tbl = nct;
874 ct = nct;
876 p++;
879 assertm(!ct[*p].t, "bad utf encoding #2");
880 if_assert_failed return;
882 if (ct[*p].u.str == no_str)
883 ct[*p].u.str = str;
886 /* A conversion table from some charset to UTF-8.
887 * If it is from UTF-8 to UTF-8, it converts each byte separately.
888 * Unlike in other translation tables, the strings in elements 0x80 to
889 * 0xFF are allocated dynamically. */
890 struct conv_table utf_table[256];
891 int utf_table_init = 1;
893 static void
894 free_utf_table(void)
896 int i;
898 /* Cast away const. */
899 for (i = 128; i < 256; i++)
900 mem_free((unsigned char *) utf_table[i].u.str);
903 static struct conv_table *
904 get_translation_table_to_utf8(int from)
906 int i;
907 static int lfr = -1;
909 if (from == -1) return NULL;
910 from &= ~SYSTEM_CHARSET_FLAG;
911 if (from == lfr) return utf_table;
912 lfr = from;
913 if (utf_table_init) {
914 memset(utf_table, 0, sizeof(utf_table));
915 utf_table_init = 0;
916 } else
917 free_utf_table();
919 for (i = 0; i < 128; i++)
920 utf_table[i].u.str = strings[i];
922 if (is_cp_ptr_utf8(&codepages[from])) {
923 for (i = 128; i < 256; i++)
924 utf_table[i].u.str = stracpy(strings[i]);
925 return utf_table;
928 for (i = 128; i < 256; i++) {
929 unicode_val_T u = codepages[from].highhalf[i - 0x80];
931 if (u == 0xFFFF)
932 utf_table[i].u.str = NULL;
933 else
934 utf_table[i].u.str = stracpy(encode_utf8(u));
937 for (i = 0; codepages[from].table[i].c; i++) {
938 unicode_val_T u = codepages[from].table[i].u;
940 if (!utf_table[codepages[from].table[i].c].u.str)
941 utf_table[codepages[from].table[i].c].u.str =
942 stracpy(encode_utf8(u));
945 for (i = 128; i < 256; i++)
946 if (!utf_table[i].u.str)
947 utf_table[i].u.str = stracpy(no_str);
949 return utf_table;
952 /* A conversion table between two charsets, where the target is not UTF-8. */
953 static struct conv_table table[256];
954 static int first = 1;
956 void
957 free_conv_table(void)
959 if (!utf_table_init) free_utf_table();
960 if (first) {
961 memset(table, 0, sizeof(table));
962 first = 0;
964 new_translation_table(table);
965 #ifdef HAVE_ICONV
966 if (iconv_cd != (iconv_t)-1) {
967 iconv_close(iconv_cd);
968 iconv_cd = (iconv_t)-1;
970 #endif
974 struct conv_table *
975 get_translation_table(int from, int to)
977 static int lfr = -1;
978 static int lto = -1;
980 from &= ~SYSTEM_CHARSET_FLAG;
981 to &= ~SYSTEM_CHARSET_FLAG;
982 if (first) {
983 memset(table, 0, sizeof(table));
984 first = 0;
987 if (codepages[from].iconv) {
988 struct conv_table *table2 = get_translation_table_to_utf8(34);
990 if (table2) table2->iconv_cp = from;
991 return table2;
994 if (/*from == to ||*/ from == -1 || to == -1)
995 return NULL;
996 if (is_cp_ptr_utf8(&codepages[to])) {
997 struct conv_table *table2 = get_translation_table_to_utf8(from);
999 if (table2) table2->iconv_cp = -1;
1000 return table2;
1002 if (from == lfr && to == lto)
1003 return table;
1004 lfr = from;
1005 lto = to;
1006 new_translation_table(table);
1008 if (is_cp_ptr_utf8(&codepages[from])) {
1009 int i;
1011 /* Map U+00A0 and U+00AD the same way as u2cp() would. */
1012 add_utf8(table, UCS_NO_BREAK_SPACE, strings[NBSP_CHAR]);
1013 add_utf8(table, UCS_SOFT_HYPHEN, "");
1015 for (i = 0x80; i <= 0xFF; i++)
1016 if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
1017 add_utf8(table,
1018 codepages[to].highhalf[i - 0x80],
1019 strings[i]);
1021 for (i = 0; codepages[to].table[i].c; i++)
1022 add_utf8(table, codepages[to].table[i].u,
1023 strings[codepages[to].table[i].c]);
1025 for (i = 0; unicode_7b[i].x != -1; i++)
1026 if (unicode_7b[i].x >= 0x80)
1027 add_utf8(table, unicode_7b[i].x,
1028 unicode_7b[i].s);
1030 } else {
1031 int i;
1033 for (i = 128; i < 256; i++) {
1034 if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
1035 const unsigned char *u;
1037 u = u2cp(codepages[from].highhalf[i - 0x80], to);
1038 if (u) table[i].u.str = u;
1043 return table;
1046 static inline int
1047 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
1049 while (l2) {
1050 if (*s1 > *s2) return 1;
1051 if (*s1 < *s2) return -1;
1052 s1++;
1053 s2++;
1054 l2--;
1057 return *s2 ? -1 : 0;
1060 /* Entity cache debugging purpose. */
1061 #if 0
1062 #define DEBUG_ENTITY_CACHE
1063 #else
1064 #undef DEBUG_ENTITY_CACHE
1065 #endif
1067 struct entity_cache {
1068 unsigned int hits;
1069 int strlen;
1070 int encoding;
1071 const unsigned char *result;
1072 unsigned char str[20]; /* Suffice in any case. */
1075 /* comparison function for qsort() */
1076 static int
1077 hits_cmp(const void *v1, const void *v2)
1079 const struct entity_cache *a = v1, *b = v2;
1081 if (a->hits == b->hits) return 0;
1082 if (a->hits > b->hits) return -1;
1083 else return 1;
1086 static int
1087 compare_entities(const void *key_, const void *element_)
1089 struct string *key = (struct string *) key_;
1090 struct entity *element = (struct entity *) element_;
1091 int length = key->length;
1092 unsigned char *first = key->source;
1093 unsigned char *second = element->s;
1095 return xxstrcmp(first, second, length);
1098 const unsigned char *
1099 get_entity_string(const unsigned char *str, const int strlen, int encoding)
1101 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
1102 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
1103 will go in [0] table */
1104 static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
1105 static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
1106 unsigned int slen = 0;
1107 const unsigned char *result = NULL;
1109 /* Note that an object of static storage duration is automatically
1110 * initialised to zero in C. */
1112 if (strlen <= 0) return NULL;
1114 #ifdef CONFIG_UTF8
1115 /* TODO: caching UTF-8 */
1116 encoding &= ~SYSTEM_CHARSET_FLAG;
1117 if (is_cp_ptr_utf8(&codepages[encoding]))
1118 goto skip;
1119 #endif /* CONFIG_UTF8 */
1121 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1122 * + google + slashdot + websites that result from a search for test on google,
1123 * + various ones) show quite impressive improvment:
1124 * Top ten is:
1125 * 0: hits=2459 l=4 st='nbsp'
1126 * 1: hits=2152 l=6 st='eacute'
1127 * 2: hits=235 l=6 st='egrave'
1128 * 3: hits=136 l=6 st='agrave'
1129 * 4: hits=100 l=3 st='amp'
1130 * 5: hits=40 l=5 st='laquo'
1131 * 6: hits=8 l=4 st='copy'
1132 * 7: hits=5 l=2 st='gt'
1133 * 8: hits=2 l=2 st='lt'
1134 * 9: hits=1 l=6 st='middot'
1136 * Most of the time cache hit ratio is near 95%.
1138 * A long test shows: 15186 hits vs. 24 misses and mean iteration
1139 * count is kept < 2 (worst case 1.58). Not so bad ;)
1141 * --Zas */
1143 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1144 slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
1146 if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
1147 int i;
1149 for (i = 0; i < nb_entity_cache[slen]; i++) {
1150 if (entity_cache[slen][i].encoding == encoding
1151 && !memcmp(str, entity_cache[slen][i].str, strlen)) {
1152 #ifdef DEBUG_ENTITY_CACHE
1153 static double total_iter = 0;
1154 static unsigned long hit_count = 0;
1156 total_iter += i + 1;
1157 hit_count++;
1158 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
1159 #endif
1160 if (entity_cache[slen][i].hits < (unsigned int) ~0)
1161 entity_cache[slen][i].hits++;
1162 return entity_cache[slen][i].result;
1165 #ifdef DEBUG_ENTITY_CACHE
1166 fprintf(stderr, "miss\n");
1167 #endif
1169 #ifdef CONFIG_UTF8
1170 skip:
1171 #endif /* CONFIG_UTF8 */
1172 if (*str == '#') { /* Numeric entity. */
1173 int l = (int) strlen;
1174 unsigned char *st = (unsigned char *) str;
1175 unicode_val_T n = 0;
1177 if (l == 1) goto end; /* &#; ? */
1178 st++, l--;
1179 if ((*st | 32) == 'x') { /* Hexadecimal */
1181 if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
1182 st++, l--;
1183 do {
1184 unsigned char c = (*(st++) | 32);
1186 if (isdigit(c))
1187 n = (n << 4) | (c - '0');
1188 else if (isxdigit(c))
1189 n = (n << 4) | (c - 'a' + 10);
1190 else
1191 goto end; /* Bad char. */
1192 } while (--l);
1193 } else { /* Decimal */
1194 if (l > 10) goto end; /* 4294967295 max. */
1195 do {
1196 unsigned char c = *(st++);
1198 if (isdigit(c))
1199 n = n * 10 + c - '0';
1200 else
1201 goto end; /* Bad char. */
1202 /* Limit to 0xFFFFFFFF. */
1203 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1204 goto end;
1205 } while (--l);
1208 result = u2cp(n, encoding);
1210 #ifdef DEBUG_ENTITY_CACHE
1211 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1212 #endif
1213 } else { /* Text entity. */
1214 struct string key = INIT_STRING((unsigned char *) str, strlen);
1215 struct entity *element = bsearch((void *) &key, entities,
1216 N_ENTITIES,
1217 sizeof(*element),
1218 compare_entities);
1220 if (element) result = u2cp(element->c, encoding);
1223 #ifdef CONFIG_UTF8
1224 if (is_cp_ptr_utf8(&codepages[encoding])) {
1225 return result;
1227 #endif /* CONFIG_UTF8 */
1228 end:
1229 /* Take care of potential buffer overflow. */
1230 if (strlen < sizeof(entity_cache[slen][0].str)) {
1231 struct entity_cache *ece;
1233 /* Sort entries by hit order. */
1234 if (nb_entity_cache[slen] > 1)
1235 qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1236 sizeof(entity_cache[slen][0]), hits_cmp);
1238 /* Increment number of cache entries if possible.
1239 * Else, just replace the least used entry. */
1240 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1241 ece = &entity_cache[slen][nb_entity_cache[slen] - 1];
1243 /* Copy new entry to cache. */
1244 ece->hits = 1;
1245 ece->strlen = strlen;
1246 ece->encoding = encoding;
1247 ece->result = result;
1248 memcpy(ece->str, str, strlen);
1249 ece->str[strlen] = '\0';
1252 #ifdef DEBUG_ENTITY_CACHE
1253 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1254 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1257 unsigned int i;
1259 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1260 for (i = 0; i < nb_entity_cache[slen] ; i++)
1261 fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1262 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1263 entity_cache[slen][i].str);
1264 fprintf(stderr, "-----------------\n");
1266 #endif /* DEBUG_ENTITY_CACHE */
1268 return result;
1271 unsigned char *
1272 convert_string(struct conv_table *convert_table,
1273 unsigned char *chars2, int charslen2, int cp,
1274 enum convert_string_mode mode, int *length,
1275 void (*callback)(void *data, unsigned char *buf, int buflen),
1276 void *callback_data)
1278 unsigned char *buffer;
1279 int bufferpos = 0;
1280 int charspos = 0;
1281 unsigned char *chars = chars2;
1282 int charslen = charslen2;
1284 #ifdef HAVE_ICONV
1285 static char iconv_input[256];
1286 static char iconv_output[256 * 8];
1287 static size_t iconv_offset;
1288 static int iconv_cp;
1289 static size_t iconv_inleft;
1290 size_t iconv_outleft = 256 * 8;
1291 int loop = 0;
1292 int is_iconv = 0;
1293 int chars_offset = 0;
1295 if (!convert_table && !memchr(chars, '&', charslen)) {
1296 if (callback) {
1297 if (charslen) callback(callback_data, chars, charslen);
1298 return NULL;
1299 } else {
1300 return memacpy(chars, charslen);
1304 if (cp >= 0) {
1305 if (convert_table && convert_table->iconv_cp > 0) {
1306 is_iconv = 1;
1307 cp = convert_table->iconv_cp;
1308 } else {
1309 is_iconv = codepages[cp & ~SYSTEM_CHARSET_FLAG].iconv;
1312 #endif
1314 /* Buffer allocation */
1316 buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1317 if (!buffer) return NULL;
1319 #ifdef HAVE_ICONV
1320 if (is_iconv) {
1321 int v;
1322 size_t before, to_copy;
1323 char *outp, *inp;
1325 if (iconv_cd >= 0) {
1326 if (cp != iconv_cp) {
1327 iconv_close(iconv_cd);
1328 iconv_cd = (iconv_t)-1;
1331 if (iconv_cd == (iconv_t)-1) {
1332 iconv_offset = 0;
1333 iconv_cd = iconv_open("utf-8", get_cp_mime_name(cp));
1334 if (iconv_cd == (iconv_t)-1) {
1335 mem_free(buffer);
1336 return NULL;
1338 iconv_cp = cp;
1340 repeat:
1341 to_copy = charslen2 - chars_offset;
1342 if (to_copy > 256 - iconv_offset) to_copy = 256 - iconv_offset;
1343 memcpy(iconv_input + iconv_offset, chars + chars_offset, to_copy);
1344 iconv_outleft = 256 * 8;
1345 iconv_inleft = iconv_offset + to_copy;
1346 inp = iconv_input;
1347 outp = iconv_output;
1348 before = iconv_inleft;
1349 again:
1350 v = iconv(iconv_cd, &inp, &iconv_inleft, &outp, &iconv_outleft);
1351 chars_offset += before - iconv_inleft;
1352 charslen = 256 * 8 - iconv_outleft;
1354 chars = (unsigned char *)iconv_output;
1355 charspos = 0;
1357 if (v == -1) {
1358 switch (errno) {
1359 case EINVAL:
1360 memcpy(iconv_input, inp, iconv_inleft);
1361 iconv_offset = iconv_inleft;
1362 break;
1363 case EILSEQ:
1364 chars_offset++;
1365 iconv_inleft--;
1366 inp++;
1367 goto again;
1368 break;
1369 default:
1370 iconv_offset = 0;
1372 } else {
1373 iconv_offset = 0;
1376 loop = chars_offset < charslen2;
1378 #endif
1379 /* Iterate ;-) */
1381 while (charspos < charslen) {
1382 const unsigned char *translit;
1384 #define PUTC do { \
1385 buffer[bufferpos++] = chars[charspos++]; \
1386 translit = ""; \
1387 goto flush; \
1388 } while (0)
1390 if (chars[charspos] != '&') {
1391 struct conv_table *t;
1392 int i;
1394 if (chars[charspos] < 128 || !convert_table) PUTC;
1396 t = convert_table;
1397 i = charspos;
1399 while (t[chars[i]].t) {
1400 t = t[chars[i++]].u.tbl;
1401 if (i >= charslen) PUTC;
1404 translit = t[chars[i]].u.str;
1405 charspos = i + 1;
1407 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1408 PUTC;
1410 } else {
1411 int start = charspos + 1;
1412 int i = start;
1414 while (i < charslen
1415 && (isasciialpha(chars[i])
1416 || isdigit(chars[i])
1417 || (chars[i] == '#')))
1418 i++;
1420 /* This prevents bug 213: we were expanding "entities"
1421 * in URL query strings. */
1422 /* XXX: But this disables &nbsp&nbsp usage, which
1423 * appears to be relatively common! --pasky */
1424 if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1425 && i > start
1426 && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1427 translit = get_entity_string(&chars[start], i - start,
1428 cp);
1429 if (chars[i] != ';') {
1430 /* Eat &nbsp &nbsp<foo> happily, but
1431 * pull back from the character after
1432 * entity string if it is not the valid
1433 * terminator. */
1434 i--;
1437 if (!translit) PUTC;
1438 charspos = i + (i < charslen);
1439 } else PUTC;
1442 if (!translit[0]) continue;
1444 if (!translit[1]) {
1445 buffer[bufferpos++] = translit[0];
1446 translit = "";
1447 goto flush;
1450 while (*translit) {
1451 unsigned char *new;
1453 buffer[bufferpos++] = *(translit++);
1454 flush:
1455 if (bufferpos & (ALLOC_GR - 1)) continue;
1457 if (callback) {
1458 buffer[bufferpos] = 0;
1459 callback(callback_data, buffer, bufferpos);
1460 bufferpos = 0;
1461 } else {
1462 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1463 if (!new) {
1464 mem_free(buffer);
1465 return NULL;
1467 buffer = new;
1470 #undef PUTC
1473 #ifdef HAVE_ICONV
1474 if (loop) goto repeat;
1475 #endif
1476 /* Say bye */
1478 buffer[bufferpos] = 0;
1479 if (length) *length = bufferpos;
1481 if (callback) {
1482 if (bufferpos) callback(callback_data, buffer, bufferpos);
1483 mem_free(buffer);
1484 return NULL;
1485 } else {
1486 return buffer;
1491 #ifndef USE_FASTFIND
1493 get_cp_index(const unsigned char *name)
1495 int i, a;
1496 int syscp = 0;
1498 if (!c_strcasecmp(name, "System")) {
1499 #if HAVE_LANGINFO_CODESET
1500 name = nl_langinfo(CODESET);
1501 syscp = SYSTEM_CHARSET_FLAG;
1502 #else
1503 name = "us-ascii";
1504 #endif
1507 for (i = 0; codepages[i].name; i++) {
1508 for (a = 0; codepages[i].aliases[a]; a++) {
1509 /* In the past, we looked for the longest substring
1510 * in all the names; it is way too expensive, though:
1512 * % cumulative self self total
1513 * time seconds seconds calls us/call us/call name
1514 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1516 * Anything called from redraw_screen() is in fact
1517 * relatively expensive, even if it's called just
1518 * once. So we will do a simple strcasecmp() here.
1521 if (!c_strcasecmp(name, codepages[i].aliases[a]))
1522 return i | syscp;
1526 if (syscp) {
1527 return get_cp_index("us-ascii") | syscp;
1528 } else {
1529 return -1;
1533 #else
1535 static unsigned int i_name = 0;
1536 static unsigned int i_alias = 0;
1538 /* Reset internal list pointer */
1539 void
1540 charsets_list_reset(void)
1542 i_name = 0;
1543 i_alias = 0;
1546 /* Returns a pointer to a struct that contains current key and data pointers
1547 * and increment internal pointer. It returns NULL when key is NULL. */
1548 struct fastfind_key_value *
1549 charsets_list_next(void)
1551 static struct fastfind_key_value kv;
1553 if (!codepages[i_name].name) return NULL;
1555 kv.key = codepages[i_name].aliases[i_alias];
1556 kv.data = (void *) &codepages[i_name]; /* cast away const */
1558 if (codepages[i_name].aliases[i_alias + 1])
1559 i_alias++;
1560 else {
1561 i_name++;
1562 i_alias = 0;
1565 return &kv;
1568 static struct fastfind_index ff_charsets_index
1569 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1571 /* It searchs for a charset named @name or one of its aliases and
1572 * returns index for it or -1 if not found. */
1574 get_cp_index(const unsigned char *name)
1576 const struct codepage_desc *codepage;
1577 int syscp = 0;
1579 if (!c_strcasecmp(name, "System")) {
1580 #if HAVE_LANGINFO_CODESET
1581 name = nl_langinfo(CODESET);
1582 syscp = SYSTEM_CHARSET_FLAG;
1583 #else
1584 name = "us-ascii";
1585 #endif
1588 codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1589 if (codepage) {
1590 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1591 return (codepage - codepages) | syscp;
1593 } else if (syscp) {
1594 return get_cp_index("us-ascii") | syscp;
1596 } else {
1597 return -1;
1601 #endif /* USE_FASTFIND */
1603 void
1604 init_charsets_lookup(void)
1606 #ifdef USE_FASTFIND
1607 fastfind_index(&ff_charsets_index, FF_COMPRESS);
1608 #endif
1611 void
1612 free_charsets_lookup(void)
1614 #ifdef USE_FASTFIND
1615 fastfind_done(&ff_charsets_index);
1616 #endif
1619 /* Get the codepage's name for displaying to the user, or NULL if
1620 * @cp_index is one past the end. In the future, we might want to
1621 * localize these with gettext. So it may be best not to use this
1622 * function if the name will have to be converted back to an
1623 * index. */
1624 unsigned char *
1625 get_cp_name(int cp_index)
1627 if (cp_index < 0) return "none";
1628 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1630 return codepages[cp_index].name;
1633 /* Get the codepage's name for saving to a configuration file. These
1634 * names can be converted back to indexes, even in future versions of
1635 * ELinks. */
1636 unsigned char *
1637 get_cp_config_name(int cp_index)
1639 if (cp_index < 0) return "none";
1640 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1641 if (!codepages[cp_index].aliases) return NULL;
1643 return codepages[cp_index].aliases[0];
1646 /* Get the codepage's name for sending to a library or server that
1647 * understands MIME charset names. This function irreversibly maps
1648 * the "System" codepage to the underlying charset. */
1649 unsigned char *
1650 get_cp_mime_name(int cp_index)
1652 if (cp_index < 0) return "none";
1653 cp_index &= ~SYSTEM_CHARSET_FLAG;
1654 if (!codepages[cp_index].aliases) return NULL;
1656 return codepages[cp_index].aliases[0];
1660 is_cp_utf8(int cp_index)
1662 cp_index &= ~SYSTEM_CHARSET_FLAG;
1663 return is_cp_ptr_utf8(&codepages[cp_index]);
1666 /* This function will be used by the xhtml parser. */
1667 const uint16_t *
1668 get_cp_highhalf(const unsigned char *name)
1670 int cp = get_cp_index(name);
1672 if (cp < 0) return NULL;
1673 cp &= ~SYSTEM_CHARSET_FLAG;
1674 return codepages[cp].highhalf;