Retry only for https protocol
[elinks.git] / src / intl / charsets.c
blob5b30c855d3c75500fe2f565ae0c2d28f982a762e
1 /* Charsets convertor */
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE /* strcasecmp() */
5 #endif
7 #ifdef HAVE_CONFIG_H
8 #include "config.h"
9 #endif
11 #if HAVE_LANGINFO_CODESET
12 #include <langinfo.h>
13 #endif
15 #include <ctype.h>
16 #include <stdlib.h>
17 #if HAVE_WCTYPE_H
18 #include <wctype.h>
19 #endif
21 #ifdef HAVE_ICONV
22 #include <errno.h>
23 #include <iconv.h>
24 #endif
26 #include "elinks.h"
28 #include "document/options.h"
29 #include "intl/charsets.h"
30 #include "util/conv.h"
31 #include "util/error.h"
32 #include "util/fastfind.h"
33 #include "util/hash.h"
34 #include "util/memory.h"
35 #include "util/string.h"
38 /* Fix namespace clash on MacOS. */
39 #define table table_elinks
41 struct table_entry {
42 unsigned char c;
43 /* This should in principle be unicode_val_T, but because all
44 * the values currently in codepage.inc fit in 16 bits, we can
45 * as well use uint16_t and halve sizeof(struct table_entry)
46 * from 8 bytes to 4. Should other characters ever be needed,
47 * unicode_val_T u : 24 might be a possibility, although it
48 * seems a little unportable as bitfields are in principle
49 * restricted to int, which may be 16-bit. */
50 uint16_t u;
53 struct codepage_desc {
54 unsigned char *name;
55 unsigned char *const *aliases;
57 /* The Unicode mappings of codepage bytes 0x80...0xFF.
58 * (0x00...0x7F are assumed to be ASCII in all codepages.)
59 * Because all current values fit in 16 bits, we store them as
60 * uint16_t rather than unicode_val_T. If the codepage does
61 * not use some byte, then @highhalf maps that byte to 0xFFFF,
62 * which C code converts to UCS_REPLACEMENT_CHARACTER where
63 * appropriate. (U+FFFF is reserved and will never be
64 * assigned as a character.) */
65 const uint16_t *highhalf;
67 /* If some byte in the codepage corresponds to multiple Unicode
68 * characters, then the preferred character is in @highhalf
69 * above, and the rest are listed here in @table. This table
70 * is not used for translating from the codepage to Unicode. */
71 const struct table_entry *table;
73 /* Whether use iconv for translation */
74 unsigned int iconv:1;
77 #include "intl/codepage.inc"
78 #include "intl/uni_7b.inc"
79 #include "intl/entity.inc"
81 /* Declare the external-linkage inline functions defined in this file.
82 * Avoid the GCC 4.3.1 warning: `foo' declared inline after being
83 * called. The functions are not declared inline in charsets.h
84 * because C99 6.7.4p6 says that every external-linkage function
85 * declared inline shall be defined in the same translation unit.
86 * The non-inline declarations in charsets.h also make sure that the
87 * compiler emits global definitions for the symbols so that the
88 * functions can be called from other translation units. */
89 NONSTATIC_INLINE unsigned char *encode_utf8(unicode_val_T u);
90 NONSTATIC_INLINE int utf8charlen(const unsigned char *p);
91 NONSTATIC_INLINE int unicode_to_cell(unicode_val_T c);
92 NONSTATIC_INLINE unicode_val_T utf8_to_unicode(unsigned char **string,
93 const unsigned char *end);
95 static const char strings[256][2] = {
96 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
97 "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
98 "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
99 "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
100 "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
101 "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
102 "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
103 "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
104 "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
105 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
106 "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
107 "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
108 "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
109 "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
110 "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
111 "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
112 "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
113 "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
114 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
115 "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
116 "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
117 "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
118 "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
119 "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
120 "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
121 "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
122 "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
123 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
124 "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
125 "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
126 "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
127 "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
130 #ifdef HAVE_ICONV
131 static iconv_t iconv_cd = (iconv_t)-1;
132 #endif
134 static void
135 free_translation_table(struct conv_table *p)
137 int i;
139 for (i = 0; i < 256; i++)
140 if (p[i].t)
141 free_translation_table(p[i].u.tbl);
143 mem_free(p);
146 /* A string used in conversion tables when there is no correct
147 * conversion. This is compared by address and therefore should be a
148 * named array rather than a pointer so that it won't share storage
149 * with any other string literal that happens to have the same
150 * characters. */
151 static const unsigned char no_str[] = "*";
153 static void
154 new_translation_table(struct conv_table *p)
156 int i;
158 for (i = 0; i < 256; i++)
159 if (p[i].t)
160 free_translation_table(p[i].u.tbl);
161 for (i = 0; i < 128; i++) {
162 p[i].t = 0;
163 p[i].u.str = strings[i];
165 for (; i < 256; i++) {
166 p[i].t = 0;
167 p[i].u.str = no_str;
169 p->iconv_cp = -1;
172 #define BIN_SEARCH(table, entry, entries, key, result) \
174 long _s = 0, _e = (entries) - 1; \
176 while (_s <= _e || !((result) = -1)) { \
177 long _m = (_s + _e) / 2; \
179 if ((table)[_m].entry == (key)) { \
180 (result) = _m; \
181 break; \
183 if ((table)[_m].entry > (key)) _e = _m - 1; \
184 if ((table)[_m].entry < (key)) _s = _m + 1; \
188 static const unicode_val_T strange_chars[32] = {
189 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
190 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
191 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
192 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
195 #define SYSTEM_CHARSET_FLAG 128
196 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
198 const unsigned char *
199 u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
201 int j;
202 int s;
204 if (u < 128) return strings[u];
206 if (u < 0xa0) {
207 u = strange_chars[u - 0x80];
208 if (!u) return NULL;
211 to &= ~SYSTEM_CHARSET_FLAG;
213 if (is_cp_ptr_utf8(&codepages[to]))
214 return encode_utf8(u);
216 /* To mark non breaking spaces in non-UTF-8 strings, we use a
217 * special char NBSP_CHAR. */
218 if (u == UCS_NO_BREAK_SPACE) {
219 if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
220 else /* NBSP_MODE_ASCII */ return " ";
222 if (u == UCS_SOFT_HYPHEN) return "";
224 if (u < 0xFFFF)
225 for (j = 0; j < 0x80; j++)
226 if (codepages[to].highhalf[j] == u)
227 return strings[0x80 + j];
228 for (j = 0; codepages[to].table[j].c; j++)
229 if (codepages[to].table[j].u == u)
230 return strings[codepages[to].table[j].c];
232 BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
233 if (s != -1) return unicode_7b[s].s;
235 return no_str;
238 static unsigned char utf_buffer[7];
240 NONSTATIC_INLINE unsigned char *
241 encode_utf8(unicode_val_T u)
243 memset(utf_buffer, 0, 7);
245 if (u < 0x80)
246 utf_buffer[0] = u;
247 else if (u < 0x800)
248 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
249 utf_buffer[1] = 0x80 | (u & 0x3f);
250 else if (u < 0x10000)
251 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
252 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
253 utf_buffer[2] = 0x80 | (u & 0x3f);
254 else if (u < 0x200000)
255 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
256 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
257 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
258 utf_buffer[3] = 0x80 | (u & 0x3f);
259 else if (u < 0x4000000)
260 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
261 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
262 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
263 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
264 utf_buffer[4] = 0x80 | (u & 0x3f);
265 else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
266 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
267 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
268 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
269 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
270 utf_buffer[5] = 0x80 | (u & 0x3f);
272 return utf_buffer;
275 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
276 * equal ones and handled different. */
277 static const char utf8char_len_tab[256] = {
278 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
279 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
280 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
281 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
282 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
283 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
284 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
285 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
288 #ifdef CONFIG_UTF8
289 NONSTATIC_INLINE int
290 utf8charlen(const unsigned char *p)
292 return p ? utf8char_len_tab[*p] : 0;
296 strlen_utf8(unsigned char **str)
298 unsigned char *s = *str;
299 unsigned char *end = strchr((const char *)s, '\0');
300 int x;
301 int len;
303 for (x = 0;; x++, s += len) {
304 len = utf8charlen(s);
305 if (s + len > end) break;
307 *str = s;
308 return x;
311 #define utf8_issingle(p) (((p) & 0x80) == 0)
312 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
314 /* Start from @current and move back to @pos char. This pointer return. The
315 * most left pointer is @start. */
316 unsigned char *
317 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
319 if (current == NULL || start == NULL || pos < 0)
320 return NULL;
321 while (pos > 0 && current != start) {
322 current--;
323 if (utf8_islead(*current))
324 pos--;
326 return current;
329 /* Count number of standard terminal cells needed for displaying UTF-8
330 * character. */
332 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
334 unicode_val_T u;
336 if (end == NULL)
337 end = strchr((const char *)utf8_char, '\0');
339 if(!utf8_char || !end)
340 return -1;
342 u = utf8_to_unicode(&utf8_char, end);
344 return unicode_to_cell(u);
347 /* Count number of standard terminal cells needed for displaying string
348 * with UTF-8 characters. */
350 utf8_ptr2cells(unsigned char *string, unsigned char *end)
352 int charlen, cell, cells = 0;
354 if (end == NULL)
355 end = strchr((const char *)string, '\0');
357 if(!string || !end)
358 return -1;
360 do {
361 charlen = utf8charlen(string);
362 if (string + charlen > end)
363 break;
365 cell = utf8_char2cells(string, end);
366 if (cell < 0)
367 return -1;
369 cells += cell;
370 string += charlen;
371 } while (1);
373 return cells;
376 /* Count number of characters in string. */
378 utf8_ptr2chars(unsigned char *string, unsigned char *end)
380 int charlen, chars = 0;
382 if (end == NULL)
383 end = strchr((const char *)string, '\0');
385 if(!string || !end)
386 return -1;
388 do {
389 charlen = utf8charlen(string);
390 if (string + charlen > end)
391 break;
393 chars++;
394 string += charlen;
395 } while (1);
397 return chars;
401 * Count number of bytes from begining of the string needed for displaying
402 * specified number of cells.
405 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
407 unsigned int bytes = 0, cells = 0;
409 assert(max_cells>=0);
411 if (end == NULL)
412 end = strchr((const char *)string, '\0');
414 if(!string || !end)
415 return -1;
417 do {
418 int cell = utf8_char2cells(&string[bytes], end);
419 if (cell < 0)
420 return -1;
422 cells += cell;
423 if (cells > max_cells)
424 break;
426 bytes += utf8charlen(&string[bytes]);
428 if (string + bytes > end) {
429 bytes = end - string;
430 break;
432 } while(1);
434 return bytes;
437 /* Take @max steps forward from @string in the specified @way, but
438 * not going past @end. Return the resulting address. Store the
439 * number of steps taken to *@count, unless @count is NULL.
441 * This assumes the text is valid UTF-8, and @string and @end point to
442 * character boundaries. If not, it doesn't crash but the results may
443 * be inconsistent.
445 * This function can do some of the same jobs as utf8charlen(),
446 * utf8_cells2bytes(), and strlen_utf8(). */
447 unsigned char *
448 utf8_step_forward(unsigned char *string, unsigned char *end,
449 int max, enum utf8_step way, int *count)
451 int steps = 0;
452 unsigned char *current = string;
454 assert(string);
455 assert(max >= 0);
456 if_assert_failed goto invalid_arg;
457 if (end == NULL)
458 end = strchr((const char *)string, '\0');
460 switch (way) {
461 case UTF8_STEP_CHARACTERS:
462 while (steps < max && current < end) {
463 ++current;
464 if (utf8_islead(*current))
465 ++steps;
467 break;
469 case UTF8_STEP_CELLS_FEWER:
470 case UTF8_STEP_CELLS_MORE:
471 while (steps < max && current < end) {
472 unicode_val_T u;
473 unsigned char *prev = current;
474 int width;
476 u = utf8_to_unicode(&current, end);
477 if (u == UCS_NO_CHAR) {
478 /* Assume the incomplete sequence
479 * costs one cell. */
480 current = end;
481 ++steps;
482 break;
485 width = unicode_to_cell(u);
486 if (way == UTF8_STEP_CELLS_FEWER
487 && steps + width > max) {
488 /* Back off. */
489 current = prev;
490 break;
492 steps += width;
494 break;
496 default:
497 INTERNAL("impossible enum utf8_step");
500 invalid_arg:
501 if (count)
502 *count = steps;
503 return current;
506 /* Take @max steps backward from @string in the specified @way, but
507 * not going past @start. Return the resulting address. Store the
508 * number of steps taken to *@count, unless @count is NULL.
510 * This assumes the text is valid UTF-8, and @string and @start point
511 * to character boundaries. If not, it doesn't crash but the results
512 * may be inconsistent.
514 * This function can do some of the same jobs as utf8_prevchar(). */
515 unsigned char *
516 utf8_step_backward(unsigned char *string, unsigned char *start,
517 int max, enum utf8_step way, int *count)
519 int steps = 0;
520 unsigned char *current = string;
522 assert(string);
523 assert(start);
524 assert(max >= 0);
525 if_assert_failed goto invalid_arg;
527 switch (way) {
528 case UTF8_STEP_CHARACTERS:
529 while (steps < max && current > start) {
530 --current;
531 if (utf8_islead(*current))
532 ++steps;
534 break;
536 case UTF8_STEP_CELLS_FEWER:
537 case UTF8_STEP_CELLS_MORE:
538 while (steps < max) {
539 unsigned char *prev = current;
540 unsigned char *look;
541 unicode_val_T u;
542 int width;
544 if (current <= start)
545 break;
546 do {
547 --current;
548 } while (current > start && !utf8_islead(*current));
550 look = current;
551 u = utf8_to_unicode(&look, prev);
552 if (u == UCS_NO_CHAR) {
553 /* Assume the incomplete sequence
554 * costs one cell. */
555 width = 1;
556 } else
557 width = unicode_to_cell(u);
559 if (way == UTF8_STEP_CELLS_FEWER
560 && steps + width > max) {
561 /* Back off. */
562 current = prev;
563 break;
565 steps += width;
567 break;
569 default:
570 INTERNAL("impossible enum utf8_step");
573 invalid_arg:
574 if (count)
575 *count = steps;
576 return current;
580 * Find out number of standard terminal collumns needed for displaying symbol
581 * (glyph) which represents Unicode character c.
583 * TODO: Use wcwidth when it is available. This seems to require:
584 * - Make the configure script check whether <wchar.h> and wcwidth exist.
585 * - Define _XOPEN_SOURCE and include <wchar.h>.
586 * - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
587 * matches ISO 10646 in all locales.)
588 * However, these do not suffice, because wcwidth depends on LC_CTYPE
589 * in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
590 * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
591 * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
592 * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
593 * character is apparently not supported in all locales. Why is that?
594 * - Perhaps there is standardese that requires supported characters
595 * to be convertable to multibyte form. Then ELinks could just pick
596 * some UTF-8 locale for its wcwidth purposes.
597 * - Perhaps wcwidth can even return different nonnegative values for
598 * the same ISO 10646 character in different locales. Then ELinks
599 * would have to set LC_CTYPE to match at least the terminal's
600 * charset (which may differ from the LC_CTYPE environment variable,
601 * especially when the master process is serving a slave terminal).
602 * But there is no guarantee that the libc supports all the same
603 * charsets as ELinks does.
604 * For now, it seems safest to avoid the potentially locale-dependent
605 * libc version of wcwidth, and instead use a hardcoded mapping.
607 * @return 2 for double-width glyph, 1 for others.
608 * 0 for unprintable glyphs (like 0x200e: "LEFT-TO-RIGHT MARK")
610 NONSTATIC_INLINE int
611 unicode_to_cell(unicode_val_T c)
613 if (c == 0x200e || c == 0x200f)
614 return 0;
615 if (c >= 0x1100
616 && (c <= 0x115f /* Hangul Jamo */
617 || c == 0x2329
618 || c == 0x232a
619 || (c >= 0x2e80 && c <= 0xa4cf
620 && c != 0x303f) /* CJK ... Yi */
621 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
622 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
623 Ideographs */
624 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
625 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
626 || (c >= 0xffe0 && c <= 0xffe6)
627 || (c >= 0x20000 && c <= 0x2fffd)
628 || (c >= 0x30000 && c <= 0x3fffd)))
629 return 2;
631 return 1;
634 /* Fold the case of a Unicode character, so that hotkeys in labels can
635 * be compared case-insensitively. It is unspecified whether the
636 * result will be in upper or lower case. */
637 unicode_val_T
638 unicode_fold_label_case(unicode_val_T c)
640 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
641 return towlower(c);
642 #else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
643 /* For now, this supports only ASCII. It would be possible to
644 * use code generated from CaseFolding.txt of Unicode if the
645 * acknowledgements required by http://www.unicode.org/copyright.html
646 * were added to associated documentation of ELinks. */
647 if (c >= 0x41 && c <= 0x5A)
648 return c + 0x20;
649 else
650 return c;
651 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
653 #endif /* CONFIG_UTF8 */
655 NONSTATIC_INLINE unicode_val_T
656 utf8_to_unicode(unsigned char **string, const unsigned char *end)
658 unsigned char *str = *string;
659 unicode_val_T u;
660 int length;
662 length = utf8char_len_tab[str[0]];
664 if (str + length > end) {
665 return UCS_NO_CHAR;
668 switch (length) {
669 case 1: /* U+0000 to U+007F */
670 if (str[0] >= 0x80) {
671 invalid_utf8:
672 ++*string;
673 return UCS_REPLACEMENT_CHARACTER;
675 u = str[0];
676 break;
677 case 2: /* U+0080 to U+07FF */
678 if ((str[1] & 0xc0) != 0x80)
679 goto invalid_utf8;
680 u = (str[0] & 0x1f) << 6;
681 u += (str[1] & 0x3f);
682 if (u < 0x80)
683 goto invalid_utf8;
684 break;
685 case 3: /* U+0800 to U+FFFF, except surrogates */
686 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
687 goto invalid_utf8;
688 u = (str[0] & 0x0f) << 12;
689 u += ((str[1] & 0x3f) << 6);
690 u += (str[2] & 0x3f);
691 if (u < 0x800 || is_utf16_surrogate(u))
692 goto invalid_utf8;
693 break;
694 case 4: /* U+10000 to U+1FFFFF */
695 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
696 || (str[3] & 0xc0) != 0x80)
697 goto invalid_utf8;
698 u = (str[0] & 0x0f) << 18;
699 u += ((str[1] & 0x3f) << 12);
700 u += ((str[2] & 0x3f) << 6);
701 u += (str[3] & 0x3f);
702 if (u < 0x10000)
703 goto invalid_utf8;
704 break;
705 case 5: /* U+200000 to U+3FFFFFF */
706 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
707 || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
708 goto invalid_utf8;
709 u = (str[0] & 0x0f) << 24;
710 u += ((str[1] & 0x3f) << 18);
711 u += ((str[2] & 0x3f) << 12);
712 u += ((str[3] & 0x3f) << 6);
713 u += (str[4] & 0x3f);
714 if (u < 0x200000)
715 goto invalid_utf8;
716 break;
717 case 6: /* U+4000000 to U+7FFFFFFF */
718 if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
719 || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
720 || (str[5] & 0xc0) != 0x80)
721 goto invalid_utf8;
722 u = (str[0] & 0x01) << 30;
723 u += ((str[1] & 0x3f) << 24);
724 u += ((str[2] & 0x3f) << 18);
725 u += ((str[3] & 0x3f) << 12);
726 u += ((str[4] & 0x3f) << 6);
727 u += (str[5] & 0x3f);
728 if (u < 0x4000000)
729 goto invalid_utf8;
730 break;
731 default:
732 INTERNAL("utf8char_len_tab out of range");
733 goto invalid_utf8;
735 *string = str + length;
736 return u;
739 /* The common part of cp2u and cp2utf_8. */
740 static unicode_val_T
741 cp2u_shared(const struct codepage_desc *from, unsigned char c)
743 unicode_val_T u = from->highhalf[c - 0x80];
745 if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
746 return u;
749 /* Used for converting input from the terminal. */
750 unicode_val_T
751 cp2u(int from, unsigned char c)
753 from &= ~SYSTEM_CHARSET_FLAG;
755 /* UTF-8 is a multibyte codepage and cannot be handled with
756 * this function. */
757 assert(!is_cp_ptr_utf8(&codepages[from]));
758 if_assert_failed return UCS_REPLACEMENT_CHARACTER;
760 if (c < 0x80) return c;
761 else return cp2u_shared(&codepages[from], c);
764 /* This slow and ugly code is used by the terminal utf_8_io */
765 const unsigned char *
766 cp2utf8(int from, int c)
768 from &= ~SYSTEM_CHARSET_FLAG;
770 if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
771 return strings[c];
773 return encode_utf8(cp2u_shared(&codepages[from], c));
776 unicode_val_T
777 cp_to_unicode(int codepage, unsigned char **string, const unsigned char *end)
779 unicode_val_T ret;
781 if (is_cp_utf8(codepage))
782 return utf8_to_unicode(string, end);
784 if (*string >= end)
785 return UCS_NO_CHAR;
787 ret = cp2u(codepage, **string);
788 ++*string;
789 return ret;
793 #ifdef CONFIG_COMBINE
794 unicode_val_T last_combined = UCS_BEGIN_COMBINED - 1;
795 unicode_val_T **combined;
796 struct hash *combined_hash;
798 unicode_val_T
799 get_combined(unicode_val_T *data, int length)
801 struct hash_item *item;
802 unicode_val_T *key;
803 int i, indeks;
805 assert(length >= 1 && length <= UCS_MAX_LENGTH_COMBINED);
806 if_assert_failed return UCS_NO_CHAR;
808 if (!combined_hash) combined_hash = init_hash8();
809 if (!combined_hash) return UCS_NO_CHAR;
810 item = get_hash_item(combined_hash, (unsigned char *)data, length * sizeof(*data));
812 if (item) return (unicode_val_T)(long)item->value;
813 if (last_combined >= UCS_END_COMBINED) return UCS_NO_CHAR;
815 key = mem_alloc((length + 1) * sizeof(*key));
816 if (!key) return UCS_NO_CHAR;
817 for (i = 0; i < length; i++)
818 key[i] = data[i];
819 key[i] = UCS_END_COMBINED;
821 last_combined++;
822 indeks = last_combined - UCS_BEGIN_COMBINED;
824 combined = mem_realloc(combined, sizeof(*combined) * (indeks + 1));
825 if (!combined) {
826 mem_free(key);
827 last_combined--;
828 return UCS_NO_CHAR;
830 combined[indeks] = key;
831 item = add_hash_item(combined_hash, (unsigned char *)key,
832 length * sizeof(*data), (void *)(long)(last_combined));
833 if (!item) {
834 last_combined--;
835 mem_free(key);
836 return UCS_NO_CHAR;
838 return last_combined;
841 void
842 free_combined()
844 int i, end = last_combined - UCS_BEGIN_COMBINED + 1;
846 if (combined_hash)
847 free_hash(&combined_hash);
848 for (i = 0; i < end; i++)
849 mem_free(combined[i]);
850 mem_free_if(combined);
852 #endif /* CONFIG_COMBINE */
855 static void
856 add_utf8(struct conv_table *ct, unicode_val_T u, const unsigned char *str)
858 unsigned char *p = encode_utf8(u);
860 while (p[1]) {
861 if (ct[*p].t) ct = ct[*p].u.tbl;
862 else {
863 struct conv_table *nct;
865 assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
866 if_assert_failed return;
868 nct = mem_calloc(256, sizeof(*nct));
869 if (!nct) return;
870 new_translation_table(nct);
871 ct[*p].t = 1;
872 ct[*p].u.tbl = nct;
873 ct = nct;
875 p++;
878 assertm(!ct[*p].t, "bad utf encoding #2");
879 if_assert_failed return;
881 if (ct[*p].u.str == no_str)
882 ct[*p].u.str = str;
885 /* A conversion table from some charset to UTF-8.
886 * If it is from UTF-8 to UTF-8, it converts each byte separately.
887 * Unlike in other translation tables, the strings in elements 0x80 to
888 * 0xFF are allocated dynamically. */
889 struct conv_table utf_table[256];
890 int utf_table_init = 1;
892 static void
893 free_utf_table(void)
895 int i;
897 /* Cast away const. */
898 for (i = 128; i < 256; i++)
899 mem_free((unsigned char *) utf_table[i].u.str);
902 static struct conv_table *
903 get_translation_table_to_utf8(int from)
905 int i;
906 static int lfr = -1;
908 if (from == -1) return NULL;
909 from &= ~SYSTEM_CHARSET_FLAG;
910 if (from == lfr) return utf_table;
911 lfr = from;
912 if (utf_table_init) {
913 memset(utf_table, 0, sizeof(utf_table));
914 utf_table_init = 0;
915 } else
916 free_utf_table();
918 for (i = 0; i < 128; i++)
919 utf_table[i].u.str = strings[i];
921 if (is_cp_ptr_utf8(&codepages[from])) {
922 for (i = 128; i < 256; i++)
923 utf_table[i].u.str = stracpy(strings[i]);
924 return utf_table;
927 for (i = 128; i < 256; i++) {
928 unicode_val_T u = codepages[from].highhalf[i - 0x80];
930 if (u == 0xFFFF)
931 utf_table[i].u.str = NULL;
932 else
933 utf_table[i].u.str = stracpy(encode_utf8(u));
936 for (i = 0; codepages[from].table[i].c; i++) {
937 unicode_val_T u = codepages[from].table[i].u;
939 if (!utf_table[codepages[from].table[i].c].u.str)
940 utf_table[codepages[from].table[i].c].u.str =
941 stracpy(encode_utf8(u));
944 for (i = 128; i < 256; i++)
945 if (!utf_table[i].u.str)
946 utf_table[i].u.str = stracpy(no_str);
948 return utf_table;
951 /* A conversion table between two charsets, where the target is not UTF-8. */
952 static struct conv_table table[256];
953 static int first = 1;
955 void
956 free_conv_table(void)
958 if (!utf_table_init) free_utf_table();
959 if (first) {
960 memset(table, 0, sizeof(table));
961 first = 0;
963 new_translation_table(table);
964 #ifdef HAVE_ICONV
965 if (iconv_cd != (iconv_t)-1) {
966 iconv_close(iconv_cd);
967 iconv_cd = (iconv_t)-1;
969 #endif
973 struct conv_table *
974 get_translation_table(int from, int to)
976 static int lfr = -1;
977 static int lto = -1;
979 from &= ~SYSTEM_CHARSET_FLAG;
980 to &= ~SYSTEM_CHARSET_FLAG;
981 if (first) {
982 memset(table, 0, sizeof(table));
983 first = 0;
986 if (codepages[from].iconv) {
987 struct conv_table *table2 = get_translation_table_to_utf8(34);
989 if (table2) table2->iconv_cp = from;
990 return table2;
993 if (/*from == to ||*/ from == -1 || to == -1)
994 return NULL;
995 if (is_cp_ptr_utf8(&codepages[to])) {
996 struct conv_table *table2 = get_translation_table_to_utf8(from);
998 if (table2) table2->iconv_cp = -1;
999 return table2;
1001 if (from == lfr && to == lto)
1002 return table;
1003 lfr = from;
1004 lto = to;
1005 new_translation_table(table);
1007 if (is_cp_ptr_utf8(&codepages[from])) {
1008 int i;
1010 /* Map U+00A0 and U+00AD the same way as u2cp() would. */
1011 add_utf8(table, UCS_NO_BREAK_SPACE, strings[NBSP_CHAR]);
1012 add_utf8(table, UCS_SOFT_HYPHEN, "");
1014 for (i = 0x80; i <= 0xFF; i++)
1015 if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
1016 add_utf8(table,
1017 codepages[to].highhalf[i - 0x80],
1018 strings[i]);
1020 for (i = 0; codepages[to].table[i].c; i++)
1021 add_utf8(table, codepages[to].table[i].u,
1022 strings[codepages[to].table[i].c]);
1024 for (i = 0; unicode_7b[i].x != -1; i++)
1025 if (unicode_7b[i].x >= 0x80)
1026 add_utf8(table, unicode_7b[i].x,
1027 unicode_7b[i].s);
1029 } else {
1030 int i;
1032 for (i = 128; i < 256; i++) {
1033 if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
1034 const unsigned char *u;
1036 u = u2cp(codepages[from].highhalf[i - 0x80], to);
1037 if (u) table[i].u.str = u;
1042 return table;
1045 static inline int
1046 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
1048 while (l2) {
1049 if (*s1 > *s2) return 1;
1050 if (*s1 < *s2) return -1;
1051 s1++;
1052 s2++;
1053 l2--;
1056 return *s2 ? -1 : 0;
1059 /* Entity cache debugging purpose. */
1060 #if 0
1061 #define DEBUG_ENTITY_CACHE
1062 #else
1063 #undef DEBUG_ENTITY_CACHE
1064 #endif
1066 struct entity_cache {
1067 unsigned int hits;
1068 int strlen;
1069 int encoding;
1070 const unsigned char *result;
1071 unsigned char str[20]; /* Suffice in any case. */
1074 /* comparison function for qsort() */
1075 static int
1076 hits_cmp(const void *v1, const void *v2)
1078 const struct entity_cache *a = v1, *b = v2;
1080 if (a->hits == b->hits) return 0;
1081 if (a->hits > b->hits) return -1;
1082 else return 1;
1085 static int
1086 compare_entities(const void *key_, const void *element_)
1088 struct string *key = (struct string *) key_;
1089 struct entity *element = (struct entity *) element_;
1090 int length = key->length;
1091 unsigned char *first = key->source;
1092 unsigned char *second = element->s;
1094 return xxstrcmp(first, second, length);
1097 const unsigned char *
1098 get_entity_string(const unsigned char *str, const int strlen, int encoding)
1100 #define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
1101 #define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
1102 will go in [0] table */
1103 static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
1104 static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
1105 unsigned int slen = 0;
1106 const unsigned char *result = NULL;
1108 /* Note that an object of static storage duration is automatically
1109 * initialised to zero in C. */
1111 if (strlen <= 0) return NULL;
1113 #ifdef CONFIG_UTF8
1114 /* TODO: caching UTF-8 */
1115 encoding &= ~SYSTEM_CHARSET_FLAG;
1116 if (is_cp_ptr_utf8(&codepages[encoding]))
1117 goto skip;
1118 #endif /* CONFIG_UTF8 */
1120 /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1121 * + google + slashdot + websites that result from a search for test on google,
1122 * + various ones) show quite impressive improvment:
1123 * Top ten is:
1124 * 0: hits=2459 l=4 st='nbsp'
1125 * 1: hits=2152 l=6 st='eacute'
1126 * 2: hits=235 l=6 st='egrave'
1127 * 3: hits=136 l=6 st='agrave'
1128 * 4: hits=100 l=3 st='amp'
1129 * 5: hits=40 l=5 st='laquo'
1130 * 6: hits=8 l=4 st='copy'
1131 * 7: hits=5 l=2 st='gt'
1132 * 8: hits=2 l=2 st='lt'
1133 * 9: hits=1 l=6 st='middot'
1135 * Most of the time cache hit ratio is near 95%.
1137 * A long test shows: 15186 hits vs. 24 misses and mean iteration
1138 * count is kept < 2 (worst case 1.58). Not so bad ;)
1140 * --Zas */
1142 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1143 slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
1145 if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
1146 int i;
1148 for (i = 0; i < nb_entity_cache[slen]; i++) {
1149 if (entity_cache[slen][i].encoding == encoding
1150 && !memcmp(str, entity_cache[slen][i].str, strlen)) {
1151 #ifdef DEBUG_ENTITY_CACHE
1152 static double total_iter = 0;
1153 static unsigned long hit_count = 0;
1155 total_iter += i + 1;
1156 hit_count++;
1157 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
1158 #endif
1159 if (entity_cache[slen][i].hits < (unsigned int) ~0)
1160 entity_cache[slen][i].hits++;
1161 return entity_cache[slen][i].result;
1164 #ifdef DEBUG_ENTITY_CACHE
1165 fprintf(stderr, "miss\n");
1166 #endif
1168 #ifdef CONFIG_UTF8
1169 skip:
1170 #endif /* CONFIG_UTF8 */
1171 if (*str == '#') { /* Numeric entity. */
1172 int l = (int) strlen;
1173 unsigned char *st = (unsigned char *) str;
1174 unicode_val_T n = 0;
1176 if (l == 1) goto end; /* &#; ? */
1177 st++, l--;
1178 if ((*st | 32) == 'x') { /* Hexadecimal */
1180 if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
1181 st++, l--;
1182 do {
1183 unsigned char c = (*(st++) | 32);
1185 if (isdigit(c))
1186 n = (n << 4) | (c - '0');
1187 else if (isxdigit(c))
1188 n = (n << 4) | (c - 'a' + 10);
1189 else
1190 goto end; /* Bad char. */
1191 } while (--l);
1192 } else { /* Decimal */
1193 if (l > 10) goto end; /* 4294967295 max. */
1194 do {
1195 unsigned char c = *(st++);
1197 if (isdigit(c))
1198 n = n * 10 + c - '0';
1199 else
1200 goto end; /* Bad char. */
1201 /* Limit to 0xFFFFFFFF. */
1202 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1203 goto end;
1204 } while (--l);
1207 result = u2cp(n, encoding);
1209 #ifdef DEBUG_ENTITY_CACHE
1210 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1211 #endif
1212 } else { /* Text entity. */
1213 struct string key = INIT_STRING((unsigned char *) str, strlen);
1214 struct entity *element = bsearch((void *) &key, entities,
1215 N_ENTITIES,
1216 sizeof(*element),
1217 compare_entities);
1219 if (element) result = u2cp(element->c, encoding);
1222 #ifdef CONFIG_UTF8
1223 if (is_cp_ptr_utf8(&codepages[encoding])) {
1224 return result;
1226 #endif /* CONFIG_UTF8 */
1227 end:
1228 /* Take care of potential buffer overflow. */
1229 if (strlen < sizeof(entity_cache[slen][0].str)) {
1230 struct entity_cache *ece;
1232 /* Sort entries by hit order. */
1233 if (nb_entity_cache[slen] > 1)
1234 qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1235 sizeof(entity_cache[slen][0]), hits_cmp);
1237 /* Increment number of cache entries if possible.
1238 * Else, just replace the least used entry. */
1239 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1240 ece = &entity_cache[slen][nb_entity_cache[slen] - 1];
1242 /* Copy new entry to cache. */
1243 ece->hits = 1;
1244 ece->strlen = strlen;
1245 ece->encoding = encoding;
1246 ece->result = result;
1247 memcpy(ece->str, str, strlen);
1248 ece->str[strlen] = '\0';
1251 #ifdef DEBUG_ENTITY_CACHE
1252 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1253 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1256 unsigned int i;
1258 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1259 for (i = 0; i < nb_entity_cache[slen] ; i++)
1260 fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1261 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1262 entity_cache[slen][i].str);
1263 fprintf(stderr, "-----------------\n");
1265 #endif /* DEBUG_ENTITY_CACHE */
1267 return result;
1270 unsigned char *
1271 convert_string(struct conv_table *convert_table,
1272 unsigned char *chars2, int charslen2, int cp,
1273 enum convert_string_mode mode, int *length,
1274 void (*callback)(void *data, unsigned char *buf, int buflen),
1275 void *callback_data)
1277 unsigned char *buffer;
1278 int bufferpos = 0;
1279 int charspos = 0;
1280 unsigned char *chars = chars2;
1281 int charslen = charslen2;
1283 #ifdef HAVE_ICONV
1284 static char iconv_input[256];
1285 static char iconv_output[256 * 8];
1286 static size_t iconv_offset;
1287 static int iconv_cp;
1288 static size_t iconv_inleft;
1289 size_t iconv_outleft = 256 * 8;
1290 int loop = 0;
1291 int is_iconv = 0;
1292 int chars_offset = 0;
1294 if (!convert_table && !memchr(chars, '&', charslen)) {
1295 if (callback) {
1296 if (charslen) callback(callback_data, chars, charslen);
1297 return NULL;
1298 } else {
1299 return memacpy(chars, charslen);
1303 if (cp >= 0) {
1304 if (convert_table && convert_table->iconv_cp > 0) {
1305 is_iconv = 1;
1306 cp = convert_table->iconv_cp;
1307 } else {
1308 is_iconv = codepages[cp & ~SYSTEM_CHARSET_FLAG].iconv;
1311 #endif
1313 /* Buffer allocation */
1315 buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1316 if (!buffer) return NULL;
1318 #ifdef HAVE_ICONV
1319 if (is_iconv) {
1320 int v;
1321 size_t before, to_copy;
1322 char *outp, *inp;
1324 if (iconv_cd >= 0) {
1325 if (cp != iconv_cp) {
1326 iconv_close(iconv_cd);
1327 iconv_cd = (iconv_t)-1;
1330 if (iconv_cd == (iconv_t)-1) {
1331 iconv_offset = 0;
1332 iconv_cd = iconv_open("utf-8", get_cp_mime_name(cp));
1333 if (iconv_cd == (iconv_t)-1) {
1334 mem_free(buffer);
1335 return NULL;
1337 iconv_cp = cp;
1339 repeat:
1340 to_copy = charslen2 - chars_offset;
1341 if (to_copy > 256 - iconv_offset) to_copy = 256 - iconv_offset;
1342 memcpy(iconv_input + iconv_offset, chars2 + chars_offset, to_copy);
1343 iconv_outleft = 256 * 8;
1344 iconv_inleft = iconv_offset + to_copy;
1345 inp = iconv_input;
1346 outp = iconv_output;
1347 before = iconv_inleft;
1349 v = iconv(iconv_cd, &inp, &iconv_inleft, &outp, &iconv_outleft);
1350 chars_offset += before - iconv_inleft;
1351 charslen = 256 * 8 - iconv_outleft;
1353 chars = (unsigned char *)iconv_output;
1354 charspos = 0;
1356 if (v == -1) {
1357 switch (errno) {
1358 case EINVAL:
1359 memcpy(iconv_input, inp, iconv_inleft);
1360 iconv_offset = iconv_inleft;
1361 break;
1362 case EILSEQ:
1363 loop = 0;
1364 goto out;
1365 break;
1366 default:
1367 iconv_offset = 0;
1369 } else {
1370 iconv_offset = 0;
1373 loop = chars_offset < charslen2;
1375 #endif
1376 /* Iterate ;-) */
1378 out:
1379 while (charspos < charslen) {
1380 const unsigned char *translit;
1382 #define PUTC do { \
1383 buffer[bufferpos++] = chars[charspos++]; \
1384 translit = ""; \
1385 goto flush; \
1386 } while (0)
1388 if (chars[charspos] != '&') {
1389 struct conv_table *t;
1390 int i;
1392 if (chars[charspos] < 128 || !convert_table) PUTC;
1394 t = convert_table;
1395 i = charspos;
1397 while (t[chars[i]].t) {
1398 t = t[chars[i++]].u.tbl;
1399 if (i >= charslen) PUTC;
1402 translit = t[chars[i]].u.str;
1403 charspos = i + 1;
1405 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1406 PUTC;
1408 } else {
1409 int start = charspos + 1;
1410 int i = start;
1412 while (i < charslen
1413 && (isasciialpha(chars[i])
1414 || isdigit(chars[i])
1415 || (chars[i] == '#')))
1416 i++;
1418 /* This prevents bug 213: we were expanding "entities"
1419 * in URL query strings. */
1420 /* XXX: But this disables &nbsp&nbsp usage, which
1421 * appears to be relatively common! --pasky */
1422 if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1423 && i > start
1424 && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1425 translit = get_entity_string(&chars[start], i - start,
1426 cp);
1427 if (chars[i] != ';') {
1428 /* Eat &nbsp &nbsp<foo> happily, but
1429 * pull back from the character after
1430 * entity string if it is not the valid
1431 * terminator. */
1432 i--;
1435 if (!translit) PUTC;
1436 charspos = i + (i < charslen);
1437 } else PUTC;
1440 if (!translit[0]) continue;
1442 if (!translit[1]) {
1443 buffer[bufferpos++] = translit[0];
1444 translit = "";
1445 goto flush;
1448 while (*translit) {
1449 unsigned char *new_;
1451 buffer[bufferpos++] = *(translit++);
1452 flush:
1453 if (bufferpos & (ALLOC_GR - 1)) continue;
1455 if (callback) {
1456 buffer[bufferpos] = 0;
1457 callback(callback_data, buffer, bufferpos);
1458 bufferpos = 0;
1459 } else {
1460 new_ = mem_realloc(buffer, bufferpos + ALLOC_GR);
1461 if (!new_) {
1462 mem_free(buffer);
1463 return NULL;
1465 buffer = new_;
1468 #undef PUTC
1471 #ifdef HAVE_ICONV
1472 if (loop) goto repeat;
1473 #endif
1474 /* Say bye */
1476 buffer[bufferpos] = 0;
1477 if (length) *length = bufferpos;
1479 if (callback) {
1480 if (bufferpos) callback(callback_data, buffer, bufferpos);
1481 mem_free(buffer);
1482 return NULL;
1483 } else {
1484 return buffer;
1489 #ifndef USE_FASTFIND
1491 get_cp_index(const unsigned char *name)
1493 int i, a;
1494 int syscp = 0;
1496 if (!c_strcasecmp(name, "System")) {
1497 #if HAVE_LANGINFO_CODESET
1498 name = nl_langinfo(CODESET);
1499 syscp = SYSTEM_CHARSET_FLAG;
1500 #else
1501 name = "us-ascii";
1502 #endif
1505 for (i = 0; codepages[i].name; i++) {
1506 for (a = 0; codepages[i].aliases[a]; a++) {
1507 /* In the past, we looked for the longest substring
1508 * in all the names; it is way too expensive, though:
1510 * % cumulative self self total
1511 * time seconds seconds calls us/call us/call name
1512 * 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
1514 * Anything called from redraw_screen() is in fact
1515 * relatively expensive, even if it's called just
1516 * once. So we will do a simple strcasecmp() here.
1519 if (!c_strcasecmp(name, codepages[i].aliases[a]))
1520 return i | syscp;
1524 if (syscp) {
1525 return get_cp_index("us-ascii") | syscp;
1526 } else {
1527 return -1;
1531 #else
1533 static unsigned int i_name = 0;
1534 static unsigned int i_alias = 0;
1536 /* Reset internal list pointer */
1537 void
1538 charsets_list_reset(void)
1540 i_name = 0;
1541 i_alias = 0;
1544 /* Returns a pointer to a struct that contains current key and data pointers
1545 * and increment internal pointer. It returns NULL when key is NULL. */
1546 struct fastfind_key_value *
1547 charsets_list_next(void)
1549 static struct fastfind_key_value kv;
1551 if (!codepages[i_name].name) return NULL;
1553 kv.key = codepages[i_name].aliases[i_alias];
1554 kv.data = (void *) &codepages[i_name]; /* cast away const */
1556 if (codepages[i_name].aliases[i_alias + 1])
1557 i_alias++;
1558 else {
1559 i_name++;
1560 i_alias = 0;
1563 return &kv;
1566 static struct fastfind_index ff_charsets_index
1567 = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1569 /* It searchs for a charset named @name or one of its aliases and
1570 * returns index for it or -1 if not found. */
1572 get_cp_index(const unsigned char *name)
1574 const struct codepage_desc *codepage;
1575 int syscp = 0;
1577 if (!c_strcasecmp(name, "System")) {
1578 #if HAVE_LANGINFO_CODESET
1579 name = nl_langinfo(CODESET);
1580 syscp = SYSTEM_CHARSET_FLAG;
1581 #else
1582 name = "us-ascii";
1583 #endif
1586 codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1587 if (codepage) {
1588 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1589 return (codepage - codepages) | syscp;
1591 } else if (syscp) {
1592 return get_cp_index("us-ascii") | syscp;
1594 } else {
1595 return -1;
1599 #endif /* USE_FASTFIND */
1601 void
1602 init_charsets_lookup(void)
1604 #ifdef USE_FASTFIND
1605 fastfind_index(&ff_charsets_index, FF_COMPRESS);
1606 #endif
1609 void
1610 free_charsets_lookup(void)
1612 #ifdef USE_FASTFIND
1613 fastfind_done(&ff_charsets_index);
1614 #endif
1617 /* Get the codepage's name for displaying to the user, or NULL if
1618 * @cp_index is one past the end. In the future, we might want to
1619 * localize these with gettext. So it may be best not to use this
1620 * function if the name will have to be converted back to an
1621 * index. */
1622 unsigned char *
1623 get_cp_name(int cp_index)
1625 if (cp_index < 0) return "none";
1626 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1628 return codepages[cp_index].name;
1631 /* Get the codepage's name for saving to a configuration file. These
1632 * names can be converted back to indexes, even in future versions of
1633 * ELinks. */
1634 unsigned char *
1635 get_cp_config_name(int cp_index)
1637 if (cp_index < 0) return "none";
1638 if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1639 if (!codepages[cp_index].aliases) return NULL;
1641 return codepages[cp_index].aliases[0];
1644 /* Get the codepage's name for sending to a library or server that
1645 * understands MIME charset names. This function irreversibly maps
1646 * the "System" codepage to the underlying charset. */
1647 unsigned char *
1648 get_cp_mime_name(int cp_index)
1650 if (cp_index < 0) return "none";
1651 cp_index &= ~SYSTEM_CHARSET_FLAG;
1652 if (!codepages[cp_index].aliases) return NULL;
1654 return codepages[cp_index].aliases[0];
1658 is_cp_utf8(int cp_index)
1660 cp_index &= ~SYSTEM_CHARSET_FLAG;
1661 return is_cp_ptr_utf8(&codepages[cp_index]);
1664 /* This function will be used by the xhtml parser. */
1665 const uint16_t *
1666 get_cp_highhalf(const unsigned char *name)
1668 int cp = get_cp_index(name);
1670 if (cp < 0) return NULL;
1671 cp &= ~SYSTEM_CHARSET_FLAG;
1672 return codepages[cp].highhalf;