src/intl/charsets.c

   1 /* Charsets convertor */
   2
   3 #ifndef _GNU_SOURCE
   4 #define _GNU_SOURCE /* strcasecmp() */
   5 #endif
   6
   7 #ifdef HAVE_CONFIG_H
   8 #include "config.h"
   9 #endif
  10
  11 #if HAVE_LANGINFO_CODESET
  12 #include <langinfo.h>
  13 #endif
  14
  15 #include <ctype.h>
  16 #include <stdlib.h>
  17 #if HAVE_WCTYPE_H
  18 #include <wctype.h>
  19 #endif
  20
  21 #include "elinks.h"
  22
  23 #include "document/options.h"
  24 #include "intl/charsets.h"
  25 #include "util/conv.h"
  26 #include "util/error.h"
  27 #include "util/fastfind.h"
  28 #include "util/memory.h"
  29 #include "util/string.h"
  30
  31
  32 /* Fix namespace clash on MacOS. */
  33 #define table table_elinks
  34
  35 struct table_entry {
  36         unsigned char c;
  37         /* This should in principle be unicode_val_T, but because all
  38          * the values currently in codepage.inc fit in 16 bits, we can
  39          * as well use uint16_t and halve sizeof(struct table_entry)
  40          * from 8 bytes to 4.  Should other characters ever be needed,
  41          * unicode_val_T u : 24 might be a possibility, although it
  42          * seems a little unportable as bitfields are in principle
  43          * restricted to int, which may be 16-bit.  */
  44         uint16_t u;
  45 };
  46
  47 struct codepage_desc {
  48         unsigned char *name;
  49         unsigned char *const *aliases;
  50
  51         /* The Unicode mappings of codepage bytes 0x80...0xFF.
  52          * (0x00...0x7F are assumed to be ASCII in all codepages.)
  53          * Because all current values fit in 16 bits, we store them as
  54          * uint16_t rather than unicode_val_T.  If the codepage does
  55          * not use some byte, then @highhalf maps that byte to 0xFFFF,
  56          * which C code converts to UCS_REPLACEMENT_CHARACTER where
  57          * appropriate.  (U+FFFF is reserved and will never be
  58          * assigned as a character.)  */
  59         const uint16_t *highhalf;
  60
  61         /* If some byte in the codepage corresponds to multiple Unicode
  62          * characters, then the preferred character is in @highhalf
  63          * above, and the rest are listed here in @table.  This table
  64          * is not used for translating from the codepage to Unicode.  */
  65         const struct table_entry *table;
  66 };
  67
  68 #include "intl/codepage.inc"
  69 #include "intl/uni_7b.inc"
  70 #include "intl/entity.inc"
  71
  72
  73 static const char strings[256][2] = {
  74         "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
  75         "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
  76         "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
  77         "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
  78         "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
  79         "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
  80         "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
  81         "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
  82         "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
  83         "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
  84         "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
  85         "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
  86         "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
  87         "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
  88         "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
  89         "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
  90         "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
  91         "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
  92         "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
  93         "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
  94         "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
  95         "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
  96         "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
  97         "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
  98         "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
  99         "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
 100         "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
 101         "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
 102         "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
 103         "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
 104         "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
 105         "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
 106 };
 107
 108 static void
 109 free_translation_table(struct conv_table *p)
 110 {
 111         int i;
 112
 113         for (i = 0; i < 256; i++)
 114                 if (p[i].t)
 115                         free_translation_table(p[i].u.tbl);
 116
 117         mem_free(p);
 118 }
 119
 120 /* A string used in conversion tables when there is no correct
 121  * conversion.  This is compared by address and therefore should be a
 122  * named array rather than a pointer so that it won't share storage
 123  * with any other string literal that happens to have the same
 124  * characters.  */
 125 static const unsigned char no_str[] = "*";
 126
 127 static void
 128 new_translation_table(struct conv_table *p)
 129 {
 130         int i;
 131
 132         for (i = 0; i < 256; i++)
 133                 if (p[i].t)
 134                         free_translation_table(p[i].u.tbl);
 135         for (i = 0; i < 128; i++) {
 136                 p[i].t = 0;
 137                 p[i].u.str = strings[i];
 138         }
 139         for (; i < 256; i++) {
 140                 p[i].t = 0;
 141                 p[i].u.str = no_str;
 142         }
 143 }
 144
 145 #define BIN_SEARCH(table, entry, entries, key, result)                                  \
 146 {                                                                                       \
 147         long _s = 0, _e = (entries) - 1;                                                \
 148                                                                                         \
 149         while (_s <= _e || !((result) = -1)) {                                          \
 150                 long _m = (_s + _e) / 2;                                                \
 151                                                                                         \
 152                 if ((table)[_m].entry == (key)) {                                       \
 153                         (result) = _m;                                                  \
 154                         break;                                                          \
 155                 }                                                                       \
 156                 if ((table)[_m].entry > (key)) _e = _m - 1;                             \
 157                 if ((table)[_m].entry < (key)) _s = _m + 1;                             \
 158         }                                                                               \
 159 }                                                                                       \
 160
 161 static const unicode_val_T strange_chars[32] = {
 162 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
 163 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
 164 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
 165 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
 166 };
 167
 168 #define SYSTEM_CHARSET_FLAG 128
 169 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
 170
 171 const unsigned char *
 172 u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
 173 {
 174         int j;
 175         int s;
 176
 177         if (u < 128) return strings[u];
 178
 179         to &= ~SYSTEM_CHARSET_FLAG;
 180
 181 #ifdef CONFIG_UTF8
 182         if (is_cp_ptr_utf8(&codepages[to]))
 183                 return encode_utf8(u);
 184 #endif /* CONFIG_UTF8 */
 185
 186         /* To mark non breaking spaces in non-UTF-8 strings, we use a
 187          * special char NBSP_CHAR. */
 188         if (u == 0xa0) {
 189                 if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
 190                 else /* NBSP_MODE_ASCII */ return " ";
 191         }
 192         if (u == 0xad) return "";
 193
 194         if (u < 0xa0) {
 195                 unicode_val_T strange = strange_chars[u - 0x80];
 196
 197                 if (!strange) return NULL;
 198                 return u2cp_(strange, to, nbsp_mode);
 199         }
 200
 201         if (u < 0xFFFF)
 202                 for (j = 0; j < 0x80; j++)
 203                         if (codepages[to].highhalf[j] == u)
 204                                 return strings[0x80 + j];
 205         for (j = 0; codepages[to].table[j].c; j++)
 206                 if (codepages[to].table[j].u == u)
 207                         return strings[codepages[to].table[j].c];
 208
 209         BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
 210         if (s != -1) return unicode_7b[s].s;
 211
 212         return no_str;
 213 }
 214
 215 static unsigned char utf_buffer[7];
 216
 217 #ifdef CONFIG_UTF8
 218 inline unsigned char *
 219 encode_utf8(unicode_val_T u)
 220 #else
 221 static unsigned char *
 222 encode_utf8(unicode_val_T u)
 223 #endif /* CONFIG_UTF8 */
 224 {
 225         memset(utf_buffer, 0, 7);
 226
 227         if (u < 0x80)
 228                 utf_buffer[0] = u;
 229         else if (u < 0x800)
 230                 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
 231                 utf_buffer[1] = 0x80 | (u & 0x3f);
 232         else if (u < 0x10000)
 233                 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
 234                 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
 235                 utf_buffer[2] = 0x80 | (u & 0x3f);
 236         else if (u < 0x200000)
 237                 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
 238                 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
 239                 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
 240                 utf_buffer[3] = 0x80 | (u & 0x3f);
 241         else if (u < 0x4000000)
 242                 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
 243                 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
 244                 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
 245                 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
 246                 utf_buffer[4] = 0x80 | (u & 0x3f);
 247         else    utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
 248                 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
 249                 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
 250                 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
 251                 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
 252                 utf_buffer[5] = 0x80 | (u & 0x3f);
 253
 254         return utf_buffer;
 255 }
 256
 257 #ifdef CONFIG_UTF8
 258 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
 259  * equal ones and handled different. */
 260 static const char utf8char_len_tab[256] = {
 261         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 262         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 263         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 264         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 265         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 266         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 267         2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
 268         3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
 269 };
 270
 271 inline int utf8charlen(const unsigned char *p)
 272 {
 273         return p ? utf8char_len_tab[*p] : 0;
 274 }
 275
 276 inline int
 277 strlen_utf8(unsigned char **str)
 278 {
 279         unsigned char *s = *str;
 280         unsigned char *end = strchr(s, '\0');
 281         int x;
 282         int len;
 283
 284         for (x = 0;; x++, s += len) {
 285                 len = utf8charlen(s);
 286                 if (s + len > end) break;
 287         }
 288         *str = s;
 289         return x;
 290 }
 291
 292 #define utf8_issingle(p) (((p) & 0x80) == 0)
 293 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
 294
 295 /* Start from @current and move back to @pos char. This pointer return. The
 296  * most left pointer is @start. */
 297 inline unsigned char *
 298 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
 299 {
 300         if (current == NULL || start == NULL || pos < 0)
 301                 return NULL;
 302         while (pos > 0 && current != start) {
 303                 current--;
 304                 if (utf8_islead(*current))
 305                         pos--;
 306         }
 307         return current;
 308 }
 309
 310 /* Count number of standard terminal cells needed for displaying UTF-8
 311  * character. */
 312 int
 313 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
 314 {
 315         unicode_val_T u;
 316
 317         if (end == NULL)
 318                 end = strchr(utf8_char, '\0');
 319
 320         if(!utf8_char || !end)
 321                 return -1;
 322
 323         u = utf8_to_unicode(&utf8_char, end);
 324
 325         return unicode_to_cell(u);
 326 }
 327
 328 /* Count number of standard terminal cells needed for displaying string
 329  * with UTF-8 characters. */
 330 int
 331 utf8_ptr2cells(unsigned char *string, unsigned char *end)
 332 {
 333         int charlen, cell, cells = 0;
 334
 335         if (end == NULL)
 336                 end = strchr(string, '\0');
 337
 338         if(!string || !end)
 339                 return -1;
 340
 341         do {
 342                 charlen = utf8charlen(string);
 343                 if (string + charlen > end)
 344                         break;
 345
 346                 cell = utf8_char2cells(string, end);
 347                 if  (cell < 0)
 348                         return -1;
 349
 350                 cells += cell;
 351                 string += charlen;
 352         } while (1);
 353
 354         return cells;
 355 }
 356
 357 /* Count number of characters in string. */
 358 int
 359 utf8_ptr2chars(unsigned char *string, unsigned char *end)
 360 {
 361         int charlen, chars = 0;
 362
 363         if (end == NULL)
 364                 end = strchr(string, '\0');
 365
 366         if(!string || !end)
 367                 return -1;
 368
 369         do {
 370                 charlen = utf8charlen(string);
 371                 if (string + charlen > end)
 372                         break;
 373
 374                 chars++;
 375                 string += charlen;
 376         } while (1);
 377
 378         return chars;
 379 }
 380
 381 /*
 382  * Count number of bytes from begining of the string needed for displaying
 383  * specified number of cells.
 384  */
 385 int
 386 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
 387 {
 388         unsigned int bytes = 0, cells = 0;
 389
 390         assert(max_cells>=0);
 391
 392         if (end == NULL)
 393                 end = strchr(string, '\0');
 394
 395         if(!string || !end)
 396                 return -1;
 397
 398         do {
 399                 int cell = utf8_char2cells(&string[bytes], end);
 400                 if (cell < 0)
 401                         return -1;
 402
 403                 cells += cell;
 404                 if (cells > max_cells)
 405                         break;
 406
 407                 bytes += utf8charlen(&string[bytes]);
 408
 409                 if (string + bytes > end) {
 410                         bytes = end - string;
 411                         break;
 412                 }
 413         } while(1);
 414
 415         return bytes;
 416 }
 417
 418 /* Take @max steps forward from @string in the specified @way, but
 419  * not going past @end.  Return the resulting address.  Store the
 420  * number of steps taken to *@count, unless @count is NULL.
 421  *
 422  * This assumes the text is valid UTF-8, and @string and @end point to
 423  * character boundaries.  If not, it doesn't crash but the results may
 424  * be inconsistent.
 425  *
 426  * This function can do some of the same jobs as utf8charlen(),
 427  * utf8_cells2bytes(), and strlen_utf8().  */
 428 unsigned char *
 429 utf8_step_forward(unsigned char *string, unsigned char *end,
 430                   int max, enum utf8_step way, int *count)
 431 {
 432         int steps = 0;
 433         unsigned char *current = string;
 434
 435         assert(string);
 436         assert(max >= 0);
 437         if_assert_failed goto invalid_arg;
 438         if (end == NULL)
 439                 end = strchr(string, '\0');
 440
 441         switch (way) {
 442         case UTF8_STEP_CHARACTERS:
 443                 while (steps < max && current < end) {
 444                         ++current;
 445                         if (utf8_islead(*current))
 446                                 ++steps;
 447                 }
 448                 break;
 449
 450         case UTF8_STEP_CELLS_FEWER:
 451         case UTF8_STEP_CELLS_MORE:
 452                 while (steps < max) {
 453                         unicode_val_T u;
 454                         unsigned char *prev = current;
 455                         int width;
 456
 457                         u = utf8_to_unicode(&current, end);
 458                         if (u == UCS_NO_CHAR) {
 459                                 /* Assume the incomplete sequence
 460                                  * costs one cell.  */
 461                                 current = end;
 462                                 ++steps;
 463                                 break;
 464                         }
 465
 466                         width = unicode_to_cell(u);
 467                         if (way == UTF8_STEP_CELLS_FEWER
 468                             && steps + width > max) {
 469                                 /* Back off.  */
 470                                 current = prev;
 471                                 break;
 472                         }
 473                         steps += width;
 474                 }
 475                 break;
 476
 477         default:
 478                 INTERNAL("impossible enum utf8_step");
 479         }
 480
 481 invalid_arg:
 482         if (count)
 483                 *count = steps;
 484         return current;
 485 }
 486
 487 /* Take @max steps backward from @string in the specified @way, but
 488  * not going past @start.  Return the resulting address.  Store the
 489  * number of steps taken to *@count, unless @count is NULL.
 490  *
 491  * This assumes the text is valid UTF-8, and @string and @start point
 492  * to character boundaries.  If not, it doesn't crash but the results
 493  * may be inconsistent.
 494  *
 495  * This function can do some of the same jobs as utf8_prevchar().  */
 496 unsigned char *
 497 utf8_step_backward(unsigned char *string, unsigned char *start,
 498                    int max, enum utf8_step way, int *count)
 499 {
 500         int steps = 0;
 501         unsigned char *current = string;
 502
 503         assert(string);
 504         assert(start);
 505         assert(max >= 0);
 506         if_assert_failed goto invalid_arg;
 507
 508         switch (way) {
 509         case UTF8_STEP_CHARACTERS:
 510                 while (steps < max && current > start) {
 511                         --current;
 512                         if (utf8_islead(*current))
 513                                 ++steps;
 514                 }
 515                 break;
 516
 517         case UTF8_STEP_CELLS_FEWER:
 518         case UTF8_STEP_CELLS_MORE:
 519                 while (steps < max) {
 520                         unsigned char *prev = current;
 521                         unsigned char *look;
 522                         unicode_val_T u;
 523                         int width;
 524
 525                         if (current <= start)
 526                                 break;
 527                         do {
 528                                 --current;
 529                         } while (current > start && !utf8_islead(*current));
 530
 531                         look = current;
 532                         u = utf8_to_unicode(&look, prev);
 533                         if (u == UCS_NO_CHAR) {
 534                                 /* Assume the incomplete sequence
 535                                  * costs one cell.  */
 536                                 width = 1;
 537                         } else
 538                                 width = unicode_to_cell(u);
 539
 540                         if (way == UTF8_STEP_CELLS_FEWER
 541                             && steps + width > max) {
 542                                 /* Back off.  */
 543                                 current = prev;
 544                                 break;
 545                         }
 546                         steps += width;
 547                 }
 548                 break;
 549
 550         default:
 551                 INTERNAL("impossible enum utf8_step");
 552         }
 553
 554 invalid_arg:
 555         if (count)
 556                 *count = steps;
 557         return current;
 558 }
 559
 560 /*
 561  * Find out number of standard terminal collumns needed for displaying symbol
 562  * (glyph) which represents Unicode character c.
 563  *
 564  * TODO: Use wcwidth when it is available. This seems to require:
 565  * - Make the configure script check whether <wchar.h> and wcwidth exist.
 566  * - Define _XOPEN_SOURCE and include <wchar.h>.
 567  * - Test that __STDC_ISO_10646__ is defined.  (This macro means wchar_t
 568  *   matches ISO 10646 in all locales.)
 569  * However, these do not suffice, because wcwidth depends on LC_CTYPE
 570  * in glibc-2.3.6.  For instance, wcwidth(0xff20) is -1 when LC_CTYPE
 571  * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
 572  * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
 573  * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
 574  * character is apparently not supported in all locales.  Why is that?
 575  * - Perhaps there is standardese that requires supported characters
 576  *   to be convertable to multibyte form.  Then ELinks could just pick
 577  *   some UTF-8 locale for its wcwidth purposes.
 578  * - Perhaps wcwidth can even return different nonnegative values for
 579  *   the same ISO 10646 character in different locales.  Then ELinks
 580  *   would have to set LC_CTYPE to match at least the terminal's
 581  *   charset (which may differ from the LC_CTYPE environment variable,
 582  *   especially when the master process is serving a slave terminal).
 583  *   But there is no guarantee that the libc supports all the same
 584  *   charsets as ELinks does.
 585  * For now, it seems safest to avoid the potentially locale-dependent
 586  * libc version of wcwidth, and instead use a hardcoded mapping.
 587  *
 588  * @return      2 for double-width glyph, 1 for others.
 589  *              TODO: May be extended to return 0 for zero-width glyphs
 590  *              (like composing, maybe unprintable too).
 591  */
 592 inline int
 593 unicode_to_cell(unicode_val_T c)
 594 {
 595         if (c >= 0x1100
 596                 && (c <= 0x115f                 /* Hangul Jamo */
 597                 || c == 0x2329
 598                 || c == 0x232a
 599                 || (c >= 0x2e80 && c <= 0xa4cf
 600                         && c != 0x303f)         /* CJK ... Yi */
 601                 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
 602                 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
 603                                                                 Ideographs */
 604                 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
 605                 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
 606                 || (c >= 0xffe0 && c <= 0xffe6)
 607                 || (c >= 0x20000 && c <= 0x2fffd)
 608                 || (c >= 0x30000 && c <= 0x3fffd)))
 609                 return 2;
 610
 611         return 1;
 612 }
 613
 614 /* Fold the case of a Unicode character, so that hotkeys in labels can
 615  * be compared case-insensitively.  It is unspecified whether the
 616  * result will be in upper or lower case.  */
 617 unicode_val_T
 618 unicode_fold_label_case(unicode_val_T c)
 619 {
 620 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
 621         return towlower(c);
 622 #else  /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 623         /* For now, this supports only ASCII.  It would be possible to
 624          * use code generated from CaseFolding.txt of Unicode if the
 625          * acknowledgements required by http://www.unicode.org/copyright.html
 626          * were added to associated documentation of ELinks.  */
 627         if (c >= 0x41 && c <= 0x5A)
 628                 return c + 0x20;
 629         else
 630                 return c;
 631 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 632 }
 633
 634 inline unicode_val_T
 635 utf8_to_unicode(unsigned char **string, unsigned char *end)
 636 {
 637         unsigned char *str = *string;
 638         unicode_val_T u;
 639         int length;
 640
 641         length = utf8char_len_tab[str[0]];
 642
 643         if (str + length > end) {
 644                 return UCS_NO_CHAR;
 645         }
 646
 647         switch (length) {
 648                 case 1:         /* U+0000 to U+007F */
 649                         if (str[0] >= 0x80) {
 650 invalid_utf8:
 651                                 ++*string;
 652                                 return UCS_REPLACEMENT_CHARACTER;
 653                         }
 654                         u = str[0];
 655                         break;
 656                 case 2:         /* U+0080 to U+07FF */
 657                         if ((str[1] & 0xc0) != 0x80)
 658                                 goto invalid_utf8;
 659                         u = (str[0] & 0x1f) << 6;
 660                         u += (str[1] & 0x3f);
 661                         if (u < 0x80)
 662                                 goto invalid_utf8;
 663                         break;
 664                 case 3:         /* U+0800 to U+FFFF, except surrogates */
 665                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
 666                                 goto invalid_utf8;
 667                         u = (str[0] & 0x0f) << 12;
 668                         u += ((str[1] & 0x3f) << 6);
 669                         u += (str[2] & 0x3f);
 670                         if (u < 0x800 || is_utf16_surrogate(u))
 671                                 goto invalid_utf8;
 672                         break;
 673                 case 4:         /* U+10000 to U+1FFFFF */
 674                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 675                             || (str[3] & 0xc0) != 0x80)
 676                                 goto invalid_utf8;
 677                         u = (str[0] & 0x0f) << 18;
 678                         u += ((str[1] & 0x3f) << 12);
 679                         u += ((str[2] & 0x3f) << 6);
 680                         u += (str[3] & 0x3f);
 681                         if (u < 0x10000)
 682                                 goto invalid_utf8;
 683                         break;
 684                 case 5:         /* U+200000 to U+3FFFFFF */
 685                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 686                             || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
 687                                 goto invalid_utf8;
 688                         u = (str[0] & 0x0f) << 24;
 689                         u += ((str[1] & 0x3f) << 18);
 690                         u += ((str[2] & 0x3f) << 12);
 691                         u += ((str[3] & 0x3f) << 6);
 692                         u += (str[4] & 0x3f);
 693                         if (u < 0x200000)
 694                                 goto invalid_utf8;
 695                         break;
 696                 case 6:         /* U+4000000 to U+7FFFFFFF */
 697                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 698                             || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
 699                             || (str[5] & 0xc0) != 0x80)
 700                                 goto invalid_utf8;
 701                         u = (str[0] & 0x01) << 30;
 702                         u += ((str[1] & 0x3f) << 24);
 703                         u += ((str[2] & 0x3f) << 18);
 704                         u += ((str[3] & 0x3f) << 12);
 705                         u += ((str[4] & 0x3f) << 6);
 706                         u += (str[5] & 0x3f);
 707                         if (u < 0x4000000)
 708                                 goto invalid_utf8;
 709                         break;
 710                 default:
 711                         INTERNAL("utf8char_len_tab out of range");
 712                         goto invalid_utf8;
 713         }
 714         *string = str + length;
 715         return u;
 716 }
 717 #endif /* CONFIG_UTF8 */
 718
 719 /* The common part of cp2u and cp2utf_8.  */
 720 static unicode_val_T
 721 cp2u_shared(const struct codepage_desc *from, unsigned char c)
 722 {
 723         unicode_val_T u = from->highhalf[c - 0x80];
 724
 725         if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
 726         return u;
 727 }
 728
 729 /* Used for converting input from the terminal.  */
 730 unicode_val_T
 731 cp2u(int from, unsigned char c)
 732 {
 733         from &= ~SYSTEM_CHARSET_FLAG;
 734
 735         /* UTF-8 is a multibyte codepage and cannot be handled with
 736          * this function.  */
 737         assert(!is_cp_ptr_utf8(&codepages[from]));
 738         if_assert_failed return UCS_REPLACEMENT_CHARACTER;
 739
 740         if (c < 0x80) return c;
 741         else return cp2u_shared(&codepages[from], c);
 742 }
 743
 744 /* This slow and ugly code is used by the terminal utf_8_io */
 745 const unsigned char *
 746 cp2utf8(int from, int c)
 747 {
 748         from &= ~SYSTEM_CHARSET_FLAG;
 749
 750         if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
 751                 return strings[c];
 752
 753         return encode_utf8(cp2u_shared(&codepages[from], c));
 754 }
 755
 756 #ifdef CONFIG_UTF8
 757 unicode_val_T
 758 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
 759 {
 760         unicode_val_T ret;
 761
 762         if (is_cp_utf8(codepage))
 763                 return utf8_to_unicode(string, end);
 764
 765         if (*string >= end)
 766                 return UCS_NO_CHAR;
 767
 768         ret = cp2u(codepage, **string);
 769         ++*string;
 770         return ret;
 771 }
 772 #endif  /* CONFIG_UTF8 */
 773
 774
 775 static void
 776 add_utf8(struct conv_table *ct, unicode_val_T u, const unsigned char *str)
 777 {
 778         unsigned char *p = encode_utf8(u);
 779
 780         while (p[1]) {
 781                 if (ct[*p].t) ct = ct[*p].u.tbl;
 782                 else {
 783                         struct conv_table *nct;
 784
 785                         assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
 786                         if_assert_failed return;
 787
 788                         nct = mem_calloc(256, sizeof(*nct));
 789                         if (!nct) return;
 790                         new_translation_table(nct);
 791                         ct[*p].t = 1;
 792                         ct[*p].u.tbl = nct;
 793                         ct = nct;
 794                 }
 795                 p++;
 796         }
 797
 798         assertm(!ct[*p].t, "bad utf encoding #2");
 799         if_assert_failed return;
 800
 801         if (ct[*p].u.str == no_str)
 802                 ct[*p].u.str = str;
 803 }
 804
 805 /* A conversion table from some charset to UTF-8.
 806  * If it is from UTF-8 to UTF-8, it converts each byte separately.
 807  * Unlike in other translation tables, the strings in elements 0x80 to
 808  * 0xFF are allocated dynamically.  */
 809 struct conv_table utf_table[256];
 810 int utf_table_init = 1;
 811
 812 static void
 813 free_utf_table(void)
 814 {
 815         int i;
 816
 817         /* Cast away const.  */
 818         for (i = 128; i < 256; i++)
 819                 mem_free((unsigned char *) utf_table[i].u.str);
 820 }
 821
 822 static struct conv_table *
 823 get_translation_table_to_utf8(int from)
 824 {
 825         int i;
 826         static int lfr = -1;
 827
 828         if (from == -1) return NULL;
 829         from &= ~SYSTEM_CHARSET_FLAG;
 830         if (from == lfr) return utf_table;
 831         lfr = from;
 832         if (utf_table_init)
 833                 memset(utf_table, 0, sizeof(utf_table)),
 834                 utf_table_init = 0;
 835         else
 836                 free_utf_table();
 837
 838         for (i = 0; i < 128; i++)
 839                 utf_table[i].u.str = strings[i];
 840
 841         if (is_cp_ptr_utf8(&codepages[from])) {
 842                 for (i = 128; i < 256; i++)
 843                         utf_table[i].u.str = stracpy(strings[i]);
 844                 return utf_table;
 845         }
 846
 847         for (i = 128; i < 256; i++) {
 848                 unicode_val_T u = codepages[from].highhalf[i - 0x80];
 849
 850                 if (u == 0xFFFF)
 851                         utf_table[i].u.str = NULL;
 852                 else
 853                         utf_table[i].u.str = stracpy(encode_utf8(u));
 854         }
 855
 856         for (i = 0; codepages[from].table[i].c; i++) {
 857                 unicode_val_T u = codepages[from].table[i].u;
 858
 859                 if (!utf_table[codepages[from].table[i].c].u.str)
 860                         utf_table[codepages[from].table[i].c].u.str =
 861                                 stracpy(encode_utf8(u));
 862         }
 863
 864         for (i = 128; i < 256; i++)
 865                 if (!utf_table[i].u.str)
 866                         utf_table[i].u.str = stracpy(no_str);
 867
 868         return utf_table;
 869 }
 870
 871 /* A conversion table between two charsets, where the target is not UTF-8.  */
 872 static struct conv_table table[256];
 873 static int first = 1;
 874
 875 void
 876 free_conv_table(void)
 877 {
 878         if (!utf_table_init) free_utf_table();
 879         if (first) {
 880                 memset(table, 0, sizeof(table));
 881                 first = 0;
 882         }
 883         new_translation_table(table);
 884 }
 885
 886
 887 struct conv_table *
 888 get_translation_table(int from, int to)
 889 {
 890         static int lfr = -1;
 891         static int lto = -1;
 892
 893         from &= ~SYSTEM_CHARSET_FLAG;
 894         to &= ~SYSTEM_CHARSET_FLAG;
 895         if (first) {
 896                 memset(table, 0, sizeof(table));
 897                 first = 0;
 898         }
 899         if (/*from == to ||*/ from == -1 || to == -1)
 900                 return NULL;
 901         if (is_cp_ptr_utf8(&codepages[to]))
 902                 return get_translation_table_to_utf8(from);
 903         if (from == lfr && to == lto)
 904                 return table;
 905         lfr = from;
 906         lto = to;
 907         new_translation_table(table);
 908
 909         if (is_cp_ptr_utf8(&codepages[from])) {
 910                 int i;
 911
 912                 for (i = 0x80; i <= 0xFF; i++)
 913                         if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
 914                                 add_utf8(table,
 915                                          codepages[to].highhalf[i - 0x80],
 916                                          strings[i]);
 917
 918                 for (i = 0; codepages[to].table[i].c; i++)
 919                         add_utf8(table, codepages[to].table[i].u,
 920                                  strings[codepages[to].table[i].c]);
 921
 922                 for (i = 0; unicode_7b[i].x != -1; i++)
 923                         if (unicode_7b[i].x >= 0x80)
 924                                 add_utf8(table, unicode_7b[i].x,
 925                                          unicode_7b[i].s);
 926
 927         } else {
 928                 int i;
 929
 930                 for (i = 128; i < 256; i++) {
 931                         if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
 932                                 const unsigned char *u;
 933
 934                                 u = u2cp(codepages[from].highhalf[i - 0x80], to);
 935                                 if (u) table[i].u.str = u;
 936                         }
 937                 }
 938         }
 939
 940         return table;
 941 }
 942
 943 static inline int
 944 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
 945 {
 946         while (l2) {
 947                 if (*s1 > *s2) return 1;
 948                 if (*s1 < *s2) return -1;
 949                 s1++;
 950                 s2++;
 951                 l2--;
 952         }
 953
 954         return *s2 ? -1 : 0;
 955 }
 956
 957 /* Entity cache debugging purpose. */
 958 #if 0
 959 #define DEBUG_ENTITY_CACHE
 960 #else
 961 #undef DEBUG_ENTITY_CACHE
 962 #endif
 963
 964 struct entity_cache {
 965         unsigned int hits;
 966         int strlen;
 967         int encoding;
 968         const unsigned char *result;
 969         unsigned char str[20]; /* Suffice in any case. */
 970 };
 971
 972 static int
 973 hits_cmp(struct entity_cache *a, struct entity_cache *b)
 974 {
 975         if (a->hits == b->hits) return 0;
 976         if (a->hits > b->hits) return -1;
 977         else return 1;
 978 }
 979
 980 static int
 981 compare_entities(const void *key_, const void *element_)
 982 {
 983         struct string *key = (struct string *) key_;
 984         struct entity *element = (struct entity *) element_;
 985         int length = key->length;
 986         unsigned char *first = key->source;
 987         unsigned char *second = element->s;
 988
 989         return xxstrcmp(first, second, length);
 990 }
 991
 992 const unsigned char *
 993 get_entity_string(const unsigned char *str, const int strlen, int encoding)
 994 {
 995 #define ENTITY_CACHE_SIZE 10    /* 10 seems a good value. */
 996 #define ENTITY_CACHE_MAXLEN 9   /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
 997                                    will go in [0] table */
 998         static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
 999         static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
1000         static int first_time = 1;
1001         unsigned int slen = 0;
1002         const unsigned char *result = NULL;
1003
1004         if (strlen <= 0) return NULL;
1005
1006 #ifdef CONFIG_UTF8
1007         /* TODO: caching UTF-8 */
1008         encoding &= ~SYSTEM_CHARSET_FLAG;
1009         if (is_cp_ptr_utf8(&codepages[encoding]))
1010                 goto skip;
1011 #endif /* CONFIG_UTF8 */
1012
1013         if (first_time) {
1014                 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
1015                 first_time = 0;
1016         }
1017
1018         /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1019          * + google + slashdot + websites that result from a search for test on google,
1020          * + various ones) show a quite impressive improvment:
1021          * Top ten is:
1022          * 0: hits=2459 l=4 st='nbsp'
1023          * 1: hits=2152 l=6 st='eacute'
1024          * 2: hits=235 l=6 st='egrave'
1025          * 3: hits=136 l=6 st='agrave'
1026          * 4: hits=100 l=3 st='amp'
1027          * 5: hits=40 l=5 st='laquo'
1028          * 6: hits=8 l=4 st='copy'
1029          * 7: hits=5 l=2 st='gt'
1030          * 8: hits=2 l=2 st='lt'
1031          * 9: hits=1 l=6 st='middot'
1032          *
1033          * Most of the time cache hit ratio is near 95%.
1034          *
1035          * A long test shows: 15186 hits vs. 24 misses and mean iteration
1036          * count is kept < 2 (worst case 1.58). Not so bad ;)
1037          *
1038          * --Zas */
1039
1040         /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1041         slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
1042
1043         if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
1044                 int i;
1045
1046                 for (i = 0; i < nb_entity_cache[slen]; i++) {
1047                         if (entity_cache[slen][i].encoding == encoding
1048                             && !memcmp(str, entity_cache[slen][i].str, strlen)) {
1049 #ifdef DEBUG_ENTITY_CACHE
1050                                 static double total_iter = 0;
1051                                 static unsigned long hit_count = 0;
1052
1053                                 total_iter += i + 1;
1054                                 hit_count++;
1055                                 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
1056 #endif
1057                                 if (entity_cache[slen][i].hits < (unsigned int) ~0)
1058                                         entity_cache[slen][i].hits++;
1059                                 return entity_cache[slen][i].result;
1060                         }
1061                 }
1062 #ifdef DEBUG_ENTITY_CACHE
1063                 fprintf(stderr, "miss\n");
1064 #endif
1065         }
1066 #ifdef CONFIG_UTF8
1067 skip:
1068 #endif /* CONFIG_UTF8 */
1069         if (*str == '#') { /* Numeric entity. */
1070                 int l = (int) strlen;
1071                 unsigned char *st = (unsigned char *) str;
1072                 unicode_val_T n = 0;
1073
1074                 if (l == 1) goto end; /* &#; ? */
1075                 st++, l--;
1076                 if ((*st | 32) == 'x') { /* Hexadecimal */
1077
1078                         if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
1079                         st++, l--;
1080                         do {
1081                                 unsigned char c = (*(st++) | 32);
1082
1083                                 if (isdigit(c))
1084                                         n = (n << 4) | (c - '0');
1085                                 else if (isxdigit(c))
1086                                         n = (n << 4) | (c - 'a' + 10);
1087                                 else
1088                                         goto end; /* Bad char. */
1089                         } while (--l);
1090                 } else { /* Decimal */
1091                         if (l > 10) goto end; /* 4294967295 max. */
1092                         do {
1093                                 unsigned char c = *(st++);
1094
1095                                 if (isdigit(c))
1096                                         n = n * 10 + c - '0';
1097                                 else
1098                                         goto end; /* Bad char. */
1099                                 /* Limit to 0xFFFFFFFF. */
1100                                 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1101                                         goto end;
1102                         } while (--l);
1103                 }
1104
1105                 result = u2cp(n, encoding);
1106
1107 #ifdef DEBUG_ENTITY_CACHE
1108                 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1109 #endif
1110         } else { /* Text entity. */
1111                 struct string key = INIT_STRING((unsigned char *) str, strlen);
1112                 struct entity *element = bsearch((void *) &key, entities,
1113                                                  N_ENTITIES,
1114                                                  sizeof(*element),
1115                                                  compare_entities);
1116
1117                 if (element) result = u2cp(element->c, encoding);
1118         }
1119
1120 #ifdef CONFIG_UTF8
1121         if (is_cp_ptr_utf8(&codepages[encoding])) {
1122                 return result;
1123         }
1124 #endif /* CONFIG_UTF8 */
1125 end:
1126         /* Take care of potential buffer overflow. */
1127         if (strlen < sizeof(entity_cache[slen][0].str)) {
1128                 struct entity_cache *ece = &entity_cache[slen][nb_entity_cache[slen]];
1129
1130                 /* Copy new entry to cache. */
1131                 ece->hits = 1;
1132                 ece->strlen = strlen;
1133                 ece->encoding = encoding;
1134                 ece->result = result;
1135                 memcpy(ece->str, str, strlen);
1136                 ece->str[strlen] = '\0';
1137
1138                 /* Increment number of cache entries if possible. */
1139                 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1140
1141 #ifdef DEBUG_ENTITY_CACHE
1142                 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1143                                 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1144
1145 #endif
1146
1147                 /* Sort entries by hit order. */
1148                 if (nb_entity_cache[slen] > 1)
1149                         qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1150                               sizeof(entity_cache[slen][0]), (void *) hits_cmp);
1151
1152 #ifdef DEBUG_ENTITY_CACHE
1153         {
1154                 unsigned int i;
1155
1156                 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1157                 for (i = 0; i < nb_entity_cache[slen] ; i++)
1158                         fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1159                                 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1160                                 entity_cache[slen][i].str);
1161                 fprintf(stderr, "-----------------\n");
1162         }
1163 #endif
1164         }
1165         return result;
1166 }
1167
1168 unsigned char *
1169 convert_string(struct conv_table *convert_table,
1170                unsigned char *chars, int charslen, int cp,
1171                enum convert_string_mode mode, int *length,
1172                void (*callback)(void *data, unsigned char *buf, int buflen),
1173                void *callback_data)
1174 {
1175         unsigned char *buffer;
1176         int bufferpos = 0;
1177         int charspos = 0;
1178
1179         if (!convert_table && !memchr(chars, '&', charslen)) {
1180                 if (callback) {
1181                         if (charslen) callback(callback_data, chars, charslen);
1182                         return NULL;
1183                 } else {
1184                         return memacpy(chars, charslen);
1185                 }
1186         }
1187
1188         /* Buffer allocation */
1189
1190         buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1191         if (!buffer) return NULL;
1192
1193         /* Iterate ;-) */
1194
1195         while (charspos < charslen) {
1196                 const unsigned char *translit;
1197
1198 #define PUTC do { \
1199                 buffer[bufferpos++] = chars[charspos++]; \
1200                 translit = ""; \
1201                 goto flush; \
1202         } while (0)
1203
1204                 if (chars[charspos] != '&') {
1205                         struct conv_table *t;
1206                         int i;
1207
1208                         if (chars[charspos] < 128 || !convert_table) PUTC;
1209
1210                         t = convert_table;
1211                         i = charspos;
1212
1213                         while (t[chars[i]].t) {
1214                                 t = t[chars[i++]].u.tbl;
1215                                 if (i >= charslen) PUTC;
1216                         }
1217
1218                         translit = t[chars[i]].u.str;
1219                         charspos = i + 1;
1220
1221                 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1222                         PUTC;
1223
1224                 } else {
1225                         int start = charspos + 1;
1226                         int i = start;
1227
1228                         while (i < charslen
1229                                && (isasciialpha(chars[i])
1230                                    || isdigit(chars[i])
1231                                    || (chars[i] == '#')))
1232                                 i++;
1233
1234                         /* This prevents bug 213: we were expanding "entities"
1235                          * in URL query strings. */
1236                         /* XXX: But this disables &nbsp&nbsp usage, which
1237                          * appears to be relatively common! --pasky */
1238                         if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1239                             && i > start
1240                             && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1241                                 translit = get_entity_string(&chars[start], i - start,
1242                                                       cp);
1243                                 if (chars[i] != ';') {
1244                                         /* Eat &nbsp &nbsp<foo> happily, but
1245                                          * pull back from the character after
1246                                          * entity string if it is not the valid
1247                                          * terminator. */
1248                                         i--;
1249                                 }
1250
1251                                 if (!translit) PUTC;
1252                                 charspos = i + (i < charslen);
1253                         } else PUTC;
1254                 }
1255
1256                 if (!translit[0]) continue;
1257
1258                 if (!translit[1]) {
1259                         buffer[bufferpos++] = translit[0];
1260                         translit = "";
1261                         goto flush;
1262                 }
1263
1264                 while (*translit) {
1265                         unsigned char *new;
1266
1267                         buffer[bufferpos++] = *(translit++);
1268 flush:
1269                         if (bufferpos & (ALLOC_GR - 1)) continue;
1270
1271                         if (callback) {
1272                                 buffer[bufferpos] = 0;
1273                                 callback(callback_data, buffer, bufferpos);
1274                                 bufferpos = 0;
1275                         } else {
1276                                 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1277                                 if (!new) {
1278                                         mem_free(buffer);
1279                                         return NULL;
1280                                 }
1281                                 buffer = new;
1282                         }
1283                 }
1284 #undef PUTC
1285         }
1286
1287         /* Say bye */
1288
1289         buffer[bufferpos] = 0;
1290         if (length) *length = bufferpos;
1291
1292         if (callback) {
1293                 if (bufferpos) callback(callback_data, buffer, bufferpos);
1294                 mem_free(buffer);
1295                 return NULL;
1296         } else {
1297                 return buffer;
1298         }
1299 }
1300
1301
1302 #ifndef USE_FASTFIND
1303 int
1304 get_cp_index(unsigned char *name)
1305 {
1306         int i, a;
1307         int syscp = 0;
1308
1309         if (!strcasecmp(name, "System")) {
1310 #if HAVE_LANGINFO_CODESET
1311                 name = nl_langinfo(CODESET);
1312                 syscp = SYSTEM_CHARSET_FLAG;
1313 #else
1314                 name = "us-ascii";
1315 #endif
1316         }
1317
1318         for (i = 0; codepages[i].name; i++) {
1319                 for (a = 0; codepages[i].aliases[a]; a++) {
1320                         /* In the past, we looked for the longest substring
1321                          * in all the names; it is way too expensive, though:
1322                          *
1323                          *   %   cumulative   self              self     total
1324                          *  time   seconds   seconds    calls  us/call  us/call  name
1325                          *  3.00      0.66     0.03     1325    22.64    22.64  get_cp_index
1326                          *
1327                          * Anything called from redraw_screen() is in fact
1328                          * relatively expensive, even if it's called just
1329                          * once. So we will do a simple strcasecmp() here.
1330                          */
1331
1332                         if (!strcasecmp(name, codepages[i].aliases[a]))
1333                                 return i | syscp;
1334                 }
1335         }
1336
1337         if (syscp) {
1338                 return get_cp_index("us-ascii") | syscp;
1339         } else {
1340                 return -1;
1341         }
1342 }
1343
1344 #else
1345
1346 static unsigned int i_name = 0;
1347 static unsigned int i_alias = 0;
1348
1349 /* Reset internal list pointer */
1350 void
1351 charsets_list_reset(void)
1352 {
1353         i_name = 0;
1354         i_alias = 0;
1355 }
1356
1357 /* Returns a pointer to a struct that contains current key and data pointers
1358  * and increment internal pointer.  It returns NULL when key is NULL. */
1359 struct fastfind_key_value *
1360 charsets_list_next(void)
1361 {
1362         static struct fastfind_key_value kv;
1363
1364         if (!codepages[i_name].name) return NULL;
1365
1366         kv.key = codepages[i_name].aliases[i_alias];
1367         kv.data = (void *) &codepages[i_name]; /* cast away const */
1368
1369         if (codepages[i_name].aliases[i_alias + 1])
1370                 i_alias++;
1371         else {
1372                 i_name++;
1373                 i_alias = 0;
1374         }
1375
1376         return &kv;
1377 }
1378
1379 static struct fastfind_index ff_charsets_index
1380         = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1381
1382 /* It searchs for a charset named @name or one of its aliases and
1383  * returns index for it or -1 if not found. */
1384 int
1385 get_cp_index(unsigned char *name)
1386 {
1387         const struct codepage_desc *codepage;
1388         int syscp = 0;
1389
1390         if (!strcasecmp(name, "System")) {
1391 #if HAVE_LANGINFO_CODESET
1392                 name = nl_langinfo(CODESET);
1393                 syscp = SYSTEM_CHARSET_FLAG;
1394 #else
1395                 name = "us-ascii";
1396 #endif
1397         }
1398
1399         codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1400         if (codepage) {
1401                 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1402                 return (codepage - codepages) | syscp;
1403
1404         } else if (syscp) {
1405                 return get_cp_index("us-ascii") | syscp;
1406
1407         } else {
1408                 return -1;
1409         }
1410 }
1411
1412 #endif /* USE_FASTFIND */
1413
1414 void
1415 init_charsets_lookup(void)
1416 {
1417 #ifdef USE_FASTFIND
1418         fastfind_index(&ff_charsets_index, FF_COMPRESS);
1419 #endif
1420 }
1421
1422 void
1423 free_charsets_lookup(void)
1424 {
1425 #ifdef USE_FASTFIND
1426         fastfind_done(&ff_charsets_index);
1427 #endif
1428 }
1429
1430 unsigned char *
1431 get_cp_name(int cp_index)
1432 {
1433         if (cp_index < 0) return "none";
1434         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1435
1436         return codepages[cp_index].name;
1437 }
1438
1439 unsigned char *
1440 get_cp_mime_name(int cp_index)
1441 {
1442         if (cp_index < 0) return "none";
1443         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1444         if (!codepages[cp_index].aliases) return NULL;
1445
1446         return codepages[cp_index].aliases[0];
1447 }
1448
1449 int
1450 is_cp_utf8(int cp_index)
1451 {
1452         cp_index &= ~SYSTEM_CHARSET_FLAG;
1453         return is_cp_ptr_utf8(&codepages[cp_index]);
1454 }