src/intl/charsets.c

   1 /* Charsets convertor */
   2
   3 #ifndef _GNU_SOURCE
   4 #define _GNU_SOURCE /* strcasecmp() */
   5 #endif
   6
   7 #ifdef HAVE_CONFIG_H
   8 #include "config.h"
   9 #endif
  10
  11 #if HAVE_LANGINFO_CODESET
  12 #include <langinfo.h>
  13 #endif
  14
  15 #include <ctype.h>
  16 #include <stdlib.h>
  17 #if HAVE_WCTYPE_H
  18 #include <wctype.h>
  19 #endif
  20
  21 #include "elinks.h"
  22
  23 #include "document/options.h"
  24 #include "intl/charsets.h"
  25 #include "util/conv.h"
  26 #include "util/error.h"
  27 #include "util/fastfind.h"
  28 #include "util/memory.h"
  29 #include "util/string.h"
  30
  31
  32 /* Fix namespace clash on MacOS. */
  33 #define table table_elinks
  34
  35 struct table_entry {
  36         unsigned char c;
  37         /* This should in principle be unicode_val_T, but because all
  38          * the values currently in codepage.inc fit in 16 bits, we can
  39          * as well use uint16_t and halve sizeof(struct table_entry)
  40          * from 8 bytes to 4.  Should other characters ever be needed,
  41          * unicode_val_T u : 24 might be a possibility, although it
  42          * seems a little unportable as bitfields are in principle
  43          * restricted to int, which may be 16-bit.  */
  44         uint16_t u;
  45 };
  46
  47 struct codepage_desc {
  48         unsigned char *name;
  49         unsigned char *const *aliases;
  50
  51         /* The Unicode mappings of codepage bytes 0x80...0xFF.
  52          * (0x00...0x7F are assumed to be ASCII in all codepages.)
  53          * Because all current values fit in 16 bits, we store them as
  54          * uint16_t rather than unicode_val_T.  If the codepage does
  55          * not use some byte, then @highhalf maps that byte to 0xFFFF,
  56          * which C code converts to UCS_REPLACEMENT_CHARACTER where
  57          * appropriate.  (U+FFFF is reserved and will never be
  58          * assigned as a character.)  */
  59         const uint16_t *highhalf;
  60
  61         /* If some byte in the codepage corresponds to multiple Unicode
  62          * characters, then the preferred character is in @highhalf
  63          * above, and the rest are listed here in @table.  This table
  64          * is not used for translating from the codepage to Unicode.  */
  65         const struct table_entry *table;
  66 };
  67
  68 #include "intl/codepage.inc"
  69 #include "intl/uni_7b.inc"
  70 #include "intl/entity.inc"
  71
  72
  73 static const char strings[256][2] = {
  74         "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
  75         "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
  76         "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
  77         "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
  78         "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
  79         "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
  80         "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
  81         "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
  82         "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
  83         "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
  84         "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
  85         "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
  86         "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
  87         "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
  88         "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
  89         "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
  90         "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
  91         "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
  92         "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
  93         "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
  94         "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
  95         "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
  96         "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
  97         "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
  98         "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
  99         "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
 100         "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
 101         "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
 102         "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
 103         "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
 104         "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
 105         "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
 106 };
 107
 108 static void
 109 free_translation_table(struct conv_table *p)
 110 {
 111         int i;
 112
 113         for (i = 0; i < 256; i++)
 114                 if (p[i].t)
 115                         free_translation_table(p[i].u.tbl);
 116
 117         mem_free(p);
 118 }
 119
 120 /* A string used in conversion tables when there is no correct
 121  * conversion.  This is compared by address and therefore should be a
 122  * named array rather than a pointer so that it won't share storage
 123  * with any other string literal that happens to have the same
 124  * characters.  */
 125 static const unsigned char no_str[] = "*";
 126
 127 static void
 128 new_translation_table(struct conv_table *p)
 129 {
 130         int i;
 131
 132         for (i = 0; i < 256; i++)
 133                 if (p[i].t)
 134                         free_translation_table(p[i].u.tbl);
 135         for (i = 0; i < 128; i++) {
 136                 p[i].t = 0;
 137                 p[i].u.str = strings[i];
 138         }
 139         for (; i < 256; i++) {
 140                 p[i].t = 0;
 141                 p[i].u.str = no_str;
 142         }
 143 }
 144
 145 #define BIN_SEARCH(table, entry, entries, key, result)                                  \
 146 {                                                                                       \
 147         long _s = 0, _e = (entries) - 1;                                                \
 148                                                                                         \
 149         while (_s <= _e || !((result) = -1)) {                                          \
 150                 long _m = (_s + _e) / 2;                                                \
 151                                                                                         \
 152                 if ((table)[_m].entry == (key)) {                                       \
 153                         (result) = _m;                                                  \
 154                         break;                                                          \
 155                 }                                                                       \
 156                 if ((table)[_m].entry > (key)) _e = _m - 1;                             \
 157                 if ((table)[_m].entry < (key)) _s = _m + 1;                             \
 158         }                                                                               \
 159 }                                                                                       \
 160
 161 static const unicode_val_T strange_chars[32] = {
 162 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
 163 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
 164 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
 165 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
 166 };
 167
 168 #define SYSTEM_CHARSET_FLAG 128
 169 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
 170
 171 const unsigned char *
 172 u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
 173 {
 174         int j;
 175         int s;
 176
 177         if (u < 128) return strings[u];
 178
 179         to &= ~SYSTEM_CHARSET_FLAG;
 180
 181 #ifdef CONFIG_UTF8
 182         if (is_cp_ptr_utf8(&codepages[to]))
 183                 return encode_utf8(u);
 184 #endif /* CONFIG_UTF8 */
 185
 186         /* To mark non breaking spaces in non-UTF-8 strings, we use a
 187          * special char NBSP_CHAR. */
 188         if (u == UCS_NO_BREAK_SPACE) {
 189                 if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
 190                 else /* NBSP_MODE_ASCII */ return " ";
 191         }
 192         if (u == UCS_SOFT_HYPHEN) return "";
 193
 194         if (u < 0xa0) {
 195                 unicode_val_T strange = strange_chars[u - 0x80];
 196
 197                 if (!strange) return NULL;
 198                 return u2cp_(strange, to, nbsp_mode);
 199         }
 200
 201         if (u < 0xFFFF)
 202                 for (j = 0; j < 0x80; j++)
 203                         if (codepages[to].highhalf[j] == u)
 204                                 return strings[0x80 + j];
 205         for (j = 0; codepages[to].table[j].c; j++)
 206                 if (codepages[to].table[j].u == u)
 207                         return strings[codepages[to].table[j].c];
 208
 209         BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
 210         if (s != -1) return unicode_7b[s].s;
 211
 212         return no_str;
 213 }
 214
 215 static unsigned char utf_buffer[7];
 216
 217 #ifdef CONFIG_UTF8
 218 inline unsigned char *
 219 encode_utf8(unicode_val_T u)
 220 #else
 221 static unsigned char *
 222 encode_utf8(unicode_val_T u)
 223 #endif /* CONFIG_UTF8 */
 224 {
 225         memset(utf_buffer, 0, 7);
 226
 227         if (u < 0x80)
 228                 utf_buffer[0] = u;
 229         else if (u < 0x800)
 230                 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
 231                 utf_buffer[1] = 0x80 | (u & 0x3f);
 232         else if (u < 0x10000)
 233                 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
 234                 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
 235                 utf_buffer[2] = 0x80 | (u & 0x3f);
 236         else if (u < 0x200000)
 237                 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
 238                 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
 239                 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
 240                 utf_buffer[3] = 0x80 | (u & 0x3f);
 241         else if (u < 0x4000000)
 242                 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
 243                 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
 244                 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
 245                 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
 246                 utf_buffer[4] = 0x80 | (u & 0x3f);
 247         else    utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
 248                 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
 249                 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
 250                 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
 251                 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
 252                 utf_buffer[5] = 0x80 | (u & 0x3f);
 253
 254         return utf_buffer;
 255 }
 256
 257 #ifdef CONFIG_UTF8
 258 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
 259  * equal ones and handled different. */
 260 static const char utf8char_len_tab[256] = {
 261         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 262         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 263         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 264         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 265         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 266         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 267         2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
 268         3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
 269 };
 270
 271 inline int utf8charlen(const unsigned char *p)
 272 {
 273         return p ? utf8char_len_tab[*p] : 0;
 274 }
 275
 276 inline int
 277 strlen_utf8(unsigned char **str)
 278 {
 279         unsigned char *s = *str;
 280         unsigned char *end = strchr(s, '\0');
 281         int x;
 282         int len;
 283
 284         for (x = 0;; x++, s += len) {
 285                 len = utf8charlen(s);
 286                 if (s + len > end) break;
 287         }
 288         *str = s;
 289         return x;
 290 }
 291
 292 #define utf8_issingle(p) (((p) & 0x80) == 0)
 293 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
 294
 295 /* Start from @current and move back to @pos char. This pointer return. The
 296  * most left pointer is @start. */
 297 inline unsigned char *
 298 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
 299 {
 300         if (current == NULL || start == NULL || pos < 0)
 301                 return NULL;
 302         while (pos > 0 && current != start) {
 303                 current--;
 304                 if (utf8_islead(*current))
 305                         pos--;
 306         }
 307         return current;
 308 }
 309
 310 /* Count number of standard terminal cells needed for displaying UTF-8
 311  * character. */
 312 int
 313 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
 314 {
 315         unicode_val_T u;
 316
 317         if (end == NULL)
 318                 end = strchr(utf8_char, '\0');
 319
 320         if(!utf8_char || !end)
 321                 return -1;
 322
 323         u = utf8_to_unicode(&utf8_char, end);
 324
 325         return unicode_to_cell(u);
 326 }
 327
 328 /* Count number of standard terminal cells needed for displaying string
 329  * with UTF-8 characters. */
 330 int
 331 utf8_ptr2cells(unsigned char *string, unsigned char *end)
 332 {
 333         int charlen, cell, cells = 0;
 334
 335         if (end == NULL)
 336                 end = strchr(string, '\0');
 337
 338         if(!string || !end)
 339                 return -1;
 340
 341         do {
 342                 charlen = utf8charlen(string);
 343                 if (string + charlen > end)
 344                         break;
 345
 346                 cell = utf8_char2cells(string, end);
 347                 if  (cell < 0)
 348                         return -1;
 349
 350                 cells += cell;
 351                 string += charlen;
 352         } while (1);
 353
 354         return cells;
 355 }
 356
 357 /* Count number of characters in string. */
 358 int
 359 utf8_ptr2chars(unsigned char *string, unsigned char *end)
 360 {
 361         int charlen, chars = 0;
 362
 363         if (end == NULL)
 364                 end = strchr(string, '\0');
 365
 366         if(!string || !end)
 367                 return -1;
 368
 369         do {
 370                 charlen = utf8charlen(string);
 371                 if (string + charlen > end)
 372                         break;
 373
 374                 chars++;
 375                 string += charlen;
 376         } while (1);
 377
 378         return chars;
 379 }
 380
 381 /*
 382  * Count number of bytes from begining of the string needed for displaying
 383  * specified number of cells.
 384  */
 385 int
 386 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
 387 {
 388         unsigned int bytes = 0, cells = 0;
 389
 390         assert(max_cells>=0);
 391
 392         if (end == NULL)
 393                 end = strchr(string, '\0');
 394
 395         if(!string || !end)
 396                 return -1;
 397
 398         do {
 399                 int cell = utf8_char2cells(&string[bytes], end);
 400                 if (cell < 0)
 401                         return -1;
 402
 403                 cells += cell;
 404                 if (cells > max_cells)
 405                         break;
 406
 407                 bytes += utf8charlen(&string[bytes]);
 408
 409                 if (string + bytes > end) {
 410                         bytes = end - string;
 411                         break;
 412                 }
 413         } while(1);
 414
 415         return bytes;
 416 }
 417
 418 /* Take @max steps forward from @string in the specified @way, but
 419  * not going past @end.  Return the resulting address.  Store the
 420  * number of steps taken to *@count, unless @count is NULL.
 421  *
 422  * This assumes the text is valid UTF-8, and @string and @end point to
 423  * character boundaries.  If not, it doesn't crash but the results may
 424  * be inconsistent.
 425  *
 426  * This function can do some of the same jobs as utf8charlen(),
 427  * utf8_cells2bytes(), and strlen_utf8().  */
 428 unsigned char *
 429 utf8_step_forward(unsigned char *string, unsigned char *end,
 430                   int max, enum utf8_step way, int *count)
 431 {
 432         int steps = 0;
 433         unsigned char *current = string;
 434
 435         assert(string);
 436         assert(max >= 0);
 437         if_assert_failed goto invalid_arg;
 438         if (end == NULL)
 439                 end = strchr(string, '\0');
 440
 441         switch (way) {
 442         case UTF8_STEP_CHARACTERS:
 443                 while (steps < max && current < end) {
 444                         ++current;
 445                         if (utf8_islead(*current))
 446                                 ++steps;
 447                 }
 448                 break;
 449
 450         case UTF8_STEP_CELLS_FEWER:
 451         case UTF8_STEP_CELLS_MORE:
 452                 while (steps < max) {
 453                         unicode_val_T u;
 454                         unsigned char *prev = current;
 455                         int width;
 456
 457                         u = utf8_to_unicode(&current, end);
 458                         if (u == UCS_NO_CHAR) {
 459                                 /* Assume the incomplete sequence
 460                                  * costs one cell.  */
 461                                 current = end;
 462                                 ++steps;
 463                                 break;
 464                         }
 465
 466                         width = unicode_to_cell(u);
 467                         if (way == UTF8_STEP_CELLS_FEWER
 468                             && steps + width > max) {
 469                                 /* Back off.  */
 470                                 current = prev;
 471                                 break;
 472                         }
 473                         steps += width;
 474                 }
 475                 break;
 476
 477         default:
 478                 INTERNAL("impossible enum utf8_step");
 479         }
 480
 481 invalid_arg:
 482         if (count)
 483                 *count = steps;
 484         return current;
 485 }
 486
 487 /* Take @max steps backward from @string in the specified @way, but
 488  * not going past @start.  Return the resulting address.  Store the
 489  * number of steps taken to *@count, unless @count is NULL.
 490  *
 491  * This assumes the text is valid UTF-8, and @string and @start point
 492  * to character boundaries.  If not, it doesn't crash but the results
 493  * may be inconsistent.
 494  *
 495  * This function can do some of the same jobs as utf8_prevchar().  */
 496 unsigned char *
 497 utf8_step_backward(unsigned char *string, unsigned char *start,
 498                    int max, enum utf8_step way, int *count)
 499 {
 500         int steps = 0;
 501         unsigned char *current = string;
 502
 503         assert(string);
 504         assert(start);
 505         assert(max >= 0);
 506         if_assert_failed goto invalid_arg;
 507
 508         switch (way) {
 509         case UTF8_STEP_CHARACTERS:
 510                 while (steps < max && current > start) {
 511                         --current;
 512                         if (utf8_islead(*current))
 513                                 ++steps;
 514                 }
 515                 break;
 516
 517         case UTF8_STEP_CELLS_FEWER:
 518         case UTF8_STEP_CELLS_MORE:
 519                 while (steps < max) {
 520                         unsigned char *prev = current;
 521                         unsigned char *look;
 522                         unicode_val_T u;
 523                         int width;
 524
 525                         if (current <= start)
 526                                 break;
 527                         do {
 528                                 --current;
 529                         } while (current > start && !utf8_islead(*current));
 530
 531                         look = current;
 532                         u = utf8_to_unicode(&look, prev);
 533                         if (u == UCS_NO_CHAR) {
 534                                 /* Assume the incomplete sequence
 535                                  * costs one cell.  */
 536                                 width = 1;
 537                         } else
 538                                 width = unicode_to_cell(u);
 539
 540                         if (way == UTF8_STEP_CELLS_FEWER
 541                             && steps + width > max) {
 542                                 /* Back off.  */
 543                                 current = prev;
 544                                 break;
 545                         }
 546                         steps += width;
 547                 }
 548                 break;
 549
 550         default:
 551                 INTERNAL("impossible enum utf8_step");
 552         }
 553
 554 invalid_arg:
 555         if (count)
 556                 *count = steps;
 557         return current;
 558 }
 559
 560 /*
 561  * Find out number of standard terminal collumns needed for displaying symbol
 562  * (glyph) which represents Unicode character c.
 563  *
 564  * TODO: Use wcwidth when it is available. This seems to require:
 565  * - Make the configure script check whether <wchar.h> and wcwidth exist.
 566  * - Define _XOPEN_SOURCE and include <wchar.h>.
 567  * - Test that __STDC_ISO_10646__ is defined.  (This macro means wchar_t
 568  *   matches ISO 10646 in all locales.)
 569  * However, these do not suffice, because wcwidth depends on LC_CTYPE
 570  * in glibc-2.3.6.  For instance, wcwidth(0xff20) is -1 when LC_CTYPE
 571  * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
 572  * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
 573  * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
 574  * character is apparently not supported in all locales.  Why is that?
 575  * - Perhaps there is standardese that requires supported characters
 576  *   to be convertable to multibyte form.  Then ELinks could just pick
 577  *   some UTF-8 locale for its wcwidth purposes.
 578  * - Perhaps wcwidth can even return different nonnegative values for
 579  *   the same ISO 10646 character in different locales.  Then ELinks
 580  *   would have to set LC_CTYPE to match at least the terminal's
 581  *   charset (which may differ from the LC_CTYPE environment variable,
 582  *   especially when the master process is serving a slave terminal).
 583  *   But there is no guarantee that the libc supports all the same
 584  *   charsets as ELinks does.
 585  * For now, it seems safest to avoid the potentially locale-dependent
 586  * libc version of wcwidth, and instead use a hardcoded mapping.
 587  *
 588  * @return      2 for double-width glyph, 1 for others.
 589  *              TODO: May be extended to return 0 for zero-width glyphs
 590  *              (like composing, maybe unprintable too).
 591  */
 592 inline int
 593 unicode_to_cell(unicode_val_T c)
 594 {
 595         if (c >= 0x1100
 596                 && (c <= 0x115f                 /* Hangul Jamo */
 597                 || c == 0x2329
 598                 || c == 0x232a
 599                 || (c >= 0x2e80 && c <= 0xa4cf
 600                         && c != 0x303f)         /* CJK ... Yi */
 601                 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
 602                 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
 603                                                                 Ideographs */
 604                 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
 605                 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
 606                 || (c >= 0xffe0 && c <= 0xffe6)
 607                 || (c >= 0x20000 && c <= 0x2fffd)
 608                 || (c >= 0x30000 && c <= 0x3fffd)))
 609                 return 2;
 610
 611         return 1;
 612 }
 613
 614 /* Fold the case of a Unicode character, so that hotkeys in labels can
 615  * be compared case-insensitively.  It is unspecified whether the
 616  * result will be in upper or lower case.  */
 617 unicode_val_T
 618 unicode_fold_label_case(unicode_val_T c)
 619 {
 620 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
 621         return towlower(c);
 622 #else  /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 623         /* For now, this supports only ASCII.  It would be possible to
 624          * use code generated from CaseFolding.txt of Unicode if the
 625          * acknowledgements required by http://www.unicode.org/copyright.html
 626          * were added to associated documentation of ELinks.  */
 627         if (c >= 0x41 && c <= 0x5A)
 628                 return c + 0x20;
 629         else
 630                 return c;
 631 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 632 }
 633
 634 inline unicode_val_T
 635 utf8_to_unicode(unsigned char **string, const unsigned char *end)
 636 {
 637         unsigned char *str = *string;
 638         unicode_val_T u;
 639         int length;
 640
 641         length = utf8char_len_tab[str[0]];
 642
 643         if (str + length > end) {
 644                 return UCS_NO_CHAR;
 645         }
 646
 647         switch (length) {
 648                 case 1:         /* U+0000 to U+007F */
 649                         if (str[0] >= 0x80) {
 650 invalid_utf8:
 651                                 ++*string;
 652                                 return UCS_REPLACEMENT_CHARACTER;
 653                         }
 654                         u = str[0];
 655                         break;
 656                 case 2:         /* U+0080 to U+07FF */
 657                         if ((str[1] & 0xc0) != 0x80)
 658                                 goto invalid_utf8;
 659                         u = (str[0] & 0x1f) << 6;
 660                         u += (str[1] & 0x3f);
 661                         if (u < 0x80)
 662                                 goto invalid_utf8;
 663                         break;
 664                 case 3:         /* U+0800 to U+FFFF, except surrogates */
 665                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
 666                                 goto invalid_utf8;
 667                         u = (str[0] & 0x0f) << 12;
 668                         u += ((str[1] & 0x3f) << 6);
 669                         u += (str[2] & 0x3f);
 670                         if (u < 0x800 || is_utf16_surrogate(u))
 671                                 goto invalid_utf8;
 672                         break;
 673                 case 4:         /* U+10000 to U+1FFFFF */
 674                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 675                             || (str[3] & 0xc0) != 0x80)
 676                                 goto invalid_utf8;
 677                         u = (str[0] & 0x0f) << 18;
 678                         u += ((str[1] & 0x3f) << 12);
 679                         u += ((str[2] & 0x3f) << 6);
 680                         u += (str[3] & 0x3f);
 681                         if (u < 0x10000)
 682                                 goto invalid_utf8;
 683                         break;
 684                 case 5:         /* U+200000 to U+3FFFFFF */
 685                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 686                             || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
 687                                 goto invalid_utf8;
 688                         u = (str[0] & 0x0f) << 24;
 689                         u += ((str[1] & 0x3f) << 18);
 690                         u += ((str[2] & 0x3f) << 12);
 691                         u += ((str[3] & 0x3f) << 6);
 692                         u += (str[4] & 0x3f);
 693                         if (u < 0x200000)
 694                                 goto invalid_utf8;
 695                         break;
 696                 case 6:         /* U+4000000 to U+7FFFFFFF */
 697                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 698                             || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
 699                             || (str[5] & 0xc0) != 0x80)
 700                                 goto invalid_utf8;
 701                         u = (str[0] & 0x01) << 30;
 702                         u += ((str[1] & 0x3f) << 24);
 703                         u += ((str[2] & 0x3f) << 18);
 704                         u += ((str[3] & 0x3f) << 12);
 705                         u += ((str[4] & 0x3f) << 6);
 706                         u += (str[5] & 0x3f);
 707                         if (u < 0x4000000)
 708                                 goto invalid_utf8;
 709                         break;
 710                 default:
 711                         INTERNAL("utf8char_len_tab out of range");
 712                         goto invalid_utf8;
 713         }
 714         *string = str + length;
 715         return u;
 716 }
 717 #endif /* CONFIG_UTF8 */
 718
 719 /* The common part of cp2u and cp2utf_8.  */
 720 static unicode_val_T
 721 cp2u_shared(const struct codepage_desc *from, unsigned char c)
 722 {
 723         unicode_val_T u = from->highhalf[c - 0x80];
 724
 725         if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
 726         return u;
 727 }
 728
 729 /* Used for converting input from the terminal.  */
 730 unicode_val_T
 731 cp2u(int from, unsigned char c)
 732 {
 733         from &= ~SYSTEM_CHARSET_FLAG;
 734
 735         /* UTF-8 is a multibyte codepage and cannot be handled with
 736          * this function.  */
 737         assert(!is_cp_ptr_utf8(&codepages[from]));
 738         if_assert_failed return UCS_REPLACEMENT_CHARACTER;
 739
 740         if (c < 0x80) return c;
 741         else return cp2u_shared(&codepages[from], c);
 742 }
 743
 744 /* This slow and ugly code is used by the terminal utf_8_io */
 745 const unsigned char *
 746 cp2utf8(int from, int c)
 747 {
 748         from &= ~SYSTEM_CHARSET_FLAG;
 749
 750         if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
 751                 return strings[c];
 752
 753         return encode_utf8(cp2u_shared(&codepages[from], c));
 754 }
 755
 756 #ifdef CONFIG_UTF8
 757 unicode_val_T
 758 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
 759 {
 760         unicode_val_T ret;
 761
 762         if (is_cp_utf8(codepage))
 763                 return utf8_to_unicode(string, end);
 764
 765         if (*string >= end)
 766                 return UCS_NO_CHAR;
 767
 768         ret = cp2u(codepage, **string);
 769         ++*string;
 770         return ret;
 771 }
 772 #endif  /* CONFIG_UTF8 */
 773
 774
 775 static void
 776 add_utf8(struct conv_table *ct, unicode_val_T u, const unsigned char *str)
 777 {
 778         unsigned char *p = encode_utf8(u);
 779
 780         while (p[1]) {
 781                 if (ct[*p].t) ct = ct[*p].u.tbl;
 782                 else {
 783                         struct conv_table *nct;
 784
 785                         assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
 786                         if_assert_failed return;
 787
 788                         nct = mem_calloc(256, sizeof(*nct));
 789                         if (!nct) return;
 790                         new_translation_table(nct);
 791                         ct[*p].t = 1;
 792                         ct[*p].u.tbl = nct;
 793                         ct = nct;
 794                 }
 795                 p++;
 796         }
 797
 798         assertm(!ct[*p].t, "bad utf encoding #2");
 799         if_assert_failed return;
 800
 801         if (ct[*p].u.str == no_str)
 802                 ct[*p].u.str = str;
 803 }
 804
 805 /* A conversion table from some charset to UTF-8.
 806  * If it is from UTF-8 to UTF-8, it converts each byte separately.
 807  * Unlike in other translation tables, the strings in elements 0x80 to
 808  * 0xFF are allocated dynamically.  */
 809 struct conv_table utf_table[256];
 810 int utf_table_init = 1;
 811
 812 static void
 813 free_utf_table(void)
 814 {
 815         int i;
 816
 817         /* Cast away const.  */
 818         for (i = 128; i < 256; i++)
 819                 mem_free((unsigned char *) utf_table[i].u.str);
 820 }
 821
 822 static struct conv_table *
 823 get_translation_table_to_utf8(int from)
 824 {
 825         int i;
 826         static int lfr = -1;
 827
 828         if (from == -1) return NULL;
 829         from &= ~SYSTEM_CHARSET_FLAG;
 830         if (from == lfr) return utf_table;
 831         lfr = from;
 832         if (utf_table_init) {
 833                 memset(utf_table, 0, sizeof(utf_table));
 834                 utf_table_init = 0;
 835         } else
 836                 free_utf_table();
 837
 838         for (i = 0; i < 128; i++)
 839                 utf_table[i].u.str = strings[i];
 840
 841         if (is_cp_ptr_utf8(&codepages[from])) {
 842                 for (i = 128; i < 256; i++)
 843                         utf_table[i].u.str = stracpy(strings[i]);
 844                 return utf_table;
 845         }
 846
 847         for (i = 128; i < 256; i++) {
 848                 unicode_val_T u = codepages[from].highhalf[i - 0x80];
 849
 850                 if (u == 0xFFFF)
 851                         utf_table[i].u.str = NULL;
 852                 else
 853                         utf_table[i].u.str = stracpy(encode_utf8(u));
 854         }
 855
 856         for (i = 0; codepages[from].table[i].c; i++) {
 857                 unicode_val_T u = codepages[from].table[i].u;
 858
 859                 if (!utf_table[codepages[from].table[i].c].u.str)
 860                         utf_table[codepages[from].table[i].c].u.str =
 861                                 stracpy(encode_utf8(u));
 862         }
 863
 864         for (i = 128; i < 256; i++)
 865                 if (!utf_table[i].u.str)
 866                         utf_table[i].u.str = stracpy(no_str);
 867
 868         return utf_table;
 869 }
 870
 871 /* A conversion table between two charsets, where the target is not UTF-8.  */
 872 static struct conv_table table[256];
 873 static int first = 1;
 874
 875 void
 876 free_conv_table(void)
 877 {
 878         if (!utf_table_init) free_utf_table();
 879         if (first) {
 880                 memset(table, 0, sizeof(table));
 881                 first = 0;
 882         }
 883         new_translation_table(table);
 884 }
 885
 886
 887 struct conv_table *
 888 get_translation_table(int from, int to)
 889 {
 890         static int lfr = -1;
 891         static int lto = -1;
 892
 893         from &= ~SYSTEM_CHARSET_FLAG;
 894         to &= ~SYSTEM_CHARSET_FLAG;
 895         if (first) {
 896                 memset(table, 0, sizeof(table));
 897                 first = 0;
 898         }
 899         if (/*from == to ||*/ from == -1 || to == -1)
 900                 return NULL;
 901         if (is_cp_ptr_utf8(&codepages[to]))
 902                 return get_translation_table_to_utf8(from);
 903         if (from == lfr && to == lto)
 904                 return table;
 905         lfr = from;
 906         lto = to;
 907         new_translation_table(table);
 908
 909         if (is_cp_ptr_utf8(&codepages[from])) {
 910                 int i;
 911
 912                 /* Map U+00A0 and U+00AD the same way as u2cp() would.  */
 913                 add_utf8(table, UCS_NO_BREAK_SPACE, strings[NBSP_CHAR]);
 914                 add_utf8(table, UCS_SOFT_HYPHEN, "");
 915
 916                 for (i = 0x80; i <= 0xFF; i++)
 917                         if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
 918                                 add_utf8(table,
 919                                          codepages[to].highhalf[i - 0x80],
 920                                          strings[i]);
 921
 922                 for (i = 0; codepages[to].table[i].c; i++)
 923                         add_utf8(table, codepages[to].table[i].u,
 924                                  strings[codepages[to].table[i].c]);
 925
 926                 for (i = 0; unicode_7b[i].x != -1; i++)
 927                         if (unicode_7b[i].x >= 0x80)
 928                                 add_utf8(table, unicode_7b[i].x,
 929                                          unicode_7b[i].s);
 930
 931         } else {
 932                 int i;
 933
 934                 for (i = 128; i < 256; i++) {
 935                         if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
 936                                 const unsigned char *u;
 937
 938                                 u = u2cp(codepages[from].highhalf[i - 0x80], to);
 939                                 if (u) table[i].u.str = u;
 940                         }
 941                 }
 942         }
 943
 944         return table;
 945 }
 946
 947 static inline int
 948 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
 949 {
 950         while (l2) {
 951                 if (*s1 > *s2) return 1;
 952                 if (*s1 < *s2) return -1;
 953                 s1++;
 954                 s2++;
 955                 l2--;
 956         }
 957
 958         return *s2 ? -1 : 0;
 959 }
 960
 961 /* Entity cache debugging purpose. */
 962 #if 0
 963 #define DEBUG_ENTITY_CACHE
 964 #else
 965 #undef DEBUG_ENTITY_CACHE
 966 #endif
 967
 968 struct entity_cache {
 969         unsigned int hits;
 970         int strlen;
 971         int encoding;
 972         const unsigned char *result;
 973         unsigned char str[20]; /* Suffice in any case. */
 974 };
 975
 976 /* comparison function for qsort() */
 977 static int
 978 hits_cmp(const void *v1, const void *v2)
 979 {
 980         const struct entity_cache *a = v1, *b = v2;
 981
 982         if (a->hits == b->hits) return 0;
 983         if (a->hits > b->hits) return -1;
 984         else return 1;
 985 }
 986
 987 static int
 988 compare_entities(const void *key_, const void *element_)
 989 {
 990         struct string *key = (struct string *) key_;
 991         struct entity *element = (struct entity *) element_;
 992         int length = key->length;
 993         unsigned char *first = key->source;
 994         unsigned char *second = element->s;
 995
 996         return xxstrcmp(first, second, length);
 997 }
 998
 999 const unsigned char *
1000 get_entity_string(const unsigned char *str, const int strlen, int encoding)
1001 {
1002 #define ENTITY_CACHE_SIZE 10    /* 10 seems a good value. */
1003 #define ENTITY_CACHE_MAXLEN 9   /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
1004                                    will go in [0] table */
1005         static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
1006         static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
1007         static int first_time = 1;
1008         unsigned int slen = 0;
1009         const unsigned char *result = NULL;
1010
1011         if (strlen <= 0) return NULL;
1012
1013 #ifdef CONFIG_UTF8
1014         /* TODO: caching UTF-8 */
1015         encoding &= ~SYSTEM_CHARSET_FLAG;
1016         if (is_cp_ptr_utf8(&codepages[encoding]))
1017                 goto skip;
1018 #endif /* CONFIG_UTF8 */
1019
1020         if (first_time) {
1021                 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
1022                 first_time = 0;
1023         }
1024
1025         /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1026          * + google + slashdot + websites that result from a search for test on google,
1027          * + various ones) show quite impressive improvment:
1028          * Top ten is:
1029          * 0: hits=2459 l=4 st='nbsp'
1030          * 1: hits=2152 l=6 st='eacute'
1031          * 2: hits=235 l=6 st='egrave'
1032          * 3: hits=136 l=6 st='agrave'
1033          * 4: hits=100 l=3 st='amp'
1034          * 5: hits=40 l=5 st='laquo'
1035          * 6: hits=8 l=4 st='copy'
1036          * 7: hits=5 l=2 st='gt'
1037          * 8: hits=2 l=2 st='lt'
1038          * 9: hits=1 l=6 st='middot'
1039          *
1040          * Most of the time cache hit ratio is near 95%.
1041          *
1042          * A long test shows: 15186 hits vs. 24 misses and mean iteration
1043          * count is kept < 2 (worst case 1.58). Not so bad ;)
1044          *
1045          * --Zas */
1046
1047         /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1048         slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
1049
1050         if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
1051                 int i;
1052
1053                 for (i = 0; i < nb_entity_cache[slen]; i++) {
1054                         if (entity_cache[slen][i].encoding == encoding
1055                             && !memcmp(str, entity_cache[slen][i].str, strlen)) {
1056 #ifdef DEBUG_ENTITY_CACHE
1057                                 static double total_iter = 0;
1058                                 static unsigned long hit_count = 0;
1059
1060                                 total_iter += i + 1;
1061                                 hit_count++;
1062                                 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
1063 #endif
1064                                 if (entity_cache[slen][i].hits < (unsigned int) ~0)
1065                                         entity_cache[slen][i].hits++;
1066                                 return entity_cache[slen][i].result;
1067                         }
1068                 }
1069 #ifdef DEBUG_ENTITY_CACHE
1070                 fprintf(stderr, "miss\n");
1071 #endif
1072         }
1073 #ifdef CONFIG_UTF8
1074 skip:
1075 #endif /* CONFIG_UTF8 */
1076         if (*str == '#') { /* Numeric entity. */
1077                 int l = (int) strlen;
1078                 unsigned char *st = (unsigned char *) str;
1079                 unicode_val_T n = 0;
1080
1081                 if (l == 1) goto end; /* &#; ? */
1082                 st++, l--;
1083                 if ((*st | 32) == 'x') { /* Hexadecimal */
1084
1085                         if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
1086                         st++, l--;
1087                         do {
1088                                 unsigned char c = (*(st++) | 32);
1089
1090                                 if (isdigit(c))
1091                                         n = (n << 4) | (c - '0');
1092                                 else if (isxdigit(c))
1093                                         n = (n << 4) | (c - 'a' + 10);
1094                                 else
1095                                         goto end; /* Bad char. */
1096                         } while (--l);
1097                 } else { /* Decimal */
1098                         if (l > 10) goto end; /* 4294967295 max. */
1099                         do {
1100                                 unsigned char c = *(st++);
1101
1102                                 if (isdigit(c))
1103                                         n = n * 10 + c - '0';
1104                                 else
1105                                         goto end; /* Bad char. */
1106                                 /* Limit to 0xFFFFFFFF. */
1107                                 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1108                                         goto end;
1109                         } while (--l);
1110                 }
1111
1112                 result = u2cp(n, encoding);
1113
1114 #ifdef DEBUG_ENTITY_CACHE
1115                 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1116 #endif
1117         } else { /* Text entity. */
1118                 struct string key = INIT_STRING((unsigned char *) str, strlen);
1119                 struct entity *element = bsearch((void *) &key, entities,
1120                                                  N_ENTITIES,
1121                                                  sizeof(*element),
1122                                                  compare_entities);
1123
1124                 if (element) result = u2cp(element->c, encoding);
1125         }
1126
1127 #ifdef CONFIG_UTF8
1128         if (is_cp_ptr_utf8(&codepages[encoding])) {
1129                 return result;
1130         }
1131 #endif /* CONFIG_UTF8 */
1132 end:
1133         /* Take care of potential buffer overflow. */
1134         if (strlen < sizeof(entity_cache[slen][0].str)) {
1135                 struct entity_cache *ece;
1136
1137                 /* Sort entries by hit order. */
1138                 if (nb_entity_cache[slen] > 1)
1139                         qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1140                               sizeof(entity_cache[slen][0]), hits_cmp);
1141
1142                 /* Increment number of cache entries if possible.
1143                  * Else, just replace the least used entry.  */
1144                 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1145                 ece = &entity_cache[slen][nb_entity_cache[slen] - 1];
1146
1147                 /* Copy new entry to cache. */
1148                 ece->hits = 1;
1149                 ece->strlen = strlen;
1150                 ece->encoding = encoding;
1151                 ece->result = result;
1152                 memcpy(ece->str, str, strlen);
1153                 ece->str[strlen] = '\0';
1154
1155
1156 #ifdef DEBUG_ENTITY_CACHE
1157                 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1158                                 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1159
1160         {
1161                 unsigned int i;
1162
1163                 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1164                 for (i = 0; i < nb_entity_cache[slen] ; i++)
1165                         fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1166                                 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1167                                 entity_cache[slen][i].str);
1168                 fprintf(stderr, "-----------------\n");
1169         }
1170 #endif  /* DEBUG_ENTITY_CACHE */
1171         }
1172         return result;
1173 }
1174
1175 unsigned char *
1176 convert_string(struct conv_table *convert_table,
1177                unsigned char *chars, int charslen, int cp,
1178                enum convert_string_mode mode, int *length,
1179                void (*callback)(void *data, unsigned char *buf, int buflen),
1180                void *callback_data)
1181 {
1182         unsigned char *buffer;
1183         int bufferpos = 0;
1184         int charspos = 0;
1185
1186         if (!convert_table && !memchr(chars, '&', charslen)) {
1187                 if (callback) {
1188                         if (charslen) callback(callback_data, chars, charslen);
1189                         return NULL;
1190                 } else {
1191                         return memacpy(chars, charslen);
1192                 }
1193         }
1194
1195         /* Buffer allocation */
1196
1197         buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1198         if (!buffer) return NULL;
1199
1200         /* Iterate ;-) */
1201
1202         while (charspos < charslen) {
1203                 const unsigned char *translit;
1204
1205 #define PUTC do { \
1206                 buffer[bufferpos++] = chars[charspos++]; \
1207                 translit = ""; \
1208                 goto flush; \
1209         } while (0)
1210
1211                 if (chars[charspos] != '&') {
1212                         struct conv_table *t;
1213                         int i;
1214
1215                         if (chars[charspos] < 128 || !convert_table) PUTC;
1216
1217                         t = convert_table;
1218                         i = charspos;
1219
1220                         while (t[chars[i]].t) {
1221                                 t = t[chars[i++]].u.tbl;
1222                                 if (i >= charslen) PUTC;
1223                         }
1224
1225                         translit = t[chars[i]].u.str;
1226                         charspos = i + 1;
1227
1228                 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1229                         PUTC;
1230
1231                 } else {
1232                         int start = charspos + 1;
1233                         int i = start;
1234
1235                         while (i < charslen
1236                                && (isasciialpha(chars[i])
1237                                    || isdigit(chars[i])
1238                                    || (chars[i] == '#')))
1239                                 i++;
1240
1241                         /* This prevents bug 213: we were expanding "entities"
1242                          * in URL query strings. */
1243                         /* XXX: But this disables &nbsp&nbsp usage, which
1244                          * appears to be relatively common! --pasky */
1245                         if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1246                             && i > start
1247                             && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1248                                 translit = get_entity_string(&chars[start], i - start,
1249                                                       cp);
1250                                 if (chars[i] != ';') {
1251                                         /* Eat &nbsp &nbsp<foo> happily, but
1252                                          * pull back from the character after
1253                                          * entity string if it is not the valid
1254                                          * terminator. */
1255                                         i--;
1256                                 }
1257
1258                                 if (!translit) PUTC;
1259                                 charspos = i + (i < charslen);
1260                         } else PUTC;
1261                 }
1262
1263                 if (!translit[0]) continue;
1264
1265                 if (!translit[1]) {
1266                         buffer[bufferpos++] = translit[0];
1267                         translit = "";
1268                         goto flush;
1269                 }
1270
1271                 while (*translit) {
1272                         unsigned char *new;
1273
1274                         buffer[bufferpos++] = *(translit++);
1275 flush:
1276                         if (bufferpos & (ALLOC_GR - 1)) continue;
1277
1278                         if (callback) {
1279                                 buffer[bufferpos] = 0;
1280                                 callback(callback_data, buffer, bufferpos);
1281                                 bufferpos = 0;
1282                         } else {
1283                                 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1284                                 if (!new) {
1285                                         mem_free(buffer);
1286                                         return NULL;
1287                                 }
1288                                 buffer = new;
1289                         }
1290                 }
1291 #undef PUTC
1292         }
1293
1294         /* Say bye */
1295
1296         buffer[bufferpos] = 0;
1297         if (length) *length = bufferpos;
1298
1299         if (callback) {
1300                 if (bufferpos) callback(callback_data, buffer, bufferpos);
1301                 mem_free(buffer);
1302                 return NULL;
1303         } else {
1304                 return buffer;
1305         }
1306 }
1307
1308
1309 #ifndef USE_FASTFIND
1310 int
1311 get_cp_index(const unsigned char *name)
1312 {
1313         int i, a;
1314         int syscp = 0;
1315
1316         if (!strcasecmp(name, "System")) {
1317 #if HAVE_LANGINFO_CODESET
1318                 name = nl_langinfo(CODESET);
1319                 syscp = SYSTEM_CHARSET_FLAG;
1320 #else
1321                 name = "us-ascii";
1322 #endif
1323         }
1324
1325         for (i = 0; codepages[i].name; i++) {
1326                 for (a = 0; codepages[i].aliases[a]; a++) {
1327                         /* In the past, we looked for the longest substring
1328                          * in all the names; it is way too expensive, though:
1329                          *
1330                          *   %   cumulative   self              self     total
1331                          *  time   seconds   seconds    calls  us/call  us/call  name
1332                          *  3.00      0.66     0.03     1325    22.64    22.64  get_cp_index
1333                          *
1334                          * Anything called from redraw_screen() is in fact
1335                          * relatively expensive, even if it's called just
1336                          * once. So we will do a simple strcasecmp() here.
1337                          */
1338
1339                         if (!strcasecmp(name, codepages[i].aliases[a]))
1340                                 return i | syscp;
1341                 }
1342         }
1343
1344         if (syscp) {
1345                 return get_cp_index("us-ascii") | syscp;
1346         } else {
1347                 return -1;
1348         }
1349 }
1350
1351 #else
1352
1353 static unsigned int i_name = 0;
1354 static unsigned int i_alias = 0;
1355
1356 /* Reset internal list pointer */
1357 void
1358 charsets_list_reset(void)
1359 {
1360         i_name = 0;
1361         i_alias = 0;
1362 }
1363
1364 /* Returns a pointer to a struct that contains current key and data pointers
1365  * and increment internal pointer.  It returns NULL when key is NULL. */
1366 struct fastfind_key_value *
1367 charsets_list_next(void)
1368 {
1369         static struct fastfind_key_value kv;
1370
1371         if (!codepages[i_name].name) return NULL;
1372
1373         kv.key = codepages[i_name].aliases[i_alias];
1374         kv.data = (void *) &codepages[i_name]; /* cast away const */
1375
1376         if (codepages[i_name].aliases[i_alias + 1])
1377                 i_alias++;
1378         else {
1379                 i_name++;
1380                 i_alias = 0;
1381         }
1382
1383         return &kv;
1384 }
1385
1386 static struct fastfind_index ff_charsets_index
1387         = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1388
1389 /* It searchs for a charset named @name or one of its aliases and
1390  * returns index for it or -1 if not found. */
1391 int
1392 get_cp_index(const unsigned char *name)
1393 {
1394         const struct codepage_desc *codepage;
1395         int syscp = 0;
1396
1397         if (!strcasecmp(name, "System")) {
1398 #if HAVE_LANGINFO_CODESET
1399                 name = nl_langinfo(CODESET);
1400                 syscp = SYSTEM_CHARSET_FLAG;
1401 #else
1402                 name = "us-ascii";
1403 #endif
1404         }
1405
1406         codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1407         if (codepage) {
1408                 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1409                 return (codepage - codepages) | syscp;
1410
1411         } else if (syscp) {
1412                 return get_cp_index("us-ascii") | syscp;
1413
1414         } else {
1415                 return -1;
1416         }
1417 }
1418
1419 #endif /* USE_FASTFIND */
1420
1421 void
1422 init_charsets_lookup(void)
1423 {
1424 #ifdef USE_FASTFIND
1425         fastfind_index(&ff_charsets_index, FF_COMPRESS);
1426 #endif
1427 }
1428
1429 void
1430 free_charsets_lookup(void)
1431 {
1432 #ifdef USE_FASTFIND
1433         fastfind_done(&ff_charsets_index);
1434 #endif
1435 }
1436
1437 /* Get the codepage's name for displaying to the user, or NULL if
1438  * @cp_index is one past the end.  In the future, we might want to
1439  * localize these with gettext.  So it may be best not to use this
1440  * function if the name will have to be converted back to an
1441  * index.  */
1442 unsigned char *
1443 get_cp_name(int cp_index)
1444 {
1445         if (cp_index < 0) return "none";
1446         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1447
1448         return codepages[cp_index].name;
1449 }
1450
1451 /* Get the codepage's name for saving to a configuration file.  These
1452  * names can be converted back to indexes, even in future versions of
1453  * ELinks.  */
1454 unsigned char *
1455 get_cp_config_name(int cp_index)
1456 {
1457         if (cp_index < 0) return "none";
1458         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1459         if (!codepages[cp_index].aliases) return NULL;
1460
1461         return codepages[cp_index].aliases[0];
1462 }
1463
1464 /* Get the codepage's name for sending to a library or server that
1465  * understands MIME charset names.  This function irreversibly maps
1466  * the "System" codepage to the underlying charset.  */
1467 unsigned char *
1468 get_cp_mime_name(int cp_index)
1469 {
1470         if (cp_index < 0) return "none";
1471         cp_index &= ~SYSTEM_CHARSET_FLAG;
1472         if (!codepages[cp_index].aliases) return NULL;
1473
1474         return codepages[cp_index].aliases[0];
1475 }
1476
1477 int
1478 is_cp_utf8(int cp_index)
1479 {
1480         cp_index &= ~SYSTEM_CHARSET_FLAG;
1481         return is_cp_ptr_utf8(&codepages[cp_index]);
1482 }