src/intl/charsets.c

   1 /* Charsets convertor */
   2
   3 #ifndef _GNU_SOURCE
   4 #define _GNU_SOURCE /* strcasecmp() */
   5 #endif
   6
   7 #ifdef HAVE_CONFIG_H
   8 #include "config.h"
   9 #endif
  10
  11 #if HAVE_LANGINFO_CODESET
  12 #include <langinfo.h>
  13 #endif
  14
  15 #include <ctype.h>
  16 #include <stdlib.h>
  17 #if HAVE_WCTYPE_H
  18 #include <wctype.h>
  19 #endif
  20
  21 #ifdef HAVE_ICONV
  22 #include <errno.h>
  23 #include <iconv.h>
  24 #endif
  25
  26 #include "elinks.h"
  27
  28 #include "document/options.h"
  29 #include "intl/charsets.h"
  30 #include "util/conv.h"
  31 #include "util/error.h"
  32 #include "util/fastfind.h"
  33 #include "util/hash.h"
  34 #include "util/memory.h"
  35 #include "util/string.h"
  36
  37
  38 /* Fix namespace clash on MacOS. */
  39 #define table table_elinks
  40
  41 struct table_entry {
  42         unsigned char c;
  43         /* This should in principle be unicode_val_T, but because all
  44          * the values currently in codepage.inc fit in 16 bits, we can
  45          * as well use uint16_t and halve sizeof(struct table_entry)
  46          * from 8 bytes to 4.  Should other characters ever be needed,
  47          * unicode_val_T u : 24 might be a possibility, although it
  48          * seems a little unportable as bitfields are in principle
  49          * restricted to int, which may be 16-bit.  */
  50         uint16_t u;
  51 };
  52
  53 struct codepage_desc {
  54         unsigned char *name;
  55         unsigned char *const *aliases;
  56
  57         /* The Unicode mappings of codepage bytes 0x80...0xFF.
  58          * (0x00...0x7F are assumed to be ASCII in all codepages.)
  59          * Because all current values fit in 16 bits, we store them as
  60          * uint16_t rather than unicode_val_T.  If the codepage does
  61          * not use some byte, then @highhalf maps that byte to 0xFFFF,
  62          * which C code converts to UCS_REPLACEMENT_CHARACTER where
  63          * appropriate.  (U+FFFF is reserved and will never be
  64          * assigned as a character.)  */
  65         const uint16_t *highhalf;
  66
  67         /* If some byte in the codepage corresponds to multiple Unicode
  68          * characters, then the preferred character is in @highhalf
  69          * above, and the rest are listed here in @table.  This table
  70          * is not used for translating from the codepage to Unicode.  */
  71         const struct table_entry *table;
  72
  73         /* Whether use iconv for translation */
  74         unsigned int iconv:1;
  75 };
  76
  77 #include "intl/codepage.inc"
  78 #include "intl/uni_7b.inc"
  79 #include "intl/entity.inc"
  80
  81 /* Declare the external-linkage inline functions defined in this file.
  82  * Avoid the GCC 4.3.1 warning: `foo' declared inline after being
  83  * called.  The functions are not declared inline in charsets.h
  84  * because C99 6.7.4p6 says that every external-linkage function
  85  * declared inline shall be defined in the same translation unit.
  86  * The non-inline declarations in charsets.h also make sure that the
  87  * compiler emits global definitions for the symbols so that the
  88  * functions can be called from other translation units.  */
  89 NONSTATIC_INLINE unsigned char *encode_utf8(unicode_val_T u);
  90 NONSTATIC_INLINE int utf8charlen(const unsigned char *p);
  91 NONSTATIC_INLINE int unicode_to_cell(unicode_val_T c);
  92 NONSTATIC_INLINE unicode_val_T utf8_to_unicode(unsigned char **string,
  93                                                const unsigned char *end);
  94
  95 static const char strings[256][2] = {
  96         "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
  97         "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
  98         "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
  99         "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
 100         "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
 101         "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
 102         "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
 103         "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
 104         "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
 105         "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
 106         "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
 107         "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
 108         "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
 109         "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
 110         "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
 111         "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
 112         "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
 113         "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
 114         "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
 115         "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
 116         "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
 117         "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
 118         "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
 119         "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
 120         "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
 121         "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
 122         "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
 123         "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
 124         "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
 125         "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
 126         "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
 127         "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
 128 };
 129
 130 #ifdef HAVE_ICONV
 131 static iconv_t iconv_cd = (iconv_t)-1;
 132 #endif
 133
 134 static void
 135 free_translation_table(struct conv_table *p)
 136 {
 137         int i;
 138
 139         for (i = 0; i < 256; i++)
 140                 if (p[i].t)
 141                         free_translation_table(p[i].u.tbl);
 142
 143         mem_free(p);
 144 }
 145
 146 /* A string used in conversion tables when there is no correct
 147  * conversion.  This is compared by address and therefore should be a
 148  * named array rather than a pointer so that it won't share storage
 149  * with any other string literal that happens to have the same
 150  * characters.  */
 151 static const unsigned char no_str[] = "*";
 152
 153 static void
 154 new_translation_table(struct conv_table *p)
 155 {
 156         int i;
 157
 158         for (i = 0; i < 256; i++)
 159                 if (p[i].t)
 160                         free_translation_table(p[i].u.tbl);
 161         for (i = 0; i < 128; i++) {
 162                 p[i].t = 0;
 163                 p[i].u.str = strings[i];
 164         }
 165         for (; i < 256; i++) {
 166                 p[i].t = 0;
 167                 p[i].u.str = no_str;
 168         }
 169         p->iconv_cp = -1;
 170 }
 171
 172 #define BIN_SEARCH(table, entry, entries, key, result)                                  \
 173 {                                                                                       \
 174         long _s = 0, _e = (entries) - 1;                                                \
 175                                                                                         \
 176         while (_s <= _e || !((result) = -1)) {                                          \
 177                 long _m = (_s + _e) / 2;                                                \
 178                                                                                         \
 179                 if ((table)[_m].entry == (key)) {                                       \
 180                         (result) = _m;                                                  \
 181                         break;                                                          \
 182                 }                                                                       \
 183                 if ((table)[_m].entry > (key)) _e = _m - 1;                             \
 184                 if ((table)[_m].entry < (key)) _s = _m + 1;                             \
 185         }                                                                               \
 186 }                                                                                       \
 187
 188 static const unicode_val_T strange_chars[32] = {
 189 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
 190 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
 191 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
 192 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
 193 };
 194
 195 #define SYSTEM_CHARSET_FLAG 128
 196 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
 197
 198 const unsigned char *
 199 u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
 200 {
 201         int j;
 202         int s;
 203
 204         if (u < 128) return strings[u];
 205
 206         to &= ~SYSTEM_CHARSET_FLAG;
 207
 208         if (is_cp_ptr_utf8(&codepages[to]))
 209                 return encode_utf8(u);
 210
 211         /* To mark non breaking spaces in non-UTF-8 strings, we use a
 212          * special char NBSP_CHAR. */
 213         if (u == UCS_NO_BREAK_SPACE) {
 214                 if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
 215                 else /* NBSP_MODE_ASCII */ return " ";
 216         }
 217         if (u == UCS_SOFT_HYPHEN) return "";
 218
 219         if (u < 0xa0) {
 220                 unicode_val_T strange = strange_chars[u - 0x80];
 221
 222                 if (!strange) return NULL;
 223                 return u2cp_(strange, to, nbsp_mode);
 224         }
 225
 226         if (u < 0xFFFF)
 227                 for (j = 0; j < 0x80; j++)
 228                         if (codepages[to].highhalf[j] == u)
 229                                 return strings[0x80 + j];
 230         for (j = 0; codepages[to].table[j].c; j++)
 231                 if (codepages[to].table[j].u == u)
 232                         return strings[codepages[to].table[j].c];
 233
 234         BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
 235         if (s != -1) return unicode_7b[s].s;
 236
 237         return no_str;
 238 }
 239
 240 static unsigned char utf_buffer[7];
 241
 242 NONSTATIC_INLINE unsigned char *
 243 encode_utf8(unicode_val_T u)
 244 {
 245         memset(utf_buffer, 0, 7);
 246
 247         if (u < 0x80)
 248                 utf_buffer[0] = u;
 249         else if (u < 0x800)
 250                 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
 251                 utf_buffer[1] = 0x80 | (u & 0x3f);
 252         else if (u < 0x10000)
 253                 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
 254                 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
 255                 utf_buffer[2] = 0x80 | (u & 0x3f);
 256         else if (u < 0x200000)
 257                 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
 258                 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
 259                 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
 260                 utf_buffer[3] = 0x80 | (u & 0x3f);
 261         else if (u < 0x4000000)
 262                 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
 263                 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
 264                 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
 265                 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
 266                 utf_buffer[4] = 0x80 | (u & 0x3f);
 267         else    utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
 268                 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
 269                 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
 270                 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
 271                 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
 272                 utf_buffer[5] = 0x80 | (u & 0x3f);
 273
 274         return utf_buffer;
 275 }
 276
 277 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
 278  * equal ones and handled different. */
 279 static const char utf8char_len_tab[256] = {
 280         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 281         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 282         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 283         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 284         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 285         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 286         2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
 287         3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
 288 };
 289
 290 #ifdef CONFIG_UTF8
 291 NONSTATIC_INLINE int
 292 utf8charlen(const unsigned char *p)
 293 {
 294         return p ? utf8char_len_tab[*p] : 0;
 295 }
 296
 297 int
 298 strlen_utf8(unsigned char **str)
 299 {
 300         unsigned char *s = *str;
 301         unsigned char *end = strchr(s, '\0');
 302         int x;
 303         int len;
 304
 305         for (x = 0;; x++, s += len) {
 306                 len = utf8charlen(s);
 307                 if (s + len > end) break;
 308         }
 309         *str = s;
 310         return x;
 311 }
 312
 313 #define utf8_issingle(p) (((p) & 0x80) == 0)
 314 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
 315
 316 /* Start from @current and move back to @pos char. This pointer return. The
 317  * most left pointer is @start. */
 318 unsigned char *
 319 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
 320 {
 321         if (current == NULL || start == NULL || pos < 0)
 322                 return NULL;
 323         while (pos > 0 && current != start) {
 324                 current--;
 325                 if (utf8_islead(*current))
 326                         pos--;
 327         }
 328         return current;
 329 }
 330
 331 /* Count number of standard terminal cells needed for displaying UTF-8
 332  * character. */
 333 int
 334 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
 335 {
 336         unicode_val_T u;
 337
 338         if (end == NULL)
 339                 end = strchr(utf8_char, '\0');
 340
 341         if(!utf8_char || !end)
 342                 return -1;
 343
 344         u = utf8_to_unicode(&utf8_char, end);
 345
 346         return unicode_to_cell(u);
 347 }
 348
 349 /* Count number of standard terminal cells needed for displaying string
 350  * with UTF-8 characters. */
 351 int
 352 utf8_ptr2cells(unsigned char *string, unsigned char *end)
 353 {
 354         int charlen, cell, cells = 0;
 355
 356         if (end == NULL)
 357                 end = strchr(string, '\0');
 358
 359         if(!string || !end)
 360                 return -1;
 361
 362         do {
 363                 charlen = utf8charlen(string);
 364                 if (string + charlen > end)
 365                         break;
 366
 367                 cell = utf8_char2cells(string, end);
 368                 if  (cell < 0)
 369                         return -1;
 370
 371                 cells += cell;
 372                 string += charlen;
 373         } while (1);
 374
 375         return cells;
 376 }
 377
 378 /* Count number of characters in string. */
 379 int
 380 utf8_ptr2chars(unsigned char *string, unsigned char *end)
 381 {
 382         int charlen, chars = 0;
 383
 384         if (end == NULL)
 385                 end = strchr(string, '\0');
 386
 387         if(!string || !end)
 388                 return -1;
 389
 390         do {
 391                 charlen = utf8charlen(string);
 392                 if (string + charlen > end)
 393                         break;
 394
 395                 chars++;
 396                 string += charlen;
 397         } while (1);
 398
 399         return chars;
 400 }
 401
 402 /*
 403  * Count number of bytes from begining of the string needed for displaying
 404  * specified number of cells.
 405  */
 406 int
 407 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
 408 {
 409         unsigned int bytes = 0, cells = 0;
 410
 411         assert(max_cells>=0);
 412
 413         if (end == NULL)
 414                 end = strchr(string, '\0');
 415
 416         if(!string || !end)
 417                 return -1;
 418
 419         do {
 420                 int cell = utf8_char2cells(&string[bytes], end);
 421                 if (cell < 0)
 422                         return -1;
 423
 424                 cells += cell;
 425                 if (cells > max_cells)
 426                         break;
 427
 428                 bytes += utf8charlen(&string[bytes]);
 429
 430                 if (string + bytes > end) {
 431                         bytes = end - string;
 432                         break;
 433                 }
 434         } while(1);
 435
 436         return bytes;
 437 }
 438
 439 /* Take @max steps forward from @string in the specified @way, but
 440  * not going past @end.  Return the resulting address.  Store the
 441  * number of steps taken to *@count, unless @count is NULL.
 442  *
 443  * This assumes the text is valid UTF-8, and @string and @end point to
 444  * character boundaries.  If not, it doesn't crash but the results may
 445  * be inconsistent.
 446  *
 447  * This function can do some of the same jobs as utf8charlen(),
 448  * utf8_cells2bytes(), and strlen_utf8().  */
 449 unsigned char *
 450 utf8_step_forward(unsigned char *string, unsigned char *end,
 451                   int max, enum utf8_step way, int *count)
 452 {
 453         int steps = 0;
 454         unsigned char *current = string;
 455
 456         assert(string);
 457         assert(max >= 0);
 458         if_assert_failed goto invalid_arg;
 459         if (end == NULL)
 460                 end = strchr(string, '\0');
 461
 462         switch (way) {
 463         case UTF8_STEP_CHARACTERS:
 464                 while (steps < max && current < end) {
 465                         ++current;
 466                         if (utf8_islead(*current))
 467                                 ++steps;
 468                 }
 469                 break;
 470
 471         case UTF8_STEP_CELLS_FEWER:
 472         case UTF8_STEP_CELLS_MORE:
 473                 while (steps < max && current < end) {
 474                         unicode_val_T u;
 475                         unsigned char *prev = current;
 476                         int width;
 477
 478                         u = utf8_to_unicode(&current, end);
 479                         if (u == UCS_NO_CHAR) {
 480                                 /* Assume the incomplete sequence
 481                                  * costs one cell.  */
 482                                 current = end;
 483                                 ++steps;
 484                                 break;
 485                         }
 486
 487                         width = unicode_to_cell(u);
 488                         if (way == UTF8_STEP_CELLS_FEWER
 489                             && steps + width > max) {
 490                                 /* Back off.  */
 491                                 current = prev;
 492                                 break;
 493                         }
 494                         steps += width;
 495                 }
 496                 break;
 497
 498         default:
 499                 INTERNAL("impossible enum utf8_step");
 500         }
 501
 502 invalid_arg:
 503         if (count)
 504                 *count = steps;
 505         return current;
 506 }
 507
 508 /* Take @max steps backward from @string in the specified @way, but
 509  * not going past @start.  Return the resulting address.  Store the
 510  * number of steps taken to *@count, unless @count is NULL.
 511  *
 512  * This assumes the text is valid UTF-8, and @string and @start point
 513  * to character boundaries.  If not, it doesn't crash but the results
 514  * may be inconsistent.
 515  *
 516  * This function can do some of the same jobs as utf8_prevchar().  */
 517 unsigned char *
 518 utf8_step_backward(unsigned char *string, unsigned char *start,
 519                    int max, enum utf8_step way, int *count)
 520 {
 521         int steps = 0;
 522         unsigned char *current = string;
 523
 524         assert(string);
 525         assert(start);
 526         assert(max >= 0);
 527         if_assert_failed goto invalid_arg;
 528
 529         switch (way) {
 530         case UTF8_STEP_CHARACTERS:
 531                 while (steps < max && current > start) {
 532                         --current;
 533                         if (utf8_islead(*current))
 534                                 ++steps;
 535                 }
 536                 break;
 537
 538         case UTF8_STEP_CELLS_FEWER:
 539         case UTF8_STEP_CELLS_MORE:
 540                 while (steps < max) {
 541                         unsigned char *prev = current;
 542                         unsigned char *look;
 543                         unicode_val_T u;
 544                         int width;
 545
 546                         if (current <= start)
 547                                 break;
 548                         do {
 549                                 --current;
 550                         } while (current > start && !utf8_islead(*current));
 551
 552                         look = current;
 553                         u = utf8_to_unicode(&look, prev);
 554                         if (u == UCS_NO_CHAR) {
 555                                 /* Assume the incomplete sequence
 556                                  * costs one cell.  */
 557                                 width = 1;
 558                         } else
 559                                 width = unicode_to_cell(u);
 560
 561                         if (way == UTF8_STEP_CELLS_FEWER
 562                             && steps + width > max) {
 563                                 /* Back off.  */
 564                                 current = prev;
 565                                 break;
 566                         }
 567                         steps += width;
 568                 }
 569                 break;
 570
 571         default:
 572                 INTERNAL("impossible enum utf8_step");
 573         }
 574
 575 invalid_arg:
 576         if (count)
 577                 *count = steps;
 578         return current;
 579 }
 580
 581 /*
 582  * Find out number of standard terminal collumns needed for displaying symbol
 583  * (glyph) which represents Unicode character c.
 584  *
 585  * TODO: Use wcwidth when it is available. This seems to require:
 586  * - Make the configure script check whether <wchar.h> and wcwidth exist.
 587  * - Define _XOPEN_SOURCE and include <wchar.h>.
 588  * - Test that __STDC_ISO_10646__ is defined.  (This macro means wchar_t
 589  *   matches ISO 10646 in all locales.)
 590  * However, these do not suffice, because wcwidth depends on LC_CTYPE
 591  * in glibc-2.3.6.  For instance, wcwidth(0xff20) is -1 when LC_CTYPE
 592  * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
 593  * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
 594  * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
 595  * character is apparently not supported in all locales.  Why is that?
 596  * - Perhaps there is standardese that requires supported characters
 597  *   to be convertable to multibyte form.  Then ELinks could just pick
 598  *   some UTF-8 locale for its wcwidth purposes.
 599  * - Perhaps wcwidth can even return different nonnegative values for
 600  *   the same ISO 10646 character in different locales.  Then ELinks
 601  *   would have to set LC_CTYPE to match at least the terminal's
 602  *   charset (which may differ from the LC_CTYPE environment variable,
 603  *   especially when the master process is serving a slave terminal).
 604  *   But there is no guarantee that the libc supports all the same
 605  *   charsets as ELinks does.
 606  * For now, it seems safest to avoid the potentially locale-dependent
 607  * libc version of wcwidth, and instead use a hardcoded mapping.
 608  *
 609  * @return      2 for double-width glyph, 1 for others.
 610  *              TODO: May be extended to return 0 for zero-width glyphs
 611  *              (like composing, maybe unprintable too).
 612  */
 613 NONSTATIC_INLINE int
 614 unicode_to_cell(unicode_val_T c)
 615 {
 616         if (c >= 0x1100
 617                 && (c <= 0x115f                 /* Hangul Jamo */
 618                 || c == 0x2329
 619                 || c == 0x232a
 620                 || (c >= 0x2e80 && c <= 0xa4cf
 621                         && c != 0x303f)         /* CJK ... Yi */
 622                 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
 623                 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
 624                                                                 Ideographs */
 625                 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
 626                 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
 627                 || (c >= 0xffe0 && c <= 0xffe6)
 628                 || (c >= 0x20000 && c <= 0x2fffd)
 629                 || (c >= 0x30000 && c <= 0x3fffd)))
 630                 return 2;
 631
 632         return 1;
 633 }
 634
 635 /* Fold the case of a Unicode character, so that hotkeys in labels can
 636  * be compared case-insensitively.  It is unspecified whether the
 637  * result will be in upper or lower case.  */
 638 unicode_val_T
 639 unicode_fold_label_case(unicode_val_T c)
 640 {
 641 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
 642         return towlower(c);
 643 #else  /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 644         /* For now, this supports only ASCII.  It would be possible to
 645          * use code generated from CaseFolding.txt of Unicode if the
 646          * acknowledgements required by http://www.unicode.org/copyright.html
 647          * were added to associated documentation of ELinks.  */
 648         if (c >= 0x41 && c <= 0x5A)
 649                 return c + 0x20;
 650         else
 651                 return c;
 652 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 653 }
 654 #endif /* CONFIG_UTF8 */
 655
 656 NONSTATIC_INLINE unicode_val_T
 657 utf8_to_unicode(unsigned char **string, const unsigned char *end)
 658 {
 659         unsigned char *str = *string;
 660         unicode_val_T u;
 661         int length;
 662
 663         length = utf8char_len_tab[str[0]];
 664
 665         if (str + length > end) {
 666                 return UCS_NO_CHAR;
 667         }
 668
 669         switch (length) {
 670                 case 1:         /* U+0000 to U+007F */
 671                         if (str[0] >= 0x80) {
 672 invalid_utf8:
 673                                 ++*string;
 674                                 return UCS_REPLACEMENT_CHARACTER;
 675                         }
 676                         u = str[0];
 677                         break;
 678                 case 2:         /* U+0080 to U+07FF */
 679                         if ((str[1] & 0xc0) != 0x80)
 680                                 goto invalid_utf8;
 681                         u = (str[0] & 0x1f) << 6;
 682                         u += (str[1] & 0x3f);
 683                         if (u < 0x80)
 684                                 goto invalid_utf8;
 685                         break;
 686                 case 3:         /* U+0800 to U+FFFF, except surrogates */
 687                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
 688                                 goto invalid_utf8;
 689                         u = (str[0] & 0x0f) << 12;
 690                         u += ((str[1] & 0x3f) << 6);
 691                         u += (str[2] & 0x3f);
 692                         if (u < 0x800 || is_utf16_surrogate(u))
 693                                 goto invalid_utf8;
 694                         break;
 695                 case 4:         /* U+10000 to U+1FFFFF */
 696                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 697                             || (str[3] & 0xc0) != 0x80)
 698                                 goto invalid_utf8;
 699                         u = (str[0] & 0x0f) << 18;
 700                         u += ((str[1] & 0x3f) << 12);
 701                         u += ((str[2] & 0x3f) << 6);
 702                         u += (str[3] & 0x3f);
 703                         if (u < 0x10000)
 704                                 goto invalid_utf8;
 705                         break;
 706                 case 5:         /* U+200000 to U+3FFFFFF */
 707                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 708                             || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
 709                                 goto invalid_utf8;
 710                         u = (str[0] & 0x0f) << 24;
 711                         u += ((str[1] & 0x3f) << 18);
 712                         u += ((str[2] & 0x3f) << 12);
 713                         u += ((str[3] & 0x3f) << 6);
 714                         u += (str[4] & 0x3f);
 715                         if (u < 0x200000)
 716                                 goto invalid_utf8;
 717                         break;
 718                 case 6:         /* U+4000000 to U+7FFFFFFF */
 719                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 720                             || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
 721                             || (str[5] & 0xc0) != 0x80)
 722                                 goto invalid_utf8;
 723                         u = (str[0] & 0x01) << 30;
 724                         u += ((str[1] & 0x3f) << 24);
 725                         u += ((str[2] & 0x3f) << 18);
 726                         u += ((str[3] & 0x3f) << 12);
 727                         u += ((str[4] & 0x3f) << 6);
 728                         u += (str[5] & 0x3f);
 729                         if (u < 0x4000000)
 730                                 goto invalid_utf8;
 731                         break;
 732                 default:
 733                         INTERNAL("utf8char_len_tab out of range");
 734                         goto invalid_utf8;
 735         }
 736         *string = str + length;
 737         return u;
 738 }
 739
 740 /* The common part of cp2u and cp2utf_8.  */
 741 static unicode_val_T
 742 cp2u_shared(const struct codepage_desc *from, unsigned char c)
 743 {
 744         unicode_val_T u = from->highhalf[c - 0x80];
 745
 746         if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
 747         return u;
 748 }
 749
 750 /* Used for converting input from the terminal.  */
 751 unicode_val_T
 752 cp2u(int from, unsigned char c)
 753 {
 754         from &= ~SYSTEM_CHARSET_FLAG;
 755
 756         /* UTF-8 is a multibyte codepage and cannot be handled with
 757          * this function.  */
 758         assert(!is_cp_ptr_utf8(&codepages[from]));
 759         if_assert_failed return UCS_REPLACEMENT_CHARACTER;
 760
 761         if (c < 0x80) return c;
 762         else return cp2u_shared(&codepages[from], c);
 763 }
 764
 765 /* This slow and ugly code is used by the terminal utf_8_io */
 766 const unsigned char *
 767 cp2utf8(int from, int c)
 768 {
 769         from &= ~SYSTEM_CHARSET_FLAG;
 770
 771         if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
 772                 return strings[c];
 773
 774         return encode_utf8(cp2u_shared(&codepages[from], c));
 775 }
 776
 777 unicode_val_T
 778 cp_to_unicode(int codepage, unsigned char **string, const unsigned char *end)
 779 {
 780         unicode_val_T ret;
 781
 782         if (is_cp_utf8(codepage))
 783                 return utf8_to_unicode(string, end);
 784
 785         if (*string >= end)
 786                 return UCS_NO_CHAR;
 787
 788         ret = cp2u(codepage, **string);
 789         ++*string;
 790         return ret;
 791 }
 792
 793
 794 #ifdef CONFIG_COMBINE
 795 unicode_val_T last_combined = UCS_BEGIN_COMBINED - 1;
 796 unicode_val_T **combined;
 797 struct hash *combined_hash;
 798
 799 unicode_val_T
 800 get_combined(unicode_val_T *data, int length)
 801 {
 802         struct hash_item *item;
 803         unicode_val_T *key;
 804         int i, indeks;
 805
 806         assert(length >= 1 && length <= UCS_MAX_LENGTH_COMBINED);
 807         if_assert_failed return UCS_NO_CHAR;
 808
 809         if (!combined_hash) combined_hash = init_hash8();
 810         if (!combined_hash) return UCS_NO_CHAR;
 811         item = get_hash_item(combined_hash, (unsigned char *)data, length * sizeof(*data));
 812
 813         if (item) return (unicode_val_T)(long)item->value;
 814         if (last_combined >= UCS_END_COMBINED) return UCS_NO_CHAR;
 815
 816         key = mem_alloc((length + 1) * sizeof(*key));
 817         if (!key) return UCS_NO_CHAR;
 818         for (i = 0; i < length; i++)
 819                 key[i] = data[i];
 820         key[i] = UCS_END_COMBINED;
 821
 822         last_combined++;
 823         indeks = last_combined - UCS_BEGIN_COMBINED;
 824
 825         combined = mem_realloc(combined, sizeof(*combined) * (indeks + 1));
 826         if (!combined) {
 827                 mem_free(key);
 828                 last_combined--;
 829                 return UCS_NO_CHAR;
 830         }
 831         combined[indeks] = key;
 832         item = add_hash_item(combined_hash, (unsigned char *)key,
 833                              length * sizeof(*data), (void *)(long)(last_combined));
 834         if (!item) {
 835                 last_combined--;
 836                 mem_free(key);
 837                 return UCS_NO_CHAR;
 838         }
 839         return last_combined;
 840 }
 841
 842 void
 843 free_combined()
 844 {
 845         int i, end = last_combined - UCS_BEGIN_COMBINED + 1;
 846
 847         if (combined_hash)
 848                 free_hash(&combined_hash);
 849         for (i = 0; i < end; i++)
 850                 mem_free(combined[i]);
 851         mem_free_if(combined);
 852 }
 853 #endif /* CONFIG_COMBINE */
 854
 855
 856 static void
 857 add_utf8(struct conv_table *ct, unicode_val_T u, const unsigned char *str)
 858 {
 859         unsigned char *p = encode_utf8(u);
 860
 861         while (p[1]) {
 862                 if (ct[*p].t) ct = ct[*p].u.tbl;
 863                 else {
 864                         struct conv_table *nct;
 865
 866                         assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
 867                         if_assert_failed return;
 868
 869                         nct = mem_calloc(256, sizeof(*nct));
 870                         if (!nct) return;
 871                         new_translation_table(nct);
 872                         ct[*p].t = 1;
 873                         ct[*p].u.tbl = nct;
 874                         ct = nct;
 875                 }
 876                 p++;
 877         }
 878
 879         assertm(!ct[*p].t, "bad utf encoding #2");
 880         if_assert_failed return;
 881
 882         if (ct[*p].u.str == no_str)
 883                 ct[*p].u.str = str;
 884 }
 885
 886 /* A conversion table from some charset to UTF-8.
 887  * If it is from UTF-8 to UTF-8, it converts each byte separately.
 888  * Unlike in other translation tables, the strings in elements 0x80 to
 889  * 0xFF are allocated dynamically.  */
 890 struct conv_table utf_table[256];
 891 int utf_table_init = 1;
 892
 893 static void
 894 free_utf_table(void)
 895 {
 896         int i;
 897
 898         /* Cast away const.  */
 899         for (i = 128; i < 256; i++)
 900                 mem_free((unsigned char *) utf_table[i].u.str);
 901 }
 902
 903 static struct conv_table *
 904 get_translation_table_to_utf8(int from)
 905 {
 906         int i;
 907         static int lfr = -1;
 908
 909         if (from == -1) return NULL;
 910         from &= ~SYSTEM_CHARSET_FLAG;
 911         if (from == lfr) return utf_table;
 912         lfr = from;
 913         if (utf_table_init) {
 914                 memset(utf_table, 0, sizeof(utf_table));
 915                 utf_table_init = 0;
 916         } else
 917                 free_utf_table();
 918
 919         for (i = 0; i < 128; i++)
 920                 utf_table[i].u.str = strings[i];
 921
 922         if (is_cp_ptr_utf8(&codepages[from])) {
 923                 for (i = 128; i < 256; i++)
 924                         utf_table[i].u.str = stracpy(strings[i]);
 925                 return utf_table;
 926         }
 927
 928         for (i = 128; i < 256; i++) {
 929                 unicode_val_T u = codepages[from].highhalf[i - 0x80];
 930
 931                 if (u == 0xFFFF)
 932                         utf_table[i].u.str = NULL;
 933                 else
 934                         utf_table[i].u.str = stracpy(encode_utf8(u));
 935         }
 936
 937         for (i = 0; codepages[from].table[i].c; i++) {
 938                 unicode_val_T u = codepages[from].table[i].u;
 939
 940                 if (!utf_table[codepages[from].table[i].c].u.str)
 941                         utf_table[codepages[from].table[i].c].u.str =
 942                                 stracpy(encode_utf8(u));
 943         }
 944
 945         for (i = 128; i < 256; i++)
 946                 if (!utf_table[i].u.str)
 947                         utf_table[i].u.str = stracpy(no_str);
 948
 949         return utf_table;
 950 }
 951
 952 /* A conversion table between two charsets, where the target is not UTF-8.  */
 953 static struct conv_table table[256];
 954 static int first = 1;
 955
 956 void
 957 free_conv_table(void)
 958 {
 959         if (!utf_table_init) free_utf_table();
 960         if (first) {
 961                 memset(table, 0, sizeof(table));
 962                 first = 0;
 963         }
 964         new_translation_table(table);
 965 #ifdef HAVE_ICONV
 966         if (iconv_cd != (iconv_t)-1) {
 967                 iconv_close(iconv_cd);
 968                 iconv_cd = (iconv_t)-1;
 969         }
 970 #endif
 971 }
 972
 973
 974 struct conv_table *
 975 get_translation_table(int from, int to)
 976 {
 977         static int lfr = -1;
 978         static int lto = -1;
 979
 980         from &= ~SYSTEM_CHARSET_FLAG;
 981         to &= ~SYSTEM_CHARSET_FLAG;
 982         if (first) {
 983                 memset(table, 0, sizeof(table));
 984                 first = 0;
 985         }
 986
 987         if (codepages[from].iconv) {
 988                 struct conv_table *table2 = get_translation_table_to_utf8(34);
 989
 990                 if (table2) table2->iconv_cp = from;
 991                 return table2;
 992         }
 993
 994         if (/*from == to ||*/ from == -1 || to == -1)
 995                 return NULL;
 996         if (is_cp_ptr_utf8(&codepages[to])) {
 997                 struct conv_table *table2 = get_translation_table_to_utf8(from);
 998
 999                 if (table2) table2->iconv_cp = -1;
1000                 return table2;
1001         }
1002         if (from == lfr && to == lto)
1003                 return table;
1004         lfr = from;
1005         lto = to;
1006         new_translation_table(table);
1007
1008         if (is_cp_ptr_utf8(&codepages[from])) {
1009                 int i;
1010
1011                 /* Map U+00A0 and U+00AD the same way as u2cp() would.  */
1012                 add_utf8(table, UCS_NO_BREAK_SPACE, strings[NBSP_CHAR]);
1013                 add_utf8(table, UCS_SOFT_HYPHEN, "");
1014
1015                 for (i = 0x80; i <= 0xFF; i++)
1016                         if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
1017                                 add_utf8(table,
1018                                          codepages[to].highhalf[i - 0x80],
1019                                          strings[i]);
1020
1021                 for (i = 0; codepages[to].table[i].c; i++)
1022                         add_utf8(table, codepages[to].table[i].u,
1023                                  strings[codepages[to].table[i].c]);
1024
1025                 for (i = 0; unicode_7b[i].x != -1; i++)
1026                         if (unicode_7b[i].x >= 0x80)
1027                                 add_utf8(table, unicode_7b[i].x,
1028                                          unicode_7b[i].s);
1029
1030         } else {
1031                 int i;
1032
1033                 for (i = 128; i < 256; i++) {
1034                         if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
1035                                 const unsigned char *u;
1036
1037                                 u = u2cp(codepages[from].highhalf[i - 0x80], to);
1038                                 if (u) table[i].u.str = u;
1039                         }
1040                 }
1041         }
1042
1043         return table;
1044 }
1045
1046 static inline int
1047 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
1048 {
1049         while (l2) {
1050                 if (*s1 > *s2) return 1;
1051                 if (*s1 < *s2) return -1;
1052                 s1++;
1053                 s2++;
1054                 l2--;
1055         }
1056
1057         return *s2 ? -1 : 0;
1058 }
1059
1060 /* Entity cache debugging purpose. */
1061 #if 0
1062 #define DEBUG_ENTITY_CACHE
1063 #else
1064 #undef DEBUG_ENTITY_CACHE
1065 #endif
1066
1067 struct entity_cache {
1068         unsigned int hits;
1069         int strlen;
1070         int encoding;
1071         const unsigned char *result;
1072         unsigned char str[20]; /* Suffice in any case. */
1073 };
1074
1075 /* comparison function for qsort() */
1076 static int
1077 hits_cmp(const void *v1, const void *v2)
1078 {
1079         const struct entity_cache *a = v1, *b = v2;
1080
1081         if (a->hits == b->hits) return 0;
1082         if (a->hits > b->hits) return -1;
1083         else return 1;
1084 }
1085
1086 static int
1087 compare_entities(const void *key_, const void *element_)
1088 {
1089         struct string *key = (struct string *) key_;
1090         struct entity *element = (struct entity *) element_;
1091         int length = key->length;
1092         unsigned char *first = key->source;
1093         unsigned char *second = element->s;
1094
1095         return xxstrcmp(first, second, length);
1096 }
1097
1098 const unsigned char *
1099 get_entity_string(const unsigned char *str, const int strlen, int encoding)
1100 {
1101 #define ENTITY_CACHE_SIZE 10    /* 10 seems a good value. */
1102 #define ENTITY_CACHE_MAXLEN 9   /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
1103                                    will go in [0] table */
1104         static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
1105         static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
1106         unsigned int slen = 0;
1107         const unsigned char *result = NULL;
1108
1109         /* Note that an object of static storage duration is automatically
1110          * initialised to zero in C.  */
1111
1112         if (strlen <= 0) return NULL;
1113
1114 #ifdef CONFIG_UTF8
1115         /* TODO: caching UTF-8 */
1116         encoding &= ~SYSTEM_CHARSET_FLAG;
1117         if (is_cp_ptr_utf8(&codepages[encoding]))
1118                 goto skip;
1119 #endif /* CONFIG_UTF8 */
1120
1121         /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1122          * + google + slashdot + websites that result from a search for test on google,
1123          * + various ones) show quite impressive improvment:
1124          * Top ten is:
1125          * 0: hits=2459 l=4 st='nbsp'
1126          * 1: hits=2152 l=6 st='eacute'
1127          * 2: hits=235 l=6 st='egrave'
1128          * 3: hits=136 l=6 st='agrave'
1129          * 4: hits=100 l=3 st='amp'
1130          * 5: hits=40 l=5 st='laquo'
1131          * 6: hits=8 l=4 st='copy'
1132          * 7: hits=5 l=2 st='gt'
1133          * 8: hits=2 l=2 st='lt'
1134          * 9: hits=1 l=6 st='middot'
1135          *
1136          * Most of the time cache hit ratio is near 95%.
1137          *
1138          * A long test shows: 15186 hits vs. 24 misses and mean iteration
1139          * count is kept < 2 (worst case 1.58). Not so bad ;)
1140          *
1141          * --Zas */
1142
1143         /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1144         slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
1145
1146         if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
1147                 int i;
1148
1149                 for (i = 0; i < nb_entity_cache[slen]; i++) {
1150                         if (entity_cache[slen][i].encoding == encoding
1151                             && !memcmp(str, entity_cache[slen][i].str, strlen)) {
1152 #ifdef DEBUG_ENTITY_CACHE
1153                                 static double total_iter = 0;
1154                                 static unsigned long hit_count = 0;
1155
1156                                 total_iter += i + 1;
1157                                 hit_count++;
1158                                 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
1159 #endif
1160                                 if (entity_cache[slen][i].hits < (unsigned int) ~0)
1161                                         entity_cache[slen][i].hits++;
1162                                 return entity_cache[slen][i].result;
1163                         }
1164                 }
1165 #ifdef DEBUG_ENTITY_CACHE
1166                 fprintf(stderr, "miss\n");
1167 #endif
1168         }
1169 #ifdef CONFIG_UTF8
1170 skip:
1171 #endif /* CONFIG_UTF8 */
1172         if (*str == '#') { /* Numeric entity. */
1173                 int l = (int) strlen;
1174                 unsigned char *st = (unsigned char *) str;
1175                 unicode_val_T n = 0;
1176
1177                 if (l == 1) goto end; /* &#; ? */
1178                 st++, l--;
1179                 if ((*st | 32) == 'x') { /* Hexadecimal */
1180
1181                         if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
1182                         st++, l--;
1183                         do {
1184                                 unsigned char c = (*(st++) | 32);
1185
1186                                 if (isdigit(c))
1187                                         n = (n << 4) | (c - '0');
1188                                 else if (isxdigit(c))
1189                                         n = (n << 4) | (c - 'a' + 10);
1190                                 else
1191                                         goto end; /* Bad char. */
1192                         } while (--l);
1193                 } else { /* Decimal */
1194                         if (l > 10) goto end; /* 4294967295 max. */
1195                         do {
1196                                 unsigned char c = *(st++);
1197
1198                                 if (isdigit(c))
1199                                         n = n * 10 + c - '0';
1200                                 else
1201                                         goto end; /* Bad char. */
1202                                 /* Limit to 0xFFFFFFFF. */
1203                                 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1204                                         goto end;
1205                         } while (--l);
1206                 }
1207
1208                 result = u2cp(n, encoding);
1209
1210 #ifdef DEBUG_ENTITY_CACHE
1211                 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1212 #endif
1213         } else { /* Text entity. */
1214                 struct string key = INIT_STRING((unsigned char *) str, strlen);
1215                 struct entity *element = bsearch((void *) &key, entities,
1216                                                  N_ENTITIES,
1217                                                  sizeof(*element),
1218                                                  compare_entities);
1219
1220                 if (element) result = u2cp(element->c, encoding);
1221         }
1222
1223 #ifdef CONFIG_UTF8
1224         if (is_cp_ptr_utf8(&codepages[encoding])) {
1225                 return result;
1226         }
1227 #endif /* CONFIG_UTF8 */
1228 end:
1229         /* Take care of potential buffer overflow. */
1230         if (strlen < sizeof(entity_cache[slen][0].str)) {
1231                 struct entity_cache *ece;
1232
1233                 /* Sort entries by hit order. */
1234                 if (nb_entity_cache[slen] > 1)
1235                         qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1236                               sizeof(entity_cache[slen][0]), hits_cmp);
1237
1238                 /* Increment number of cache entries if possible.
1239                  * Else, just replace the least used entry.  */
1240                 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1241                 ece = &entity_cache[slen][nb_entity_cache[slen] - 1];
1242
1243                 /* Copy new entry to cache. */
1244                 ece->hits = 1;
1245                 ece->strlen = strlen;
1246                 ece->encoding = encoding;
1247                 ece->result = result;
1248                 memcpy(ece->str, str, strlen);
1249                 ece->str[strlen] = '\0';
1250
1251
1252 #ifdef DEBUG_ENTITY_CACHE
1253                 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1254                                 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1255
1256         {
1257                 unsigned int i;
1258
1259                 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1260                 for (i = 0; i < nb_entity_cache[slen] ; i++)
1261                         fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1262                                 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1263                                 entity_cache[slen][i].str);
1264                 fprintf(stderr, "-----------------\n");
1265         }
1266 #endif  /* DEBUG_ENTITY_CACHE */
1267         }
1268         return result;
1269 }
1270
1271 unsigned char *
1272 convert_string(struct conv_table *convert_table,
1273                unsigned char *chars2, int charslen2, int cp,
1274                enum convert_string_mode mode, int *length,
1275                void (*callback)(void *data, unsigned char *buf, int buflen),
1276                void *callback_data)
1277 {
1278         unsigned char *buffer;
1279         int bufferpos = 0;
1280         int charspos = 0;
1281         unsigned char *chars = chars2;
1282         int charslen = charslen2;
1283
1284 #ifdef HAVE_ICONV
1285         static char iconv_input[256];
1286         static char iconv_output[256 * 8];
1287         static size_t iconv_offset;
1288         static int iconv_cp;
1289         static size_t iconv_inleft;
1290         size_t iconv_outleft = 256 * 8;
1291         int loop = 0;
1292         int is_iconv = 0;
1293         int chars_offset = 0;
1294
1295         if (!convert_table && !memchr(chars, '&', charslen)) {
1296                 if (callback) {
1297                         if (charslen) callback(callback_data, chars, charslen);
1298                         return NULL;
1299                 } else {
1300                         return memacpy(chars, charslen);
1301                 }
1302         }
1303
1304         if (cp >= 0) {
1305                 if (convert_table && convert_table->iconv_cp > 0) {
1306                         is_iconv = 1;
1307                         cp = convert_table->iconv_cp;
1308                 } else {
1309                         is_iconv = codepages[cp & ~SYSTEM_CHARSET_FLAG].iconv;
1310                 }
1311         }
1312 #endif
1313
1314         /* Buffer allocation */
1315
1316         buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1317         if (!buffer) return NULL;
1318
1319 #ifdef HAVE_ICONV
1320         if (is_iconv) {
1321                 int v;
1322                 size_t before, to_copy;
1323                 char *outp, *inp;
1324
1325                 if (iconv_cd >= 0) {
1326                         if (cp != iconv_cp) {
1327                                 iconv_close(iconv_cd);
1328                                 iconv_cd = (iconv_t)-1;
1329                         }
1330                 }
1331                 if (iconv_cd == (iconv_t)-1) {
1332                         iconv_offset = 0;
1333                         iconv_cd = iconv_open("utf-8", get_cp_mime_name(cp));
1334                         if (iconv_cd == (iconv_t)-1) {
1335                                 mem_free(buffer);
1336                                 return NULL;
1337                         }
1338                         iconv_cp = cp;
1339                 }
1340 repeat:
1341                 to_copy = charslen2 - chars_offset;
1342                 if (to_copy > 256 - iconv_offset) to_copy = 256 - iconv_offset;
1343                 memcpy(iconv_input + iconv_offset, chars + chars_offset, to_copy);
1344                 iconv_outleft = 256 * 8;
1345                 iconv_inleft = iconv_offset + to_copy;
1346                 inp = iconv_input;
1347                 outp = iconv_output;
1348                 before = iconv_inleft;
1349 again:
1350                 v = iconv(iconv_cd, &inp, &iconv_inleft, &outp, &iconv_outleft);
1351                 chars_offset += before - iconv_inleft;
1352                 charslen = 256 * 8 - iconv_outleft;
1353
1354                 chars = (unsigned char *)iconv_output;
1355                 charspos = 0;
1356
1357                 if (v == -1) {
1358                         switch (errno) {
1359                         case EINVAL:
1360                                 memcpy(iconv_input, inp, iconv_inleft);
1361                                 iconv_offset = iconv_inleft;
1362                                 break;
1363                         case EILSEQ:
1364                                 chars_offset++;
1365                                 iconv_inleft--;
1366                                 inp++;
1367                                 goto again;
1368                                 break;
1369                         default:
1370                                 iconv_offset = 0;
1371                         }
1372                 } else {
1373                         iconv_offset = 0;
1374                 }
1375
1376                 loop = chars_offset < charslen2;
1377         }
1378 #endif
1379         /* Iterate ;-) */
1380
1381         while (charspos < charslen) {
1382                 const unsigned char *translit;
1383
1384 #define PUTC do { \
1385                 buffer[bufferpos++] = chars[charspos++]; \
1386                 translit = ""; \
1387                 goto flush; \
1388         } while (0)
1389
1390                 if (chars[charspos] != '&') {
1391                         struct conv_table *t;
1392                         int i;
1393
1394                         if (chars[charspos] < 128 || !convert_table) PUTC;
1395
1396                         t = convert_table;
1397                         i = charspos;
1398
1399                         while (t[chars[i]].t) {
1400                                 t = t[chars[i++]].u.tbl;
1401                                 if (i >= charslen) PUTC;
1402                         }
1403
1404                         translit = t[chars[i]].u.str;
1405                         charspos = i + 1;
1406
1407                 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1408                         PUTC;
1409
1410                 } else {
1411                         int start = charspos + 1;
1412                         int i = start;
1413
1414                         while (i < charslen
1415                                && (isasciialpha(chars[i])
1416                                    || isdigit(chars[i])
1417                                    || (chars[i] == '#')))
1418                                 i++;
1419
1420                         /* This prevents bug 213: we were expanding "entities"
1421                          * in URL query strings. */
1422                         /* XXX: But this disables &nbsp&nbsp usage, which
1423                          * appears to be relatively common! --pasky */
1424                         if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1425                             && i > start
1426                             && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1427                                 translit = get_entity_string(&chars[start], i - start,
1428                                                       cp);
1429                                 if (chars[i] != ';') {
1430                                         /* Eat &nbsp &nbsp<foo> happily, but
1431                                          * pull back from the character after
1432                                          * entity string if it is not the valid
1433                                          * terminator. */
1434                                         i--;
1435                                 }
1436
1437                                 if (!translit) PUTC;
1438                                 charspos = i + (i < charslen);
1439                         } else PUTC;
1440                 }
1441
1442                 if (!translit[0]) continue;
1443
1444                 if (!translit[1]) {
1445                         buffer[bufferpos++] = translit[0];
1446                         translit = "";
1447                         goto flush;
1448                 }
1449
1450                 while (*translit) {
1451                         unsigned char *new;
1452
1453                         buffer[bufferpos++] = *(translit++);
1454 flush:
1455                         if (bufferpos & (ALLOC_GR - 1)) continue;
1456
1457                         if (callback) {
1458                                 buffer[bufferpos] = 0;
1459                                 callback(callback_data, buffer, bufferpos);
1460                                 bufferpos = 0;
1461                         } else {
1462                                 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1463                                 if (!new) {
1464                                         mem_free(buffer);
1465                                         return NULL;
1466                                 }
1467                                 buffer = new;
1468                         }
1469                 }
1470 #undef PUTC
1471         }
1472
1473 #ifdef HAVE_ICONV
1474         if (loop) goto repeat;
1475 #endif
1476         /* Say bye */
1477
1478         buffer[bufferpos] = 0;
1479         if (length) *length = bufferpos;
1480
1481         if (callback) {
1482                 if (bufferpos) callback(callback_data, buffer, bufferpos);
1483                 mem_free(buffer);
1484                 return NULL;
1485         } else {
1486                 return buffer;
1487         }
1488 }
1489
1490
1491 #ifndef USE_FASTFIND
1492 int
1493 get_cp_index(const unsigned char *name)
1494 {
1495         int i, a;
1496         int syscp = 0;
1497
1498         if (!c_strcasecmp(name, "System")) {
1499 #if HAVE_LANGINFO_CODESET
1500                 name = nl_langinfo(CODESET);
1501                 syscp = SYSTEM_CHARSET_FLAG;
1502 #else
1503                 name = "us-ascii";
1504 #endif
1505         }
1506
1507         for (i = 0; codepages[i].name; i++) {
1508                 for (a = 0; codepages[i].aliases[a]; a++) {
1509                         /* In the past, we looked for the longest substring
1510                          * in all the names; it is way too expensive, though:
1511                          *
1512                          *   %   cumulative   self              self     total
1513                          *  time   seconds   seconds    calls  us/call  us/call  name
1514                          *  3.00      0.66     0.03     1325    22.64    22.64  get_cp_index
1515                          *
1516                          * Anything called from redraw_screen() is in fact
1517                          * relatively expensive, even if it's called just
1518                          * once. So we will do a simple strcasecmp() here.
1519                          */
1520
1521                         if (!c_strcasecmp(name, codepages[i].aliases[a]))
1522                                 return i | syscp;
1523                 }
1524         }
1525
1526         if (syscp) {
1527                 return get_cp_index("us-ascii") | syscp;
1528         } else {
1529                 return -1;
1530         }
1531 }
1532
1533 #else
1534
1535 static unsigned int i_name = 0;
1536 static unsigned int i_alias = 0;
1537
1538 /* Reset internal list pointer */
1539 void
1540 charsets_list_reset(void)
1541 {
1542         i_name = 0;
1543         i_alias = 0;
1544 }
1545
1546 /* Returns a pointer to a struct that contains current key and data pointers
1547  * and increment internal pointer.  It returns NULL when key is NULL. */
1548 struct fastfind_key_value *
1549 charsets_list_next(void)
1550 {
1551         static struct fastfind_key_value kv;
1552
1553         if (!codepages[i_name].name) return NULL;
1554
1555         kv.key = codepages[i_name].aliases[i_alias];
1556         kv.data = (void *) &codepages[i_name]; /* cast away const */
1557
1558         if (codepages[i_name].aliases[i_alias + 1])
1559                 i_alias++;
1560         else {
1561                 i_name++;
1562                 i_alias = 0;
1563         }
1564
1565         return &kv;
1566 }
1567
1568 static struct fastfind_index ff_charsets_index
1569         = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1570
1571 /* It searchs for a charset named @name or one of its aliases and
1572  * returns index for it or -1 if not found. */
1573 int
1574 get_cp_index(const unsigned char *name)
1575 {
1576         const struct codepage_desc *codepage;
1577         int syscp = 0;
1578
1579         if (!c_strcasecmp(name, "System")) {
1580 #if HAVE_LANGINFO_CODESET
1581                 name = nl_langinfo(CODESET);
1582                 syscp = SYSTEM_CHARSET_FLAG;
1583 #else
1584                 name = "us-ascii";
1585 #endif
1586         }
1587
1588         codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1589         if (codepage) {
1590                 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1591                 return (codepage - codepages) | syscp;
1592
1593         } else if (syscp) {
1594                 return get_cp_index("us-ascii") | syscp;
1595
1596         } else {
1597                 return -1;
1598         }
1599 }
1600
1601 #endif /* USE_FASTFIND */
1602
1603 void
1604 init_charsets_lookup(void)
1605 {
1606 #ifdef USE_FASTFIND
1607         fastfind_index(&ff_charsets_index, FF_COMPRESS);
1608 #endif
1609 }
1610
1611 void
1612 free_charsets_lookup(void)
1613 {
1614 #ifdef USE_FASTFIND
1615         fastfind_done(&ff_charsets_index);
1616 #endif
1617 }
1618
1619 /* Get the codepage's name for displaying to the user, or NULL if
1620  * @cp_index is one past the end.  In the future, we might want to
1621  * localize these with gettext.  So it may be best not to use this
1622  * function if the name will have to be converted back to an
1623  * index.  */
1624 unsigned char *
1625 get_cp_name(int cp_index)
1626 {
1627         if (cp_index < 0) return "none";
1628         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1629
1630         return codepages[cp_index].name;
1631 }
1632
1633 /* Get the codepage's name for saving to a configuration file.  These
1634  * names can be converted back to indexes, even in future versions of
1635  * ELinks.  */
1636 unsigned char *
1637 get_cp_config_name(int cp_index)
1638 {
1639         if (cp_index < 0) return "none";
1640         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1641         if (!codepages[cp_index].aliases) return NULL;
1642
1643         return codepages[cp_index].aliases[0];
1644 }
1645
1646 /* Get the codepage's name for sending to a library or server that
1647  * understands MIME charset names.  This function irreversibly maps
1648  * the "System" codepage to the underlying charset.  */
1649 unsigned char *
1650 get_cp_mime_name(int cp_index)
1651 {
1652         if (cp_index < 0) return "none";
1653         cp_index &= ~SYSTEM_CHARSET_FLAG;
1654         if (!codepages[cp_index].aliases) return NULL;
1655
1656         return codepages[cp_index].aliases[0];
1657 }
1658
1659 int
1660 is_cp_utf8(int cp_index)
1661 {
1662         cp_index &= ~SYSTEM_CHARSET_FLAG;
1663         return is_cp_ptr_utf8(&codepages[cp_index]);
1664 }
1665
1666 /* This function will be used by the xhtml parser. */
1667 const uint16_t *
1668 get_cp_highhalf(const unsigned char *name)
1669 {
1670         int cp = get_cp_index(name);
1671
1672         if (cp < 0) return NULL;
1673         cp &= ~SYSTEM_CHARSET_FLAG;
1674         return codepages[cp].highhalf;
1675 }