src/intl/charsets.c

   1 /* Charsets convertor */
   2
   3 #ifndef _GNU_SOURCE
   4 #define _GNU_SOURCE /* strcasecmp() */
   5 #endif
   6
   7 #ifdef HAVE_CONFIG_H
   8 #include "config.h"
   9 #endif
  10
  11 #if HAVE_LANGINFO_CODESET
  12 #include <langinfo.h>
  13 #endif
  14
  15 #include <ctype.h>
  16 #include <stdlib.h>
  17 #if HAVE_WCTYPE_H
  18 #include <wctype.h>
  19 #endif
  20
  21 #ifdef HAVE_ICONV
  22 #include <errno.h>
  23 #include <iconv.h>
  24 #endif
  25
  26 #include "elinks.h"
  27
  28 #include "document/options.h"
  29 #include "intl/charsets.h"
  30 #include "util/conv.h"
  31 #include "util/error.h"
  32 #include "util/fastfind.h"
  33 #include "util/hash.h"
  34 #include "util/memory.h"
  35 #include "util/string.h"
  36
  37
  38 /* Fix namespace clash on MacOS. */
  39 #define table table_elinks
  40
  41 struct table_entry {
  42         unsigned char c;
  43         /* This should in principle be unicode_val_T, but because all
  44          * the values currently in codepage.inc fit in 16 bits, we can
  45          * as well use uint16_t and halve sizeof(struct table_entry)
  46          * from 8 bytes to 4.  Should other characters ever be needed,
  47          * unicode_val_T u : 24 might be a possibility, although it
  48          * seems a little unportable as bitfields are in principle
  49          * restricted to int, which may be 16-bit.  */
  50         uint16_t u;
  51 };
  52
  53 struct codepage_desc {
  54         unsigned char *name;
  55         unsigned char *const *aliases;
  56
  57         /* The Unicode mappings of codepage bytes 0x80...0xFF.
  58          * (0x00...0x7F are assumed to be ASCII in all codepages.)
  59          * Because all current values fit in 16 bits, we store them as
  60          * uint16_t rather than unicode_val_T.  If the codepage does
  61          * not use some byte, then @highhalf maps that byte to 0xFFFF,
  62          * which C code converts to UCS_REPLACEMENT_CHARACTER where
  63          * appropriate.  (U+FFFF is reserved and will never be
  64          * assigned as a character.)  */
  65         const uint16_t *highhalf;
  66
  67         /* If some byte in the codepage corresponds to multiple Unicode
  68          * characters, then the preferred character is in @highhalf
  69          * above, and the rest are listed here in @table.  This table
  70          * is not used for translating from the codepage to Unicode.  */
  71         const struct table_entry *table;
  72
  73         /* Whether use iconv for translation */
  74         unsigned int iconv:1;
  75 };
  76
  77 #include "intl/codepage.inc"
  78 #include "intl/uni_7b.inc"
  79 #include "intl/entity.inc"
  80
  81 /* Declare the external-linkage inline functions defined in this file.
  82  * Avoid the GCC 4.3.1 warning: `foo' declared inline after being
  83  * called.  The functions are not declared inline in charsets.h
  84  * because C99 6.7.4p6 says that every external-linkage function
  85  * declared inline shall be defined in the same translation unit.
  86  * The non-inline declarations in charsets.h also make sure that the
  87  * compiler emits global definitions for the symbols so that the
  88  * functions can be called from other translation units.  */
  89 NONSTATIC_INLINE unsigned char *encode_utf8(unicode_val_T u);
  90 NONSTATIC_INLINE int utf8charlen(const unsigned char *p);
  91 NONSTATIC_INLINE int unicode_to_cell(unicode_val_T c);
  92 NONSTATIC_INLINE unicode_val_T utf8_to_unicode(unsigned char **string,
  93                                                const unsigned char *end);
  94
  95 static const char strings[256][2] = {
  96         "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
  97         "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
  98         "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
  99         "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
 100         "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
 101         "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
 102         "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
 103         "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
 104         "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
 105         "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
 106         "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
 107         "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
 108         "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
 109         "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
 110         "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
 111         "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
 112         "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
 113         "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
 114         "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
 115         "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
 116         "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
 117         "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
 118         "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
 119         "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
 120         "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
 121         "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
 122         "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
 123         "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
 124         "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
 125         "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
 126         "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
 127         "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
 128 };
 129
 130 #ifdef HAVE_ICONV
 131 static iconv_t iconv_cd = (iconv_t)-1;
 132 #endif
 133
 134 static void
 135 free_translation_table(struct conv_table *p)
 136 {
 137         int i;
 138
 139         for (i = 0; i < 256; i++)
 140                 if (p[i].t)
 141                         free_translation_table(p[i].u.tbl);
 142
 143         mem_free(p);
 144 }
 145
 146 /* A string used in conversion tables when there is no correct
 147  * conversion.  This is compared by address and therefore should be a
 148  * named array rather than a pointer so that it won't share storage
 149  * with any other string literal that happens to have the same
 150  * characters.  */
 151 static const unsigned char no_str[] = "*";
 152
 153 static void
 154 new_translation_table(struct conv_table *p)
 155 {
 156         int i;
 157
 158         for (i = 0; i < 256; i++)
 159                 if (p[i].t)
 160                         free_translation_table(p[i].u.tbl);
 161         for (i = 0; i < 128; i++) {
 162                 p[i].t = 0;
 163                 p[i].u.str = strings[i];
 164         }
 165         for (; i < 256; i++) {
 166                 p[i].t = 0;
 167                 p[i].u.str = no_str;
 168         }
 169         p->iconv_cp = -1;
 170 }
 171
 172 #define BIN_SEARCH(table, entry, entries, key, result)                                  \
 173 {                                                                                       \
 174         long _s = 0, _e = (entries) - 1;                                                \
 175                                                                                         \
 176         while (_s <= _e || !((result) = -1)) {                                          \
 177                 long _m = (_s + _e) / 2;                                                \
 178                                                                                         \
 179                 if ((table)[_m].entry == (key)) {                                       \
 180                         (result) = _m;                                                  \
 181                         break;                                                          \
 182                 }                                                                       \
 183                 if ((table)[_m].entry > (key)) _e = _m - 1;                             \
 184                 if ((table)[_m].entry < (key)) _s = _m + 1;                             \
 185         }                                                                               \
 186 }                                                                                       \
 187
 188 static const unicode_val_T strange_chars[32] = {
 189 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
 190 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
 191 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
 192 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
 193 };
 194
 195 #define SYSTEM_CHARSET_FLAG 128
 196 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
 197
 198 const unsigned char *
 199 u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
 200 {
 201         int j;
 202         int s;
 203
 204         if (u < 128) return strings[u];
 205
 206         if (u < 0xa0) {
 207                 u = strange_chars[u - 0x80];
 208                 if (!u) return NULL;
 209         }
 210
 211         to &= ~SYSTEM_CHARSET_FLAG;
 212
 213         if (is_cp_ptr_utf8(&codepages[to]))
 214                 return encode_utf8(u);
 215
 216         /* To mark non breaking spaces in non-UTF-8 strings, we use a
 217          * special char NBSP_CHAR. */
 218         if (u == UCS_NO_BREAK_SPACE) {
 219                 if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
 220                 else /* NBSP_MODE_ASCII */ return " ";
 221         }
 222         if (u == UCS_SOFT_HYPHEN) return "";
 223
 224         if (u < 0xFFFF)
 225                 for (j = 0; j < 0x80; j++)
 226                         if (codepages[to].highhalf[j] == u)
 227                                 return strings[0x80 + j];
 228         for (j = 0; codepages[to].table[j].c; j++)
 229                 if (codepages[to].table[j].u == u)
 230                         return strings[codepages[to].table[j].c];
 231
 232         BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
 233         if (s != -1) return unicode_7b[s].s;
 234
 235         return no_str;
 236 }
 237
 238 static unsigned char utf_buffer[7];
 239
 240 NONSTATIC_INLINE unsigned char *
 241 encode_utf8(unicode_val_T u)
 242 {
 243         memset(utf_buffer, 0, 7);
 244
 245         if (u < 0x80)
 246                 utf_buffer[0] = u;
 247         else if (u < 0x800)
 248                 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
 249                 utf_buffer[1] = 0x80 | (u & 0x3f);
 250         else if (u < 0x10000)
 251                 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
 252                 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
 253                 utf_buffer[2] = 0x80 | (u & 0x3f);
 254         else if (u < 0x200000)
 255                 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
 256                 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
 257                 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
 258                 utf_buffer[3] = 0x80 | (u & 0x3f);
 259         else if (u < 0x4000000)
 260                 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
 261                 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
 262                 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
 263                 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
 264                 utf_buffer[4] = 0x80 | (u & 0x3f);
 265         else    utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
 266                 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
 267                 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
 268                 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
 269                 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
 270                 utf_buffer[5] = 0x80 | (u & 0x3f);
 271
 272         return utf_buffer;
 273 }
 274
 275 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
 276  * equal ones and handled different. */
 277 static const char utf8char_len_tab[256] = {
 278         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 279         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 280         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 281         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 282         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 283         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 284         2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
 285         3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
 286 };
 287
 288 #ifdef CONFIG_UTF8
 289 NONSTATIC_INLINE int
 290 utf8charlen(const unsigned char *p)
 291 {
 292         return p ? utf8char_len_tab[*p] : 0;
 293 }
 294
 295 int
 296 strlen_utf8(unsigned char **str)
 297 {
 298         unsigned char *s = *str;
 299         unsigned char *end = strchr((const char *)s, '\0');
 300         int x;
 301         int len;
 302
 303         for (x = 0;; x++, s += len) {
 304                 len = utf8charlen(s);
 305                 if (s + len > end) break;
 306         }
 307         *str = s;
 308         return x;
 309 }
 310
 311 #define utf8_issingle(p) (((p) & 0x80) == 0)
 312 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
 313
 314 /* Start from @current and move back to @pos char. This pointer return. The
 315  * most left pointer is @start. */
 316 unsigned char *
 317 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
 318 {
 319         if (current == NULL || start == NULL || pos < 0)
 320                 return NULL;
 321         while (pos > 0 && current != start) {
 322                 current--;
 323                 if (utf8_islead(*current))
 324                         pos--;
 325         }
 326         return current;
 327 }
 328
 329 /* Count number of standard terminal cells needed for displaying UTF-8
 330  * character. */
 331 int
 332 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
 333 {
 334         unicode_val_T u;
 335
 336         if (end == NULL)
 337                 end = strchr((const char *)utf8_char, '\0');
 338
 339         if(!utf8_char || !end)
 340                 return -1;
 341
 342         u = utf8_to_unicode(&utf8_char, end);
 343
 344         return unicode_to_cell(u);
 345 }
 346
 347 /* Count number of standard terminal cells needed for displaying string
 348  * with UTF-8 characters. */
 349 int
 350 utf8_ptr2cells(unsigned char *string, unsigned char *end)
 351 {
 352         int charlen, cell, cells = 0;
 353
 354         if (end == NULL)
 355                 end = strchr((const char *)string, '\0');
 356
 357         if(!string || !end)
 358                 return -1;
 359
 360         do {
 361                 charlen = utf8charlen(string);
 362                 if (string + charlen > end)
 363                         break;
 364
 365                 cell = utf8_char2cells(string, end);
 366                 if  (cell < 0)
 367                         return -1;
 368
 369                 cells += cell;
 370                 string += charlen;
 371         } while (1);
 372
 373         return cells;
 374 }
 375
 376 /* Count number of characters in string. */
 377 int
 378 utf8_ptr2chars(unsigned char *string, unsigned char *end)
 379 {
 380         int charlen, chars = 0;
 381
 382         if (end == NULL)
 383                 end = strchr((const char *)string, '\0');
 384
 385         if(!string || !end)
 386                 return -1;
 387
 388         do {
 389                 charlen = utf8charlen(string);
 390                 if (string + charlen > end)
 391                         break;
 392
 393                 chars++;
 394                 string += charlen;
 395         } while (1);
 396
 397         return chars;
 398 }
 399
 400 /*
 401  * Count number of bytes from begining of the string needed for displaying
 402  * specified number of cells.
 403  */
 404 int
 405 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
 406 {
 407         unsigned int bytes = 0, cells = 0;
 408
 409         assert(max_cells>=0);
 410
 411         if (end == NULL)
 412                 end = strchr((const char *)string, '\0');
 413
 414         if(!string || !end)
 415                 return -1;
 416
 417         do {
 418                 int cell = utf8_char2cells(&string[bytes], end);
 419                 if (cell < 0)
 420                         return -1;
 421
 422                 cells += cell;
 423                 if (cells > max_cells)
 424                         break;
 425
 426                 bytes += utf8charlen(&string[bytes]);
 427
 428                 if (string + bytes > end) {
 429                         bytes = end - string;
 430                         break;
 431                 }
 432         } while(1);
 433
 434         return bytes;
 435 }
 436
 437 /* Take @max steps forward from @string in the specified @way, but
 438  * not going past @end.  Return the resulting address.  Store the
 439  * number of steps taken to *@count, unless @count is NULL.
 440  *
 441  * This assumes the text is valid UTF-8, and @string and @end point to
 442  * character boundaries.  If not, it doesn't crash but the results may
 443  * be inconsistent.
 444  *
 445  * This function can do some of the same jobs as utf8charlen(),
 446  * utf8_cells2bytes(), and strlen_utf8().  */
 447 unsigned char *
 448 utf8_step_forward(unsigned char *string, unsigned char *end,
 449                   int max, enum utf8_step way, int *count)
 450 {
 451         int steps = 0;
 452         unsigned char *current = string;
 453
 454         assert(string);
 455         assert(max >= 0);
 456         if_assert_failed goto invalid_arg;
 457         if (end == NULL)
 458                 end = strchr((const char *)string, '\0');
 459
 460         switch (way) {
 461         case UTF8_STEP_CHARACTERS:
 462                 while (steps < max && current < end) {
 463                         ++current;
 464                         if (utf8_islead(*current))
 465                                 ++steps;
 466                 }
 467                 break;
 468
 469         case UTF8_STEP_CELLS_FEWER:
 470         case UTF8_STEP_CELLS_MORE:
 471                 while (steps < max && current < end) {
 472                         unicode_val_T u;
 473                         unsigned char *prev = current;
 474                         int width;
 475
 476                         u = utf8_to_unicode(&current, end);
 477                         if (u == UCS_NO_CHAR) {
 478                                 /* Assume the incomplete sequence
 479                                  * costs one cell.  */
 480                                 current = end;
 481                                 ++steps;
 482                                 break;
 483                         }
 484
 485                         width = unicode_to_cell(u);
 486                         if (way == UTF8_STEP_CELLS_FEWER
 487                             && steps + width > max) {
 488                                 /* Back off.  */
 489                                 current = prev;
 490                                 break;
 491                         }
 492                         steps += width;
 493                 }
 494                 break;
 495
 496         default:
 497                 INTERNAL("impossible enum utf8_step");
 498         }
 499
 500 invalid_arg:
 501         if (count)
 502                 *count = steps;
 503         return current;
 504 }
 505
 506 /* Take @max steps backward from @string in the specified @way, but
 507  * not going past @start.  Return the resulting address.  Store the
 508  * number of steps taken to *@count, unless @count is NULL.
 509  *
 510  * This assumes the text is valid UTF-8, and @string and @start point
 511  * to character boundaries.  If not, it doesn't crash but the results
 512  * may be inconsistent.
 513  *
 514  * This function can do some of the same jobs as utf8_prevchar().  */
 515 unsigned char *
 516 utf8_step_backward(unsigned char *string, unsigned char *start,
 517                    int max, enum utf8_step way, int *count)
 518 {
 519         int steps = 0;
 520         unsigned char *current = string;
 521
 522         assert(string);
 523         assert(start);
 524         assert(max >= 0);
 525         if_assert_failed goto invalid_arg;
 526
 527         switch (way) {
 528         case UTF8_STEP_CHARACTERS:
 529                 while (steps < max && current > start) {
 530                         --current;
 531                         if (utf8_islead(*current))
 532                                 ++steps;
 533                 }
 534                 break;
 535
 536         case UTF8_STEP_CELLS_FEWER:
 537         case UTF8_STEP_CELLS_MORE:
 538                 while (steps < max) {
 539                         unsigned char *prev = current;
 540                         unsigned char *look;
 541                         unicode_val_T u;
 542                         int width;
 543
 544                         if (current <= start)
 545                                 break;
 546                         do {
 547                                 --current;
 548                         } while (current > start && !utf8_islead(*current));
 549
 550                         look = current;
 551                         u = utf8_to_unicode(&look, prev);
 552                         if (u == UCS_NO_CHAR) {
 553                                 /* Assume the incomplete sequence
 554                                  * costs one cell.  */
 555                                 width = 1;
 556                         } else
 557                                 width = unicode_to_cell(u);
 558
 559                         if (way == UTF8_STEP_CELLS_FEWER
 560                             && steps + width > max) {
 561                                 /* Back off.  */
 562                                 current = prev;
 563                                 break;
 564                         }
 565                         steps += width;
 566                 }
 567                 break;
 568
 569         default:
 570                 INTERNAL("impossible enum utf8_step");
 571         }
 572
 573 invalid_arg:
 574         if (count)
 575                 *count = steps;
 576         return current;
 577 }
 578
 579 /*
 580  * Find out number of standard terminal collumns needed for displaying symbol
 581  * (glyph) which represents Unicode character c.
 582  *
 583  * TODO: Use wcwidth when it is available. This seems to require:
 584  * - Make the configure script check whether <wchar.h> and wcwidth exist.
 585  * - Define _XOPEN_SOURCE and include <wchar.h>.
 586  * - Test that __STDC_ISO_10646__ is defined.  (This macro means wchar_t
 587  *   matches ISO 10646 in all locales.)
 588  * However, these do not suffice, because wcwidth depends on LC_CTYPE
 589  * in glibc-2.3.6.  For instance, wcwidth(0xff20) is -1 when LC_CTYPE
 590  * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
 591  * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
 592  * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
 593  * character is apparently not supported in all locales.  Why is that?
 594  * - Perhaps there is standardese that requires supported characters
 595  *   to be convertable to multibyte form.  Then ELinks could just pick
 596  *   some UTF-8 locale for its wcwidth purposes.
 597  * - Perhaps wcwidth can even return different nonnegative values for
 598  *   the same ISO 10646 character in different locales.  Then ELinks
 599  *   would have to set LC_CTYPE to match at least the terminal's
 600  *   charset (which may differ from the LC_CTYPE environment variable,
 601  *   especially when the master process is serving a slave terminal).
 602  *   But there is no guarantee that the libc supports all the same
 603  *   charsets as ELinks does.
 604  * For now, it seems safest to avoid the potentially locale-dependent
 605  * libc version of wcwidth, and instead use a hardcoded mapping.
 606  *
 607  * @return      2 for double-width glyph, 1 for others.
 608  *              0 for unprintable glyphs (like 0x200e: "LEFT-TO-RIGHT MARK")
 609  */
 610 NONSTATIC_INLINE int
 611 unicode_to_cell(unicode_val_T c)
 612 {
 613         if (c == 0x200e || c == 0x200f)
 614                 return 0;
 615         if (c >= 0x1100
 616                 && (c <= 0x115f                 /* Hangul Jamo */
 617                 || c == 0x2329
 618                 || c == 0x232a
 619                 || (c >= 0x2e80 && c <= 0xa4cf
 620                         && c != 0x303f)         /* CJK ... Yi */
 621                 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
 622                 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
 623                                                                 Ideographs */
 624                 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
 625                 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
 626                 || (c >= 0xffe0 && c <= 0xffe6)
 627                 || (c >= 0x20000 && c <= 0x2fffd)
 628                 || (c >= 0x30000 && c <= 0x3fffd)))
 629                 return 2;
 630
 631         return 1;
 632 }
 633
 634 /* Fold the case of a Unicode character, so that hotkeys in labels can
 635  * be compared case-insensitively.  It is unspecified whether the
 636  * result will be in upper or lower case.  */
 637 unicode_val_T
 638 unicode_fold_label_case(unicode_val_T c)
 639 {
 640 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
 641         return towlower(c);
 642 #else  /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 643         /* For now, this supports only ASCII.  It would be possible to
 644          * use code generated from CaseFolding.txt of Unicode if the
 645          * acknowledgements required by http://www.unicode.org/copyright.html
 646          * were added to associated documentation of ELinks.  */
 647         if (c >= 0x41 && c <= 0x5A)
 648                 return c + 0x20;
 649         else
 650                 return c;
 651 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 652 }
 653 #endif /* CONFIG_UTF8 */
 654
 655 NONSTATIC_INLINE unicode_val_T
 656 utf8_to_unicode(unsigned char **string, const unsigned char *end)
 657 {
 658         unsigned char *str = *string;
 659         unicode_val_T u;
 660         int length;
 661
 662         length = utf8char_len_tab[str[0]];
 663
 664         if (str + length > end) {
 665                 return UCS_NO_CHAR;
 666         }
 667
 668         switch (length) {
 669                 case 1:         /* U+0000 to U+007F */
 670                         if (str[0] >= 0x80) {
 671 invalid_utf8:
 672                                 ++*string;
 673                                 return UCS_REPLACEMENT_CHARACTER;
 674                         }
 675                         u = str[0];
 676                         break;
 677                 case 2:         /* U+0080 to U+07FF */
 678                         if ((str[1] & 0xc0) != 0x80)
 679                                 goto invalid_utf8;
 680                         u = (str[0] & 0x1f) << 6;
 681                         u += (str[1] & 0x3f);
 682                         if (u < 0x80)
 683                                 goto invalid_utf8;
 684                         break;
 685                 case 3:         /* U+0800 to U+FFFF, except surrogates */
 686                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
 687                                 goto invalid_utf8;
 688                         u = (str[0] & 0x0f) << 12;
 689                         u += ((str[1] & 0x3f) << 6);
 690                         u += (str[2] & 0x3f);
 691                         if (u < 0x800 || is_utf16_surrogate(u))
 692                                 goto invalid_utf8;
 693                         break;
 694                 case 4:         /* U+10000 to U+1FFFFF */
 695                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 696                             || (str[3] & 0xc0) != 0x80)
 697                                 goto invalid_utf8;
 698                         u = (str[0] & 0x0f) << 18;
 699                         u += ((str[1] & 0x3f) << 12);
 700                         u += ((str[2] & 0x3f) << 6);
 701                         u += (str[3] & 0x3f);
 702                         if (u < 0x10000)
 703                                 goto invalid_utf8;
 704                         break;
 705                 case 5:         /* U+200000 to U+3FFFFFF */
 706                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 707                             || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
 708                                 goto invalid_utf8;
 709                         u = (str[0] & 0x0f) << 24;
 710                         u += ((str[1] & 0x3f) << 18);
 711                         u += ((str[2] & 0x3f) << 12);
 712                         u += ((str[3] & 0x3f) << 6);
 713                         u += (str[4] & 0x3f);
 714                         if (u < 0x200000)
 715                                 goto invalid_utf8;
 716                         break;
 717                 case 6:         /* U+4000000 to U+7FFFFFFF */
 718                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 719                             || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
 720                             || (str[5] & 0xc0) != 0x80)
 721                                 goto invalid_utf8;
 722                         u = (str[0] & 0x01) << 30;
 723                         u += ((str[1] & 0x3f) << 24);
 724                         u += ((str[2] & 0x3f) << 18);
 725                         u += ((str[3] & 0x3f) << 12);
 726                         u += ((str[4] & 0x3f) << 6);
 727                         u += (str[5] & 0x3f);
 728                         if (u < 0x4000000)
 729                                 goto invalid_utf8;
 730                         break;
 731                 default:
 732                         INTERNAL("utf8char_len_tab out of range");
 733                         goto invalid_utf8;
 734         }
 735         *string = str + length;
 736         return u;
 737 }
 738
 739 /* The common part of cp2u and cp2utf_8.  */
 740 static unicode_val_T
 741 cp2u_shared(const struct codepage_desc *from, unsigned char c)
 742 {
 743         unicode_val_T u = from->highhalf[c - 0x80];
 744
 745         if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
 746         return u;
 747 }
 748
 749 /* Used for converting input from the terminal.  */
 750 unicode_val_T
 751 cp2u(int from, unsigned char c)
 752 {
 753         from &= ~SYSTEM_CHARSET_FLAG;
 754
 755         /* UTF-8 is a multibyte codepage and cannot be handled with
 756          * this function.  */
 757         assert(!is_cp_ptr_utf8(&codepages[from]));
 758         if_assert_failed return UCS_REPLACEMENT_CHARACTER;
 759
 760         if (c < 0x80) return c;
 761         else return cp2u_shared(&codepages[from], c);
 762 }
 763
 764 /* This slow and ugly code is used by the terminal utf_8_io */
 765 const unsigned char *
 766 cp2utf8(int from, int c)
 767 {
 768         from &= ~SYSTEM_CHARSET_FLAG;
 769
 770         if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
 771                 return strings[c];
 772
 773         return encode_utf8(cp2u_shared(&codepages[from], c));
 774 }
 775
 776 unicode_val_T
 777 cp_to_unicode(int codepage, unsigned char **string, const unsigned char *end)
 778 {
 779         unicode_val_T ret;
 780
 781         if (is_cp_utf8(codepage))
 782                 return utf8_to_unicode(string, end);
 783
 784         if (*string >= end)
 785                 return UCS_NO_CHAR;
 786
 787         ret = cp2u(codepage, **string);
 788         ++*string;
 789         return ret;
 790 }
 791
 792
 793 #ifdef CONFIG_COMBINE
 794 unicode_val_T last_combined = UCS_BEGIN_COMBINED - 1;
 795 unicode_val_T **combined;
 796 struct hash *combined_hash;
 797
 798 unicode_val_T
 799 get_combined(unicode_val_T *data, int length)
 800 {
 801         struct hash_item *item;
 802         unicode_val_T *key;
 803         int i, indeks;
 804
 805         assert(length >= 1 && length <= UCS_MAX_LENGTH_COMBINED);
 806         if_assert_failed return UCS_NO_CHAR;
 807
 808         if (!combined_hash) combined_hash = init_hash8();
 809         if (!combined_hash) return UCS_NO_CHAR;
 810         item = get_hash_item(combined_hash, (unsigned char *)data, length * sizeof(*data));
 811
 812         if (item) return (unicode_val_T)(long)item->value;
 813         if (last_combined >= UCS_END_COMBINED) return UCS_NO_CHAR;
 814
 815         key = mem_alloc((length + 1) * sizeof(*key));
 816         if (!key) return UCS_NO_CHAR;
 817         for (i = 0; i < length; i++)
 818                 key[i] = data[i];
 819         key[i] = UCS_END_COMBINED;
 820
 821         last_combined++;
 822         indeks = last_combined - UCS_BEGIN_COMBINED;
 823
 824         combined = mem_realloc(combined, sizeof(*combined) * (indeks + 1));
 825         if (!combined) {
 826                 mem_free(key);
 827                 last_combined--;
 828                 return UCS_NO_CHAR;
 829         }
 830         combined[indeks] = key;
 831         item = add_hash_item(combined_hash, (unsigned char *)key,
 832                              length * sizeof(*data), (void *)(long)(last_combined));
 833         if (!item) {
 834                 last_combined--;
 835                 mem_free(key);
 836                 return UCS_NO_CHAR;
 837         }
 838         return last_combined;
 839 }
 840
 841 void
 842 free_combined()
 843 {
 844         int i, end = last_combined - UCS_BEGIN_COMBINED + 1;
 845
 846         if (combined_hash)
 847                 free_hash(&combined_hash);
 848         for (i = 0; i < end; i++)
 849                 mem_free(combined[i]);
 850         mem_free_if(combined);
 851 }
 852 #endif /* CONFIG_COMBINE */
 853
 854
 855 static void
 856 add_utf8(struct conv_table *ct, unicode_val_T u, const unsigned char *str)
 857 {
 858         unsigned char *p = encode_utf8(u);
 859
 860         while (p[1]) {
 861                 if (ct[*p].t) ct = ct[*p].u.tbl;
 862                 else {
 863                         struct conv_table *nct;
 864
 865                         assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
 866                         if_assert_failed return;
 867
 868                         nct = mem_calloc(256, sizeof(*nct));
 869                         if (!nct) return;
 870                         new_translation_table(nct);
 871                         ct[*p].t = 1;
 872                         ct[*p].u.tbl = nct;
 873                         ct = nct;
 874                 }
 875                 p++;
 876         }
 877
 878         assertm(!ct[*p].t, "bad utf encoding #2");
 879         if_assert_failed return;
 880
 881         if (ct[*p].u.str == no_str)
 882                 ct[*p].u.str = str;
 883 }
 884
 885 /* A conversion table from some charset to UTF-8.
 886  * If it is from UTF-8 to UTF-8, it converts each byte separately.
 887  * Unlike in other translation tables, the strings in elements 0x80 to
 888  * 0xFF are allocated dynamically.  */
 889 struct conv_table utf_table[256];
 890 int utf_table_init = 1;
 891
 892 static void
 893 free_utf_table(void)
 894 {
 895         int i;
 896
 897         /* Cast away const.  */
 898         for (i = 128; i < 256; i++)
 899                 mem_free((unsigned char *) utf_table[i].u.str);
 900 }
 901
 902 static struct conv_table *
 903 get_translation_table_to_utf8(int from)
 904 {
 905         int i;
 906         static int lfr = -1;
 907
 908         if (from == -1) return NULL;
 909         from &= ~SYSTEM_CHARSET_FLAG;
 910         if (from == lfr) return utf_table;
 911         lfr = from;
 912         if (utf_table_init) {
 913                 memset(utf_table, 0, sizeof(utf_table));
 914                 utf_table_init = 0;
 915         } else
 916                 free_utf_table();
 917
 918         for (i = 0; i < 128; i++)
 919                 utf_table[i].u.str = strings[i];
 920
 921         if (is_cp_ptr_utf8(&codepages[from])) {
 922                 for (i = 128; i < 256; i++)
 923                         utf_table[i].u.str = stracpy(strings[i]);
 924                 return utf_table;
 925         }
 926
 927         for (i = 128; i < 256; i++) {
 928                 unicode_val_T u = codepages[from].highhalf[i - 0x80];
 929
 930                 if (u == 0xFFFF)
 931                         utf_table[i].u.str = NULL;
 932                 else
 933                         utf_table[i].u.str = stracpy(encode_utf8(u));
 934         }
 935
 936         for (i = 0; codepages[from].table[i].c; i++) {
 937                 unicode_val_T u = codepages[from].table[i].u;
 938
 939                 if (!utf_table[codepages[from].table[i].c].u.str)
 940                         utf_table[codepages[from].table[i].c].u.str =
 941                                 stracpy(encode_utf8(u));
 942         }
 943
 944         for (i = 128; i < 256; i++)
 945                 if (!utf_table[i].u.str)
 946                         utf_table[i].u.str = stracpy(no_str);
 947
 948         return utf_table;
 949 }
 950
 951 /* A conversion table between two charsets, where the target is not UTF-8.  */
 952 static struct conv_table table[256];
 953 static int first = 1;
 954
 955 void
 956 free_conv_table(void)
 957 {
 958         if (!utf_table_init) free_utf_table();
 959         if (first) {
 960                 memset(table, 0, sizeof(table));
 961                 first = 0;
 962         }
 963         new_translation_table(table);
 964 #ifdef HAVE_ICONV
 965         if (iconv_cd != (iconv_t)-1) {
 966                 iconv_close(iconv_cd);
 967                 iconv_cd = (iconv_t)-1;
 968         }
 969 #endif
 970 }
 971
 972
 973 struct conv_table *
 974 get_translation_table(int from, int to)
 975 {
 976         static int lfr = -1;
 977         static int lto = -1;
 978
 979         from &= ~SYSTEM_CHARSET_FLAG;
 980         to &= ~SYSTEM_CHARSET_FLAG;
 981         if (first) {
 982                 memset(table, 0, sizeof(table));
 983                 first = 0;
 984         }
 985
 986         if (codepages[from].iconv) {
 987                 struct conv_table *table2 = get_translation_table_to_utf8(34);
 988
 989                 if (table2) table2->iconv_cp = from;
 990                 return table2;
 991         }
 992
 993         if (/*from == to ||*/ from == -1 || to == -1)
 994                 return NULL;
 995         if (is_cp_ptr_utf8(&codepages[to])) {
 996                 struct conv_table *table2 = get_translation_table_to_utf8(from);
 997
 998                 if (table2) table2->iconv_cp = -1;
 999                 return table2;
1000         }
1001         if (from == lfr && to == lto)
1002                 return table;
1003         lfr = from;
1004         lto = to;
1005         new_translation_table(table);
1006
1007         if (is_cp_ptr_utf8(&codepages[from])) {
1008                 int i;
1009
1010                 /* Map U+00A0 and U+00AD the same way as u2cp() would.  */
1011                 add_utf8(table, UCS_NO_BREAK_SPACE, strings[NBSP_CHAR]);
1012                 add_utf8(table, UCS_SOFT_HYPHEN, "");
1013
1014                 for (i = 0x80; i <= 0xFF; i++)
1015                         if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
1016                                 add_utf8(table,
1017                                          codepages[to].highhalf[i - 0x80],
1018                                          strings[i]);
1019
1020                 for (i = 0; codepages[to].table[i].c; i++)
1021                         add_utf8(table, codepages[to].table[i].u,
1022                                  strings[codepages[to].table[i].c]);
1023
1024                 for (i = 0; unicode_7b[i].x != -1; i++)
1025                         if (unicode_7b[i].x >= 0x80)
1026                                 add_utf8(table, unicode_7b[i].x,
1027                                          unicode_7b[i].s);
1028
1029         } else {
1030                 int i;
1031
1032                 for (i = 128; i < 256; i++) {
1033                         if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
1034                                 const unsigned char *u;
1035
1036                                 u = u2cp(codepages[from].highhalf[i - 0x80], to);
1037                                 if (u) table[i].u.str = u;
1038                         }
1039                 }
1040         }
1041
1042         return table;
1043 }
1044
1045 static inline int
1046 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
1047 {
1048         while (l2) {
1049                 if (*s1 > *s2) return 1;
1050                 if (*s1 < *s2) return -1;
1051                 s1++;
1052                 s2++;
1053                 l2--;
1054         }
1055
1056         return *s2 ? -1 : 0;
1057 }
1058
1059 /* Entity cache debugging purpose. */
1060 #if 0
1061 #define DEBUG_ENTITY_CACHE
1062 #else
1063 #undef DEBUG_ENTITY_CACHE
1064 #endif
1065
1066 struct entity_cache {
1067         unsigned int hits;
1068         int strlen;
1069         int encoding;
1070         const unsigned char *result;
1071         unsigned char str[20]; /* Suffice in any case. */
1072 };
1073
1074 /* comparison function for qsort() */
1075 static int
1076 hits_cmp(const void *v1, const void *v2)
1077 {
1078         const struct entity_cache *a = v1, *b = v2;
1079
1080         if (a->hits == b->hits) return 0;
1081         if (a->hits > b->hits) return -1;
1082         else return 1;
1083 }
1084
1085 static int
1086 compare_entities(const void *key_, const void *element_)
1087 {
1088         struct string *key = (struct string *) key_;
1089         struct entity *element = (struct entity *) element_;
1090         int length = key->length;
1091         unsigned char *first = key->source;
1092         unsigned char *second = element->s;
1093
1094         return xxstrcmp(first, second, length);
1095 }
1096
1097 const unsigned char *
1098 get_entity_string(const unsigned char *str, const int strlen, int encoding)
1099 {
1100 #define ENTITY_CACHE_SIZE 10    /* 10 seems a good value. */
1101 #define ENTITY_CACHE_MAXLEN 9   /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
1102                                    will go in [0] table */
1103         static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
1104         static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
1105         unsigned int slen = 0;
1106         const unsigned char *result = NULL;
1107
1108         /* Note that an object of static storage duration is automatically
1109          * initialised to zero in C.  */
1110
1111         if (strlen <= 0) return NULL;
1112
1113 #ifdef CONFIG_UTF8
1114         /* TODO: caching UTF-8 */
1115         encoding &= ~SYSTEM_CHARSET_FLAG;
1116         if (is_cp_ptr_utf8(&codepages[encoding]))
1117                 goto skip;
1118 #endif /* CONFIG_UTF8 */
1119
1120         /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1121          * + google + slashdot + websites that result from a search for test on google,
1122          * + various ones) show quite impressive improvment:
1123          * Top ten is:
1124          * 0: hits=2459 l=4 st='nbsp'
1125          * 1: hits=2152 l=6 st='eacute'
1126          * 2: hits=235 l=6 st='egrave'
1127          * 3: hits=136 l=6 st='agrave'
1128          * 4: hits=100 l=3 st='amp'
1129          * 5: hits=40 l=5 st='laquo'
1130          * 6: hits=8 l=4 st='copy'
1131          * 7: hits=5 l=2 st='gt'
1132          * 8: hits=2 l=2 st='lt'
1133          * 9: hits=1 l=6 st='middot'
1134          *
1135          * Most of the time cache hit ratio is near 95%.
1136          *
1137          * A long test shows: 15186 hits vs. 24 misses and mean iteration
1138          * count is kept < 2 (worst case 1.58). Not so bad ;)
1139          *
1140          * --Zas */
1141
1142         /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1143         slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
1144
1145         if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
1146                 int i;
1147
1148                 for (i = 0; i < nb_entity_cache[slen]; i++) {
1149                         if (entity_cache[slen][i].encoding == encoding
1150                             && !memcmp(str, entity_cache[slen][i].str, strlen)) {
1151 #ifdef DEBUG_ENTITY_CACHE
1152                                 static double total_iter = 0;
1153                                 static unsigned long hit_count = 0;
1154
1155                                 total_iter += i + 1;
1156                                 hit_count++;
1157                                 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
1158 #endif
1159                                 if (entity_cache[slen][i].hits < (unsigned int) ~0)
1160                                         entity_cache[slen][i].hits++;
1161                                 return entity_cache[slen][i].result;
1162                         }
1163                 }
1164 #ifdef DEBUG_ENTITY_CACHE
1165                 fprintf(stderr, "miss\n");
1166 #endif
1167         }
1168 #ifdef CONFIG_UTF8
1169 skip:
1170 #endif /* CONFIG_UTF8 */
1171         if (*str == '#') { /* Numeric entity. */
1172                 int l = (int) strlen;
1173                 unsigned char *st = (unsigned char *) str;
1174                 unicode_val_T n = 0;
1175
1176                 if (l == 1) goto end; /* &#; ? */
1177                 st++, l--;
1178                 if ((*st | 32) == 'x') { /* Hexadecimal */
1179
1180                         if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
1181                         st++, l--;
1182                         do {
1183                                 unsigned char c = (*(st++) | 32);
1184
1185                                 if (isdigit(c))
1186                                         n = (n << 4) | (c - '0');
1187                                 else if (isxdigit(c))
1188                                         n = (n << 4) | (c - 'a' + 10);
1189                                 else
1190                                         goto end; /* Bad char. */
1191                         } while (--l);
1192                 } else { /* Decimal */
1193                         if (l > 10) goto end; /* 4294967295 max. */
1194                         do {
1195                                 unsigned char c = *(st++);
1196
1197                                 if (isdigit(c))
1198                                         n = n * 10 + c - '0';
1199                                 else
1200                                         goto end; /* Bad char. */
1201                                 /* Limit to 0xFFFFFFFF. */
1202                                 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1203                                         goto end;
1204                         } while (--l);
1205                 }
1206
1207                 result = u2cp(n, encoding);
1208
1209 #ifdef DEBUG_ENTITY_CACHE
1210                 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1211 #endif
1212         } else { /* Text entity. */
1213                 struct string key = INIT_STRING((unsigned char *) str, strlen);
1214                 struct entity *element = bsearch((void *) &key, entities,
1215                                                  N_ENTITIES,
1216                                                  sizeof(*element),
1217                                                  compare_entities);
1218
1219                 if (element) result = u2cp(element->c, encoding);
1220         }
1221
1222 #ifdef CONFIG_UTF8
1223         if (is_cp_ptr_utf8(&codepages[encoding])) {
1224                 return result;
1225         }
1226 #endif /* CONFIG_UTF8 */
1227 end:
1228         /* Take care of potential buffer overflow. */
1229         if (strlen < sizeof(entity_cache[slen][0].str)) {
1230                 struct entity_cache *ece;
1231
1232                 /* Sort entries by hit order. */
1233                 if (nb_entity_cache[slen] > 1)
1234                         qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1235                               sizeof(entity_cache[slen][0]), hits_cmp);
1236
1237                 /* Increment number of cache entries if possible.
1238                  * Else, just replace the least used entry.  */
1239                 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1240                 ece = &entity_cache[slen][nb_entity_cache[slen] - 1];
1241
1242                 /* Copy new entry to cache. */
1243                 ece->hits = 1;
1244                 ece->strlen = strlen;
1245                 ece->encoding = encoding;
1246                 ece->result = result;
1247                 memcpy(ece->str, str, strlen);
1248                 ece->str[strlen] = '\0';
1249
1250
1251 #ifdef DEBUG_ENTITY_CACHE
1252                 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1253                                 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1254
1255         {
1256                 unsigned int i;
1257
1258                 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1259                 for (i = 0; i < nb_entity_cache[slen] ; i++)
1260                         fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1261                                 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1262                                 entity_cache[slen][i].str);
1263                 fprintf(stderr, "-----------------\n");
1264         }
1265 #endif  /* DEBUG_ENTITY_CACHE */
1266         }
1267         return result;
1268 }
1269
1270 unsigned char *
1271 convert_string(struct conv_table *convert_table,
1272                unsigned char *chars2, int charslen2, int cp,
1273                enum convert_string_mode mode, int *length,
1274                void (*callback)(void *data, unsigned char *buf, int buflen),
1275                void *callback_data)
1276 {
1277         unsigned char *buffer;
1278         int bufferpos = 0;
1279         int charspos = 0;
1280         unsigned char *chars = chars2;
1281         int charslen = charslen2;
1282
1283 #ifdef HAVE_ICONV
1284         static char iconv_input[256];
1285         static char iconv_output[256 * 8];
1286         static size_t iconv_offset;
1287         static int iconv_cp;
1288         static size_t iconv_inleft;
1289         size_t iconv_outleft = 256 * 8;
1290         int loop = 0;
1291         int is_iconv = 0;
1292         int chars_offset = 0;
1293
1294         if (!convert_table && !memchr(chars, '&', charslen)) {
1295                 if (callback) {
1296                         if (charslen) callback(callback_data, chars, charslen);
1297                         return NULL;
1298                 } else {
1299                         return memacpy(chars, charslen);
1300                 }
1301         }
1302
1303         if (cp >= 0) {
1304                 if (convert_table && convert_table->iconv_cp > 0) {
1305                         is_iconv = 1;
1306                         cp = convert_table->iconv_cp;
1307                 } else {
1308                         is_iconv = codepages[cp & ~SYSTEM_CHARSET_FLAG].iconv;
1309                 }
1310         }
1311 #endif
1312
1313         /* Buffer allocation */
1314
1315         buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1316         if (!buffer) return NULL;
1317
1318 #ifdef HAVE_ICONV
1319         if (is_iconv) {
1320                 int v;
1321                 size_t before, to_copy;
1322                 char *outp, *inp;
1323
1324                 if (iconv_cd >= 0) {
1325                         if (cp != iconv_cp) {
1326                                 iconv_close(iconv_cd);
1327                                 iconv_cd = (iconv_t)-1;
1328                         }
1329                 }
1330                 if (iconv_cd == (iconv_t)-1) {
1331                         iconv_offset = 0;
1332                         iconv_cd = iconv_open("utf-8", get_cp_mime_name(cp));
1333                         if (iconv_cd == (iconv_t)-1) {
1334                                 mem_free(buffer);
1335                                 return NULL;
1336                         }
1337                         iconv_cp = cp;
1338                 }
1339 repeat:
1340                 to_copy = charslen2 - chars_offset;
1341                 if (to_copy > 256 - iconv_offset) to_copy = 256 - iconv_offset;
1342                 memcpy(iconv_input + iconv_offset, chars2 + chars_offset, to_copy);
1343                 iconv_outleft = 256 * 8;
1344                 iconv_inleft = iconv_offset + to_copy;
1345                 inp = iconv_input;
1346                 outp = iconv_output;
1347                 before = iconv_inleft;
1348
1349                 v = iconv(iconv_cd, &inp, &iconv_inleft, &outp, &iconv_outleft);
1350                 chars_offset += before - iconv_inleft;
1351                 charslen = 256 * 8 - iconv_outleft;
1352
1353                 chars = (unsigned char *)iconv_output;
1354                 charspos = 0;
1355
1356                 if (v == -1) {
1357                         switch (errno) {
1358                         case EINVAL:
1359                                 memcpy(iconv_input, inp, iconv_inleft);
1360                                 iconv_offset = iconv_inleft;
1361                                 break;
1362                         case EILSEQ:
1363                                 loop = 0;
1364                                 goto out;
1365                                 break;
1366                         default:
1367                                 iconv_offset = 0;
1368                         }
1369                 } else {
1370                         iconv_offset = 0;
1371                 }
1372
1373                 loop = chars_offset < charslen2;
1374         }
1375 #endif
1376         /* Iterate ;-) */
1377
1378 out:
1379         while (charspos < charslen) {
1380                 const unsigned char *translit;
1381
1382 #define PUTC do { \
1383                 buffer[bufferpos++] = chars[charspos++]; \
1384                 translit = ""; \
1385                 goto flush; \
1386         } while (0)
1387
1388                 if (chars[charspos] != '&') {
1389                         struct conv_table *t;
1390                         int i;
1391
1392                         if (chars[charspos] < 128 || !convert_table) PUTC;
1393
1394                         t = convert_table;
1395                         i = charspos;
1396
1397                         while (t[chars[i]].t) {
1398                                 t = t[chars[i++]].u.tbl;
1399                                 if (i >= charslen) PUTC;
1400                         }
1401
1402                         translit = t[chars[i]].u.str;
1403                         charspos = i + 1;
1404
1405                 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1406                         PUTC;
1407
1408                 } else {
1409                         int start = charspos + 1;
1410                         int i = start;
1411
1412                         while (i < charslen
1413                                && (isasciialpha(chars[i])
1414                                    || isdigit(chars[i])
1415                                    || (chars[i] == '#')))
1416                                 i++;
1417
1418                         /* This prevents bug 213: we were expanding "entities"
1419                          * in URL query strings. */
1420                         /* XXX: But this disables &nbsp&nbsp usage, which
1421                          * appears to be relatively common! --pasky */
1422                         if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1423                             && i > start
1424                             && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1425                                 translit = get_entity_string(&chars[start], i - start,
1426                                                       cp);
1427                                 if (chars[i] != ';') {
1428                                         /* Eat &nbsp &nbsp<foo> happily, but
1429                                          * pull back from the character after
1430                                          * entity string if it is not the valid
1431                                          * terminator. */
1432                                         i--;
1433                                 }
1434
1435                                 if (!translit) PUTC;
1436                                 charspos = i + (i < charslen);
1437                         } else PUTC;
1438                 }
1439
1440                 if (!translit[0]) continue;
1441
1442                 if (!translit[1]) {
1443                         buffer[bufferpos++] = translit[0];
1444                         translit = "";
1445                         goto flush;
1446                 }
1447
1448                 while (*translit) {
1449                         unsigned char *new_;
1450
1451                         buffer[bufferpos++] = *(translit++);
1452 flush:
1453                         if (bufferpos & (ALLOC_GR - 1)) continue;
1454
1455                         if (callback) {
1456                                 buffer[bufferpos] = 0;
1457                                 callback(callback_data, buffer, bufferpos);
1458                                 bufferpos = 0;
1459                         } else {
1460                                 new_ = mem_realloc(buffer, bufferpos + ALLOC_GR);
1461                                 if (!new_) {
1462                                         mem_free(buffer);
1463                                         return NULL;
1464                                 }
1465                                 buffer = new_;
1466                         }
1467                 }
1468 #undef PUTC
1469         }
1470
1471 #ifdef HAVE_ICONV
1472         if (loop) goto repeat;
1473 #endif
1474         /* Say bye */
1475
1476         buffer[bufferpos] = 0;
1477         if (length) *length = bufferpos;
1478
1479         if (callback) {
1480                 if (bufferpos) callback(callback_data, buffer, bufferpos);
1481                 mem_free(buffer);
1482                 return NULL;
1483         } else {
1484                 return buffer;
1485         }
1486 }
1487
1488
1489 #ifndef USE_FASTFIND
1490 int
1491 get_cp_index(const unsigned char *name)
1492 {
1493         int i, a;
1494         int syscp = 0;
1495
1496         if (!c_strcasecmp(name, "System")) {
1497 #if HAVE_LANGINFO_CODESET
1498                 name = nl_langinfo(CODESET);
1499                 syscp = SYSTEM_CHARSET_FLAG;
1500 #else
1501                 name = "us-ascii";
1502 #endif
1503         }
1504
1505         for (i = 0; codepages[i].name; i++) {
1506                 for (a = 0; codepages[i].aliases[a]; a++) {
1507                         /* In the past, we looked for the longest substring
1508                          * in all the names; it is way too expensive, though:
1509                          *
1510                          *   %   cumulative   self              self     total
1511                          *  time   seconds   seconds    calls  us/call  us/call  name
1512                          *  3.00      0.66     0.03     1325    22.64    22.64  get_cp_index
1513                          *
1514                          * Anything called from redraw_screen() is in fact
1515                          * relatively expensive, even if it's called just
1516                          * once. So we will do a simple strcasecmp() here.
1517                          */
1518
1519                         if (!c_strcasecmp(name, codepages[i].aliases[a]))
1520                                 return i | syscp;
1521                 }
1522         }
1523
1524         if (syscp) {
1525                 return get_cp_index("us-ascii") | syscp;
1526         } else {
1527                 return -1;
1528         }
1529 }
1530
1531 #else
1532
1533 static unsigned int i_name = 0;
1534 static unsigned int i_alias = 0;
1535
1536 /* Reset internal list pointer */
1537 void
1538 charsets_list_reset(void)
1539 {
1540         i_name = 0;
1541         i_alias = 0;
1542 }
1543
1544 /* Returns a pointer to a struct that contains current key and data pointers
1545  * and increment internal pointer.  It returns NULL when key is NULL. */
1546 struct fastfind_key_value *
1547 charsets_list_next(void)
1548 {
1549         static struct fastfind_key_value kv;
1550
1551         if (!codepages[i_name].name) return NULL;
1552
1553         kv.key = codepages[i_name].aliases[i_alias];
1554         kv.data = (void *) &codepages[i_name]; /* cast away const */
1555
1556         if (codepages[i_name].aliases[i_alias + 1])
1557                 i_alias++;
1558         else {
1559                 i_name++;
1560                 i_alias = 0;
1561         }
1562
1563         return &kv;
1564 }
1565
1566 static struct fastfind_index ff_charsets_index
1567         = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1568
1569 /* It searchs for a charset named @name or one of its aliases and
1570  * returns index for it or -1 if not found. */
1571 int
1572 get_cp_index(const unsigned char *name)
1573 {
1574         const struct codepage_desc *codepage;
1575         int syscp = 0;
1576
1577         if (!c_strcasecmp(name, "System")) {
1578 #if HAVE_LANGINFO_CODESET
1579                 name = nl_langinfo(CODESET);
1580                 syscp = SYSTEM_CHARSET_FLAG;
1581 #else
1582                 name = "us-ascii";
1583 #endif
1584         }
1585
1586         codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1587         if (codepage) {
1588                 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1589                 return (codepage - codepages) | syscp;
1590
1591         } else if (syscp) {
1592                 return get_cp_index("us-ascii") | syscp;
1593
1594         } else {
1595                 return -1;
1596         }
1597 }
1598
1599 #endif /* USE_FASTFIND */
1600
1601 void
1602 init_charsets_lookup(void)
1603 {
1604 #ifdef USE_FASTFIND
1605         fastfind_index(&ff_charsets_index, FF_COMPRESS);
1606 #endif
1607 }
1608
1609 void
1610 free_charsets_lookup(void)
1611 {
1612 #ifdef USE_FASTFIND
1613         fastfind_done(&ff_charsets_index);
1614 #endif
1615 }
1616
1617 /* Get the codepage's name for displaying to the user, or NULL if
1618  * @cp_index is one past the end.  In the future, we might want to
1619  * localize these with gettext.  So it may be best not to use this
1620  * function if the name will have to be converted back to an
1621  * index.  */
1622 unsigned char *
1623 get_cp_name(int cp_index)
1624 {
1625         if (cp_index < 0) return "none";
1626         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1627
1628         return codepages[cp_index].name;
1629 }
1630
1631 /* Get the codepage's name for saving to a configuration file.  These
1632  * names can be converted back to indexes, even in future versions of
1633  * ELinks.  */
1634 unsigned char *
1635 get_cp_config_name(int cp_index)
1636 {
1637         if (cp_index < 0) return "none";
1638         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1639         if (!codepages[cp_index].aliases) return NULL;
1640
1641         return codepages[cp_index].aliases[0];
1642 }
1643
1644 /* Get the codepage's name for sending to a library or server that
1645  * understands MIME charset names.  This function irreversibly maps
1646  * the "System" codepage to the underlying charset.  */
1647 unsigned char *
1648 get_cp_mime_name(int cp_index)
1649 {
1650         if (cp_index < 0) return "none";
1651         cp_index &= ~SYSTEM_CHARSET_FLAG;
1652         if (!codepages[cp_index].aliases) return NULL;
1653
1654         return codepages[cp_index].aliases[0];
1655 }
1656
1657 int
1658 is_cp_utf8(int cp_index)
1659 {
1660         cp_index &= ~SYSTEM_CHARSET_FLAG;
1661         return is_cp_ptr_utf8(&codepages[cp_index]);
1662 }
1663
1664 /* This function will be used by the xhtml parser. */
1665 const uint16_t *
1666 get_cp_highhalf(const unsigned char *name)
1667 {
1668         int cp = get_cp_index(name);
1669
1670         if (cp < 0) return NULL;
1671         cp &= ~SYSTEM_CHARSET_FLAG;
1672         return codepages[cp].highhalf;
1673 }