src/intl/charsets.c

   1 /* Charsets convertor */
   2
   3 #ifndef _GNU_SOURCE
   4 #define _GNU_SOURCE /* strcasecmp() */
   5 #endif
   6
   7 #ifdef HAVE_CONFIG_H
   8 #include "config.h"
   9 #endif
  10
  11 #if HAVE_LANGINFO_CODESET
  12 #include <langinfo.h>
  13 #endif
  14
  15 #include <ctype.h>
  16 #include <stdlib.h>
  17 #if HAVE_WCTYPE_H
  18 #include <wctype.h>
  19 #endif
  20
  21 #include "elinks.h"
  22
  23 #include "document/options.h"
  24 #include "intl/charsets.h"
  25 #include "util/conv.h"
  26 #include "util/error.h"
  27 #include "util/fastfind.h"
  28 #include "util/hash.h"
  29 #include "util/memory.h"
  30 #include "util/string.h"
  31
  32
  33 /* Fix namespace clash on MacOS. */
  34 #define table table_elinks
  35
  36 struct table_entry {
  37         unsigned char c;
  38         /* This should in principle be unicode_val_T, but because all
  39          * the values currently in codepage.inc fit in 16 bits, we can
  40          * as well use uint16_t and halve sizeof(struct table_entry)
  41          * from 8 bytes to 4.  Should other characters ever be needed,
  42          * unicode_val_T u : 24 might be a possibility, although it
  43          * seems a little unportable as bitfields are in principle
  44          * restricted to int, which may be 16-bit.  */
  45         uint16_t u;
  46 };
  47
  48 struct codepage_desc {
  49         unsigned char *name;
  50         unsigned char *const *aliases;
  51
  52         /* The Unicode mappings of codepage bytes 0x80...0xFF.
  53          * (0x00...0x7F are assumed to be ASCII in all codepages.)
  54          * Because all current values fit in 16 bits, we store them as
  55          * uint16_t rather than unicode_val_T.  If the codepage does
  56          * not use some byte, then @highhalf maps that byte to 0xFFFF,
  57          * which C code converts to UCS_REPLACEMENT_CHARACTER where
  58          * appropriate.  (U+FFFF is reserved and will never be
  59          * assigned as a character.)  */
  60         const uint16_t *highhalf;
  61
  62         /* If some byte in the codepage corresponds to multiple Unicode
  63          * characters, then the preferred character is in @highhalf
  64          * above, and the rest are listed here in @table.  This table
  65          * is not used for translating from the codepage to Unicode.  */
  66         const struct table_entry *table;
  67 };
  68
  69 #include "intl/codepage.inc"
  70 #include "intl/uni_7b.inc"
  71 #include "intl/entity.inc"
  72
  73
  74 static const char strings[256][2] = {
  75         "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
  76         "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
  77         "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
  78         "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
  79         "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
  80         "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
  81         "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
  82         "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
  83         "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
  84         "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
  85         "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
  86         "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
  87         "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
  88         "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
  89         "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
  90         "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
  91         "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
  92         "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
  93         "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
  94         "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
  95         "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
  96         "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
  97         "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
  98         "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
  99         "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
 100         "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
 101         "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
 102         "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
 103         "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
 104         "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
 105         "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
 106         "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
 107 };
 108
 109 static void
 110 free_translation_table(struct conv_table *p)
 111 {
 112         int i;
 113
 114         for (i = 0; i < 256; i++)
 115                 if (p[i].t)
 116                         free_translation_table(p[i].u.tbl);
 117
 118         mem_free(p);
 119 }
 120
 121 /* A string used in conversion tables when there is no correct
 122  * conversion.  This is compared by address and therefore should be a
 123  * named array rather than a pointer so that it won't share storage
 124  * with any other string literal that happens to have the same
 125  * characters.  */
 126 static const unsigned char no_str[] = "*";
 127
 128 static void
 129 new_translation_table(struct conv_table *p)
 130 {
 131         int i;
 132
 133         for (i = 0; i < 256; i++)
 134                 if (p[i].t)
 135                         free_translation_table(p[i].u.tbl);
 136         for (i = 0; i < 128; i++) {
 137                 p[i].t = 0;
 138                 p[i].u.str = strings[i];
 139         }
 140         for (; i < 256; i++) {
 141                 p[i].t = 0;
 142                 p[i].u.str = no_str;
 143         }
 144 }
 145
 146 #define BIN_SEARCH(table, entry, entries, key, result)                                  \
 147 {                                                                                       \
 148         long _s = 0, _e = (entries) - 1;                                                \
 149                                                                                         \
 150         while (_s <= _e || !((result) = -1)) {                                          \
 151                 long _m = (_s + _e) / 2;                                                \
 152                                                                                         \
 153                 if ((table)[_m].entry == (key)) {                                       \
 154                         (result) = _m;                                                  \
 155                         break;                                                          \
 156                 }                                                                       \
 157                 if ((table)[_m].entry > (key)) _e = _m - 1;                             \
 158                 if ((table)[_m].entry < (key)) _s = _m + 1;                             \
 159         }                                                                               \
 160 }                                                                                       \
 161
 162 static const unicode_val_T strange_chars[32] = {
 163 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
 164 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
 165 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
 166 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
 167 };
 168
 169 #define SYSTEM_CHARSET_FLAG 128
 170 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
 171
 172 const unsigned char *
 173 u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
 174 {
 175         int j;
 176         int s;
 177
 178         if (u < 128) return strings[u];
 179
 180         to &= ~SYSTEM_CHARSET_FLAG;
 181
 182 #ifdef CONFIG_UTF8
 183         if (is_cp_ptr_utf8(&codepages[to]))
 184                 return encode_utf8(u);
 185 #endif /* CONFIG_UTF8 */
 186
 187         /* To mark non breaking spaces in non-UTF-8 strings, we use a
 188          * special char NBSP_CHAR. */
 189         if (u == UCS_NO_BREAK_SPACE) {
 190                 if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
 191                 else /* NBSP_MODE_ASCII */ return " ";
 192         }
 193         if (u == UCS_SOFT_HYPHEN) return "";
 194
 195         if (u < 0xa0) {
 196                 unicode_val_T strange = strange_chars[u - 0x80];
 197
 198                 if (!strange) return NULL;
 199                 return u2cp_(strange, to, nbsp_mode);
 200         }
 201
 202         if (u < 0xFFFF)
 203                 for (j = 0; j < 0x80; j++)
 204                         if (codepages[to].highhalf[j] == u)
 205                                 return strings[0x80 + j];
 206         for (j = 0; codepages[to].table[j].c; j++)
 207                 if (codepages[to].table[j].u == u)
 208                         return strings[codepages[to].table[j].c];
 209
 210         BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
 211         if (s != -1) return unicode_7b[s].s;
 212
 213         return no_str;
 214 }
 215
 216 static unsigned char utf_buffer[7];
 217
 218 #ifdef CONFIG_UTF8
 219 inline unsigned char *
 220 encode_utf8(unicode_val_T u)
 221 #else
 222 static unsigned char *
 223 encode_utf8(unicode_val_T u)
 224 #endif /* CONFIG_UTF8 */
 225 {
 226         memset(utf_buffer, 0, 7);
 227
 228         if (u < 0x80)
 229                 utf_buffer[0] = u;
 230         else if (u < 0x800)
 231                 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
 232                 utf_buffer[1] = 0x80 | (u & 0x3f);
 233         else if (u < 0x10000)
 234                 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
 235                 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
 236                 utf_buffer[2] = 0x80 | (u & 0x3f);
 237         else if (u < 0x200000)
 238                 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
 239                 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
 240                 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
 241                 utf_buffer[3] = 0x80 | (u & 0x3f);
 242         else if (u < 0x4000000)
 243                 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
 244                 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
 245                 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
 246                 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
 247                 utf_buffer[4] = 0x80 | (u & 0x3f);
 248         else    utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
 249                 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
 250                 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
 251                 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
 252                 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
 253                 utf_buffer[5] = 0x80 | (u & 0x3f);
 254
 255         return utf_buffer;
 256 }
 257
 258 #ifdef CONFIG_UTF8
 259 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
 260  * equal ones and handled different. */
 261 static const char utf8char_len_tab[256] = {
 262         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 263         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 264         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 265         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 266         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 267         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 268         2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
 269         3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
 270 };
 271
 272 inline int utf8charlen(const unsigned char *p)
 273 {
 274         return p ? utf8char_len_tab[*p] : 0;
 275 }
 276
 277 inline int
 278 strlen_utf8(unsigned char **str)
 279 {
 280         unsigned char *s = *str;
 281         unsigned char *end = strchr(s, '\0');
 282         int x;
 283         int len;
 284
 285         for (x = 0;; x++, s += len) {
 286                 len = utf8charlen(s);
 287                 if (s + len > end) break;
 288         }
 289         *str = s;
 290         return x;
 291 }
 292
 293 #define utf8_issingle(p) (((p) & 0x80) == 0)
 294 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
 295
 296 /* Start from @current and move back to @pos char. This pointer return. The
 297  * most left pointer is @start. */
 298 inline unsigned char *
 299 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
 300 {
 301         if (current == NULL || start == NULL || pos < 0)
 302                 return NULL;
 303         while (pos > 0 && current != start) {
 304                 current--;
 305                 if (utf8_islead(*current))
 306                         pos--;
 307         }
 308         return current;
 309 }
 310
 311 /* Count number of standard terminal cells needed for displaying UTF-8
 312  * character. */
 313 int
 314 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
 315 {
 316         unicode_val_T u;
 317
 318         if (end == NULL)
 319                 end = strchr(utf8_char, '\0');
 320
 321         if(!utf8_char || !end)
 322                 return -1;
 323
 324         u = utf8_to_unicode(&utf8_char, end);
 325
 326         return unicode_to_cell(u);
 327 }
 328
 329 /* Count number of standard terminal cells needed for displaying string
 330  * with UTF-8 characters. */
 331 int
 332 utf8_ptr2cells(unsigned char *string, unsigned char *end)
 333 {
 334         int charlen, cell, cells = 0;
 335
 336         if (end == NULL)
 337                 end = strchr(string, '\0');
 338
 339         if(!string || !end)
 340                 return -1;
 341
 342         do {
 343                 charlen = utf8charlen(string);
 344                 if (string + charlen > end)
 345                         break;
 346
 347                 cell = utf8_char2cells(string, end);
 348                 if  (cell < 0)
 349                         return -1;
 350
 351                 cells += cell;
 352                 string += charlen;
 353         } while (1);
 354
 355         return cells;
 356 }
 357
 358 /* Count number of characters in string. */
 359 int
 360 utf8_ptr2chars(unsigned char *string, unsigned char *end)
 361 {
 362         int charlen, chars = 0;
 363
 364         if (end == NULL)
 365                 end = strchr(string, '\0');
 366
 367         if(!string || !end)
 368                 return -1;
 369
 370         do {
 371                 charlen = utf8charlen(string);
 372                 if (string + charlen > end)
 373                         break;
 374
 375                 chars++;
 376                 string += charlen;
 377         } while (1);
 378
 379         return chars;
 380 }
 381
 382 /*
 383  * Count number of bytes from begining of the string needed for displaying
 384  * specified number of cells.
 385  */
 386 int
 387 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
 388 {
 389         unsigned int bytes = 0, cells = 0;
 390
 391         assert(max_cells>=0);
 392
 393         if (end == NULL)
 394                 end = strchr(string, '\0');
 395
 396         if(!string || !end)
 397                 return -1;
 398
 399         do {
 400                 int cell = utf8_char2cells(&string[bytes], end);
 401                 if (cell < 0)
 402                         return -1;
 403
 404                 cells += cell;
 405                 if (cells > max_cells)
 406                         break;
 407
 408                 bytes += utf8charlen(&string[bytes]);
 409
 410                 if (string + bytes > end) {
 411                         bytes = end - string;
 412                         break;
 413                 }
 414         } while(1);
 415
 416         return bytes;
 417 }
 418
 419 /* Take @max steps forward from @string in the specified @way, but
 420  * not going past @end.  Return the resulting address.  Store the
 421  * number of steps taken to *@count, unless @count is NULL.
 422  *
 423  * This assumes the text is valid UTF-8, and @string and @end point to
 424  * character boundaries.  If not, it doesn't crash but the results may
 425  * be inconsistent.
 426  *
 427  * This function can do some of the same jobs as utf8charlen(),
 428  * utf8_cells2bytes(), and strlen_utf8().  */
 429 unsigned char *
 430 utf8_step_forward(unsigned char *string, unsigned char *end,
 431                   int max, enum utf8_step way, int *count)
 432 {
 433         int steps = 0;
 434         unsigned char *current = string;
 435
 436         assert(string);
 437         assert(max >= 0);
 438         if_assert_failed goto invalid_arg;
 439         if (end == NULL)
 440                 end = strchr(string, '\0');
 441
 442         switch (way) {
 443         case UTF8_STEP_CHARACTERS:
 444                 while (steps < max && current < end) {
 445                         ++current;
 446                         if (utf8_islead(*current))
 447                                 ++steps;
 448                 }
 449                 break;
 450
 451         case UTF8_STEP_CELLS_FEWER:
 452         case UTF8_STEP_CELLS_MORE:
 453                 while (steps < max) {
 454                         unicode_val_T u;
 455                         unsigned char *prev = current;
 456                         int width;
 457
 458                         u = utf8_to_unicode(&current, end);
 459                         if (u == UCS_NO_CHAR) {
 460                                 /* Assume the incomplete sequence
 461                                  * costs one cell.  */
 462                                 current = end;
 463                                 ++steps;
 464                                 break;
 465                         }
 466
 467                         width = unicode_to_cell(u);
 468                         if (way == UTF8_STEP_CELLS_FEWER
 469                             && steps + width > max) {
 470                                 /* Back off.  */
 471                                 current = prev;
 472                                 break;
 473                         }
 474                         steps += width;
 475                 }
 476                 break;
 477
 478         default:
 479                 INTERNAL("impossible enum utf8_step");
 480         }
 481
 482 invalid_arg:
 483         if (count)
 484                 *count = steps;
 485         return current;
 486 }
 487
 488 /* Take @max steps backward from @string in the specified @way, but
 489  * not going past @start.  Return the resulting address.  Store the
 490  * number of steps taken to *@count, unless @count is NULL.
 491  *
 492  * This assumes the text is valid UTF-8, and @string and @start point
 493  * to character boundaries.  If not, it doesn't crash but the results
 494  * may be inconsistent.
 495  *
 496  * This function can do some of the same jobs as utf8_prevchar().  */
 497 unsigned char *
 498 utf8_step_backward(unsigned char *string, unsigned char *start,
 499                    int max, enum utf8_step way, int *count)
 500 {
 501         int steps = 0;
 502         unsigned char *current = string;
 503
 504         assert(string);
 505         assert(start);
 506         assert(max >= 0);
 507         if_assert_failed goto invalid_arg;
 508
 509         switch (way) {
 510         case UTF8_STEP_CHARACTERS:
 511                 while (steps < max && current > start) {
 512                         --current;
 513                         if (utf8_islead(*current))
 514                                 ++steps;
 515                 }
 516                 break;
 517
 518         case UTF8_STEP_CELLS_FEWER:
 519         case UTF8_STEP_CELLS_MORE:
 520                 while (steps < max) {
 521                         unsigned char *prev = current;
 522                         unsigned char *look;
 523                         unicode_val_T u;
 524                         int width;
 525
 526                         if (current <= start)
 527                                 break;
 528                         do {
 529                                 --current;
 530                         } while (current > start && !utf8_islead(*current));
 531
 532                         look = current;
 533                         u = utf8_to_unicode(&look, prev);
 534                         if (u == UCS_NO_CHAR) {
 535                                 /* Assume the incomplete sequence
 536                                  * costs one cell.  */
 537                                 width = 1;
 538                         } else
 539                                 width = unicode_to_cell(u);
 540
 541                         if (way == UTF8_STEP_CELLS_FEWER
 542                             && steps + width > max) {
 543                                 /* Back off.  */
 544                                 current = prev;
 545                                 break;
 546                         }
 547                         steps += width;
 548                 }
 549                 break;
 550
 551         default:
 552                 INTERNAL("impossible enum utf8_step");
 553         }
 554
 555 invalid_arg:
 556         if (count)
 557                 *count = steps;
 558         return current;
 559 }
 560
 561 /*
 562  * Find out number of standard terminal collumns needed for displaying symbol
 563  * (glyph) which represents Unicode character c.
 564  *
 565  * TODO: Use wcwidth when it is available. This seems to require:
 566  * - Make the configure script check whether <wchar.h> and wcwidth exist.
 567  * - Define _XOPEN_SOURCE and include <wchar.h>.
 568  * - Test that __STDC_ISO_10646__ is defined.  (This macro means wchar_t
 569  *   matches ISO 10646 in all locales.)
 570  * However, these do not suffice, because wcwidth depends on LC_CTYPE
 571  * in glibc-2.3.6.  For instance, wcwidth(0xff20) is -1 when LC_CTYPE
 572  * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
 573  * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
 574  * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
 575  * character is apparently not supported in all locales.  Why is that?
 576  * - Perhaps there is standardese that requires supported characters
 577  *   to be convertable to multibyte form.  Then ELinks could just pick
 578  *   some UTF-8 locale for its wcwidth purposes.
 579  * - Perhaps wcwidth can even return different nonnegative values for
 580  *   the same ISO 10646 character in different locales.  Then ELinks
 581  *   would have to set LC_CTYPE to match at least the terminal's
 582  *   charset (which may differ from the LC_CTYPE environment variable,
 583  *   especially when the master process is serving a slave terminal).
 584  *   But there is no guarantee that the libc supports all the same
 585  *   charsets as ELinks does.
 586  * For now, it seems safest to avoid the potentially locale-dependent
 587  * libc version of wcwidth, and instead use a hardcoded mapping.
 588  *
 589  * @return      2 for double-width glyph, 1 for others.
 590  *              TODO: May be extended to return 0 for zero-width glyphs
 591  *              (like composing, maybe unprintable too).
 592  */
 593 inline int
 594 unicode_to_cell(unicode_val_T c)
 595 {
 596         if (c >= 0x1100
 597                 && (c <= 0x115f                 /* Hangul Jamo */
 598                 || c == 0x2329
 599                 || c == 0x232a
 600                 || (c >= 0x2e80 && c <= 0xa4cf
 601                         && c != 0x303f)         /* CJK ... Yi */
 602                 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
 603                 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
 604                                                                 Ideographs */
 605                 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
 606                 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
 607                 || (c >= 0xffe0 && c <= 0xffe6)
 608                 || (c >= 0x20000 && c <= 0x2fffd)
 609                 || (c >= 0x30000 && c <= 0x3fffd)))
 610                 return 2;
 611
 612         return 1;
 613 }
 614
 615 /* Fold the case of a Unicode character, so that hotkeys in labels can
 616  * be compared case-insensitively.  It is unspecified whether the
 617  * result will be in upper or lower case.  */
 618 unicode_val_T
 619 unicode_fold_label_case(unicode_val_T c)
 620 {
 621 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
 622         return towlower(c);
 623 #else  /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 624         /* For now, this supports only ASCII.  It would be possible to
 625          * use code generated from CaseFolding.txt of Unicode if the
 626          * acknowledgements required by http://www.unicode.org/copyright.html
 627          * were added to associated documentation of ELinks.  */
 628         if (c >= 0x41 && c <= 0x5A)
 629                 return c + 0x20;
 630         else
 631                 return c;
 632 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 633 }
 634
 635 inline unicode_val_T
 636 utf8_to_unicode(unsigned char **string, const unsigned char *end)
 637 {
 638         unsigned char *str = *string;
 639         unicode_val_T u;
 640         int length;
 641
 642         length = utf8char_len_tab[str[0]];
 643
 644         if (str + length > end) {
 645                 return UCS_NO_CHAR;
 646         }
 647
 648         switch (length) {
 649                 case 1:         /* U+0000 to U+007F */
 650                         if (str[0] >= 0x80) {
 651 invalid_utf8:
 652                                 ++*string;
 653                                 return UCS_REPLACEMENT_CHARACTER;
 654                         }
 655                         u = str[0];
 656                         break;
 657                 case 2:         /* U+0080 to U+07FF */
 658                         if ((str[1] & 0xc0) != 0x80)
 659                                 goto invalid_utf8;
 660                         u = (str[0] & 0x1f) << 6;
 661                         u += (str[1] & 0x3f);
 662                         if (u < 0x80)
 663                                 goto invalid_utf8;
 664                         break;
 665                 case 3:         /* U+0800 to U+FFFF, except surrogates */
 666                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
 667                                 goto invalid_utf8;
 668                         u = (str[0] & 0x0f) << 12;
 669                         u += ((str[1] & 0x3f) << 6);
 670                         u += (str[2] & 0x3f);
 671                         if (u < 0x800 || is_utf16_surrogate(u))
 672                                 goto invalid_utf8;
 673                         break;
 674                 case 4:         /* U+10000 to U+1FFFFF */
 675                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 676                             || (str[3] & 0xc0) != 0x80)
 677                                 goto invalid_utf8;
 678                         u = (str[0] & 0x0f) << 18;
 679                         u += ((str[1] & 0x3f) << 12);
 680                         u += ((str[2] & 0x3f) << 6);
 681                         u += (str[3] & 0x3f);
 682                         if (u < 0x10000)
 683                                 goto invalid_utf8;
 684                         break;
 685                 case 5:         /* U+200000 to U+3FFFFFF */
 686                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 687                             || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
 688                                 goto invalid_utf8;
 689                         u = (str[0] & 0x0f) << 24;
 690                         u += ((str[1] & 0x3f) << 18);
 691                         u += ((str[2] & 0x3f) << 12);
 692                         u += ((str[3] & 0x3f) << 6);
 693                         u += (str[4] & 0x3f);
 694                         if (u < 0x200000)
 695                                 goto invalid_utf8;
 696                         break;
 697                 case 6:         /* U+4000000 to U+7FFFFFFF */
 698                         if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
 699                             || (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
 700                             || (str[5] & 0xc0) != 0x80)
 701                                 goto invalid_utf8;
 702                         u = (str[0] & 0x01) << 30;
 703                         u += ((str[1] & 0x3f) << 24);
 704                         u += ((str[2] & 0x3f) << 18);
 705                         u += ((str[3] & 0x3f) << 12);
 706                         u += ((str[4] & 0x3f) << 6);
 707                         u += (str[5] & 0x3f);
 708                         if (u < 0x4000000)
 709                                 goto invalid_utf8;
 710                         break;
 711                 default:
 712                         INTERNAL("utf8char_len_tab out of range");
 713                         goto invalid_utf8;
 714         }
 715         *string = str + length;
 716         return u;
 717 }
 718 #endif /* CONFIG_UTF8 */
 719
 720 /* The common part of cp2u and cp2utf_8.  */
 721 static unicode_val_T
 722 cp2u_shared(const struct codepage_desc *from, unsigned char c)
 723 {
 724         unicode_val_T u = from->highhalf[c - 0x80];
 725
 726         if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
 727         return u;
 728 }
 729
 730 /* Used for converting input from the terminal.  */
 731 unicode_val_T
 732 cp2u(int from, unsigned char c)
 733 {
 734         from &= ~SYSTEM_CHARSET_FLAG;
 735
 736         /* UTF-8 is a multibyte codepage and cannot be handled with
 737          * this function.  */
 738         assert(!is_cp_ptr_utf8(&codepages[from]));
 739         if_assert_failed return UCS_REPLACEMENT_CHARACTER;
 740
 741         if (c < 0x80) return c;
 742         else return cp2u_shared(&codepages[from], c);
 743 }
 744
 745 /* This slow and ugly code is used by the terminal utf_8_io */
 746 const unsigned char *
 747 cp2utf8(int from, int c)
 748 {
 749         from &= ~SYSTEM_CHARSET_FLAG;
 750
 751         if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
 752                 return strings[c];
 753
 754         return encode_utf8(cp2u_shared(&codepages[from], c));
 755 }
 756
 757 #ifdef CONFIG_UTF8
 758 unicode_val_T
 759 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
 760 {
 761         unicode_val_T ret;
 762
 763         if (is_cp_utf8(codepage))
 764                 return utf8_to_unicode(string, end);
 765
 766         if (*string >= end)
 767                 return UCS_NO_CHAR;
 768
 769         ret = cp2u(codepage, **string);
 770         ++*string;
 771         return ret;
 772 }
 773 #endif  /* CONFIG_UTF8 */
 774
 775
 776 #ifdef CONFIG_COMBINE
 777 unicode_val_T last_combined = UCS_BEGIN_COMBINED - 1;
 778 unicode_val_T **combined;
 779 struct hash *combined_hash;
 780
 781 unicode_val_T
 782 get_combined(unicode_val_T *data, int length)
 783 {
 784         struct hash_item *item;
 785         unicode_val_T *key;
 786         int i, indeks;
 787
 788         assert(length >= 1 && length <= UCS_MAX_LENGTH_COMBINED);
 789         if_assert_failed return UCS_NO_CHAR;
 790
 791         if (!combined_hash) combined_hash = init_hash8();
 792         if (!combined_hash) return UCS_NO_CHAR;
 793         item = get_hash_item(combined_hash, (unsigned char *)data, length * sizeof(*data));
 794
 795         if (item) return (unicode_val_T)(long)item->value;
 796         if (last_combined >= UCS_END_COMBINED) return UCS_NO_CHAR;
 797
 798         key = mem_alloc((length + 1) * sizeof(*key));
 799         if (!key) return UCS_NO_CHAR;
 800         for (i = 0; i < length; i++)
 801                 key[i] = data[i];
 802         key[i] = UCS_END_COMBINED;
 803
 804         last_combined++;
 805         indeks = last_combined - UCS_BEGIN_COMBINED;
 806
 807         combined = mem_realloc(combined, sizeof(*combined) * (indeks + 1));
 808         if (!combined) {
 809                 mem_free(key);
 810                 last_combined--;
 811                 return UCS_NO_CHAR;
 812         }
 813         combined[indeks] = key;
 814         item = add_hash_item(combined_hash, (unsigned char *)key,
 815                              length * sizeof(*data), (void *)(long)(last_combined));
 816         if (!item) {
 817                 last_combined--;
 818                 mem_free(key);
 819                 return UCS_NO_CHAR;
 820         }
 821         return last_combined;
 822 }
 823
 824 void
 825 free_combined()
 826 {
 827         int i, end = last_combined - UCS_BEGIN_COMBINED + 1;
 828
 829         if (combined_hash)
 830                 free_hash(&combined_hash);
 831         for (i = 0; i < end; i++)
 832                 mem_free(combined[i]);
 833         mem_free_if(combined);
 834 }
 835 #endif /* CONFIG_COMBINE */
 836
 837
 838 static void
 839 add_utf8(struct conv_table *ct, unicode_val_T u, const unsigned char *str)
 840 {
 841         unsigned char *p = encode_utf8(u);
 842
 843         while (p[1]) {
 844                 if (ct[*p].t) ct = ct[*p].u.tbl;
 845                 else {
 846                         struct conv_table *nct;
 847
 848                         assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
 849                         if_assert_failed return;
 850
 851                         nct = mem_calloc(256, sizeof(*nct));
 852                         if (!nct) return;
 853                         new_translation_table(nct);
 854                         ct[*p].t = 1;
 855                         ct[*p].u.tbl = nct;
 856                         ct = nct;
 857                 }
 858                 p++;
 859         }
 860
 861         assertm(!ct[*p].t, "bad utf encoding #2");
 862         if_assert_failed return;
 863
 864         if (ct[*p].u.str == no_str)
 865                 ct[*p].u.str = str;
 866 }
 867
 868 /* A conversion table from some charset to UTF-8.
 869  * If it is from UTF-8 to UTF-8, it converts each byte separately.
 870  * Unlike in other translation tables, the strings in elements 0x80 to
 871  * 0xFF are allocated dynamically.  */
 872 struct conv_table utf_table[256];
 873 int utf_table_init = 1;
 874
 875 static void
 876 free_utf_table(void)
 877 {
 878         int i;
 879
 880         /* Cast away const.  */
 881         for (i = 128; i < 256; i++)
 882                 mem_free((unsigned char *) utf_table[i].u.str);
 883 }
 884
 885 static struct conv_table *
 886 get_translation_table_to_utf8(int from)
 887 {
 888         int i;
 889         static int lfr = -1;
 890
 891         if (from == -1) return NULL;
 892         from &= ~SYSTEM_CHARSET_FLAG;
 893         if (from == lfr) return utf_table;
 894         lfr = from;
 895         if (utf_table_init) {
 896                 memset(utf_table, 0, sizeof(utf_table));
 897                 utf_table_init = 0;
 898         } else
 899                 free_utf_table();
 900
 901         for (i = 0; i < 128; i++)
 902                 utf_table[i].u.str = strings[i];
 903
 904         if (is_cp_ptr_utf8(&codepages[from])) {
 905                 for (i = 128; i < 256; i++)
 906                         utf_table[i].u.str = stracpy(strings[i]);
 907                 return utf_table;
 908         }
 909
 910         for (i = 128; i < 256; i++) {
 911                 unicode_val_T u = codepages[from].highhalf[i - 0x80];
 912
 913                 if (u == 0xFFFF)
 914                         utf_table[i].u.str = NULL;
 915                 else
 916                         utf_table[i].u.str = stracpy(encode_utf8(u));
 917         }
 918
 919         for (i = 0; codepages[from].table[i].c; i++) {
 920                 unicode_val_T u = codepages[from].table[i].u;
 921
 922                 if (!utf_table[codepages[from].table[i].c].u.str)
 923                         utf_table[codepages[from].table[i].c].u.str =
 924                                 stracpy(encode_utf8(u));
 925         }
 926
 927         for (i = 128; i < 256; i++)
 928                 if (!utf_table[i].u.str)
 929                         utf_table[i].u.str = stracpy(no_str);
 930
 931         return utf_table;
 932 }
 933
 934 /* A conversion table between two charsets, where the target is not UTF-8.  */
 935 static struct conv_table table[256];
 936 static int first = 1;
 937
 938 void
 939 free_conv_table(void)
 940 {
 941         if (!utf_table_init) free_utf_table();
 942         if (first) {
 943                 memset(table, 0, sizeof(table));
 944                 first = 0;
 945         }
 946         new_translation_table(table);
 947 }
 948
 949
 950 struct conv_table *
 951 get_translation_table(int from, int to)
 952 {
 953         static int lfr = -1;
 954         static int lto = -1;
 955
 956         from &= ~SYSTEM_CHARSET_FLAG;
 957         to &= ~SYSTEM_CHARSET_FLAG;
 958         if (first) {
 959                 memset(table, 0, sizeof(table));
 960                 first = 0;
 961         }
 962         if (/*from == to ||*/ from == -1 || to == -1)
 963                 return NULL;
 964         if (is_cp_ptr_utf8(&codepages[to]))
 965                 return get_translation_table_to_utf8(from);
 966         if (from == lfr && to == lto)
 967                 return table;
 968         lfr = from;
 969         lto = to;
 970         new_translation_table(table);
 971
 972         if (is_cp_ptr_utf8(&codepages[from])) {
 973                 int i;
 974
 975                 /* Map U+00A0 and U+00AD the same way as u2cp() would.  */
 976                 add_utf8(table, UCS_NO_BREAK_SPACE, strings[NBSP_CHAR]);
 977                 add_utf8(table, UCS_SOFT_HYPHEN, "");
 978
 979                 for (i = 0x80; i <= 0xFF; i++)
 980                         if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
 981                                 add_utf8(table,
 982                                          codepages[to].highhalf[i - 0x80],
 983                                          strings[i]);
 984
 985                 for (i = 0; codepages[to].table[i].c; i++)
 986                         add_utf8(table, codepages[to].table[i].u,
 987                                  strings[codepages[to].table[i].c]);
 988
 989                 for (i = 0; unicode_7b[i].x != -1; i++)
 990                         if (unicode_7b[i].x >= 0x80)
 991                                 add_utf8(table, unicode_7b[i].x,
 992                                          unicode_7b[i].s);
 993
 994         } else {
 995                 int i;
 996
 997                 for (i = 128; i < 256; i++) {
 998                         if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
 999                                 const unsigned char *u;
1000
1001                                 u = u2cp(codepages[from].highhalf[i - 0x80], to);
1002                                 if (u) table[i].u.str = u;
1003                         }
1004                 }
1005         }
1006
1007         return table;
1008 }
1009
1010 static inline int
1011 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
1012 {
1013         while (l2) {
1014                 if (*s1 > *s2) return 1;
1015                 if (*s1 < *s2) return -1;
1016                 s1++;
1017                 s2++;
1018                 l2--;
1019         }
1020
1021         return *s2 ? -1 : 0;
1022 }
1023
1024 /* Entity cache debugging purpose. */
1025 #if 0
1026 #define DEBUG_ENTITY_CACHE
1027 #else
1028 #undef DEBUG_ENTITY_CACHE
1029 #endif
1030
1031 struct entity_cache {
1032         unsigned int hits;
1033         int strlen;
1034         int encoding;
1035         const unsigned char *result;
1036         unsigned char str[20]; /* Suffice in any case. */
1037 };
1038
1039 /* comparison function for qsort() */
1040 static int
1041 hits_cmp(const void *v1, const void *v2)
1042 {
1043         const struct entity_cache *a = v1, *b = v2;
1044
1045         if (a->hits == b->hits) return 0;
1046         if (a->hits > b->hits) return -1;
1047         else return 1;
1048 }
1049
1050 static int
1051 compare_entities(const void *key_, const void *element_)
1052 {
1053         struct string *key = (struct string *) key_;
1054         struct entity *element = (struct entity *) element_;
1055         int length = key->length;
1056         unsigned char *first = key->source;
1057         unsigned char *second = element->s;
1058
1059         return xxstrcmp(first, second, length);
1060 }
1061
1062 const unsigned char *
1063 get_entity_string(const unsigned char *str, const int strlen, int encoding)
1064 {
1065 #define ENTITY_CACHE_SIZE 10    /* 10 seems a good value. */
1066 #define ENTITY_CACHE_MAXLEN 9   /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
1067                                    will go in [0] table */
1068         static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
1069         static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
1070         static int first_time = 1;
1071         unsigned int slen = 0;
1072         const unsigned char *result = NULL;
1073
1074         if (strlen <= 0) return NULL;
1075
1076 #ifdef CONFIG_UTF8
1077         /* TODO: caching UTF-8 */
1078         encoding &= ~SYSTEM_CHARSET_FLAG;
1079         if (is_cp_ptr_utf8(&codepages[encoding]))
1080                 goto skip;
1081 #endif /* CONFIG_UTF8 */
1082
1083         if (first_time) {
1084                 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
1085                 first_time = 0;
1086         }
1087
1088         /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
1089          * + google + slashdot + websites that result from a search for test on google,
1090          * + various ones) show quite impressive improvment:
1091          * Top ten is:
1092          * 0: hits=2459 l=4 st='nbsp'
1093          * 1: hits=2152 l=6 st='eacute'
1094          * 2: hits=235 l=6 st='egrave'
1095          * 3: hits=136 l=6 st='agrave'
1096          * 4: hits=100 l=3 st='amp'
1097          * 5: hits=40 l=5 st='laquo'
1098          * 6: hits=8 l=4 st='copy'
1099          * 7: hits=5 l=2 st='gt'
1100          * 8: hits=2 l=2 st='lt'
1101          * 9: hits=1 l=6 st='middot'
1102          *
1103          * Most of the time cache hit ratio is near 95%.
1104          *
1105          * A long test shows: 15186 hits vs. 24 misses and mean iteration
1106          * count is kept < 2 (worst case 1.58). Not so bad ;)
1107          *
1108          * --Zas */
1109
1110         /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
1111         slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
1112
1113         if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
1114                 int i;
1115
1116                 for (i = 0; i < nb_entity_cache[slen]; i++) {
1117                         if (entity_cache[slen][i].encoding == encoding
1118                             && !memcmp(str, entity_cache[slen][i].str, strlen)) {
1119 #ifdef DEBUG_ENTITY_CACHE
1120                                 static double total_iter = 0;
1121                                 static unsigned long hit_count = 0;
1122
1123                                 total_iter += i + 1;
1124                                 hit_count++;
1125                                 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
1126 #endif
1127                                 if (entity_cache[slen][i].hits < (unsigned int) ~0)
1128                                         entity_cache[slen][i].hits++;
1129                                 return entity_cache[slen][i].result;
1130                         }
1131                 }
1132 #ifdef DEBUG_ENTITY_CACHE
1133                 fprintf(stderr, "miss\n");
1134 #endif
1135         }
1136 #ifdef CONFIG_UTF8
1137 skip:
1138 #endif /* CONFIG_UTF8 */
1139         if (*str == '#') { /* Numeric entity. */
1140                 int l = (int) strlen;
1141                 unsigned char *st = (unsigned char *) str;
1142                 unicode_val_T n = 0;
1143
1144                 if (l == 1) goto end; /* &#; ? */
1145                 st++, l--;
1146                 if ((*st | 32) == 'x') { /* Hexadecimal */
1147
1148                         if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
1149                         st++, l--;
1150                         do {
1151                                 unsigned char c = (*(st++) | 32);
1152
1153                                 if (isdigit(c))
1154                                         n = (n << 4) | (c - '0');
1155                                 else if (isxdigit(c))
1156                                         n = (n << 4) | (c - 'a' + 10);
1157                                 else
1158                                         goto end; /* Bad char. */
1159                         } while (--l);
1160                 } else { /* Decimal */
1161                         if (l > 10) goto end; /* 4294967295 max. */
1162                         do {
1163                                 unsigned char c = *(st++);
1164
1165                                 if (isdigit(c))
1166                                         n = n * 10 + c - '0';
1167                                 else
1168                                         goto end; /* Bad char. */
1169                                 /* Limit to 0xFFFFFFFF. */
1170                                 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1171                                         goto end;
1172                         } while (--l);
1173                 }
1174
1175                 result = u2cp(n, encoding);
1176
1177 #ifdef DEBUG_ENTITY_CACHE
1178                 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1179 #endif
1180         } else { /* Text entity. */
1181                 struct string key = INIT_STRING((unsigned char *) str, strlen);
1182                 struct entity *element = bsearch((void *) &key, entities,
1183                                                  N_ENTITIES,
1184                                                  sizeof(*element),
1185                                                  compare_entities);
1186
1187                 if (element) result = u2cp(element->c, encoding);
1188         }
1189
1190 #ifdef CONFIG_UTF8
1191         if (is_cp_ptr_utf8(&codepages[encoding])) {
1192                 return result;
1193         }
1194 #endif /* CONFIG_UTF8 */
1195 end:
1196         /* Take care of potential buffer overflow. */
1197         if (strlen < sizeof(entity_cache[slen][0].str)) {
1198                 struct entity_cache *ece;
1199
1200                 /* Sort entries by hit order. */
1201                 if (nb_entity_cache[slen] > 1)
1202                         qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1203                               sizeof(entity_cache[slen][0]), hits_cmp);
1204
1205                 /* Increment number of cache entries if possible.
1206                  * Else, just replace the least used entry.  */
1207                 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1208                 ece = &entity_cache[slen][nb_entity_cache[slen] - 1];
1209
1210                 /* Copy new entry to cache. */
1211                 ece->hits = 1;
1212                 ece->strlen = strlen;
1213                 ece->encoding = encoding;
1214                 ece->result = result;
1215                 memcpy(ece->str, str, strlen);
1216                 ece->str[strlen] = '\0';
1217
1218
1219 #ifdef DEBUG_ENTITY_CACHE
1220                 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1221                                 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1222
1223         {
1224                 unsigned int i;
1225
1226                 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1227                 for (i = 0; i < nb_entity_cache[slen] ; i++)
1228                         fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1229                                 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1230                                 entity_cache[slen][i].str);
1231                 fprintf(stderr, "-----------------\n");
1232         }
1233 #endif  /* DEBUG_ENTITY_CACHE */
1234         }
1235         return result;
1236 }
1237
1238 unsigned char *
1239 convert_string(struct conv_table *convert_table,
1240                unsigned char *chars, int charslen, int cp,
1241                enum convert_string_mode mode, int *length,
1242                void (*callback)(void *data, unsigned char *buf, int buflen),
1243                void *callback_data)
1244 {
1245         unsigned char *buffer;
1246         int bufferpos = 0;
1247         int charspos = 0;
1248
1249         if (!convert_table && !memchr(chars, '&', charslen)) {
1250                 if (callback) {
1251                         if (charslen) callback(callback_data, chars, charslen);
1252                         return NULL;
1253                 } else {
1254                         return memacpy(chars, charslen);
1255                 }
1256         }
1257
1258         /* Buffer allocation */
1259
1260         buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1261         if (!buffer) return NULL;
1262
1263         /* Iterate ;-) */
1264
1265         while (charspos < charslen) {
1266                 const unsigned char *translit;
1267
1268 #define PUTC do { \
1269                 buffer[bufferpos++] = chars[charspos++]; \
1270                 translit = ""; \
1271                 goto flush; \
1272         } while (0)
1273
1274                 if (chars[charspos] != '&') {
1275                         struct conv_table *t;
1276                         int i;
1277
1278                         if (chars[charspos] < 128 || !convert_table) PUTC;
1279
1280                         t = convert_table;
1281                         i = charspos;
1282
1283                         while (t[chars[i]].t) {
1284                                 t = t[chars[i++]].u.tbl;
1285                                 if (i >= charslen) PUTC;
1286                         }
1287
1288                         translit = t[chars[i]].u.str;
1289                         charspos = i + 1;
1290
1291                 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1292                         PUTC;
1293
1294                 } else {
1295                         int start = charspos + 1;
1296                         int i = start;
1297
1298                         while (i < charslen
1299                                && (isasciialpha(chars[i])
1300                                    || isdigit(chars[i])
1301                                    || (chars[i] == '#')))
1302                                 i++;
1303
1304                         /* This prevents bug 213: we were expanding "entities"
1305                          * in URL query strings. */
1306                         /* XXX: But this disables &nbsp&nbsp usage, which
1307                          * appears to be relatively common! --pasky */
1308                         if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1309                             && i > start
1310                             && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1311                                 translit = get_entity_string(&chars[start], i - start,
1312                                                       cp);
1313                                 if (chars[i] != ';') {
1314                                         /* Eat &nbsp &nbsp<foo> happily, but
1315                                          * pull back from the character after
1316                                          * entity string if it is not the valid
1317                                          * terminator. */
1318                                         i--;
1319                                 }
1320
1321                                 if (!translit) PUTC;
1322                                 charspos = i + (i < charslen);
1323                         } else PUTC;
1324                 }
1325
1326                 if (!translit[0]) continue;
1327
1328                 if (!translit[1]) {
1329                         buffer[bufferpos++] = translit[0];
1330                         translit = "";
1331                         goto flush;
1332                 }
1333
1334                 while (*translit) {
1335                         unsigned char *new;
1336
1337                         buffer[bufferpos++] = *(translit++);
1338 flush:
1339                         if (bufferpos & (ALLOC_GR - 1)) continue;
1340
1341                         if (callback) {
1342                                 buffer[bufferpos] = 0;
1343                                 callback(callback_data, buffer, bufferpos);
1344                                 bufferpos = 0;
1345                         } else {
1346                                 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1347                                 if (!new) {
1348                                         mem_free(buffer);
1349                                         return NULL;
1350                                 }
1351                                 buffer = new;
1352                         }
1353                 }
1354 #undef PUTC
1355         }
1356
1357         /* Say bye */
1358
1359         buffer[bufferpos] = 0;
1360         if (length) *length = bufferpos;
1361
1362         if (callback) {
1363                 if (bufferpos) callback(callback_data, buffer, bufferpos);
1364                 mem_free(buffer);
1365                 return NULL;
1366         } else {
1367                 return buffer;
1368         }
1369 }
1370
1371
1372 #ifndef USE_FASTFIND
1373 int
1374 get_cp_index(const unsigned char *name)
1375 {
1376         int i, a;
1377         int syscp = 0;
1378
1379         if (!strcasecmp(name, "System")) {
1380 #if HAVE_LANGINFO_CODESET
1381                 name = nl_langinfo(CODESET);
1382                 syscp = SYSTEM_CHARSET_FLAG;
1383 #else
1384                 name = "us-ascii";
1385 #endif
1386         }
1387
1388         for (i = 0; codepages[i].name; i++) {
1389                 for (a = 0; codepages[i].aliases[a]; a++) {
1390                         /* In the past, we looked for the longest substring
1391                          * in all the names; it is way too expensive, though:
1392                          *
1393                          *   %   cumulative   self              self     total
1394                          *  time   seconds   seconds    calls  us/call  us/call  name
1395                          *  3.00      0.66     0.03     1325    22.64    22.64  get_cp_index
1396                          *
1397                          * Anything called from redraw_screen() is in fact
1398                          * relatively expensive, even if it's called just
1399                          * once. So we will do a simple strcasecmp() here.
1400                          */
1401
1402                         if (!strcasecmp(name, codepages[i].aliases[a]))
1403                                 return i | syscp;
1404                 }
1405         }
1406
1407         if (syscp) {
1408                 return get_cp_index("us-ascii") | syscp;
1409         } else {
1410                 return -1;
1411         }
1412 }
1413
1414 #else
1415
1416 static unsigned int i_name = 0;
1417 static unsigned int i_alias = 0;
1418
1419 /* Reset internal list pointer */
1420 void
1421 charsets_list_reset(void)
1422 {
1423         i_name = 0;
1424         i_alias = 0;
1425 }
1426
1427 /* Returns a pointer to a struct that contains current key and data pointers
1428  * and increment internal pointer.  It returns NULL when key is NULL. */
1429 struct fastfind_key_value *
1430 charsets_list_next(void)
1431 {
1432         static struct fastfind_key_value kv;
1433
1434         if (!codepages[i_name].name) return NULL;
1435
1436         kv.key = codepages[i_name].aliases[i_alias];
1437         kv.data = (void *) &codepages[i_name]; /* cast away const */
1438
1439         if (codepages[i_name].aliases[i_alias + 1])
1440                 i_alias++;
1441         else {
1442                 i_name++;
1443                 i_alias = 0;
1444         }
1445
1446         return &kv;
1447 }
1448
1449 static struct fastfind_index ff_charsets_index
1450         = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1451
1452 /* It searchs for a charset named @name or one of its aliases and
1453  * returns index for it or -1 if not found. */
1454 int
1455 get_cp_index(const unsigned char *name)
1456 {
1457         const struct codepage_desc *codepage;
1458         int syscp = 0;
1459
1460         if (!strcasecmp(name, "System")) {
1461 #if HAVE_LANGINFO_CODESET
1462                 name = nl_langinfo(CODESET);
1463                 syscp = SYSTEM_CHARSET_FLAG;
1464 #else
1465                 name = "us-ascii";
1466 #endif
1467         }
1468
1469         codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1470         if (codepage) {
1471                 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1472                 return (codepage - codepages) | syscp;
1473
1474         } else if (syscp) {
1475                 return get_cp_index("us-ascii") | syscp;
1476
1477         } else {
1478                 return -1;
1479         }
1480 }
1481
1482 #endif /* USE_FASTFIND */
1483
1484 void
1485 init_charsets_lookup(void)
1486 {
1487 #ifdef USE_FASTFIND
1488         fastfind_index(&ff_charsets_index, FF_COMPRESS);
1489 #endif
1490 }
1491
1492 void
1493 free_charsets_lookup(void)
1494 {
1495 #ifdef USE_FASTFIND
1496         fastfind_done(&ff_charsets_index);
1497 #endif
1498 }
1499
1500 /* Get the codepage's name for displaying to the user, or NULL if
1501  * @cp_index is one past the end.  In the future, we might want to
1502  * localize these with gettext.  So it may be best not to use this
1503  * function if the name will have to be converted back to an
1504  * index.  */
1505 unsigned char *
1506 get_cp_name(int cp_index)
1507 {
1508         if (cp_index < 0) return "none";
1509         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1510
1511         return codepages[cp_index].name;
1512 }
1513
1514 /* Get the codepage's name for saving to a configuration file.  These
1515  * names can be converted back to indexes, even in future versions of
1516  * ELinks.  */
1517 unsigned char *
1518 get_cp_config_name(int cp_index)
1519 {
1520         if (cp_index < 0) return "none";
1521         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1522         if (!codepages[cp_index].aliases) return NULL;
1523
1524         return codepages[cp_index].aliases[0];
1525 }
1526
1527 /* Get the codepage's name for sending to a library or server that
1528  * understands MIME charset names.  This function irreversibly maps
1529  * the "System" codepage to the underlying charset.  */
1530 unsigned char *
1531 get_cp_mime_name(int cp_index)
1532 {
1533         if (cp_index < 0) return "none";
1534         cp_index &= ~SYSTEM_CHARSET_FLAG;
1535         if (!codepages[cp_index].aliases) return NULL;
1536
1537         return codepages[cp_index].aliases[0];
1538 }
1539
1540 int
1541 is_cp_utf8(int cp_index)
1542 {
1543         cp_index &= ~SYSTEM_CHARSET_FLAG;
1544         return is_cp_ptr_utf8(&codepages[cp_index]);
1545 }