src/intl/charsets.c

   1 /* Charsets convertor */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #if HAVE_LANGINFO_CODESET
   8 #include <langinfo.h>
   9 #endif
  10
  11 #include <ctype.h>
  12 #include <stdlib.h>
  13 #if HAVE_WCTYPE_H
  14 #include <wctype.h>
  15 #endif
  16
  17 #include "elinks.h"
  18
  19 #include "document/options.h"
  20 #include "intl/charsets.h"
  21 #include "util/conv.h"
  22 #include "util/error.h"
  23 #include "util/fastfind.h"
  24 #include "util/memory.h"
  25 #include "util/string.h"
  26
  27
  28 /* Fix namespace clash on MacOS. */
  29 #define table table_elinks
  30
  31 struct table_entry {
  32         unsigned char c;
  33         /* This should in principle be unicode_val_T, but because all
  34          * the values currently in codepage.inc fit in 16 bits, we can
  35          * as well use uint16_t and halve sizeof(struct table_entry)
  36          * from 8 bytes to 4.  Should other characters ever be needed,
  37          * unicode_val_T u : 24 might be a possibility, although it
  38          * seems a little unportable as bitfields are in principle
  39          * restricted to int, which may be 16-bit.  */
  40         uint16_t u;
  41 };
  42
  43 struct codepage_desc {
  44         unsigned char *name;
  45         unsigned char *const *aliases;
  46         const struct table_entry *table;
  47 };
  48
  49 #include "intl/codepage.inc"
  50 #include "intl/uni_7b.inc"
  51 #include "intl/entity.inc"
  52
  53
  54 static char strings[256][2] = {
  55         "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
  56         "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
  57         "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
  58         "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
  59         "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
  60         "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
  61         "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
  62         "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
  63         "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
  64         "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
  65         "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
  66         "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
  67         "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
  68         "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
  69         "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
  70         "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
  71         "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
  72         "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
  73         "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
  74         "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
  75         "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
  76         "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
  77         "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
  78         "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
  79         "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
  80         "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
  81         "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
  82         "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
  83         "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
  84         "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
  85         "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
  86         "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
  87 };
  88
  89 static void
  90 free_translation_table(struct conv_table *p)
  91 {
  92         int i;
  93
  94         for (i = 0; i < 256; i++)
  95                 if (p[i].t)
  96                         free_translation_table(p[i].u.tbl);
  97
  98         mem_free(p);
  99 }
 100
 101 static unsigned char *no_str = "*";
 102
 103 static void
 104 new_translation_table(struct conv_table *p)
 105 {
 106         int i;
 107
 108         for (i = 0; i < 256; i++)
 109                 if (p[i].t)
 110                         free_translation_table(p[i].u.tbl);
 111         for (i = 0; i < 128; i++) {
 112                 p[i].t = 0;
 113                 p[i].u.str = strings[i];
 114         }
 115         for (; i < 256; i++) {
 116                 p[i].t = 0;
 117                 p[i].u.str = no_str;
 118         }
 119 }
 120
 121 #define BIN_SEARCH(table, entry, entries, key, result)                                  \
 122 {                                                                                       \
 123         long _s = 0, _e = (entries) - 1;                                                \
 124                                                                                         \
 125         while (_s <= _e || !((result) = -1)) {                                          \
 126                 long _m = (_s + _e) / 2;                                                \
 127                                                                                         \
 128                 if ((table)[_m].entry == (key)) {                                       \
 129                         (result) = _m;                                                  \
 130                         break;                                                          \
 131                 }                                                                       \
 132                 if ((table)[_m].entry > (key)) _e = _m - 1;                             \
 133                 if ((table)[_m].entry < (key)) _s = _m + 1;                             \
 134         }                                                                               \
 135 }                                                                                       \
 136
 137 static const unicode_val_T strange_chars[32] = {
 138 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
 139 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
 140 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
 141 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
 142 };
 143
 144 #define SYSTEM_CHARSET_FLAG 128
 145
 146 unsigned char *
 147 u2cp_(unicode_val_T u, int to, int no_nbsp_hack)
 148 {
 149         int j;
 150         int s;
 151
 152         if (u < 128) return strings[u];
 153
 154         to &= ~SYSTEM_CHARSET_FLAG;
 155
 156 #ifdef CONFIG_UTF8
 157         if (codepages[to].table == table_utf8)
 158                 return encode_utf8(u);
 159 #endif /* CONFIG_UTF8 */
 160
 161         /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
 162         if (u == 0xa0) return no_nbsp_hack ? " " : NBSP_CHAR_STRING;
 163         if (u == 0xad) return "";
 164
 165         if (u < 0xa0) {
 166                 unicode_val_T strange = strange_chars[u - 0x80];
 167
 168                 if (!strange) return NULL;
 169                 return u2cp_(strange, to, no_nbsp_hack);
 170         }
 171
 172
 173         for (j = 0; codepages[to].table[j].c; j++)
 174                 if (codepages[to].table[j].u == u)
 175                         return strings[codepages[to].table[j].c];
 176
 177         BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
 178         if (s != -1) return unicode_7b[s].s;
 179
 180         return no_str;
 181 }
 182
 183 static unsigned char utf_buffer[7];
 184
 185 #ifdef CONFIG_UTF8
 186 inline unsigned char *
 187 encode_utf8(unicode_val_T u)
 188 #else
 189 static unsigned char *
 190 encode_utf8(unicode_val_T u)
 191 #endif /* CONFIG_UTF8 */
 192 {
 193         memset(utf_buffer, 0, 7);
 194
 195         if (u < 0x80)
 196                 utf_buffer[0] = u;
 197         else if (u < 0x800)
 198                 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
 199                 utf_buffer[1] = 0x80 | (u & 0x3f);
 200         else if (u < 0x10000)
 201                 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
 202                 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
 203                 utf_buffer[2] = 0x80 | (u & 0x3f);
 204         else if (u < 0x200000)
 205                 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
 206                 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
 207                 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
 208                 utf_buffer[3] = 0x80 | (u & 0x3f);
 209         else if (u < 0x4000000)
 210                 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
 211                 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
 212                 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
 213                 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
 214                 utf_buffer[4] = 0x80 | (u & 0x3f);
 215         else    utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
 216                 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
 217                 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
 218                 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
 219                 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
 220                 utf_buffer[5] = 0x80 | (u & 0x3f);
 221
 222         return utf_buffer;
 223 }
 224
 225 #ifdef CONFIG_UTF8
 226 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
 227  * equal ones and handled different. */
 228 static char utf8char_len_tab[256] = {
 229         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 230         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 231         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 232         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 233         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 234         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 235         2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
 236         3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
 237 };
 238
 239 inline int utf8charlen(const unsigned char *p)
 240 {
 241         return p ? utf8char_len_tab[*p] : 0;
 242 }
 243
 244 inline int
 245 strlen_utf8(unsigned char **str)
 246 {
 247         unsigned char *s = *str;
 248         unsigned char *end = strchr(s, '\0');
 249         int x;
 250         int len;
 251
 252         for (x = 0;; x++, s += len) {
 253                 len = utf8charlen(s);
 254                 if (s + len > end) break;
 255         }
 256         *str = s;
 257         return x;
 258 }
 259
 260 #define utf8_issingle(p) (((p) & 0x80) == 0)
 261 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
 262
 263 /* Start from @current and move back to @pos char. This pointer return. The
 264  * most left pointer is @start. */
 265 inline unsigned char *
 266 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
 267 {
 268         if (current == NULL || start == NULL || pos < 0)
 269                 return NULL;
 270         while (pos > 0 && current != start) {
 271                 current--;
 272                 if (utf8_islead(*current))
 273                         pos--;
 274         }
 275         return current;
 276 }
 277
 278 /* Count number of standard terminal cells needed for displaying UTF-8
 279  * character. */
 280 int
 281 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
 282 {
 283         unicode_val_T u;
 284
 285         if (end == NULL)
 286                 end = strchr(utf8_char, '\0');
 287
 288         if(!utf8_char || !end)
 289                 return -1;
 290
 291         u = utf8_to_unicode(&utf8_char, end);
 292
 293         return unicode_to_cell(u);
 294 }
 295
 296 /* Count number of standard terminal cells needed for displaying string
 297  * with UTF-8 characters. */
 298 int
 299 utf8_ptr2cells(unsigned char *string, unsigned char *end)
 300 {
 301         int charlen, cell, cells = 0;
 302
 303         if (end == NULL)
 304                 end = strchr(string, '\0');
 305
 306         if(!string || !end)
 307                 return -1;
 308
 309         do {
 310                 charlen = utf8charlen(string);
 311                 if (string + charlen > end)
 312                         break;
 313
 314                 cell = utf8_char2cells(string, end);
 315                 if  (cell < 0)
 316                         return -1;
 317
 318                 cells += cell;
 319                 string += charlen;
 320         } while (1);
 321
 322         return cells;
 323 }
 324
 325 /* Count number of characters in string. */
 326 int
 327 utf8_ptr2chars(unsigned char *string, unsigned char *end)
 328 {
 329         int charlen, chars = 0;
 330
 331         if (end == NULL)
 332                 end = strchr(string, '\0');
 333
 334         if(!string || !end)
 335                 return -1;
 336
 337         do {
 338                 charlen = utf8charlen(string);
 339                 if (string + charlen > end)
 340                         break;
 341
 342                 chars++;
 343                 string += charlen;
 344         } while (1);
 345
 346         return chars;
 347 }
 348
 349 /*
 350  * Count number of bytes from begining of the string needed for displaying
 351  * specified number of cells.
 352  */
 353 int
 354 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
 355 {
 356         unsigned int bytes = 0, cells = 0;
 357
 358         assert(max_cells>=0);
 359
 360         if (end == NULL)
 361                 end = strchr(string, '\0');
 362
 363         if(!string || !end)
 364                 return -1;
 365
 366         do {
 367                 int cell = utf8_char2cells(&string[bytes], end);
 368                 if (cell < 0)
 369                         return -1;
 370
 371                 cells += cell;
 372                 if (cells > max_cells)
 373                         break;
 374
 375                 bytes += utf8charlen(&string[bytes]);
 376
 377                 if (string + bytes > end) {
 378                         bytes = end - string;
 379                         break;
 380                 }
 381         } while(1);
 382
 383         return bytes;
 384 }
 385
 386 /* Take @max steps forward from @string in the specified @way, but
 387  * not going past @end.  Return the resulting address.  Store the
 388  * number of steps taken to *@count, unless @count is NULL.
 389  *
 390  * This assumes the text is valid UTF-8, and @string and @end point to
 391  * character boundaries.  If not, it doesn't crash but the results may
 392  * be inconsistent.
 393  *
 394  * This function can do some of the same jobs as utf8charlen(),
 395  * utf8_cells2bytes(), and strlen_utf8().  */
 396 unsigned char *
 397 utf8_step_forward(unsigned char *string, unsigned char *end,
 398                   int max, enum utf8_step way, int *count)
 399 {
 400         int steps = 0;
 401         unsigned char *current = string;
 402
 403         assert(string);
 404         assert(max >= 0);
 405         if_assert_failed goto invalid_arg;
 406         if (end == NULL)
 407                 end = strchr(string, '\0');
 408
 409         switch (way) {
 410         case utf8_step_characters:
 411                 while (steps < max && current < end) {
 412                         ++current;
 413                         if (utf8_islead(*current))
 414                                 ++steps;
 415                 }
 416                 break;
 417
 418         case utf8_step_cells_fewer:
 419         case utf8_step_cells_more:
 420                 while (steps < max) {
 421                         unicode_val_T u;
 422                         unsigned char *prev = current;
 423                         int width;
 424
 425                         u = utf8_to_unicode(&current, end);
 426                         if (u == UCS_NO_CHAR) {
 427                                 /* Assume the incomplete sequence
 428                                  * costs one cell.  */
 429                                 current = end;
 430                                 ++steps;
 431                                 break;
 432                         }
 433
 434                         width = unicode_to_cell(u);
 435                         if (way == utf8_step_cells_fewer
 436                             && steps + width > max) {
 437                                 /* Back off.  */
 438                                 current = prev;
 439                                 break;
 440                         }
 441                         steps += width;
 442                 }
 443                 break;
 444
 445         default:
 446                 INTERNAL("impossible enum utf8_step");
 447         }
 448
 449 invalid_arg:
 450         if (count)
 451                 *count = steps;
 452         return current;
 453 }
 454
 455 /* Take @max steps backward from @string in the specified @way, but
 456  * not going past @start.  Return the resulting address.  Store the
 457  * number of steps taken to *@count, unless @count is NULL.
 458  *
 459  * This assumes the text is valid UTF-8, and @string and @start point
 460  * to character boundaries.  If not, it doesn't crash but the results
 461  * may be inconsistent.
 462  *
 463  * This function can do some of the same jobs as utf8_prevchar().  */
 464 unsigned char *
 465 utf8_step_backward(unsigned char *string, unsigned char *start,
 466                    int max, enum utf8_step way, int *count)
 467 {
 468         int steps = 0;
 469         unsigned char *current = string;
 470
 471         assert(string);
 472         assert(start);
 473         assert(max >= 0);
 474         if_assert_failed goto invalid_arg;
 475
 476         switch (way) {
 477         case utf8_step_characters:
 478                 while (steps < max && current > start) {
 479                         --current;
 480                         if (utf8_islead(*current))
 481                                 ++steps;
 482                 }
 483                 break;
 484
 485         case utf8_step_cells_fewer:
 486         case utf8_step_cells_more:
 487                 while (steps < max) {
 488                         unsigned char *prev = current;
 489                         unsigned char *look;
 490                         unicode_val_T u;
 491                         int width;
 492
 493                         if (current <= start)
 494                                 break;
 495                         do {
 496                                 --current;
 497                         } while (current > start && !utf8_islead(*current));
 498
 499                         look = current;
 500                         u = utf8_to_unicode(&look, prev);
 501                         if (u == UCS_NO_CHAR) {
 502                                 /* Assume the incomplete sequence
 503                                  * costs one cell.  */
 504                                 width = 1;
 505                         } else
 506                                 width = unicode_to_cell(u);
 507
 508                         if (way == utf8_step_cells_fewer
 509                             && steps + width > max) {
 510                                 /* Back off.  */
 511                                 current = prev;
 512                                 break;
 513                         }
 514                         steps += width;
 515                 }
 516                 break;
 517
 518         default:
 519                 INTERNAL("impossible enum utf8_step");
 520         }
 521
 522 invalid_arg:
 523         if (count)
 524                 *count = steps;
 525         return current;
 526 }
 527
 528 /*
 529  * Find out number of standard terminal collumns needed for displaying symbol
 530  * (glyph) which represents Unicode character c.
 531  * TODO: Use wcwidth when it is available.
 532  *
 533  * @return      2 for double-width glyph, 1 for others.
 534  *              TODO: May be extended to return 0 for zero-width glyphs
 535  *              (like composing, maybe unprintable too).
 536  */
 537 inline int
 538 unicode_to_cell(unicode_val_T c)
 539 {
 540         if (c >= 0x1100
 541                 && (c <= 0x115f                 /* Hangul Jamo */
 542                 || c == 0x2329
 543                 || c == 0x232a
 544                 || (c >= 0x2e80 && c <= 0xa4cf
 545                         && c != 0x303f)         /* CJK ... Yi */
 546                 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
 547                 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
 548                                                                 Ideographs */
 549                 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
 550                 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
 551                 || (c >= 0xffe0 && c <= 0xffe6)
 552                 || (c >= 0x20000 && c <= 0x2fffd)
 553                 || (c >= 0x30000 && c <= 0x3fffd)))
 554                 return 2;
 555
 556         return 1;
 557 }
 558
 559 /* Fold the case of a Unicode character, so that hotkeys in labels can
 560  * be compared case-insensitively.  It is unspecified whether the
 561  * result will be in upper or lower case.  */
 562 unicode_val_T
 563 unicode_fold_label_case(unicode_val_T c)
 564 {
 565 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
 566         return towlower(c);
 567 #else  /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 568         /* For now, this supports only ASCII.  It would be possible to
 569          * use code generated from CaseFolding.txt of Unicode if the
 570          * acknowledgements required by http://www.unicode.org/copyright.html
 571          * were added to associated documentation of ELinks.  */
 572         if (c >= 0x41 && c <= 0x5A)
 573                 return c + 0x20;
 574         else
 575                 return c;
 576 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 577 }
 578
 579 inline unicode_val_T
 580 utf8_to_unicode(unsigned char **string, unsigned char *end)
 581 {
 582         unsigned char *str = *string;
 583         unicode_val_T u;
 584         int length;
 585
 586         length = utf8char_len_tab[str[0]];
 587
 588         if (str + length > end) {
 589                 return UCS_NO_CHAR;
 590         }
 591
 592         switch (length) {
 593                 case 1:
 594                         u = str[0];
 595                         break;
 596                 case 2:
 597                         u = (str[0] & 0x1f) << 6;
 598                         u += (str[1] & 0x3f);
 599                         break;
 600                 case 3:
 601                         u = (str[0] & 0x0f) << 12;
 602                         u += ((str[1] & 0x3f) << 6);
 603                         u += (str[2] & 0x3f);
 604                         break;
 605                 case 4:
 606                         u = (str[0] & 0x0f) << 18;
 607                         u += ((str[1] & 0x3f) << 12);
 608                         u += ((str[2] & 0x3f) << 6);
 609                         u += (str[3] & 0x3f);
 610                         break;
 611                 case 5:
 612                         u = (str[0] & 0x0f) << 24;
 613                         u += ((str[1] & 0x3f) << 18);
 614                         u += ((str[2] & 0x3f) << 12);
 615                         u += ((str[3] & 0x3f) << 6);
 616                         u += (str[4] & 0x3f);
 617                         break;
 618                 case 6:
 619                 default:
 620                         u = (str[0] & 0x01) << 30;
 621                         u += ((str[1] & 0x3f) << 24);
 622                         u += ((str[2] & 0x3f) << 18);
 623                         u += ((str[3] & 0x3f) << 12);
 624                         u += ((str[4] & 0x3f) << 6);
 625                         u += (str[5] & 0x3f);
 626                         break;
 627         }
 628         *string = str + length;
 629         return u;
 630 }
 631 #endif /* CONFIG_UTF8 */
 632
 633 /* Slow algorithm, the common part of cp2u and cp2utf8.  */
 634 static unicode_val_T
 635 cp2u_shared(const struct codepage_desc *from, unsigned char c)
 636 {
 637         int j;
 638
 639         for (j = 0; from->table[j].c; j++)
 640                 if (from->table[j].c == c)
 641                         return from->table[j].u;
 642
 643         return UCS_REPLACEMENT_CHARACTER;
 644 }
 645
 646 /* Slow algorithm, used for converting input from the terminal.  */
 647 unicode_val_T
 648 cp2u(int from, unsigned char c)
 649 {
 650         from &= ~SYSTEM_CHARSET_FLAG;
 651
 652         /* UTF-8 is a multibyte codepage and cannot be handled with
 653          * this function.  */
 654         assert(codepages[from].table != table_utf8);
 655         if_assert_failed return UCS_REPLACEMENT_CHARACTER;
 656
 657         if (c < 0x80) return c;
 658         else return cp2u_shared(&codepages[from], c);
 659 }
 660
 661 /* This slow and ugly code is used by the terminal utf_8_io */
 662 unsigned char *
 663 cp2utf8(int from, int c)
 664 {
 665         from &= ~SYSTEM_CHARSET_FLAG;
 666
 667         if (codepages[from].table == table_utf8 || c < 128)
 668                 return strings[c];
 669
 670         return encode_utf8(cp2u_shared(&codepages[from], c));
 671 }
 672
 673 #ifdef CONFIG_UTF8
 674 unicode_val_T
 675 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
 676 {
 677         unicode_val_T ret;
 678
 679         if (is_cp_utf8(codepage))
 680                 return utf8_to_unicode(string, end);
 681
 682         if (*string >= end)
 683                 return UCS_NO_CHAR;
 684
 685         ret = cp2u(codepage, **string);
 686         ++*string;
 687         return ret;
 688 }
 689 #endif  /* CONFIG_UTF8 */
 690
 691
 692 static void
 693 add_utf8(struct conv_table *ct, unicode_val_T u, unsigned char *str)
 694 {
 695         unsigned char *p = encode_utf8(u);
 696
 697         while (p[1]) {
 698                 if (ct[*p].t) ct = ct[*p].u.tbl;
 699                 else {
 700                         struct conv_table *nct;
 701
 702                         assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
 703                         if_assert_failed return;
 704
 705                         nct = mem_calloc(256, sizeof(*nct));
 706                         if (!nct) return;
 707                         new_translation_table(nct);
 708                         ct[*p].t = 1;
 709                         ct[*p].u.tbl = nct;
 710                         ct = nct;
 711                 }
 712                 p++;
 713         }
 714
 715         assertm(!ct[*p].t, "bad utf encoding #2");
 716         if_assert_failed return;
 717
 718         if (ct[*p].u.str == no_str)
 719                 ct[*p].u.str = str;
 720 }
 721
 722 struct conv_table utf_table[256];
 723 int utf_table_init = 1;
 724
 725 static void
 726 free_utf_table(void)
 727 {
 728         int i;
 729
 730         for (i = 128; i < 256; i++)
 731                 mem_free(utf_table[i].u.str);
 732 }
 733
 734 static struct conv_table *
 735 get_translation_table_to_utf8(int from)
 736 {
 737         int i;
 738         static int lfr = -1;
 739
 740         if (from == -1) return NULL;
 741         from &= ~SYSTEM_CHARSET_FLAG;
 742         if (from == lfr) return utf_table;
 743         lfr = from;
 744         if (utf_table_init)
 745                 memset(utf_table, 0, sizeof(utf_table)),
 746                 utf_table_init = 0;
 747         else
 748                 free_utf_table();
 749
 750         for (i = 0; i < 128; i++)
 751                 utf_table[i].u.str = strings[i];
 752
 753         if (codepages[from].table == table_utf8) {
 754                 for (i = 128; i < 256; i++)
 755                         utf_table[i].u.str = stracpy(strings[i]);
 756                 return utf_table;
 757         }
 758
 759         for (i = 128; i < 256; i++)
 760                 utf_table[i].u.str = NULL;
 761
 762         for (i = 0; codepages[from].table[i].c; i++) {
 763                 unicode_val_T u = codepages[from].table[i].u;
 764
 765                 if (!utf_table[codepages[from].table[i].c].u.str)
 766                         utf_table[codepages[from].table[i].c].u.str =
 767                                 stracpy(encode_utf8(u));
 768         }
 769
 770         for (i = 128; i < 256; i++)
 771                 if (!utf_table[i].u.str)
 772                         utf_table[i].u.str = stracpy(no_str);
 773
 774         return utf_table;
 775 }
 776
 777 struct conv_table table[256];
 778 static int first = 1;
 779
 780 void
 781 free_conv_table(void)
 782 {
 783         if (!utf_table_init) free_utf_table();
 784         if (first) {
 785                 memset(table, 0, sizeof(table));
 786                 first = 0;
 787         }
 788         new_translation_table(table);
 789 }
 790
 791
 792 struct conv_table *
 793 get_translation_table(int from, int to)
 794 {
 795         static int lfr = -1;
 796         static int lto = -1;
 797
 798         from &= ~SYSTEM_CHARSET_FLAG;
 799         to &= ~SYSTEM_CHARSET_FLAG;
 800         if (first) {
 801                 memset(table, 0, sizeof(table));
 802                 first = 0;
 803         }
 804         if (/*from == to ||*/ from == -1 || to == -1)
 805                 return NULL;
 806         if (codepages[to].table == table_utf8)
 807                 return get_translation_table_to_utf8(from);
 808         if (from == lfr && to == lto)
 809                 return table;
 810         lfr = from;
 811         lto = to;
 812         new_translation_table(table);
 813
 814         if (codepages[from].table == table_utf8) {
 815                 int i;
 816
 817                 for (i = 0; codepages[to].table[i].c; i++)
 818                         add_utf8(table, codepages[to].table[i].u,
 819                                  strings[codepages[to].table[i].c]);
 820
 821                 for (i = 0; unicode_7b[i].x != -1; i++)
 822                         if (unicode_7b[i].x >= 0x80)
 823                                 add_utf8(table, unicode_7b[i].x,
 824                                          unicode_7b[i].s);
 825
 826         } else {
 827                 int i;
 828
 829                 for (i = 128; i < 256; i++) {
 830                         int j;
 831
 832                         for (j = 0; codepages[from].table[j].c; j++) {
 833                                 if (codepages[from].table[j].c == i) {
 834                                         unsigned char *u;
 835
 836                                         u = u2cp(codepages[from].table[j].u, to);
 837                                         if (u) table[i].u.str = u;
 838                                         break;
 839                                 }
 840                         }
 841                 }
 842         }
 843
 844         return table;
 845 }
 846
 847 static inline int
 848 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
 849 {
 850         while (l2) {
 851                 if (*s1 > *s2) return 1;
 852                 if (*s1 < *s2) return -1;
 853                 s1++;
 854                 s2++;
 855                 l2--;
 856         }
 857
 858         return *s2 ? -1 : 0;
 859 }
 860
 861 /* Entity cache debugging purpose. */
 862 #if 0
 863 #define DEBUG_ENTITY_CACHE
 864 #else
 865 #undef DEBUG_ENTITY_CACHE
 866 #endif
 867
 868 struct entity_cache {
 869         unsigned int hits;
 870         int strlen;
 871         int encoding;
 872         unsigned char *result;
 873         unsigned char str[20]; /* Suffice in any case. */
 874 };
 875
 876 static int
 877 hits_cmp(struct entity_cache *a, struct entity_cache *b)
 878 {
 879         if (a->hits == b->hits) return 0;
 880         if (a->hits > b->hits) return -1;
 881         else return 1;
 882 }
 883
 884 static int
 885 compare_entities(const void *key_, const void *element_)
 886 {
 887         struct string *key = (struct string *) key_;
 888         struct entity *element = (struct entity *) element_;
 889         int length = key->length;
 890         unsigned char *first = key->source;
 891         unsigned char *second = element->s;
 892
 893         return xxstrcmp(first, second, length);
 894 }
 895
 896 unsigned char *
 897 get_entity_string(const unsigned char *str, const int strlen, int encoding)
 898 {
 899 #define ENTITY_CACHE_SIZE 10    /* 10 seems a good value. */
 900 #define ENTITY_CACHE_MAXLEN 9   /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
 901                                    will go in [0] table */
 902         static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
 903         static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
 904         static int first_time = 1;
 905         unsigned int slen = 0;
 906         unsigned char *result = NULL;
 907
 908         if (strlen <= 0) return NULL;
 909
 910 #ifdef CONFIG_UTF8
 911         /* TODO: caching UTF-8 */
 912         encoding &= ~SYSTEM_CHARSET_FLAG;
 913         if (codepages[encoding].table == table_utf8)
 914                 goto skip;
 915 #endif /* CONFIG_UTF8 */
 916
 917         if (first_time) {
 918                 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
 919                 first_time = 0;
 920         }
 921
 922         /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
 923          * + google + slashdot + websites that result from a search for test on google,
 924          * + various ones) show a quite impressive improvment:
 925          * Top ten is:
 926          * 0: hits=2459 l=4 st='nbsp'
 927          * 1: hits=2152 l=6 st='eacute'
 928          * 2: hits=235 l=6 st='egrave'
 929          * 3: hits=136 l=6 st='agrave'
 930          * 4: hits=100 l=3 st='amp'
 931          * 5: hits=40 l=5 st='laquo'
 932          * 6: hits=8 l=4 st='copy'
 933          * 7: hits=5 l=2 st='gt'
 934          * 8: hits=2 l=2 st='lt'
 935          * 9: hits=1 l=6 st='middot'
 936          *
 937          * Most of the time cache hit ratio is near 95%.
 938          *
 939          * A long test shows: 15186 hits vs. 24 misses and mean iteration
 940          * count is kept < 2 (worst case 1.58). Not so bad ;)
 941          *
 942          * --Zas */
 943
 944         /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
 945         slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
 946
 947         if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
 948                 int i;
 949
 950                 for (i = 0; i < nb_entity_cache[slen]; i++) {
 951                         if (entity_cache[slen][i].encoding == encoding
 952                             && !memcmp(str, entity_cache[slen][i].str, strlen)) {
 953 #ifdef DEBUG_ENTITY_CACHE
 954                                 static double total_iter = 0;
 955                                 static unsigned long hit_count = 0;
 956
 957                                 total_iter += i + 1;
 958                                 hit_count++;
 959                                 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
 960 #endif
 961                                 if (entity_cache[slen][i].hits < (unsigned int) ~0)
 962                                         entity_cache[slen][i].hits++;
 963                                 return entity_cache[slen][i].result;
 964                         }
 965                 }
 966 #ifdef DEBUG_ENTITY_CACHE
 967                 fprintf(stderr, "miss\n");
 968 #endif
 969         }
 970 #ifdef CONFIG_UTF8
 971 skip:
 972 #endif /* CONFIG_UTF8 */
 973         if (*str == '#') { /* Numeric entity. */
 974                 int l = (int) strlen;
 975                 unsigned char *st = (unsigned char *) str;
 976                 unicode_val_T n = 0;
 977
 978                 if (l == 1) goto end; /* &#; ? */
 979                 st++, l--;
 980                 if ((*st | 32) == 'x') { /* Hexadecimal */
 981
 982                         if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
 983                         st++, l--;
 984                         do {
 985                                 unsigned char c = (*(st++) | 32);
 986
 987                                 if (isdigit(c))
 988                                         n = (n << 4) | (c - '0');
 989                                 else if (isxdigit(c))
 990                                         n = (n << 4) | (c - 'a' + 10);
 991                                 else
 992                                         goto end; /* Bad char. */
 993                         } while (--l);
 994                 } else { /* Decimal */
 995                         if (l > 10) goto end; /* 4294967295 max. */
 996                         do {
 997                                 unsigned char c = *(st++);
 998
 999                                 if (isdigit(c))
1000                                         n = n * 10 + c - '0';
1001                                 else
1002                                         goto end; /* Bad char. */
1003                                 /* Limit to 0xFFFFFFFF. */
1004                                 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1005                                         goto end;
1006                         } while (--l);
1007                 }
1008
1009                 result = u2cp(n, encoding);
1010
1011 #ifdef DEBUG_ENTITY_CACHE
1012                 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1013 #endif
1014         } else { /* Text entity. */
1015                 struct string key = INIT_STRING((unsigned char *) str, strlen);
1016                 struct entity *element = bsearch((void *) &key, entities,
1017                                                  N_ENTITIES,
1018                                                  sizeof(*element),
1019                                                  compare_entities);
1020
1021                 if (element) result = u2cp(element->c, encoding);
1022         }
1023
1024 #ifdef CONFIG_UTF8
1025         if (codepages[encoding].table == table_utf8) {
1026                 return result;
1027         }
1028 #endif /* CONFIG_UTF8 */
1029 end:
1030         /* Take care of potential buffer overflow. */
1031         if (strlen < sizeof(entity_cache[slen][0].str)) {
1032                 struct entity_cache *ece = &entity_cache[slen][nb_entity_cache[slen]];
1033
1034                 /* Copy new entry to cache. */
1035                 ece->hits = 1;
1036                 ece->strlen = strlen;
1037                 ece->encoding = encoding;
1038                 ece->result = result;
1039                 memcpy(ece->str, str, strlen);
1040                 ece->str[strlen] = '\0';
1041
1042                 /* Increment number of cache entries if possible. */
1043                 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1044
1045 #ifdef DEBUG_ENTITY_CACHE
1046                 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1047                                 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1048
1049 #endif
1050
1051                 /* Sort entries by hit order. */
1052                 if (nb_entity_cache[slen] > 1)
1053                         qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1054                               sizeof(entity_cache[slen][0]), (void *) hits_cmp);
1055
1056 #ifdef DEBUG_ENTITY_CACHE
1057         {
1058                 unsigned int i;
1059
1060                 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1061                 for (i = 0; i < nb_entity_cache[slen] ; i++)
1062                         fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1063                                 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1064                                 entity_cache[slen][i].str);
1065                 fprintf(stderr, "-----------------\n");
1066         }
1067 #endif
1068         }
1069         return result;
1070 }
1071
1072 unsigned char *
1073 convert_string(struct conv_table *convert_table,
1074                unsigned char *chars, int charslen, int cp,
1075                enum convert_string_mode mode, int *length,
1076                void (*callback)(void *data, unsigned char *buf, int buflen),
1077                void *callback_data)
1078 {
1079         unsigned char *buffer;
1080         int bufferpos = 0;
1081         int charspos = 0;
1082
1083         if (!convert_table && !memchr(chars, '&', charslen)) {
1084                 if (callback) {
1085                         if (charslen) callback(callback_data, chars, charslen);
1086                         return NULL;
1087                 } else {
1088                         return memacpy(chars, charslen);
1089                 }
1090         }
1091
1092         /* Buffer allocation */
1093
1094         buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1095         if (!buffer) return NULL;
1096
1097         /* Iterate ;-) */
1098
1099         while (charspos < charslen) {
1100                 unsigned char *translit;
1101
1102 #define PUTC do { \
1103                 buffer[bufferpos++] = chars[charspos++]; \
1104                 translit = ""; \
1105                 goto flush; \
1106         } while (0)
1107
1108                 if (chars[charspos] != '&') {
1109                         struct conv_table *t;
1110                         int i;
1111
1112                         if (chars[charspos] < 128 || !convert_table) PUTC;
1113
1114                         t = convert_table;
1115                         i = charspos;
1116
1117                         while (t[chars[i]].t) {
1118                                 t = t[chars[i++]].u.tbl;
1119                                 if (i >= charslen) PUTC;
1120                         }
1121
1122                         translit = t[chars[i]].u.str;
1123                         charspos = i + 1;
1124
1125                 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1126                         PUTC;
1127
1128                 } else {
1129                         int start = charspos + 1;
1130                         int i = start;
1131
1132                         while (i < charslen
1133                                && (isasciialpha(chars[i])
1134                                    || isdigit(chars[i])
1135                                    || (chars[i] == '#')))
1136                                 i++;
1137
1138                         /* This prevents bug 213: we were expanding "entities"
1139                          * in URL query strings. */
1140                         /* XXX: But this disables &nbsp&nbsp usage, which
1141                          * appears to be relatively common! --pasky */
1142                         if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1143                             && i > start
1144                             && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1145                                 translit = get_entity_string(&chars[start], i - start,
1146                                                       cp);
1147                                 if (chars[i] != ';') {
1148                                         /* Eat &nbsp &nbsp<foo> happily, but
1149                                          * pull back from the character after
1150                                          * entity string if it is not the valid
1151                                          * terminator. */
1152                                         i--;
1153                                 }
1154
1155                                 if (!translit) PUTC;
1156                                 charspos = i + (i < charslen);
1157                         } else PUTC;
1158                 }
1159
1160                 if (!translit[0]) continue;
1161
1162                 if (!translit[1]) {
1163                         buffer[bufferpos++] = translit[0];
1164                         translit = "";
1165                         goto flush;
1166                 }
1167
1168                 while (*translit) {
1169                         unsigned char *new;
1170
1171                         buffer[bufferpos++] = *(translit++);
1172 flush:
1173                         if (bufferpos & (ALLOC_GR - 1)) continue;
1174
1175                         if (callback) {
1176                                 buffer[bufferpos] = 0;
1177                                 callback(callback_data, buffer, bufferpos);
1178                                 bufferpos = 0;
1179                         } else {
1180                                 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1181                                 if (!new) {
1182                                         mem_free(buffer);
1183                                         return NULL;
1184                                 }
1185                                 buffer = new;
1186                         }
1187                 }
1188 #undef PUTC
1189         }
1190
1191         /* Say bye */
1192
1193         buffer[bufferpos] = 0;
1194         if (length) *length = bufferpos;
1195
1196         if (callback) {
1197                 if (bufferpos) callback(callback_data, buffer, bufferpos);
1198                 mem_free(buffer);
1199                 return NULL;
1200         } else {
1201                 return buffer;
1202         }
1203 }
1204
1205
1206 #ifndef USE_FASTFIND
1207 int
1208 get_cp_index(unsigned char *name)
1209 {
1210         int i, a;
1211         int syscp = 0;
1212
1213         if (!strcasecmp(name, "System")) {
1214 #if HAVE_LANGINFO_CODESET
1215                 name = nl_langinfo(CODESET);
1216                 syscp = SYSTEM_CHARSET_FLAG;
1217 #else
1218                 name = "us-ascii";
1219 #endif
1220         }
1221
1222         for (i = 0; codepages[i].name; i++) {
1223                 for (a = 0; codepages[i].aliases[a]; a++) {
1224                         /* In the past, we looked for the longest substring
1225                          * in all the names; it is way too expensive, though:
1226                          *
1227                          *   %   cumulative   self              self     total
1228                          *  time   seconds   seconds    calls  us/call  us/call  name
1229                          *  3.00      0.66     0.03     1325    22.64    22.64  get_cp_index
1230                          *
1231                          * Anything called from redraw_screen() is in fact
1232                          * relatively expensive, even if it's called just
1233                          * once. So we will do a simple strcasecmp() here.
1234                          */
1235
1236                         if (!strcasecmp(name, codepages[i].aliases[a]))
1237                                 return i | syscp;
1238                 }
1239         }
1240
1241         if (syscp) {
1242                 return get_cp_index("us-ascii") | syscp;
1243         } else {
1244                 return -1;
1245         }
1246 }
1247
1248 #else
1249
1250 static unsigned int i_name = 0;
1251 static unsigned int i_alias = 0;
1252
1253 /* Reset internal list pointer */
1254 void
1255 charsets_list_reset(void)
1256 {
1257         i_name = 0;
1258         i_alias = 0;
1259 }
1260
1261 /* Returns a pointer to a struct that contains current key and data pointers
1262  * and increment internal pointer.  It returns NULL when key is NULL. */
1263 struct fastfind_key_value *
1264 charsets_list_next(void)
1265 {
1266         static struct fastfind_key_value kv;
1267
1268         if (!codepages[i_name].name) return NULL;
1269
1270         kv.key = codepages[i_name].aliases[i_alias];
1271         kv.data = (void *) &codepages[i_name]; /* cast away const */
1272
1273         if (codepages[i_name].aliases[i_alias + 1])
1274                 i_alias++;
1275         else {
1276                 i_name++;
1277                 i_alias = 0;
1278         }
1279
1280         return &kv;
1281 }
1282
1283 static struct fastfind_index ff_charsets_index
1284         = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1285
1286 /* It searchs for a charset named @name or one of its aliases and
1287  * returns index for it or -1 if not found. */
1288 int
1289 get_cp_index(unsigned char *name)
1290 {
1291         const struct codepage_desc *codepage;
1292         int syscp = 0;
1293
1294         if (!strcasecmp(name, "System")) {
1295 #if HAVE_LANGINFO_CODESET
1296                 name = nl_langinfo(CODESET);
1297                 syscp = SYSTEM_CHARSET_FLAG;
1298 #else
1299                 name = "us-ascii";
1300 #endif
1301         }
1302
1303         codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1304         if (codepage) {
1305                 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1306                 return (codepage - codepages) | syscp;
1307
1308         } else if (syscp) {
1309                 return get_cp_index("us-ascii") | syscp;
1310
1311         } else {
1312                 return -1;
1313         }
1314 }
1315
1316 #endif /* USE_FASTFIND */
1317
1318 void
1319 init_charsets_lookup(void)
1320 {
1321 #ifdef USE_FASTFIND
1322         fastfind_index(&ff_charsets_index, FF_COMPRESS);
1323 #endif
1324 }
1325
1326 void
1327 free_charsets_lookup(void)
1328 {
1329 #ifdef USE_FASTFIND
1330         fastfind_done(&ff_charsets_index);
1331 #endif
1332 }
1333
1334 unsigned char *
1335 get_cp_name(int cp_index)
1336 {
1337         if (cp_index < 0) return "none";
1338         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1339
1340         return codepages[cp_index].name;
1341 }
1342
1343 unsigned char *
1344 get_cp_mime_name(int cp_index)
1345 {
1346         if (cp_index < 0) return "none";
1347         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1348         if (!codepages[cp_index].aliases) return NULL;
1349
1350         return codepages[cp_index].aliases[0];
1351 }
1352
1353 int
1354 is_cp_utf8(int cp_index)
1355 {
1356         cp_index &= ~SYSTEM_CHARSET_FLAG;
1357         return codepages[cp_index].table == table_utf8;
1358 }