src/intl/charsets.c

   1 /* Charsets convertor */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #if HAVE_LANGINFO_CODESET
   8 #include <langinfo.h>
   9 #endif
  10
  11 #include <ctype.h>
  12 #include <stdlib.h>
  13 #if HAVE_WCTYPE_H
  14 #include <wctype.h>
  15 #endif
  16
  17 #include "elinks.h"
  18
  19 #include "document/options.h"
  20 #include "intl/charsets.h"
  21 #include "util/conv.h"
  22 #include "util/error.h"
  23 #include "util/fastfind.h"
  24 #include "util/memory.h"
  25 #include "util/string.h"
  26
  27
  28 /* Fix namespace clash on MacOS. */
  29 #define table table_elinks
  30
  31 struct table_entry {
  32         unsigned char c;
  33         unicode_val_T u;
  34 };
  35
  36 struct codepage_desc {
  37         unsigned char *name;
  38         unsigned char **aliases;
  39         struct table_entry *table;
  40 };
  41
  42 #include "intl/codepage.inc"
  43 #include "intl/uni_7b.inc"
  44 #include "intl/entity.inc"
  45
  46
  47 static char strings[256][2] = {
  48         "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
  49         "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
  50         "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
  51         "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
  52         "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
  53         "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
  54         "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
  55         "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
  56         "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
  57         "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
  58         "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
  59         "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
  60         "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
  61         "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
  62         "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
  63         "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
  64         "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
  65         "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
  66         "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
  67         "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
  68         "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
  69         "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
  70         "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
  71         "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
  72         "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
  73         "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
  74         "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
  75         "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
  76         "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
  77         "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
  78         "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
  79         "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
  80 };
  81
  82 static void
  83 free_translation_table(struct conv_table *p)
  84 {
  85         int i;
  86
  87         for (i = 0; i < 256; i++)
  88                 if (p[i].t)
  89                         free_translation_table(p[i].u.tbl);
  90
  91         mem_free(p);
  92 }
  93
  94 static unsigned char *no_str = "*";
  95
  96 static void
  97 new_translation_table(struct conv_table *p)
  98 {
  99         int i;
 100
 101         for (i = 0; i < 256; i++)
 102                 if (p[i].t)
 103                         free_translation_table(p[i].u.tbl);
 104         for (i = 0; i < 128; i++) {
 105                 p[i].t = 0;
 106                 p[i].u.str = strings[i];
 107         }
 108         for (; i < 256; i++) {
 109                 p[i].t = 0;
 110                 p[i].u.str = no_str;
 111         }
 112 }
 113
 114 #define BIN_SEARCH(table, entry, entries, key, result)                                  \
 115 {                                                                                       \
 116         long _s = 0, _e = (entries) - 1;                                                \
 117                                                                                         \
 118         while (_s <= _e || !((result) = -1)) {                                          \
 119                 long _m = (_s + _e) / 2;                                                \
 120                                                                                         \
 121                 if ((table)[_m].entry == (key)) {                                       \
 122                         (result) = _m;                                                  \
 123                         break;                                                          \
 124                 }                                                                       \
 125                 if ((table)[_m].entry > (key)) _e = _m - 1;                             \
 126                 if ((table)[_m].entry < (key)) _s = _m + 1;                             \
 127         }                                                                               \
 128 }                                                                                       \
 129
 130 static const unicode_val_T strange_chars[32] = {
 131 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
 132 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
 133 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
 134 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
 135 };
 136
 137 #define SYSTEM_CHARSET_FLAG 128
 138
 139 unsigned char *
 140 u2cp_(unicode_val_T u, int to, int no_nbsp_hack)
 141 {
 142         int j;
 143         int s;
 144
 145         if (u < 128) return strings[u];
 146
 147         to &= ~SYSTEM_CHARSET_FLAG;
 148
 149 #ifdef CONFIG_UTF_8
 150         if (codepages[to].table == table_utf_8)
 151                 return encode_utf_8(u);
 152 #endif /* CONFIG_UTF_8 */
 153
 154         /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
 155         if (u == 0xa0) return no_nbsp_hack ? " " : NBSP_CHAR_STRING;
 156         if (u == 0xad) return "";
 157
 158         if (u < 0xa0) {
 159                 unicode_val_T strange = strange_chars[u - 0x80];
 160
 161                 if (!strange) return NULL;
 162                 return u2cp_(strange, to, no_nbsp_hack);
 163         }
 164
 165
 166         for (j = 0; codepages[to].table[j].c; j++)
 167                 if (codepages[to].table[j].u == u)
 168                         return strings[codepages[to].table[j].c];
 169
 170         BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
 171         if (s != -1) return unicode_7b[s].s;
 172
 173         return no_str;
 174 }
 175
 176 static unsigned char utf_buffer[7];
 177
 178 #ifdef CONFIG_UTF_8
 179 inline unsigned char *
 180 encode_utf_8(unicode_val_T u)
 181 #else
 182 static unsigned char *
 183 encode_utf_8(unicode_val_T u)
 184 #endif /* CONFIG_UTF_8 */
 185 {
 186         memset(utf_buffer, 0, 7);
 187
 188         if (u < 0x80)
 189                 utf_buffer[0] = u;
 190         else if (u < 0x800)
 191                 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
 192                 utf_buffer[1] = 0x80 | (u & 0x3f);
 193         else if (u < 0x10000)
 194                 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
 195                 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
 196                 utf_buffer[2] = 0x80 | (u & 0x3f);
 197         else if (u < 0x200000)
 198                 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
 199                 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
 200                 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
 201                 utf_buffer[3] = 0x80 | (u & 0x3f);
 202         else if (u < 0x4000000)
 203                 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
 204                 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
 205                 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
 206                 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
 207                 utf_buffer[4] = 0x80 | (u & 0x3f);
 208         else    utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
 209                 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
 210                 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
 211                 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
 212                 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
 213                 utf_buffer[5] = 0x80 | (u & 0x3f);
 214
 215         return utf_buffer;
 216 }
 217
 218 #ifdef CONFIG_UTF_8
 219 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
 220  * equal ones and handled different. */
 221 static char utf8char_len_tab[256] = {
 222         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 223         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 224         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 225         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 226         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 227         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 228         2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
 229         3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
 230 };
 231
 232 inline int utf8charlen(const unsigned char *p)
 233 {
 234         return p ? utf8char_len_tab[*p] : 0;
 235 }
 236
 237 inline int
 238 strlen_utf8(unsigned char **str)
 239 {
 240         unsigned char *s = *str;
 241         unsigned char *end = strchr(s, '\0');
 242         int x;
 243         int len;
 244
 245         for (x = 0;; x++, s += len) {
 246                 len = utf8charlen(s);
 247                 if (s + len > end) break;
 248         }
 249         *str = s;
 250         return x;
 251 }
 252
 253 #define utf8_issingle(p) (((p) & 0x80) == 0)
 254 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
 255
 256 /* Start from @current and move back to @pos char. This pointer return. The
 257  * most left pointer is @start. */
 258 inline unsigned char *
 259 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
 260 {
 261         if (current == NULL || start == NULL || pos < 0)
 262                 return NULL;
 263         while (pos > 0 && current != start) {
 264                 current--;
 265                 if (utf8_islead(*current))
 266                         pos--;
 267         }
 268         return current;
 269 }
 270
 271 /* Count number of standard terminal cells needed for displaying UTF-8
 272  * character. */
 273 int
 274 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
 275 {
 276         unicode_val_T u;
 277
 278         if (end == NULL)
 279                 end = strchr(utf8_char, '\0');
 280
 281         if(!utf8_char || !end)
 282                 return -1;
 283
 284         u = utf_8_to_unicode(&utf8_char, end);
 285
 286         return unicode_to_cell(u);
 287 }
 288
 289 /* Count number of standard terminal cells needed for displaying string
 290  * with UTF-8 characters. */
 291 int
 292 utf8_ptr2cells(unsigned char *string, unsigned char *end)
 293 {
 294         int charlen, cell, cells = 0;
 295
 296         if (end == NULL)
 297                 end = strchr(string, '\0');
 298
 299         if(!string || !end)
 300                 return -1;
 301
 302         do {
 303                 charlen = utf8charlen(string);
 304                 if (string + charlen > end)
 305                         break;
 306
 307                 cell = utf8_char2cells(string, end);
 308                 if  (cell < 0)
 309                         return -1;
 310
 311                 cells += cell;
 312                 string += charlen;
 313         } while (1);
 314
 315         return cells;
 316 }
 317
 318 /* Count number of characters in string. */
 319 int
 320 utf8_ptr2chars(unsigned char *string, unsigned char *end)
 321 {
 322         int charlen, chars = 0;
 323
 324         if (end == NULL)
 325                 end = strchr(string, '\0');
 326
 327         if(!string || !end)
 328                 return -1;
 329
 330         do {
 331                 charlen = utf8charlen(string);
 332                 if (string + charlen > end)
 333                         break;
 334
 335                 chars++;
 336                 string += charlen;
 337         } while (1);
 338
 339         return chars;
 340 }
 341
 342 /*
 343  * Count number of bytes from begining of the string needed for displaying
 344  * specified number of cells.
 345  */
 346 int
 347 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
 348 {
 349         unsigned int bytes = 0, cells = 0;
 350
 351         assert(max_cells>=0);
 352
 353         if (end == NULL)
 354                 end = strchr(string, '\0');
 355
 356         if(!string || !end)
 357                 return -1;
 358
 359         do {
 360                 int cell = utf8_char2cells(&string[bytes], end);
 361                 if (cell < 0)
 362                         return -1;
 363
 364                 cells += cell;
 365                 if (cells > max_cells)
 366                         break;
 367
 368                 bytes += utf8charlen(&string[bytes]);
 369
 370                 if (string + bytes > end) {
 371                         bytes = end - string;
 372                         break;
 373                 }
 374         } while(1);
 375
 376         return bytes;
 377 }
 378
 379 /* Take @max steps forward from @string in the specified @way, but
 380  * not going past @end.  Return the resulting address.  Store the
 381  * number of steps taken to *@count, unless @count is NULL.
 382  *
 383  * This assumes the text is valid UTF-8, and @string and @end point to
 384  * character boundaries.  If not, it doesn't crash but the results may
 385  * be inconsistent.
 386  *
 387  * This function can do some of the same jobs as utf8charlen(),
 388  * utf8_cells2bytes(), and strlen_utf8().  */
 389 unsigned char *
 390 utf8_step_forward(unsigned char *string, unsigned char *end,
 391                   int max, enum utf8_step way, int *count)
 392 {
 393         int steps = 0;
 394         unsigned char *current = string;
 395
 396         assert(string);
 397         assert(max >= 0);
 398         if_assert_failed goto invalid_arg;
 399         if (end == NULL)
 400                 end = strchr(string, '\0');
 401
 402         switch (way) {
 403         case utf8_step_characters:
 404                 while (steps < max && current < end) {
 405                         ++current;
 406                         if (utf8_islead(*current))
 407                                 ++steps;
 408                 }
 409                 break;
 410
 411         case utf8_step_cells_fewer:
 412         case utf8_step_cells_more:
 413                 while (steps < max) {
 414                         unicode_val_T u;
 415                         unsigned char *prev = current;
 416                         int width;
 417
 418                         u = utf_8_to_unicode(&current, end);
 419                         if (u == UCS_NO_CHAR) {
 420                                 /* Assume the incomplete sequence
 421                                  * costs one cell.  */
 422                                 current = end;
 423                                 ++steps;
 424                                 break;
 425                         }
 426
 427                         width = unicode_to_cell(u);
 428                         if (way == utf8_step_cells_fewer
 429                             && steps + width > max) {
 430                                 /* Back off.  */
 431                                 current = prev;
 432                                 break;
 433                         }
 434                         steps += width;
 435                 }
 436                 break;
 437
 438         default:
 439                 INTERNAL("impossible enum utf8_step");
 440         }
 441
 442 invalid_arg:
 443         if (count)
 444                 *count = steps;
 445         return current;
 446 }
 447
 448 /* Take @max steps backward from @string in the specified @way, but
 449  * not going past @start.  Return the resulting address.  Store the
 450  * number of steps taken to *@count, unless @count is NULL.
 451  *
 452  * This assumes the text is valid UTF-8, and @string and @start point
 453  * to character boundaries.  If not, it doesn't crash but the results
 454  * may be inconsistent.
 455  *
 456  * This function can do some of the same jobs as utf8_prevchar().  */
 457 unsigned char *
 458 utf8_step_backward(unsigned char *string, unsigned char *start,
 459                    int max, enum utf8_step way, int *count)
 460 {
 461         int steps = 0;
 462         unsigned char *current = string;
 463
 464         assert(string);
 465         assert(start);
 466         assert(max >= 0);
 467         if_assert_failed goto invalid_arg;
 468
 469         switch (way) {
 470         case utf8_step_characters:
 471                 while (steps < max && current > start) {
 472                         --current;
 473                         if (utf8_islead(*current))
 474                                 ++steps;
 475                 }
 476                 break;
 477
 478         case utf8_step_cells_fewer:
 479         case utf8_step_cells_more:
 480                 while (steps < max) {
 481                         unsigned char *prev = current;
 482                         unsigned char *look;
 483                         unicode_val_T u;
 484                         int width;
 485
 486                         if (current <= start)
 487                                 break;
 488                         do {
 489                                 --current;
 490                         } while (current > start && !utf8_islead(*current));
 491
 492                         look = current;
 493                         u = utf_8_to_unicode(&look, prev);
 494                         if (u == UCS_NO_CHAR) {
 495                                 /* Assume the incomplete sequence
 496                                  * costs one cell.  */
 497                                 width = 1;
 498                         } else
 499                                 width = unicode_to_cell(u);
 500
 501                         if (way == utf8_step_cells_fewer
 502                             && steps + width > max) {
 503                                 /* Back off.  */
 504                                 current = prev;
 505                                 break;
 506                         }
 507                         steps += width;
 508                 }
 509                 break;
 510
 511         default:
 512                 INTERNAL("impossible enum utf8_step");
 513         }
 514
 515 invalid_arg:
 516         if (count)
 517                 *count = steps;
 518         return current;
 519 }
 520
 521 /*
 522  * Find out number of standard terminal collumns needed for displaying symbol
 523  * (glyph) which represents Unicode character c.
 524  * TODO: Use wcwidth when it is available.
 525  *
 526  * @return      2 for double-width glyph, 1 for others.
 527  *              TODO: May be extended to return 0 for zero-width glyphs
 528  *              (like composing, maybe unprintable too).
 529  */
 530 inline int
 531 unicode_to_cell(unicode_val_T c)
 532 {
 533         if (c >= 0x1100
 534                 && (c <= 0x115f                 /* Hangul Jamo */
 535                 || c == 0x2329
 536                 || c == 0x232a
 537                 || (c >= 0x2e80 && c <= 0xa4cf
 538                         && c != 0x303f)         /* CJK ... Yi */
 539                 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
 540                 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
 541                                                                 Ideographs */
 542                 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
 543                 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
 544                 || (c >= 0xffe0 && c <= 0xffe6)
 545                 || (c >= 0x20000 && c <= 0x2fffd)
 546                 || (c >= 0x30000 && c <= 0x3fffd)))
 547                 return 2;
 548
 549         return 1;
 550 }
 551
 552 /* Fold the case of a Unicode character, so that hotkeys in labels can
 553  * be compared case-insensitively.  It is unspecified whether the
 554  * result will be in upper or lower case.  */
 555 unicode_val_T
 556 unicode_fold_label_case(unicode_val_T c)
 557 {
 558 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
 559         return towlower(c);
 560 #else  /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 561         /* For now, this supports only ASCII.  It would be possible to
 562          * use code generated from CaseFolding.txt of Unicode if the
 563          * acknowledgements required by http://www.unicode.org/copyright.html
 564          * were added to associated documentation of ELinks.  */
 565         if (c >= 0x41 && c <= 0x5A)
 566                 return c + 0x20;
 567         else
 568                 return c;
 569 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 570 }
 571
 572 inline unicode_val_T
 573 utf_8_to_unicode(unsigned char **string, unsigned char *end)
 574 {
 575         unsigned char *str = *string;
 576         unicode_val_T u;
 577         int length;
 578
 579         length = utf8char_len_tab[str[0]];
 580
 581         if (str + length > end) {
 582                 return UCS_NO_CHAR;
 583         }
 584
 585         switch (length) {
 586                 case 1:
 587                         u = str[0];
 588                         break;
 589                 case 2:
 590                         u = (str[0] & 0x1f) << 6;
 591                         u += (str[1] & 0x3f);
 592                         break;
 593                 case 3:
 594                         u = (str[0] & 0x0f) << 12;
 595                         u += ((str[1] & 0x3f) << 6);
 596                         u += (str[2] & 0x3f);
 597                         break;
 598                 case 4:
 599                         u = (str[0] & 0x0f) << 18;
 600                         u += ((str[1] & 0x3f) << 12);
 601                         u += ((str[2] & 0x3f) << 6);
 602                         u += (str[3] & 0x3f);
 603                         break;
 604                 case 5:
 605                         u = (str[0] & 0x0f) << 24;
 606                         u += ((str[1] & 0x3f) << 18);
 607                         u += ((str[2] & 0x3f) << 12);
 608                         u += ((str[3] & 0x3f) << 6);
 609                         u += (str[4] & 0x3f);
 610                         break;
 611                 case 6:
 612                 default:
 613                         u = (str[0] & 0x01) << 30;
 614                         u += ((str[1] & 0x3f) << 24);
 615                         u += ((str[2] & 0x3f) << 18);
 616                         u += ((str[3] & 0x3f) << 12);
 617                         u += ((str[4] & 0x3f) << 6);
 618                         u += (str[5] & 0x3f);
 619                         break;
 620         }
 621         *string = str + length;
 622         return u;
 623 }
 624 #endif /* CONFIG_UTF_8 */
 625
 626 /* Slow algorithm, the common part of cp2u and cp2utf_8.  */
 627 static unicode_val_T
 628 cp2u_shared(const struct codepage_desc *from, unsigned char c)
 629 {
 630         int j;
 631
 632         for (j = 0; from->table[j].c; j++)
 633                 if (from->table[j].c == c)
 634                         return from->table[j].u;
 635
 636         return UCS_REPLACEMENT_CHARACTER;
 637 }
 638
 639 /* Slow algorithm, used for converting input from the terminal.  */
 640 unicode_val_T
 641 cp2u(int from, unsigned char c)
 642 {
 643         from &= ~SYSTEM_CHARSET_FLAG;
 644
 645         /* UTF-8 is a multibyte codepage and cannot be handled with
 646          * this function.  */
 647         assert(codepages[from].table != table_utf_8);
 648         if_assert_failed return UCS_REPLACEMENT_CHARACTER;
 649
 650         if (c < 0x80) return c;
 651         else return cp2u_shared(&codepages[from], c);
 652 }
 653
 654 /* This slow and ugly code is used by the terminal utf_8_io */
 655 unsigned char *
 656 cp2utf_8(int from, int c)
 657 {
 658         from &= ~SYSTEM_CHARSET_FLAG;
 659
 660         if (codepages[from].table == table_utf_8 || c < 128)
 661                 return strings[c];
 662
 663         return encode_utf_8(cp2u_shared(&codepages[from], c));
 664 }
 665
 666 #ifdef CONFIG_UTF_8
 667 unicode_val_T
 668 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
 669 {
 670         unicode_val_T ret;
 671
 672         if (is_cp_utf8(codepage))
 673                 return utf_8_to_unicode(string, end);
 674
 675         if (*string >= end)
 676                 return UCS_NO_CHAR;
 677
 678         ret = cp2u(codepage, **string);
 679         ++*string;
 680         return ret;
 681 }
 682 #endif  /* CONFIG_UTF_8 */
 683
 684
 685 static void
 686 add_utf_8(struct conv_table *ct, unicode_val_T u, unsigned char *str)
 687 {
 688         unsigned char *p = encode_utf_8(u);
 689
 690         while (p[1]) {
 691                 if (ct[*p].t) ct = ct[*p].u.tbl;
 692                 else {
 693                         struct conv_table *nct;
 694
 695                         assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
 696                         if_assert_failed return;
 697
 698                         nct = mem_calloc(256, sizeof(*nct));
 699                         if (!nct) return;
 700                         new_translation_table(nct);
 701                         ct[*p].t = 1;
 702                         ct[*p].u.tbl = nct;
 703                         ct = nct;
 704                 }
 705                 p++;
 706         }
 707
 708         assertm(!ct[*p].t, "bad utf encoding #2");
 709         if_assert_failed return;
 710
 711         if (ct[*p].u.str == no_str)
 712                 ct[*p].u.str = str;
 713 }
 714
 715 struct conv_table utf_table[256];
 716 int utf_table_init = 1;
 717
 718 static void
 719 free_utf_table(void)
 720 {
 721         int i;
 722
 723         for (i = 128; i < 256; i++)
 724                 mem_free(utf_table[i].u.str);
 725 }
 726
 727 static struct conv_table *
 728 get_translation_table_to_utf_8(int from)
 729 {
 730         int i;
 731         static int lfr = -1;
 732
 733         if (from == -1) return NULL;
 734         from &= ~SYSTEM_CHARSET_FLAG;
 735         if (from == lfr) return utf_table;
 736         lfr = from;
 737         if (utf_table_init)
 738                 memset(utf_table, 0, sizeof(utf_table)),
 739                 utf_table_init = 0;
 740         else
 741                 free_utf_table();
 742
 743         for (i = 0; i < 128; i++)
 744                 utf_table[i].u.str = strings[i];
 745
 746         if (codepages[from].table == table_utf_8) {
 747                 for (i = 128; i < 256; i++)
 748                         utf_table[i].u.str = stracpy(strings[i]);
 749                 return utf_table;
 750         }
 751
 752         for (i = 128; i < 256; i++)
 753                 utf_table[i].u.str = NULL;
 754
 755         for (i = 0; codepages[from].table[i].c; i++) {
 756                 unicode_val_T u = codepages[from].table[i].u;
 757
 758                 if (!utf_table[codepages[from].table[i].c].u.str)
 759                         utf_table[codepages[from].table[i].c].u.str =
 760                                 stracpy(encode_utf_8(u));
 761         }
 762
 763         for (i = 128; i < 256; i++)
 764                 if (!utf_table[i].u.str)
 765                         utf_table[i].u.str = stracpy(no_str);
 766
 767         return utf_table;
 768 }
 769
 770 struct conv_table table[256];
 771 static int first = 1;
 772
 773 void
 774 free_conv_table(void)
 775 {
 776         if (!utf_table_init) free_utf_table();
 777         if (first) {
 778                 memset(table, 0, sizeof(table));
 779                 first = 0;
 780         }
 781         new_translation_table(table);
 782 }
 783
 784
 785 struct conv_table *
 786 get_translation_table(int from, int to)
 787 {
 788         static int lfr = -1;
 789         static int lto = -1;
 790
 791         from &= ~SYSTEM_CHARSET_FLAG;
 792         to &= ~SYSTEM_CHARSET_FLAG;
 793         if (first) {
 794                 memset(table, 0, sizeof(table));
 795                 first = 0;
 796         }
 797         if (/*from == to ||*/ from == -1 || to == -1)
 798                 return NULL;
 799         if (codepages[to].table == table_utf_8)
 800                 return get_translation_table_to_utf_8(from);
 801         if (from == lfr && to == lto)
 802                 return table;
 803         lfr = from;
 804         lto = to;
 805         new_translation_table(table);
 806
 807         if (codepages[from].table == table_utf_8) {
 808                 int i;
 809
 810                 for (i = 0; codepages[to].table[i].c; i++)
 811                         add_utf_8(table, codepages[to].table[i].u,
 812                                   strings[codepages[to].table[i].c]);
 813
 814                 for (i = 0; unicode_7b[i].x != -1; i++)
 815                         if (unicode_7b[i].x >= 0x80)
 816                                 add_utf_8(table, unicode_7b[i].x,
 817                                           unicode_7b[i].s);
 818
 819         } else {
 820                 int i;
 821
 822                 for (i = 128; i < 256; i++) {
 823                         int j;
 824
 825                         for (j = 0; codepages[from].table[j].c; j++) {
 826                                 if (codepages[from].table[j].c == i) {
 827                                         unsigned char *u;
 828
 829                                         u = u2cp(codepages[from].table[j].u, to);
 830                                         if (u) table[i].u.str = u;
 831                                         break;
 832                                 }
 833                         }
 834                 }
 835         }
 836
 837         return table;
 838 }
 839
 840 static inline int
 841 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
 842 {
 843         while (l2) {
 844                 if (*s1 > *s2) return 1;
 845                 if (*s1 < *s2) return -1;
 846                 s1++;
 847                 s2++;
 848                 l2--;
 849         }
 850
 851         return *s2 ? -1 : 0;
 852 }
 853
 854 /* Entity cache debugging purpose. */
 855 #if 0
 856 #define DEBUG_ENTITY_CACHE
 857 #else
 858 #undef DEBUG_ENTITY_CACHE
 859 #endif
 860
 861 struct entity_cache {
 862         unsigned int hits;
 863         int strlen;
 864         int encoding;
 865         unsigned char *result;
 866         unsigned char str[20]; /* Suffice in any case. */
 867 };
 868
 869 static int
 870 hits_cmp(struct entity_cache *a, struct entity_cache *b)
 871 {
 872         if (a->hits == b->hits) return 0;
 873         if (a->hits > b->hits) return -1;
 874         else return 1;
 875 }
 876
 877 static int
 878 compare_entities(const void *key_, const void *element_)
 879 {
 880         struct string *key = (struct string *) key_;
 881         struct entity *element = (struct entity *) element_;
 882         int length = key->length;
 883         unsigned char *first = key->source;
 884         unsigned char *second = element->s;
 885
 886         return xxstrcmp(first, second, length);
 887 }
 888
 889 unsigned char *
 890 get_entity_string(const unsigned char *str, const int strlen, int encoding)
 891 {
 892 #define ENTITY_CACHE_SIZE 10    /* 10 seems a good value. */
 893 #define ENTITY_CACHE_MAXLEN 9   /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
 894                                    will go in [0] table */
 895         static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
 896         static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
 897         static int first_time = 1;
 898         unsigned int slen = 0;
 899         unsigned char *result = NULL;
 900
 901         if (strlen <= 0) return NULL;
 902
 903 #ifdef CONFIG_UTF_8
 904         /* TODO: caching UTF-8 */
 905         encoding &= ~SYSTEM_CHARSET_FLAG;
 906         if (codepages[encoding].table == table_utf_8)
 907                 goto skip;
 908 #endif /* CONFIG_UTF_8 */
 909
 910         if (first_time) {
 911                 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
 912                 first_time = 0;
 913         }
 914
 915         /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
 916          * + google + slashdot + websites that result from a search for test on google,
 917          * + various ones) show a quite impressive improvment:
 918          * Top ten is:
 919          * 0: hits=2459 l=4 st='nbsp'
 920          * 1: hits=2152 l=6 st='eacute'
 921          * 2: hits=235 l=6 st='egrave'
 922          * 3: hits=136 l=6 st='agrave'
 923          * 4: hits=100 l=3 st='amp'
 924          * 5: hits=40 l=5 st='laquo'
 925          * 6: hits=8 l=4 st='copy'
 926          * 7: hits=5 l=2 st='gt'
 927          * 8: hits=2 l=2 st='lt'
 928          * 9: hits=1 l=6 st='middot'
 929          *
 930          * Most of the time cache hit ratio is near 95%.
 931          *
 932          * A long test shows: 15186 hits vs. 24 misses and mean iteration
 933          * count is kept < 2 (worst case 1.58). Not so bad ;)
 934          *
 935          * --Zas */
 936
 937         /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
 938         slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
 939
 940         if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
 941                 int i;
 942
 943                 for (i = 0; i < nb_entity_cache[slen]; i++) {
 944                         if (entity_cache[slen][i].encoding == encoding
 945                             && !memcmp(str, entity_cache[slen][i].str, strlen)) {
 946 #ifdef DEBUG_ENTITY_CACHE
 947                                 static double total_iter = 0;
 948                                 static unsigned long hit_count = 0;
 949
 950                                 total_iter += i + 1;
 951                                 hit_count++;
 952                                 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
 953 #endif
 954                                 if (entity_cache[slen][i].hits < (unsigned int) ~0)
 955                                         entity_cache[slen][i].hits++;
 956                                 return entity_cache[slen][i].result;
 957                         }
 958                 }
 959 #ifdef DEBUG_ENTITY_CACHE
 960                 fprintf(stderr, "miss\n");
 961 #endif
 962         }
 963 #ifdef CONFIG_UTF_8
 964 skip:
 965 #endif /* CONFIG_UTF_8 */
 966         if (*str == '#') { /* Numeric entity. */
 967                 int l = (int) strlen;
 968                 unsigned char *st = (unsigned char *) str;
 969                 unicode_val_T n = 0;
 970
 971                 if (l == 1) goto end; /* &#; ? */
 972                 st++, l--;
 973                 if ((*st | 32) == 'x') { /* Hexadecimal */
 974
 975                         if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
 976                         st++, l--;
 977                         do {
 978                                 unsigned char c = (*(st++) | 32);
 979
 980                                 if (isdigit(c))
 981                                         n = (n << 4) | (c - '0');
 982                                 else if (isxdigit(c))
 983                                         n = (n << 4) | (c - 'a' + 10);
 984                                 else
 985                                         goto end; /* Bad char. */
 986                         } while (--l);
 987                 } else { /* Decimal */
 988                         if (l > 10) goto end; /* 4294967295 max. */
 989                         do {
 990                                 unsigned char c = *(st++);
 991
 992                                 if (isdigit(c))
 993                                         n = n * 10 + c - '0';
 994                                 else
 995                                         goto end; /* Bad char. */
 996                                 /* Limit to 0xFFFFFFFF. */
 997                                 if (n >= (unicode_val_T) 0xFFFFFFFFu)
 998                                         goto end;
 999                         } while (--l);
1000                 }
1001
1002                 result = u2cp(n, encoding);
1003
1004 #ifdef DEBUG_ENTITY_CACHE
1005                 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1006 #endif
1007         } else { /* Text entity. */
1008                 struct string key = INIT_STRING((unsigned char *) str, strlen);
1009                 struct entity *element = bsearch((void *) &key, entities,
1010                                                  N_ENTITIES,
1011                                                  sizeof(*element),
1012                                                  compare_entities);
1013
1014                 if (element) result = u2cp(element->c, encoding);
1015         }
1016
1017 #ifdef CONFIG_UTF_8
1018         if (codepages[encoding].table == table_utf_8) {
1019                 return result;
1020         }
1021 #endif /* CONFIG_UTF_8 */
1022 end:
1023         /* Take care of potential buffer overflow. */
1024         if (strlen < sizeof(entity_cache[slen][0].str)) {
1025                 struct entity_cache *ece = &entity_cache[slen][nb_entity_cache[slen]];
1026
1027                 /* Copy new entry to cache. */
1028                 ece->hits = 1;
1029                 ece->strlen = strlen;
1030                 ece->encoding = encoding;
1031                 ece->result = result;
1032                 memcpy(ece->str, str, strlen);
1033                 ece->str[strlen] = '\0';
1034
1035                 /* Increment number of cache entries if possible. */
1036                 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1037
1038 #ifdef DEBUG_ENTITY_CACHE
1039                 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1040                                 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1041
1042 #endif
1043
1044                 /* Sort entries by hit order. */
1045                 if (nb_entity_cache[slen] > 1)
1046                         qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1047                               sizeof(entity_cache[slen][0]), (void *) hits_cmp);
1048
1049 #ifdef DEBUG_ENTITY_CACHE
1050         {
1051                 unsigned int i;
1052
1053                 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1054                 for (i = 0; i < nb_entity_cache[slen] ; i++)
1055                         fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1056                                 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1057                                 entity_cache[slen][i].str);
1058                 fprintf(stderr, "-----------------\n");
1059         }
1060 #endif
1061         }
1062         return result;
1063 }
1064
1065 unsigned char *
1066 convert_string(struct conv_table *convert_table,
1067                unsigned char *chars, int charslen, int cp,
1068                enum convert_string_mode mode, int *length,
1069                void (*callback)(void *data, unsigned char *buf, int buflen),
1070                void *callback_data)
1071 {
1072         unsigned char *buffer;
1073         int bufferpos = 0;
1074         int charspos = 0;
1075
1076         if (!convert_table && !memchr(chars, '&', charslen)) {
1077                 if (callback) {
1078                         if (charslen) callback(callback_data, chars, charslen);
1079                         return NULL;
1080                 } else {
1081                         return memacpy(chars, charslen);
1082                 }
1083         }
1084
1085         /* Buffer allocation */
1086
1087         buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1088         if (!buffer) return NULL;
1089
1090         /* Iterate ;-) */
1091
1092         while (charspos < charslen) {
1093                 unsigned char *translit;
1094
1095 #define PUTC do { \
1096                 buffer[bufferpos++] = chars[charspos++]; \
1097                 translit = ""; \
1098                 goto flush; \
1099         } while (0)
1100
1101                 if (chars[charspos] != '&') {
1102                         struct conv_table *t;
1103                         int i;
1104
1105                         if (chars[charspos] < 128 || !convert_table) PUTC;
1106
1107                         t = convert_table;
1108                         i = charspos;
1109
1110                         while (t[chars[i]].t) {
1111                                 t = t[chars[i++]].u.tbl;
1112                                 if (i >= charslen) PUTC;
1113                         }
1114
1115                         translit = t[chars[i]].u.str;
1116                         charspos = i + 1;
1117
1118                 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1119                         PUTC;
1120
1121                 } else {
1122                         int start = charspos + 1;
1123                         int i = start;
1124
1125                         while (i < charslen
1126                                && (isasciialpha(chars[i])
1127                                    || isdigit(chars[i])
1128                                    || (chars[i] == '#')))
1129                                 i++;
1130
1131                         /* This prevents bug 213: we were expanding "entities"
1132                          * in URL query strings. */
1133                         /* XXX: But this disables &nbsp&nbsp usage, which
1134                          * appears to be relatively common! --pasky */
1135                         if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1136                             && i > start
1137                             && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1138                                 translit = get_entity_string(&chars[start], i - start,
1139                                                       cp);
1140                                 if (chars[i] != ';') {
1141                                         /* Eat &nbsp &nbsp<foo> happily, but
1142                                          * pull back from the character after
1143                                          * entity string if it is not the valid
1144                                          * terminator. */
1145                                         i--;
1146                                 }
1147
1148                                 if (!translit) PUTC;
1149                                 charspos = i + (i < charslen);
1150                         } else PUTC;
1151                 }
1152
1153                 if (!translit[0]) continue;
1154
1155                 if (!translit[1]) {
1156                         buffer[bufferpos++] = translit[0];
1157                         translit = "";
1158                         goto flush;
1159                 }
1160
1161                 while (*translit) {
1162                         unsigned char *new;
1163
1164                         buffer[bufferpos++] = *(translit++);
1165 flush:
1166                         if (bufferpos & (ALLOC_GR - 1)) continue;
1167
1168                         if (callback) {
1169                                 buffer[bufferpos] = 0;
1170                                 callback(callback_data, buffer, bufferpos);
1171                                 bufferpos = 0;
1172                         } else {
1173                                 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1174                                 if (!new) {
1175                                         mem_free(buffer);
1176                                         return NULL;
1177                                 }
1178                                 buffer = new;
1179                         }
1180                 }
1181 #undef PUTC
1182         }
1183
1184         /* Say bye */
1185
1186         buffer[bufferpos] = 0;
1187         if (length) *length = bufferpos;
1188
1189         if (callback) {
1190                 if (bufferpos) callback(callback_data, buffer, bufferpos);
1191                 mem_free(buffer);
1192                 return NULL;
1193         } else {
1194                 return buffer;
1195         }
1196 }
1197
1198
1199 #ifndef USE_FASTFIND
1200 int
1201 get_cp_index(unsigned char *name)
1202 {
1203         int i, a;
1204         int syscp = 0;
1205
1206         if (!strcasecmp(name, "System")) {
1207 #if HAVE_LANGINFO_CODESET
1208                 name = nl_langinfo(CODESET);
1209                 syscp = SYSTEM_CHARSET_FLAG;
1210 #else
1211                 name = "us-ascii";
1212 #endif
1213         }
1214
1215         for (i = 0; codepages[i].name; i++) {
1216                 for (a = 0; codepages[i].aliases[a]; a++) {
1217                         /* In the past, we looked for the longest substring
1218                          * in all the names; it is way too expensive, though:
1219                          *
1220                          *   %   cumulative   self              self     total
1221                          *  time   seconds   seconds    calls  us/call  us/call  name
1222                          *  3.00      0.66     0.03     1325    22.64    22.64  get_cp_index
1223                          *
1224                          * Anything called from redraw_screen() is in fact
1225                          * relatively expensive, even if it's called just
1226                          * once. So we will do a simple strcasecmp() here.
1227                          */
1228
1229                         if (!strcasecmp(name, codepages[i].aliases[a]))
1230                                 return i | syscp;
1231                 }
1232         }
1233
1234         if (syscp) {
1235                 return get_cp_index("us-ascii") | syscp;
1236         } else {
1237                 return -1;
1238         }
1239 }
1240
1241 #else
1242
1243 static unsigned int i_name = 0;
1244 static unsigned int i_alias = 0;
1245
1246 /* Reset internal list pointer */
1247 void
1248 charsets_list_reset(void)
1249 {
1250         i_name = 0;
1251         i_alias = 0;
1252 }
1253
1254 /* Returns a pointer to a struct that contains current key and data pointers
1255  * and increment internal pointer.  It returns NULL when key is NULL. */
1256 struct fastfind_key_value *
1257 charsets_list_next(void)
1258 {
1259         static struct fastfind_key_value kv;
1260
1261         if (!codepages[i_name].name) return NULL;
1262
1263         kv.key = codepages[i_name].aliases[i_alias];
1264         kv.data = &codepages[i_name];
1265
1266         if (codepages[i_name].aliases[i_alias + 1])
1267                 i_alias++;
1268         else {
1269                 i_name++;
1270                 i_alias = 0;
1271         }
1272
1273         return &kv;
1274 }
1275
1276 static struct fastfind_index ff_charsets_index
1277         = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1278
1279 /* It searchs for a charset named @name or one of its aliases and
1280  * returns index for it or -1 if not found. */
1281 int
1282 get_cp_index(unsigned char *name)
1283 {
1284         struct codepage_desc *codepage;
1285         int syscp = 0;
1286
1287         if (!strcasecmp(name, "System")) {
1288 #if HAVE_LANGINFO_CODESET
1289                 name = nl_langinfo(CODESET);
1290                 syscp = SYSTEM_CHARSET_FLAG;
1291 #else
1292                 name = "us-ascii";
1293 #endif
1294         }
1295
1296         codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1297         if (codepage) {
1298                 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1299                 return (codepage - codepages) | syscp;
1300
1301         } else if (syscp) {
1302                 return get_cp_index("us-ascii") | syscp;
1303
1304         } else {
1305                 return -1;
1306         }
1307 }
1308
1309 #endif /* USE_FASTFIND */
1310
1311 void
1312 init_charsets_lookup(void)
1313 {
1314 #ifdef USE_FASTFIND
1315         fastfind_index(&ff_charsets_index, FF_COMPRESS);
1316 #endif
1317 }
1318
1319 void
1320 free_charsets_lookup(void)
1321 {
1322 #ifdef USE_FASTFIND
1323         fastfind_done(&ff_charsets_index);
1324 #endif
1325 }
1326
1327 unsigned char *
1328 get_cp_name(int cp_index)
1329 {
1330         if (cp_index < 0) return "none";
1331         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1332
1333         return codepages[cp_index].name;
1334 }
1335
1336 unsigned char *
1337 get_cp_mime_name(int cp_index)
1338 {
1339         if (cp_index < 0) return "none";
1340         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1341         if (!codepages[cp_index].aliases) return NULL;
1342
1343         return codepages[cp_index].aliases[0];
1344 }
1345
1346 int
1347 is_cp_utf8(int cp_index)
1348 {
1349         cp_index &= ~SYSTEM_CHARSET_FLAG;
1350         return codepages[cp_index].table == table_utf_8;
1351 }