src/intl/charsets.c

   1 /* Charsets convertor */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #if HAVE_LANGINFO_CODESET
   8 #include <langinfo.h>
   9 #endif
  10
  11 #include <ctype.h>
  12 #include <stdlib.h>
  13 #if HAVE_WCTYPE_H
  14 #include <wctype.h>
  15 #endif
  16
  17 #include "elinks.h"
  18
  19 #include "document/options.h"
  20 #include "intl/charsets.h"
  21 #include "util/conv.h"
  22 #include "util/error.h"
  23 #include "util/fastfind.h"
  24 #include "util/memory.h"
  25 #include "util/string.h"
  26
  27
  28 /* Fix namespace clash on MacOS. */
  29 #define table table_elinks
  30
  31 struct table_entry {
  32         unsigned char c;
  33         unicode_val_T u;
  34 };
  35
  36 struct codepage_desc {
  37         unsigned char *name;
  38         unsigned char **aliases;
  39         struct table_entry *table;
  40 };
  41
  42 #include "intl/codepage.inc"
  43 #include "intl/uni_7b.inc"
  44 #include "intl/entity.inc"
  45
  46
  47 static char strings[256][2] = {
  48         "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
  49         "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
  50         "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
  51         "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
  52         "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
  53         "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
  54         "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
  55         "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
  56         "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
  57         "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
  58         "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
  59         "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
  60         "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
  61         "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
  62         "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
  63         "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
  64         "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
  65         "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
  66         "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
  67         "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
  68         "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
  69         "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
  70         "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
  71         "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
  72         "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
  73         "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
  74         "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
  75         "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
  76         "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
  77         "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
  78         "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
  79         "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
  80 };
  81
  82 static void
  83 free_translation_table(struct conv_table *p)
  84 {
  85         int i;
  86
  87         for (i = 0; i < 256; i++)
  88                 if (p[i].t)
  89                         free_translation_table(p[i].u.tbl);
  90
  91         mem_free(p);
  92 }
  93
  94 static unsigned char *no_str = "*";
  95
  96 static void
  97 new_translation_table(struct conv_table *p)
  98 {
  99         int i;
 100
 101         for (i = 0; i < 256; i++)
 102                 if (p[i].t)
 103                         free_translation_table(p[i].u.tbl);
 104         for (i = 0; i < 128; i++) {
 105                 p[i].t = 0;
 106                 p[i].u.str = strings[i];
 107         }
 108         for (; i < 256; i++) {
 109                 p[i].t = 0;
 110                 p[i].u.str = no_str;
 111         }
 112 }
 113
 114 #define BIN_SEARCH(table, entry, entries, key, result)                                  \
 115 {                                                                                       \
 116         long _s = 0, _e = (entries) - 1;                                                \
 117                                                                                         \
 118         while (_s <= _e || !((result) = -1)) {                                          \
 119                 long _m = (_s + _e) / 2;                                                \
 120                                                                                         \
 121                 if ((table)[_m].entry == (key)) {                                       \
 122                         (result) = _m;                                                  \
 123                         break;                                                          \
 124                 }                                                                       \
 125                 if ((table)[_m].entry > (key)) _e = _m - 1;                             \
 126                 if ((table)[_m].entry < (key)) _s = _m + 1;                             \
 127         }                                                                               \
 128 }                                                                                       \
 129
 130 static const unicode_val_T strange_chars[32] = {
 131 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
 132 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
 133 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
 134 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
 135 };
 136
 137 #define SYSTEM_CHARSET_FLAG 128
 138
 139 unsigned char *
 140 u2cp_(unicode_val_T u, int to, int no_nbsp_hack)
 141 {
 142         int j;
 143         int s;
 144
 145         if (u < 128) return strings[u];
 146
 147         to &= ~SYSTEM_CHARSET_FLAG;
 148
 149 #ifdef CONFIG_UTF_8
 150         if (codepages[to].table == table_utf_8)
 151                 return encode_utf_8(u);
 152 #endif /* CONFIG_UTF_8 */
 153
 154         /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
 155         if (u == 0xa0) return no_nbsp_hack ? " " : NBSP_CHAR_STRING;
 156         if (u == 0xad) return "";
 157
 158         if (u < 0xa0) {
 159                 unicode_val_T strange = strange_chars[u - 0x80];
 160
 161                 if (!strange) return NULL;
 162                 return u2cp_(strange, to, no_nbsp_hack);
 163         }
 164
 165
 166         for (j = 0; codepages[to].table[j].c; j++)
 167                 if (codepages[to].table[j].u == u)
 168                         return strings[codepages[to].table[j].c];
 169
 170         BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
 171         if (s != -1) return unicode_7b[s].s;
 172
 173         return no_str;
 174 }
 175
 176 static unsigned char utf_buffer[7];
 177
 178 #ifdef CONFIG_UTF_8
 179 inline unsigned char *
 180 encode_utf_8(unicode_val_T u)
 181 #else
 182 static unsigned char *
 183 encode_utf_8(unicode_val_T u)
 184 #endif /* CONFIG_UTF_8 */
 185 {
 186         memset(utf_buffer, 0, 7);
 187
 188         if (u < 0x80)
 189                 utf_buffer[0] = u;
 190         else if (u < 0x800)
 191                 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
 192                 utf_buffer[1] = 0x80 | (u & 0x3f);
 193         else if (u < 0x10000)
 194                 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
 195                 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
 196                 utf_buffer[2] = 0x80 | (u & 0x3f);
 197         else if (u < 0x200000)
 198                 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
 199                 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
 200                 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
 201                 utf_buffer[3] = 0x80 | (u & 0x3f);
 202         else if (u < 0x4000000)
 203                 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
 204                 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
 205                 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
 206                 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
 207                 utf_buffer[4] = 0x80 | (u & 0x3f);
 208         else    utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
 209                 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
 210                 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
 211                 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
 212                 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
 213                 utf_buffer[5] = 0x80 | (u & 0x3f);
 214
 215         return utf_buffer;
 216 }
 217
 218 #ifdef CONFIG_UTF_8
 219 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
 220  * equal ones and handled different. */
 221 static char utf8char_len_tab[256] = {
 222         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 223         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 224         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 225         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 226         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 227         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 228         2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
 229         3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
 230 };
 231
 232 inline int utf8charlen(const unsigned char *p)
 233 {
 234         return p ? utf8char_len_tab[*p] : 0;
 235 }
 236
 237 inline int
 238 strlen_utf8(unsigned char **str)
 239 {
 240         unsigned char *s = *str;
 241         unsigned char *end = strchr(s, '\0');
 242         int x;
 243         int len;
 244
 245         for (x = 0;; x++, s += len) {
 246                 len = utf8charlen(s);
 247                 if (s + len > end) break;
 248         }
 249         *str = s;
 250         return x;
 251 }
 252
 253 #define utf8_issingle(p) (((p) & 0x80) == 0)
 254 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
 255
 256 /* Start from @current and move back to @pos char. This pointer return. The
 257  * most left pointer is @start. */
 258 inline unsigned char *
 259 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
 260 {
 261         if (current == NULL || start == NULL || pos < 0)
 262                 return NULL;
 263         while (pos > 0 && current != start) {
 264                 current--;
 265                 if (utf8_islead(*current))
 266                         pos--;
 267         }
 268         return current;
 269 }
 270
 271 /* Count number of standard terminal cells needed for displaying UTF-8
 272  * character. */
 273 int
 274 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
 275 {
 276         unicode_val_T u;
 277
 278         if (end == NULL)
 279                 end = strchr(utf8_char, '\0');
 280
 281         if(!utf8_char || !end)
 282                 return -1;
 283
 284         u = utf_8_to_unicode(&utf8_char, end);
 285
 286         return unicode_to_cell(u);
 287 }
 288
 289 /* Count number of standard terminal cells needed for displaying string
 290  * with UTF-8 characters. */
 291 int
 292 utf8_ptr2cells(unsigned char *string, unsigned char *end)
 293 {
 294         int charlen, cell, cells = 0;
 295
 296         if (end == NULL)
 297                 end = strchr(string, '\0');
 298
 299         if(!string || !end)
 300                 return -1;
 301
 302         do {
 303                 charlen = utf8charlen(string);
 304                 if (string + charlen > end)
 305                         break;
 306
 307                 cell = utf8_char2cells(string, end);
 308                 if  (cell < 0)
 309                         return -1;
 310
 311                 cells += cell;
 312                 string += charlen;
 313         } while (1);
 314
 315         return cells;
 316 }
 317
 318 /* Count number of characters in string. */
 319 int
 320 utf8_ptr2chars(unsigned char *string, unsigned char *end)
 321 {
 322         int charlen, chars = 0;
 323
 324         if (end == NULL)
 325                 end = strchr(string, '\0');
 326
 327         if(!string || !end)
 328                 return -1;
 329
 330         do {
 331                 charlen = utf8charlen(string);
 332                 if (string + charlen > end)
 333                         break;
 334
 335                 chars++;
 336                 string += charlen;
 337         } while (1);
 338
 339         return chars;
 340 }
 341
 342 /*
 343  * Count number of bytes from begining of the string needed for displaying
 344  * specified number of cells.
 345  */
 346 int
 347 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
 348 {
 349         unsigned int bytes = 0, cells = 0;
 350
 351         assert(max_cells>=0);
 352
 353         if (end == NULL)
 354                 end = strchr(string, '\0');
 355
 356         if(!string || !end)
 357                 return -1;
 358
 359         do {
 360                 int cell = utf8_char2cells(&string[bytes], end);
 361                 if (cell < 0)
 362                         return -1;
 363
 364                 cells += cell;
 365                 if (cells > max_cells)
 366                         break;
 367
 368                 bytes += utf8charlen(&string[bytes]);
 369
 370                 if (string + bytes > end) {
 371                         bytes = end - string;
 372                         break;
 373                 }
 374         } while(1);
 375
 376         return bytes;
 377 }
 378
 379 /*
 380  * Find out number of standard terminal collumns needed for displaying symbol
 381  * (glyph) which represents Unicode character c.
 382  * TODO: Use wcwidth when it is available.
 383  *
 384  * @return      2 for double-width glyph, 1 for others.
 385  *              TODO: May be extended to return 0 for zero-width glyphs
 386  *              (like composing, maybe unprintable too).
 387  */
 388 inline int
 389 unicode_to_cell(unicode_val_T c)
 390 {
 391         if (c >= 0x1100
 392                 && (c <= 0x115f                 /* Hangul Jamo */
 393                 || c == 0x2329
 394                 || c == 0x232a
 395                 || (c >= 0x2e80 && c <= 0xa4cf
 396                         && c != 0x303f)         /* CJK ... Yi */
 397                 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
 398                 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
 399                                                                 Ideographs */
 400                 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
 401                 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
 402                 || (c >= 0xffe0 && c <= 0xffe6)
 403                 || (c >= 0x20000 && c <= 0x2fffd)
 404                 || (c >= 0x30000 && c <= 0x3fffd)))
 405                 return 2;
 406
 407         return 1;
 408 }
 409
 410 /* Fold the case of a Unicode character, so that hotkeys in labels can
 411  * be compared case-insensitively.  This should be called only if
 412  * check_kbd_label_key(c) is true.  It is unspecified whether the
 413  * result will be in upper or lower case.  */
 414 unicode_val_T
 415 unicode_fold_label_case(unicode_val_T c)
 416 {
 417 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
 418         return towlower(c);
 419 #else  /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 420         /* For now, this supports only ASCII.  It would be possible to
 421          * use code generated from CaseFolding.txt of Unicode if the
 422          * acknowledgements required by http://www.unicode.org/copyright.html
 423          * were added to associated documentation of ELinks.  */
 424         if (c >= 0x41 && c <= 0x5A)
 425                 return c + 0x20;
 426         else
 427                 return c;
 428 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 429 }
 430
 431 inline unicode_val_T
 432 utf_8_to_unicode(unsigned char **string, unsigned char *end)
 433 {
 434         unsigned char *str = *string;
 435         unicode_val_T u;
 436         int length;
 437
 438         length = utf8char_len_tab[str[0]];
 439
 440         if (str + length > end) {
 441                 return UCS_NO_CHAR;
 442         }
 443
 444         switch (length) {
 445                 case 1:
 446                         u = str[0];
 447                         break;
 448                 case 2:
 449                         u = (str[0] & 0x1f) << 6;
 450                         u += (str[1] & 0x3f);
 451                         break;
 452                 case 3:
 453                         u = (str[0] & 0x0f) << 12;
 454                         u += ((str[1] & 0x3f) << 6);
 455                         u += (str[2] & 0x3f);
 456                         break;
 457                 case 4:
 458                         u = (str[0] & 0x0f) << 18;
 459                         u += ((str[1] & 0x3f) << 12);
 460                         u += ((str[2] & 0x3f) << 6);
 461                         u += (str[3] & 0x3f);
 462                         break;
 463                 case 5:
 464                         u = (str[0] & 0x0f) << 24;
 465                         u += ((str[1] & 0x3f) << 18);
 466                         u += ((str[2] & 0x3f) << 12);
 467                         u += ((str[3] & 0x3f) << 6);
 468                         u += (str[4] & 0x3f);
 469                         break;
 470                 case 6:
 471                 default:
 472                         u = (str[0] & 0x01) << 30;
 473                         u += ((str[1] & 0x3f) << 24);
 474                         u += ((str[2] & 0x3f) << 18);
 475                         u += ((str[3] & 0x3f) << 12);
 476                         u += ((str[4] & 0x3f) << 6);
 477                         u += (str[5] & 0x3f);
 478                         break;
 479         }
 480         *string = str + length;
 481         return u;
 482 }
 483 #endif /* CONFIG_UTF_8 */
 484
 485 /* Slow algorithm, the common part of cp2u and cp2utf_8.  */
 486 static unicode_val_T
 487 cp2u_shared(const struct codepage_desc *from, unsigned char c)
 488 {
 489         int j;
 490
 491         for (j = 0; from->table[j].c; j++)
 492                 if (from->table[j].c == c)
 493                         return from->table[j].u;
 494
 495         return UCS_NO_CHAR;
 496 }
 497
 498 /* Slow algorithm, used for converting input from the terminal.  */
 499 unicode_val_T
 500 cp2u(int from, unsigned char c)
 501 {
 502         from &= ~SYSTEM_CHARSET_FLAG;
 503
 504         /* UTF-8 is a multibyte codepage and cannot be handled with
 505          * this function.  */
 506         assert(codepages[from].table != table_utf_8);
 507         if_assert_failed return UCS_NO_CHAR;
 508
 509         if (c < 0x80) return c;
 510         else return cp2u_shared(&codepages[from], c);
 511 }
 512
 513 /* This slow and ugly code is used by the terminal utf_8_io */
 514 unsigned char *
 515 cp2utf_8(int from, int c)
 516 {
 517         from &= ~SYSTEM_CHARSET_FLAG;
 518
 519         if (codepages[from].table == table_utf_8 || c < 128)
 520                 return strings[c];
 521
 522         return encode_utf_8(cp2u_shared(&codepages[from], c));
 523 }
 524
 525 #ifdef CONFIG_UTF_8
 526 unicode_val_T
 527 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
 528 {
 529         if (is_cp_utf8(codepage))
 530                 return utf_8_to_unicode(string, end);
 531         else {
 532                 if (*string >= end)
 533                         return UCS_NO_CHAR;
 534                 else {
 535                         unicode_val_T ret = cp2u(codepage, **string);
 536                         ++*string;
 537                         return ret;
 538                 }
 539         }
 540 }
 541 #endif  /* CONFIG_UTF_8 */
 542
 543
 544 static void
 545 add_utf_8(struct conv_table *ct, unicode_val_T u, unsigned char *str)
 546 {
 547         unsigned char *p = encode_utf_8(u);
 548
 549         while (p[1]) {
 550                 if (ct[*p].t) ct = ct[*p].u.tbl;
 551                 else {
 552                         struct conv_table *nct;
 553
 554                         assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
 555                         if_assert_failed return;
 556
 557                         nct = mem_calloc(256, sizeof(*nct));
 558                         if (!nct) return;
 559                         new_translation_table(nct);
 560                         ct[*p].t = 1;
 561                         ct[*p].u.tbl = nct;
 562                         ct = nct;
 563                 }
 564                 p++;
 565         }
 566
 567         assertm(!ct[*p].t, "bad utf encoding #2");
 568         if_assert_failed return;
 569
 570         if (ct[*p].u.str == no_str)
 571                 ct[*p].u.str = str;
 572 }
 573
 574 struct conv_table utf_table[256];
 575 int utf_table_init = 1;
 576
 577 static void
 578 free_utf_table(void)
 579 {
 580         int i;
 581
 582         for (i = 128; i < 256; i++)
 583                 mem_free(utf_table[i].u.str);
 584 }
 585
 586 static struct conv_table *
 587 get_translation_table_to_utf_8(int from)
 588 {
 589         int i;
 590         static int lfr = -1;
 591
 592         if (from == -1) return NULL;
 593         from &= ~SYSTEM_CHARSET_FLAG;
 594         if (from == lfr) return utf_table;
 595         lfr = from;
 596         if (utf_table_init)
 597                 memset(utf_table, 0, sizeof(utf_table)),
 598                 utf_table_init = 0;
 599         else
 600                 free_utf_table();
 601
 602         for (i = 0; i < 128; i++)
 603                 utf_table[i].u.str = strings[i];
 604
 605         if (codepages[from].table == table_utf_8) {
 606                 for (i = 128; i < 256; i++)
 607                         utf_table[i].u.str = stracpy(strings[i]);
 608                 return utf_table;
 609         }
 610
 611         for (i = 128; i < 256; i++)
 612                 utf_table[i].u.str = NULL;
 613
 614         for (i = 0; codepages[from].table[i].c; i++) {
 615                 unicode_val_T u = codepages[from].table[i].u;
 616
 617                 if (!utf_table[codepages[from].table[i].c].u.str)
 618                         utf_table[codepages[from].table[i].c].u.str =
 619                                 stracpy(encode_utf_8(u));
 620         }
 621
 622         for (i = 128; i < 256; i++)
 623                 if (!utf_table[i].u.str)
 624                         utf_table[i].u.str = stracpy(no_str);
 625
 626         return utf_table;
 627 }
 628
 629 struct conv_table table[256];
 630 static int first = 1;
 631
 632 void
 633 free_conv_table(void)
 634 {
 635         if (!utf_table_init) free_utf_table();
 636         if (first) {
 637                 memset(table, 0, sizeof(table));
 638                 first = 0;
 639         }
 640         new_translation_table(table);
 641 }
 642
 643
 644 struct conv_table *
 645 get_translation_table(int from, int to)
 646 {
 647         static int lfr = -1;
 648         static int lto = -1;
 649
 650         from &= ~SYSTEM_CHARSET_FLAG;
 651         to &= ~SYSTEM_CHARSET_FLAG;
 652         if (first) {
 653                 memset(table, 0, sizeof(table));
 654                 first = 0;
 655         }
 656         if (/*from == to ||*/ from == -1 || to == -1)
 657                 return NULL;
 658         if (codepages[to].table == table_utf_8)
 659                 return get_translation_table_to_utf_8(from);
 660         if (from == lfr && to == lto)
 661                 return table;
 662         lfr = from;
 663         lto = to;
 664         new_translation_table(table);
 665
 666         if (codepages[from].table == table_utf_8) {
 667                 int i;
 668
 669                 for (i = 0; codepages[to].table[i].c; i++)
 670                         add_utf_8(table, codepages[to].table[i].u,
 671                                   strings[codepages[to].table[i].c]);
 672
 673                 for (i = 0; unicode_7b[i].x != -1; i++)
 674                         if (unicode_7b[i].x >= 0x80)
 675                                 add_utf_8(table, unicode_7b[i].x,
 676                                           unicode_7b[i].s);
 677
 678         } else {
 679                 int i;
 680
 681                 for (i = 128; i < 256; i++) {
 682                         int j;
 683
 684                         for (j = 0; codepages[from].table[j].c; j++) {
 685                                 if (codepages[from].table[j].c == i) {
 686                                         unsigned char *u;
 687
 688                                         u = u2cp(codepages[from].table[j].u, to);
 689                                         if (u) table[i].u.str = u;
 690                                         break;
 691                                 }
 692                         }
 693                 }
 694         }
 695
 696         return table;
 697 }
 698
 699 static inline int
 700 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
 701 {
 702         while (l2) {
 703                 if (*s1 > *s2) return 1;
 704                 if (*s1 < *s2) return -1;
 705                 s1++;
 706                 s2++;
 707                 l2--;
 708         }
 709
 710         return *s2 ? -1 : 0;
 711 }
 712
 713 /* Entity cache debugging purpose. */
 714 #if 0
 715 #define DEBUG_ENTITY_CACHE
 716 #else
 717 #undef DEBUG_ENTITY_CACHE
 718 #endif
 719
 720 struct entity_cache {
 721         unsigned int hits;
 722         int strlen;
 723         int encoding;
 724         unsigned char *result;
 725         unsigned char str[20]; /* Suffice in any case. */
 726 };
 727
 728 static int
 729 hits_cmp(struct entity_cache *a, struct entity_cache *b)
 730 {
 731         if (a->hits == b->hits) return 0;
 732         if (a->hits > b->hits) return -1;
 733         else return 1;
 734 }
 735
 736 static int
 737 compare_entities(const void *key_, const void *element_)
 738 {
 739         struct string *key = (struct string *) key_;
 740         struct entity *element = (struct entity *) element_;
 741         int length = key->length;
 742         unsigned char *first = key->source;
 743         unsigned char *second = element->s;
 744
 745         return xxstrcmp(first, second, length);
 746 }
 747
 748 unsigned char *
 749 get_entity_string(const unsigned char *str, const int strlen, int encoding)
 750 {
 751 #define ENTITY_CACHE_SIZE 10    /* 10 seems a good value. */
 752 #define ENTITY_CACHE_MAXLEN 9   /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
 753                                    will go in [0] table */
 754         static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
 755         static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
 756         static int first_time = 1;
 757         unsigned int slen = 0;
 758         unsigned char *result = NULL;
 759
 760         if (strlen <= 0) return NULL;
 761
 762 #ifdef CONFIG_UTF_8
 763         /* TODO: caching UTF-8 */
 764         encoding &= ~SYSTEM_CHARSET_FLAG;
 765         if (codepages[encoding].table == table_utf_8)
 766                 goto skip;
 767 #endif /* CONFIG_UTF_8 */
 768
 769         if (first_time) {
 770                 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
 771                 first_time = 0;
 772         }
 773
 774         /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
 775          * + google + slashdot + websites that result from a search for test on google,
 776          * + various ones) show a quite impressive improvment:
 777          * Top ten is:
 778          * 0: hits=2459 l=4 st='nbsp'
 779          * 1: hits=2152 l=6 st='eacute'
 780          * 2: hits=235 l=6 st='egrave'
 781          * 3: hits=136 l=6 st='agrave'
 782          * 4: hits=100 l=3 st='amp'
 783          * 5: hits=40 l=5 st='laquo'
 784          * 6: hits=8 l=4 st='copy'
 785          * 7: hits=5 l=2 st='gt'
 786          * 8: hits=2 l=2 st='lt'
 787          * 9: hits=1 l=6 st='middot'
 788          *
 789          * Most of the time cache hit ratio is near 95%.
 790          *
 791          * A long test shows: 15186 hits vs. 24 misses and mean iteration
 792          * count is kept < 2 (worst case 1.58). Not so bad ;)
 793          *
 794          * --Zas */
 795
 796         /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
 797         slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
 798
 799         if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
 800                 int i;
 801
 802                 for (i = 0; i < nb_entity_cache[slen]; i++) {
 803                         if (entity_cache[slen][i].encoding == encoding
 804                             && !memcmp(str, entity_cache[slen][i].str, strlen)) {
 805 #ifdef DEBUG_ENTITY_CACHE
 806                                 static double total_iter = 0;
 807                                 static unsigned long hit_count = 0;
 808
 809                                 total_iter += i + 1;
 810                                 hit_count++;
 811                                 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
 812 #endif
 813                                 if (entity_cache[slen][i].hits < (unsigned int) ~0)
 814                                         entity_cache[slen][i].hits++;
 815                                 return entity_cache[slen][i].result;
 816                         }
 817                 }
 818 #ifdef DEBUG_ENTITY_CACHE
 819                 fprintf(stderr, "miss\n");
 820 #endif
 821         }
 822 #ifdef CONFIG_UTF_8
 823 skip:
 824 #endif /* CONFIG_UTF_8 */
 825         if (*str == '#') { /* Numeric entity. */
 826                 int l = (int) strlen;
 827                 unsigned char *st = (unsigned char *) str;
 828                 unicode_val_T n = 0;
 829
 830                 if (l == 1) goto end; /* &#; ? */
 831                 st++, l--;
 832                 if ((*st | 32) == 'x') { /* Hexadecimal */
 833
 834                         if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
 835                         st++, l--;
 836                         do {
 837                                 unsigned char c = (*(st++) | 32);
 838
 839                                 if (isdigit(c))
 840                                         n = (n << 4) | (c - '0');
 841                                 else if (isxdigit(c))
 842                                         n = (n << 4) | (c - 'a' + 10);
 843                                 else
 844                                         goto end; /* Bad char. */
 845                         } while (--l);
 846                 } else { /* Decimal */
 847                         if (l > 10) goto end; /* 4294967295 max. */
 848                         do {
 849                                 unsigned char c = *(st++);
 850
 851                                 if (isdigit(c))
 852                                         n = n * 10 + c - '0';
 853                                 else
 854                                         goto end; /* Bad char. */
 855                                 /* Limit to 0xFFFFFFFF. */
 856                                 if (n >= (unicode_val_T) 0xFFFFFFFFu)
 857                                         goto end;
 858                         } while (--l);
 859                 }
 860
 861                 result = u2cp(n, encoding);
 862
 863 #ifdef DEBUG_ENTITY_CACHE
 864                 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
 865 #endif
 866         } else { /* Text entity. */
 867                 struct string key = INIT_STRING((unsigned char *) str, strlen);
 868                 struct entity *element = bsearch((void *) &key, entities,
 869                                                  N_ENTITIES,
 870                                                  sizeof(*element),
 871                                                  compare_entities);
 872
 873                 if (element) result = u2cp(element->c, encoding);
 874         }
 875
 876 #ifdef CONFIG_UTF_8
 877         if (codepages[encoding].table == table_utf_8) {
 878                 return result;
 879         }
 880 #endif /* CONFIG_UTF_8 */
 881 end:
 882         /* Take care of potential buffer overflow. */
 883         if (strlen < sizeof(entity_cache[slen][0].str)) {
 884                 struct entity_cache *ece = &entity_cache[slen][nb_entity_cache[slen]];
 885
 886                 /* Copy new entry to cache. */
 887                 ece->hits = 1;
 888                 ece->strlen = strlen;
 889                 ece->encoding = encoding;
 890                 ece->result = result;
 891                 memcpy(ece->str, str, strlen);
 892                 ece->str[strlen] = '\0';
 893
 894                 /* Increment number of cache entries if possible. */
 895                 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
 896
 897 #ifdef DEBUG_ENTITY_CACHE
 898                 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
 899                                 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
 900
 901 #endif
 902
 903                 /* Sort entries by hit order. */
 904                 if (nb_entity_cache[slen] > 1)
 905                         qsort(&entity_cache[slen][0], nb_entity_cache[slen],
 906                               sizeof(entity_cache[slen][0]), (void *) hits_cmp);
 907
 908 #ifdef DEBUG_ENTITY_CACHE
 909         {
 910                 unsigned int i;
 911
 912                 fprintf(stderr, "- Cache entries [%u] -\n", slen);
 913                 for (i = 0; i < nb_entity_cache[slen] ; i++)
 914                         fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
 915                                 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
 916                                 entity_cache[slen][i].str);
 917                 fprintf(stderr, "-----------------\n");
 918         }
 919 #endif
 920         }
 921         return result;
 922 }
 923
 924 unsigned char *
 925 convert_string(struct conv_table *convert_table,
 926                unsigned char *chars, int charslen, int cp,
 927                enum convert_string_mode mode, int *length,
 928                void (*callback)(void *data, unsigned char *buf, int buflen),
 929                void *callback_data)
 930 {
 931         unsigned char *buffer;
 932         int bufferpos = 0;
 933         int charspos = 0;
 934
 935         if (!convert_table && !memchr(chars, '&', charslen)) {
 936                 if (callback) {
 937                         if (charslen) callback(callback_data, chars, charslen);
 938                         return NULL;
 939                 } else {
 940                         return memacpy(chars, charslen);
 941                 }
 942         }
 943
 944         /* Buffer allocation */
 945
 946         buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
 947         if (!buffer) return NULL;
 948
 949         /* Iterate ;-) */
 950
 951         while (charspos < charslen) {
 952                 unsigned char *translit;
 953
 954 #define PUTC do { \
 955                 buffer[bufferpos++] = chars[charspos++]; \
 956                 translit = ""; \
 957                 goto flush; \
 958         } while (0)
 959
 960                 if (chars[charspos] != '&') {
 961                         struct conv_table *t;
 962                         int i;
 963
 964                         if (chars[charspos] < 128 || !convert_table) PUTC;
 965
 966                         t = convert_table;
 967                         i = charspos;
 968
 969                         while (t[chars[i]].t) {
 970                                 t = t[chars[i++]].u.tbl;
 971                                 if (i >= charslen) PUTC;
 972                         }
 973
 974                         translit = t[chars[i]].u.str;
 975                         charspos = i + 1;
 976
 977                 } else if (mode == CSM_FORM || mode == CSM_NONE) {
 978                         PUTC;
 979
 980                 } else {
 981                         int start = charspos + 1;
 982                         int i = start;
 983
 984                         while (i < charslen
 985                                && (isasciialpha(chars[i])
 986                                    || isdigit(chars[i])
 987                                    || (chars[i] == '#')))
 988                                 i++;
 989
 990                         /* This prevents bug 213: we were expanding "entities"
 991                          * in URL query strings. */
 992                         /* XXX: But this disables &nbsp&nbsp usage, which
 993                          * appears to be relatively common! --pasky */
 994                         if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
 995                             && i > start
 996                             && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
 997                                 translit = get_entity_string(&chars[start], i - start,
 998                                                       cp);
 999                                 if (chars[i] != ';') {
1000                                         /* Eat &nbsp &nbsp<foo> happily, but
1001                                          * pull back from the character after
1002                                          * entity string if it is not the valid
1003                                          * terminator. */
1004                                         i--;
1005                                 }
1006
1007                                 if (!translit) PUTC;
1008                                 charspos = i + (i < charslen);
1009                         } else PUTC;
1010                 }
1011
1012                 if (!translit[0]) continue;
1013
1014                 if (!translit[1]) {
1015                         buffer[bufferpos++] = translit[0];
1016                         translit = "";
1017                         goto flush;
1018                 }
1019
1020                 while (*translit) {
1021                         unsigned char *new;
1022
1023                         buffer[bufferpos++] = *(translit++);
1024 flush:
1025                         if (bufferpos & (ALLOC_GR - 1)) continue;
1026
1027                         if (callback) {
1028                                 buffer[bufferpos] = 0;
1029                                 callback(callback_data, buffer, bufferpos);
1030                                 bufferpos = 0;
1031                         } else {
1032                                 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1033                                 if (!new) {
1034                                         mem_free(buffer);
1035                                         return NULL;
1036                                 }
1037                                 buffer = new;
1038                         }
1039                 }
1040 #undef PUTC
1041         }
1042
1043         /* Say bye */
1044
1045         buffer[bufferpos] = 0;
1046         if (length) *length = bufferpos;
1047
1048         if (callback) {
1049                 if (bufferpos) callback(callback_data, buffer, bufferpos);
1050                 mem_free(buffer);
1051                 return NULL;
1052         } else {
1053                 return buffer;
1054         }
1055 }
1056
1057
1058 #ifndef USE_FASTFIND
1059 int
1060 get_cp_index(unsigned char *name)
1061 {
1062         int i, a;
1063         int syscp = 0;
1064
1065         if (!strcasecmp(name, "System")) {
1066 #if HAVE_LANGINFO_CODESET
1067                 name = nl_langinfo(CODESET);
1068                 syscp = SYSTEM_CHARSET_FLAG;
1069 #else
1070                 name = "us-ascii";
1071 #endif
1072         }
1073
1074         for (i = 0; codepages[i].name; i++) {
1075                 for (a = 0; codepages[i].aliases[a]; a++) {
1076                         /* In the past, we looked for the longest substring
1077                          * in all the names; it is way too expensive, though:
1078                          *
1079                          *   %   cumulative   self              self     total
1080                          *  time   seconds   seconds    calls  us/call  us/call  name
1081                          *  3.00      0.66     0.03     1325    22.64    22.64  get_cp_index
1082                          *
1083                          * Anything called from redraw_screen() is in fact
1084                          * relatively expensive, even if it's called just
1085                          * once. So we will do a simple strcasecmp() here.
1086                          */
1087
1088                         if (!strcasecmp(name, codepages[i].aliases[a]))
1089                                 return i | syscp;
1090                 }
1091         }
1092
1093         if (syscp) {
1094                 return get_cp_index("us-ascii") | syscp;
1095         } else {
1096                 return -1;
1097         }
1098 }
1099
1100 #else
1101
1102 static unsigned int i_name = 0;
1103 static unsigned int i_alias = 0;
1104
1105 /* Reset internal list pointer */
1106 void
1107 charsets_list_reset(void)
1108 {
1109         i_name = 0;
1110         i_alias = 0;
1111 }
1112
1113 /* Returns a pointer to a struct that contains current key and data pointers
1114  * and increment internal pointer.  It returns NULL when key is NULL. */
1115 struct fastfind_key_value *
1116 charsets_list_next(void)
1117 {
1118         static struct fastfind_key_value kv;
1119
1120         if (!codepages[i_name].name) return NULL;
1121
1122         kv.key = codepages[i_name].aliases[i_alias];
1123         kv.data = &codepages[i_name];
1124
1125         if (codepages[i_name].aliases[i_alias + 1])
1126                 i_alias++;
1127         else {
1128                 i_name++;
1129                 i_alias = 0;
1130         }
1131
1132         return &kv;
1133 }
1134
1135 static struct fastfind_index ff_charsets_index
1136         = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1137
1138 /* It searchs for a charset named @name or one of its aliases and
1139  * returns index for it or -1 if not found. */
1140 int
1141 get_cp_index(unsigned char *name)
1142 {
1143         struct codepage_desc *codepage;
1144         int syscp = 0;
1145
1146         if (!strcasecmp(name, "System")) {
1147 #if HAVE_LANGINFO_CODESET
1148                 name = nl_langinfo(CODESET);
1149                 syscp = SYSTEM_CHARSET_FLAG;
1150 #else
1151                 name = "us-ascii";
1152 #endif
1153         }
1154
1155         codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1156         if (codepage) {
1157                 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1158                 return (codepage - codepages) | syscp;
1159
1160         } else if (syscp) {
1161                 return get_cp_index("us-ascii") | syscp;
1162
1163         } else {
1164                 return -1;
1165         }
1166 }
1167
1168 #endif /* USE_FASTFIND */
1169
1170 void
1171 init_charsets_lookup(void)
1172 {
1173 #ifdef USE_FASTFIND
1174         fastfind_index(&ff_charsets_index, FF_COMPRESS);
1175 #endif
1176 }
1177
1178 void
1179 free_charsets_lookup(void)
1180 {
1181 #ifdef USE_FASTFIND
1182         fastfind_done(&ff_charsets_index);
1183 #endif
1184 }
1185
1186 unsigned char *
1187 get_cp_name(int cp_index)
1188 {
1189         if (cp_index < 0) return "none";
1190         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1191
1192         return codepages[cp_index].name;
1193 }
1194
1195 unsigned char *
1196 get_cp_mime_name(int cp_index)
1197 {
1198         if (cp_index < 0) return "none";
1199         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1200         if (!codepages[cp_index].aliases) return NULL;
1201
1202         return codepages[cp_index].aliases[0];
1203 }
1204
1205 int
1206 is_cp_utf8(int cp_index)
1207 {
1208         cp_index &= ~SYSTEM_CHARSET_FLAG;
1209         return codepages[cp_index].table == table_utf_8;
1210 }