src/intl/charsets.c

   1 /* Charsets convertor */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #if HAVE_LANGINFO_CODESET
   8 #include <langinfo.h>
   9 #endif
  10
  11 #include <ctype.h>
  12 #include <stdlib.h>
  13 #if HAVE_WCTYPE_H
  14 #include <wctype.h>
  15 #endif
  16
  17 #include "elinks.h"
  18
  19 #include "document/options.h"
  20 #include "intl/charsets.h"
  21 #include "util/conv.h"
  22 #include "util/error.h"
  23 #include "util/fastfind.h"
  24 #include "util/memory.h"
  25 #include "util/string.h"
  26
  27
  28 /* Fix namespace clash on MacOS. */
  29 #define table table_elinks
  30
  31 struct table_entry {
  32         unsigned char c;
  33         unicode_val_T u;
  34 };
  35
  36 struct codepage_desc {
  37         unsigned char *name;
  38         unsigned char **aliases;
  39         struct table_entry *table;
  40 };
  41
  42 #include "intl/codepage.inc"
  43 #include "intl/uni_7b.inc"
  44 #include "intl/entity.inc"
  45
  46
  47 static char strings[256][2] = {
  48         "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
  49         "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
  50         "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
  51         "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
  52         "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
  53         "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
  54         "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
  55         "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
  56         "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
  57         "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
  58         "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
  59         "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
  60         "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
  61         "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
  62         "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
  63         "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
  64         "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
  65         "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
  66         "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
  67         "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
  68         "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
  69         "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
  70         "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
  71         "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
  72         "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
  73         "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
  74         "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
  75         "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
  76         "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
  77         "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
  78         "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
  79         "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
  80 };
  81
  82 static void
  83 free_translation_table(struct conv_table *p)
  84 {
  85         int i;
  86
  87         for (i = 0; i < 256; i++)
  88                 if (p[i].t)
  89                         free_translation_table(p[i].u.tbl);
  90
  91         mem_free(p);
  92 }
  93
  94 static unsigned char *no_str = "*";
  95
  96 static void
  97 new_translation_table(struct conv_table *p)
  98 {
  99         int i;
 100
 101         for (i = 0; i < 256; i++)
 102                 if (p[i].t)
 103                         free_translation_table(p[i].u.tbl);
 104         for (i = 0; i < 128; i++) {
 105                 p[i].t = 0;
 106                 p[i].u.str = strings[i];
 107         }
 108         for (; i < 256; i++) {
 109                 p[i].t = 0;
 110                 p[i].u.str = no_str;
 111         }
 112 }
 113
 114 #define BIN_SEARCH(table, entry, entries, key, result)                                  \
 115 {                                                                                       \
 116         long _s = 0, _e = (entries) - 1;                                                \
 117                                                                                         \
 118         while (_s <= _e || !((result) = -1)) {                                          \
 119                 long _m = (_s + _e) / 2;                                                \
 120                                                                                         \
 121                 if ((table)[_m].entry == (key)) {                                       \
 122                         (result) = _m;                                                  \
 123                         break;                                                          \
 124                 }                                                                       \
 125                 if ((table)[_m].entry > (key)) _e = _m - 1;                             \
 126                 if ((table)[_m].entry < (key)) _s = _m + 1;                             \
 127         }                                                                               \
 128 }                                                                                       \
 129
 130 static const unicode_val_T strange_chars[32] = {
 131 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
 132 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
 133 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
 134 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
 135 };
 136
 137 #define SYSTEM_CHARSET_FLAG 128
 138
 139 unsigned char *
 140 u2cp_(unicode_val_T u, int to, int no_nbsp_hack)
 141 {
 142         int j;
 143         int s;
 144
 145         if (u < 128) return strings[u];
 146
 147         to &= ~SYSTEM_CHARSET_FLAG;
 148
 149 #ifdef CONFIG_UTF_8
 150         if (codepages[to].table == table_utf_8)
 151                 return encode_utf_8(u);
 152 #endif /* CONFIG_UTF_8 */
 153
 154         /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
 155         if (u == 0xa0) return no_nbsp_hack ? " " : NBSP_CHAR_STRING;
 156         if (u == 0xad) return "";
 157
 158         if (u < 0xa0) {
 159                 unicode_val_T strange = strange_chars[u - 0x80];
 160
 161                 if (!strange) return NULL;
 162                 return u2cp_(strange, to, no_nbsp_hack);
 163         }
 164
 165
 166         for (j = 0; codepages[to].table[j].c; j++)
 167                 if (codepages[to].table[j].u == u)
 168                         return strings[codepages[to].table[j].c];
 169
 170         BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
 171         if (s != -1) return unicode_7b[s].s;
 172
 173         return no_str;
 174 }
 175
 176 static unsigned char utf_buffer[7];
 177
 178 #ifdef CONFIG_UTF_8
 179 inline unsigned char *
 180 encode_utf_8(unicode_val_T u)
 181 #else
 182 static unsigned char *
 183 encode_utf_8(unicode_val_T u)
 184 #endif /* CONFIG_UTF_8 */
 185 {
 186         memset(utf_buffer, 0, 7);
 187
 188         if (u < 0x80)
 189                 utf_buffer[0] = u;
 190         else if (u < 0x800)
 191                 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
 192                 utf_buffer[1] = 0x80 | (u & 0x3f);
 193         else if (u < 0x10000)
 194                 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
 195                 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
 196                 utf_buffer[2] = 0x80 | (u & 0x3f);
 197         else if (u < 0x200000)
 198                 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
 199                 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
 200                 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
 201                 utf_buffer[3] = 0x80 | (u & 0x3f);
 202         else if (u < 0x4000000)
 203                 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
 204                 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
 205                 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
 206                 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
 207                 utf_buffer[4] = 0x80 | (u & 0x3f);
 208         else    utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
 209                 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
 210                 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
 211                 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
 212                 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
 213                 utf_buffer[5] = 0x80 | (u & 0x3f);
 214
 215         return utf_buffer;
 216 }
 217
 218 #ifdef CONFIG_UTF_8
 219 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
 220  * equal ones and handled different. */
 221 static char utf8char_len_tab[256] = {
 222         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 223         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 224         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 225         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 226         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 227         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 228         2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
 229         3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
 230 };
 231
 232 inline int utf8charlen(const unsigned char *p)
 233 {
 234         return p ? utf8char_len_tab[*p] : 0;
 235 }
 236
 237 inline int
 238 strlen_utf8(unsigned char **str)
 239 {
 240         unsigned char *s = *str;
 241         unsigned char *end = strchr(s, '\0');
 242         int x;
 243         int len;
 244
 245         for (x = 0;; x++, s += len) {
 246                 len = utf8charlen(s);
 247                 if (s + len > end) break;
 248         }
 249         *str = s;
 250         return x;
 251 }
 252
 253 #define utf8_issingle(p) (((p) & 0x80) == 0)
 254 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
 255
 256 /* Start from @current and move back to @pos char. This pointer return. The
 257  * most left pointer is @start. */
 258 inline unsigned char *
 259 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
 260 {
 261         if (current == NULL || start == NULL || pos < 0)
 262                 return NULL;
 263         while (pos > 0 && current != start) {
 264                 current--;
 265                 if (utf8_islead(*current))
 266                         pos--;
 267         }
 268         return current;
 269 }
 270
 271 /* Count number of standard terminal cells needed for displaying UTF-8
 272  * character. */
 273 int
 274 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
 275 {
 276         unicode_val_T u;
 277
 278         if (end == NULL)
 279                 end = strchr(utf8_char, '\0');
 280
 281         if(!utf8_char || !end)
 282                 return -1;
 283
 284         u = utf_8_to_unicode(&utf8_char, end);
 285
 286         return unicode_to_cell(u);
 287 }
 288
 289 /* Count number of standard terminal cells needed for displaying string
 290  * with UTF-8 characters. */
 291 int
 292 utf8_ptr2cells(unsigned char *string, unsigned char *end)
 293 {
 294         int charlen, cell, cells = 0;
 295
 296         if (end == NULL)
 297                 end = strchr(string, '\0');
 298
 299         if(!string || !end)
 300                 return -1;
 301
 302         do {
 303                 charlen = utf8charlen(string);
 304                 if (string + charlen > end)
 305                         break;
 306
 307                 cell = utf8_char2cells(string, end);
 308                 if  (cell < 0)
 309                         return -1;
 310
 311                 cells += cell;
 312                 string += charlen;
 313         } while (1);
 314
 315         return cells;
 316 }
 317
 318 /* Count number of characters in string. */
 319 int
 320 utf8_ptr2chars(unsigned char *string, unsigned char *end)
 321 {
 322         int charlen, chars = 0;
 323
 324         if (end == NULL)
 325                 end = strchr(string, '\0');
 326
 327         if(!string || !end)
 328                 return -1;
 329
 330         do {
 331                 charlen = utf8charlen(string);
 332                 if (string + charlen > end)
 333                         break;
 334
 335                 chars++;
 336                 string += charlen;
 337         } while (1);
 338
 339         return chars;
 340 }
 341
 342 /*
 343  * Count number of bytes from begining of the string needed for displaying
 344  * specified number of cells.
 345  */
 346 int
 347 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
 348 {
 349         unsigned int bytes = 0, cells = 0;
 350
 351         assert(max_cells>=0);
 352
 353         if (end == NULL)
 354                 end = strchr(string, '\0');
 355
 356         if(!string || !end)
 357                 return -1;
 358
 359         do {
 360                 int cell = utf8_char2cells(&string[bytes], end);
 361                 if (cell < 0)
 362                         return -1;
 363
 364                 cells += cell;
 365                 if (cells > max_cells)
 366                         break;
 367
 368                 bytes += utf8charlen(&string[bytes]);
 369
 370                 if (string + bytes > end) {
 371                         bytes = end - string;
 372                         break;
 373                 }
 374         } while(1);
 375
 376         return bytes;
 377 }
 378
 379 /*
 380  * Find out number of standard terminal collumns needed for displaying symbol
 381  * (glyph) which represents Unicode character c.
 382  * TODO: Use wcwidth when it is available.
 383  *
 384  * @return      2 for double-width glyph, 1 for others.
 385  *              TODO: May be extended to return 0 for zero-width glyphs
 386  *              (like composing, maybe unprintable too).
 387  */
 388 inline int
 389 unicode_to_cell(unicode_val_T c)
 390 {
 391         if (c >= 0x1100
 392                 && (c <= 0x115f                 /* Hangul Jamo */
 393                 || c == 0x2329
 394                 || c == 0x232a
 395                 || (c >= 0x2e80 && c <= 0xa4cf
 396                         && c != 0x303f)         /* CJK ... Yi */
 397                 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
 398                 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
 399                                                                 Ideographs */
 400                 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
 401                 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
 402                 || (c >= 0xffe0 && c <= 0xffe6)
 403                 || (c >= 0x20000 && c <= 0x2fffd)
 404                 || (c >= 0x30000 && c <= 0x3fffd)))
 405                 return 2;
 406
 407         return 1;
 408 }
 409
 410 /* Fold the case of a Unicode character, so that hotkeys in labels can
 411  * be compared case-insensitively.  It is unspecified whether the
 412  * result will be in upper or lower case.  */
 413 unicode_val_T
 414 unicode_fold_label_case(unicode_val_T c)
 415 {
 416 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
 417         return towlower(c);
 418 #else  /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 419         /* For now, this supports only ASCII.  It would be possible to
 420          * use code generated from CaseFolding.txt of Unicode if the
 421          * acknowledgements required by http://www.unicode.org/copyright.html
 422          * were added to associated documentation of ELinks.  */
 423         if (c >= 0x41 && c <= 0x5A)
 424                 return c + 0x20;
 425         else
 426                 return c;
 427 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 428 }
 429
 430 inline unicode_val_T
 431 utf_8_to_unicode(unsigned char **string, unsigned char *end)
 432 {
 433         unsigned char *str = *string;
 434         unicode_val_T u;
 435         int length;
 436
 437         length = utf8char_len_tab[str[0]];
 438
 439         if (str + length > end) {
 440                 return UCS_NO_CHAR;
 441         }
 442
 443         switch (length) {
 444                 case 1:
 445                         u = str[0];
 446                         break;
 447                 case 2:
 448                         u = (str[0] & 0x1f) << 6;
 449                         u += (str[1] & 0x3f);
 450                         break;
 451                 case 3:
 452                         u = (str[0] & 0x0f) << 12;
 453                         u += ((str[1] & 0x3f) << 6);
 454                         u += (str[2] & 0x3f);
 455                         break;
 456                 case 4:
 457                         u = (str[0] & 0x0f) << 18;
 458                         u += ((str[1] & 0x3f) << 12);
 459                         u += ((str[2] & 0x3f) << 6);
 460                         u += (str[3] & 0x3f);
 461                         break;
 462                 case 5:
 463                         u = (str[0] & 0x0f) << 24;
 464                         u += ((str[1] & 0x3f) << 18);
 465                         u += ((str[2] & 0x3f) << 12);
 466                         u += ((str[3] & 0x3f) << 6);
 467                         u += (str[4] & 0x3f);
 468                         break;
 469                 case 6:
 470                 default:
 471                         u = (str[0] & 0x01) << 30;
 472                         u += ((str[1] & 0x3f) << 24);
 473                         u += ((str[2] & 0x3f) << 18);
 474                         u += ((str[3] & 0x3f) << 12);
 475                         u += ((str[4] & 0x3f) << 6);
 476                         u += (str[5] & 0x3f);
 477                         break;
 478         }
 479         *string = str + length;
 480         return u;
 481 }
 482 #endif /* CONFIG_UTF_8 */
 483
 484 /* Slow algorithm, the common part of cp2u and cp2utf_8.  */
 485 static unicode_val_T
 486 cp2u_shared(const struct codepage_desc *from, unsigned char c)
 487 {
 488         int j;
 489
 490         for (j = 0; from->table[j].c; j++)
 491                 if (from->table[j].c == c)
 492                         return from->table[j].u;
 493
 494         return UCS_REPLACEMENT_CHARACTER;
 495 }
 496
 497 /* Slow algorithm, used for converting input from the terminal.  */
 498 unicode_val_T
 499 cp2u(int from, unsigned char c)
 500 {
 501         from &= ~SYSTEM_CHARSET_FLAG;
 502
 503         /* UTF-8 is a multibyte codepage and cannot be handled with
 504          * this function.  */
 505         assert(codepages[from].table != table_utf_8);
 506         if_assert_failed return UCS_REPLACEMENT_CHARACTER;
 507
 508         if (c < 0x80) return c;
 509         else return cp2u_shared(&codepages[from], c);
 510 }
 511
 512 /* This slow and ugly code is used by the terminal utf_8_io */
 513 unsigned char *
 514 cp2utf_8(int from, int c)
 515 {
 516         from &= ~SYSTEM_CHARSET_FLAG;
 517
 518         if (codepages[from].table == table_utf_8 || c < 128)
 519                 return strings[c];
 520
 521         return encode_utf_8(cp2u_shared(&codepages[from], c));
 522 }
 523
 524 #ifdef CONFIG_UTF_8
 525 unicode_val_T
 526 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
 527 {
 528         unicode_val_T ret;
 529
 530         if (is_cp_utf8(codepage))
 531                 return utf_8_to_unicode(string, end);
 532
 533         if (*string >= end)
 534                 return UCS_NO_CHAR;
 535
 536         ret = cp2u(codepage, **string);
 537         ++*string;
 538         return ret;
 539 }
 540 #endif  /* CONFIG_UTF_8 */
 541
 542
 543 static void
 544 add_utf_8(struct conv_table *ct, unicode_val_T u, unsigned char *str)
 545 {
 546         unsigned char *p = encode_utf_8(u);
 547
 548         while (p[1]) {
 549                 if (ct[*p].t) ct = ct[*p].u.tbl;
 550                 else {
 551                         struct conv_table *nct;
 552
 553                         assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
 554                         if_assert_failed return;
 555
 556                         nct = mem_calloc(256, sizeof(*nct));
 557                         if (!nct) return;
 558                         new_translation_table(nct);
 559                         ct[*p].t = 1;
 560                         ct[*p].u.tbl = nct;
 561                         ct = nct;
 562                 }
 563                 p++;
 564         }
 565
 566         assertm(!ct[*p].t, "bad utf encoding #2");
 567         if_assert_failed return;
 568
 569         if (ct[*p].u.str == no_str)
 570                 ct[*p].u.str = str;
 571 }
 572
 573 struct conv_table utf_table[256];
 574 int utf_table_init = 1;
 575
 576 static void
 577 free_utf_table(void)
 578 {
 579         int i;
 580
 581         for (i = 128; i < 256; i++)
 582                 mem_free(utf_table[i].u.str);
 583 }
 584
 585 static struct conv_table *
 586 get_translation_table_to_utf_8(int from)
 587 {
 588         int i;
 589         static int lfr = -1;
 590
 591         if (from == -1) return NULL;
 592         from &= ~SYSTEM_CHARSET_FLAG;
 593         if (from == lfr) return utf_table;
 594         lfr = from;
 595         if (utf_table_init)
 596                 memset(utf_table, 0, sizeof(utf_table)),
 597                 utf_table_init = 0;
 598         else
 599                 free_utf_table();
 600
 601         for (i = 0; i < 128; i++)
 602                 utf_table[i].u.str = strings[i];
 603
 604         if (codepages[from].table == table_utf_8) {
 605                 for (i = 128; i < 256; i++)
 606                         utf_table[i].u.str = stracpy(strings[i]);
 607                 return utf_table;
 608         }
 609
 610         for (i = 128; i < 256; i++)
 611                 utf_table[i].u.str = NULL;
 612
 613         for (i = 0; codepages[from].table[i].c; i++) {
 614                 unicode_val_T u = codepages[from].table[i].u;
 615
 616                 if (!utf_table[codepages[from].table[i].c].u.str)
 617                         utf_table[codepages[from].table[i].c].u.str =
 618                                 stracpy(encode_utf_8(u));
 619         }
 620
 621         for (i = 128; i < 256; i++)
 622                 if (!utf_table[i].u.str)
 623                         utf_table[i].u.str = stracpy(no_str);
 624
 625         return utf_table;
 626 }
 627
 628 struct conv_table table[256];
 629 static int first = 1;
 630
 631 void
 632 free_conv_table(void)
 633 {
 634         if (!utf_table_init) free_utf_table();
 635         if (first) {
 636                 memset(table, 0, sizeof(table));
 637                 first = 0;
 638         }
 639         new_translation_table(table);
 640 }
 641
 642
 643 struct conv_table *
 644 get_translation_table(int from, int to)
 645 {
 646         static int lfr = -1;
 647         static int lto = -1;
 648
 649         from &= ~SYSTEM_CHARSET_FLAG;
 650         to &= ~SYSTEM_CHARSET_FLAG;
 651         if (first) {
 652                 memset(table, 0, sizeof(table));
 653                 first = 0;
 654         }
 655         if (/*from == to ||*/ from == -1 || to == -1)
 656                 return NULL;
 657         if (codepages[to].table == table_utf_8)
 658                 return get_translation_table_to_utf_8(from);
 659         if (from == lfr && to == lto)
 660                 return table;
 661         lfr = from;
 662         lto = to;
 663         new_translation_table(table);
 664
 665         if (codepages[from].table == table_utf_8) {
 666                 int i;
 667
 668                 for (i = 0; codepages[to].table[i].c; i++)
 669                         add_utf_8(table, codepages[to].table[i].u,
 670                                   strings[codepages[to].table[i].c]);
 671
 672                 for (i = 0; unicode_7b[i].x != -1; i++)
 673                         if (unicode_7b[i].x >= 0x80)
 674                                 add_utf_8(table, unicode_7b[i].x,
 675                                           unicode_7b[i].s);
 676
 677         } else {
 678                 int i;
 679
 680                 for (i = 128; i < 256; i++) {
 681                         int j;
 682
 683                         for (j = 0; codepages[from].table[j].c; j++) {
 684                                 if (codepages[from].table[j].c == i) {
 685                                         unsigned char *u;
 686
 687                                         u = u2cp(codepages[from].table[j].u, to);
 688                                         if (u) table[i].u.str = u;
 689                                         break;
 690                                 }
 691                         }
 692                 }
 693         }
 694
 695         return table;
 696 }
 697
 698 static inline int
 699 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
 700 {
 701         while (l2) {
 702                 if (*s1 > *s2) return 1;
 703                 if (*s1 < *s2) return -1;
 704                 s1++;
 705                 s2++;
 706                 l2--;
 707         }
 708
 709         return *s2 ? -1 : 0;
 710 }
 711
 712 /* Entity cache debugging purpose. */
 713 #if 0
 714 #define DEBUG_ENTITY_CACHE
 715 #else
 716 #undef DEBUG_ENTITY_CACHE
 717 #endif
 718
 719 struct entity_cache {
 720         unsigned int hits;
 721         int strlen;
 722         int encoding;
 723         unsigned char *result;
 724         unsigned char str[20]; /* Suffice in any case. */
 725 };
 726
 727 static int
 728 hits_cmp(struct entity_cache *a, struct entity_cache *b)
 729 {
 730         if (a->hits == b->hits) return 0;
 731         if (a->hits > b->hits) return -1;
 732         else return 1;
 733 }
 734
 735 static int
 736 compare_entities(const void *key_, const void *element_)
 737 {
 738         struct string *key = (struct string *) key_;
 739         struct entity *element = (struct entity *) element_;
 740         int length = key->length;
 741         unsigned char *first = key->source;
 742         unsigned char *second = element->s;
 743
 744         return xxstrcmp(first, second, length);
 745 }
 746
 747 unsigned char *
 748 get_entity_string(const unsigned char *str, const int strlen, int encoding)
 749 {
 750 #define ENTITY_CACHE_SIZE 10    /* 10 seems a good value. */
 751 #define ENTITY_CACHE_MAXLEN 9   /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
 752                                    will go in [0] table */
 753         static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
 754         static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
 755         static int first_time = 1;
 756         unsigned int slen = 0;
 757         unsigned char *result = NULL;
 758
 759         if (strlen <= 0) return NULL;
 760
 761 #ifdef CONFIG_UTF_8
 762         /* TODO: caching UTF-8 */
 763         encoding &= ~SYSTEM_CHARSET_FLAG;
 764         if (codepages[encoding].table == table_utf_8)
 765                 goto skip;
 766 #endif /* CONFIG_UTF_8 */
 767
 768         if (first_time) {
 769                 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
 770                 first_time = 0;
 771         }
 772
 773         /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
 774          * + google + slashdot + websites that result from a search for test on google,
 775          * + various ones) show a quite impressive improvment:
 776          * Top ten is:
 777          * 0: hits=2459 l=4 st='nbsp'
 778          * 1: hits=2152 l=6 st='eacute'
 779          * 2: hits=235 l=6 st='egrave'
 780          * 3: hits=136 l=6 st='agrave'
 781          * 4: hits=100 l=3 st='amp'
 782          * 5: hits=40 l=5 st='laquo'
 783          * 6: hits=8 l=4 st='copy'
 784          * 7: hits=5 l=2 st='gt'
 785          * 8: hits=2 l=2 st='lt'
 786          * 9: hits=1 l=6 st='middot'
 787          *
 788          * Most of the time cache hit ratio is near 95%.
 789          *
 790          * A long test shows: 15186 hits vs. 24 misses and mean iteration
 791          * count is kept < 2 (worst case 1.58). Not so bad ;)
 792          *
 793          * --Zas */
 794
 795         /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
 796         slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
 797
 798         if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
 799                 int i;
 800
 801                 for (i = 0; i < nb_entity_cache[slen]; i++) {
 802                         if (entity_cache[slen][i].encoding == encoding
 803                             && !memcmp(str, entity_cache[slen][i].str, strlen)) {
 804 #ifdef DEBUG_ENTITY_CACHE
 805                                 static double total_iter = 0;
 806                                 static unsigned long hit_count = 0;
 807
 808                                 total_iter += i + 1;
 809                                 hit_count++;
 810                                 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
 811 #endif
 812                                 if (entity_cache[slen][i].hits < (unsigned int) ~0)
 813                                         entity_cache[slen][i].hits++;
 814                                 return entity_cache[slen][i].result;
 815                         }
 816                 }
 817 #ifdef DEBUG_ENTITY_CACHE
 818                 fprintf(stderr, "miss\n");
 819 #endif
 820         }
 821 #ifdef CONFIG_UTF_8
 822 skip:
 823 #endif /* CONFIG_UTF_8 */
 824         if (*str == '#') { /* Numeric entity. */
 825                 int l = (int) strlen;
 826                 unsigned char *st = (unsigned char *) str;
 827                 unicode_val_T n = 0;
 828
 829                 if (l == 1) goto end; /* &#; ? */
 830                 st++, l--;
 831                 if ((*st | 32) == 'x') { /* Hexadecimal */
 832
 833                         if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
 834                         st++, l--;
 835                         do {
 836                                 unsigned char c = (*(st++) | 32);
 837
 838                                 if (isdigit(c))
 839                                         n = (n << 4) | (c - '0');
 840                                 else if (isxdigit(c))
 841                                         n = (n << 4) | (c - 'a' + 10);
 842                                 else
 843                                         goto end; /* Bad char. */
 844                         } while (--l);
 845                 } else { /* Decimal */
 846                         if (l > 10) goto end; /* 4294967295 max. */
 847                         do {
 848                                 unsigned char c = *(st++);
 849
 850                                 if (isdigit(c))
 851                                         n = n * 10 + c - '0';
 852                                 else
 853                                         goto end; /* Bad char. */
 854                                 /* Limit to 0xFFFFFFFF. */
 855                                 if (n >= (unicode_val_T) 0xFFFFFFFFu)
 856                                         goto end;
 857                         } while (--l);
 858                 }
 859
 860                 result = u2cp(n, encoding);
 861
 862 #ifdef DEBUG_ENTITY_CACHE
 863                 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
 864 #endif
 865         } else { /* Text entity. */
 866                 struct string key = INIT_STRING((unsigned char *) str, strlen);
 867                 struct entity *element = bsearch((void *) &key, entities,
 868                                                  N_ENTITIES,
 869                                                  sizeof(*element),
 870                                                  compare_entities);
 871
 872                 if (element) result = u2cp(element->c, encoding);
 873         }
 874
 875 #ifdef CONFIG_UTF_8
 876         if (codepages[encoding].table == table_utf_8) {
 877                 return result;
 878         }
 879 #endif /* CONFIG_UTF_8 */
 880 end:
 881         /* Take care of potential buffer overflow. */
 882         if (strlen < sizeof(entity_cache[slen][0].str)) {
 883                 struct entity_cache *ece = &entity_cache[slen][nb_entity_cache[slen]];
 884
 885                 /* Copy new entry to cache. */
 886                 ece->hits = 1;
 887                 ece->strlen = strlen;
 888                 ece->encoding = encoding;
 889                 ece->result = result;
 890                 memcpy(ece->str, str, strlen);
 891                 ece->str[strlen] = '\0';
 892
 893                 /* Increment number of cache entries if possible. */
 894                 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
 895
 896 #ifdef DEBUG_ENTITY_CACHE
 897                 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
 898                                 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
 899
 900 #endif
 901
 902                 /* Sort entries by hit order. */
 903                 if (nb_entity_cache[slen] > 1)
 904                         qsort(&entity_cache[slen][0], nb_entity_cache[slen],
 905                               sizeof(entity_cache[slen][0]), (void *) hits_cmp);
 906
 907 #ifdef DEBUG_ENTITY_CACHE
 908         {
 909                 unsigned int i;
 910
 911                 fprintf(stderr, "- Cache entries [%u] -\n", slen);
 912                 for (i = 0; i < nb_entity_cache[slen] ; i++)
 913                         fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
 914                                 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
 915                                 entity_cache[slen][i].str);
 916                 fprintf(stderr, "-----------------\n");
 917         }
 918 #endif
 919         }
 920         return result;
 921 }
 922
 923 unsigned char *
 924 convert_string(struct conv_table *convert_table,
 925                unsigned char *chars, int charslen, int cp,
 926                enum convert_string_mode mode, int *length,
 927                void (*callback)(void *data, unsigned char *buf, int buflen),
 928                void *callback_data)
 929 {
 930         unsigned char *buffer;
 931         int bufferpos = 0;
 932         int charspos = 0;
 933
 934         if (!convert_table && !memchr(chars, '&', charslen)) {
 935                 if (callback) {
 936                         if (charslen) callback(callback_data, chars, charslen);
 937                         return NULL;
 938                 } else {
 939                         return memacpy(chars, charslen);
 940                 }
 941         }
 942
 943         /* Buffer allocation */
 944
 945         buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
 946         if (!buffer) return NULL;
 947
 948         /* Iterate ;-) */
 949
 950         while (charspos < charslen) {
 951                 unsigned char *translit;
 952
 953 #define PUTC do { \
 954                 buffer[bufferpos++] = chars[charspos++]; \
 955                 translit = ""; \
 956                 goto flush; \
 957         } while (0)
 958
 959                 if (chars[charspos] != '&') {
 960                         struct conv_table *t;
 961                         int i;
 962
 963                         if (chars[charspos] < 128 || !convert_table) PUTC;
 964
 965                         t = convert_table;
 966                         i = charspos;
 967
 968                         while (t[chars[i]].t) {
 969                                 t = t[chars[i++]].u.tbl;
 970                                 if (i >= charslen) PUTC;
 971                         }
 972
 973                         translit = t[chars[i]].u.str;
 974                         charspos = i + 1;
 975
 976                 } else if (mode == CSM_FORM || mode == CSM_NONE) {
 977                         PUTC;
 978
 979                 } else {
 980                         int start = charspos + 1;
 981                         int i = start;
 982
 983                         while (i < charslen
 984                                && (isasciialpha(chars[i])
 985                                    || isdigit(chars[i])
 986                                    || (chars[i] == '#')))
 987                                 i++;
 988
 989                         /* This prevents bug 213: we were expanding "entities"
 990                          * in URL query strings. */
 991                         /* XXX: But this disables &nbsp&nbsp usage, which
 992                          * appears to be relatively common! --pasky */
 993                         if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
 994                             && i > start
 995                             && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
 996                                 translit = get_entity_string(&chars[start], i - start,
 997                                                       cp);
 998                                 if (chars[i] != ';') {
 999                                         /* Eat &nbsp &nbsp<foo> happily, but
1000                                          * pull back from the character after
1001                                          * entity string if it is not the valid
1002                                          * terminator. */
1003                                         i--;
1004                                 }
1005
1006                                 if (!translit) PUTC;
1007                                 charspos = i + (i < charslen);
1008                         } else PUTC;
1009                 }
1010
1011                 if (!translit[0]) continue;
1012
1013                 if (!translit[1]) {
1014                         buffer[bufferpos++] = translit[0];
1015                         translit = "";
1016                         goto flush;
1017                 }
1018
1019                 while (*translit) {
1020                         unsigned char *new;
1021
1022                         buffer[bufferpos++] = *(translit++);
1023 flush:
1024                         if (bufferpos & (ALLOC_GR - 1)) continue;
1025
1026                         if (callback) {
1027                                 buffer[bufferpos] = 0;
1028                                 callback(callback_data, buffer, bufferpos);
1029                                 bufferpos = 0;
1030                         } else {
1031                                 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1032                                 if (!new) {
1033                                         mem_free(buffer);
1034                                         return NULL;
1035                                 }
1036                                 buffer = new;
1037                         }
1038                 }
1039 #undef PUTC
1040         }
1041
1042         /* Say bye */
1043
1044         buffer[bufferpos] = 0;
1045         if (length) *length = bufferpos;
1046
1047         if (callback) {
1048                 if (bufferpos) callback(callback_data, buffer, bufferpos);
1049                 mem_free(buffer);
1050                 return NULL;
1051         } else {
1052                 return buffer;
1053         }
1054 }
1055
1056
1057 #ifndef USE_FASTFIND
1058 int
1059 get_cp_index(unsigned char *name)
1060 {
1061         int i, a;
1062         int syscp = 0;
1063
1064         if (!strcasecmp(name, "System")) {
1065 #if HAVE_LANGINFO_CODESET
1066                 name = nl_langinfo(CODESET);
1067                 syscp = SYSTEM_CHARSET_FLAG;
1068 #else
1069                 name = "us-ascii";
1070 #endif
1071         }
1072
1073         for (i = 0; codepages[i].name; i++) {
1074                 for (a = 0; codepages[i].aliases[a]; a++) {
1075                         /* In the past, we looked for the longest substring
1076                          * in all the names; it is way too expensive, though:
1077                          *
1078                          *   %   cumulative   self              self     total
1079                          *  time   seconds   seconds    calls  us/call  us/call  name
1080                          *  3.00      0.66     0.03     1325    22.64    22.64  get_cp_index
1081                          *
1082                          * Anything called from redraw_screen() is in fact
1083                          * relatively expensive, even if it's called just
1084                          * once. So we will do a simple strcasecmp() here.
1085                          */
1086
1087                         if (!strcasecmp(name, codepages[i].aliases[a]))
1088                                 return i | syscp;
1089                 }
1090         }
1091
1092         if (syscp) {
1093                 return get_cp_index("us-ascii") | syscp;
1094         } else {
1095                 return -1;
1096         }
1097 }
1098
1099 #else
1100
1101 static unsigned int i_name = 0;
1102 static unsigned int i_alias = 0;
1103
1104 /* Reset internal list pointer */
1105 void
1106 charsets_list_reset(void)
1107 {
1108         i_name = 0;
1109         i_alias = 0;
1110 }
1111
1112 /* Returns a pointer to a struct that contains current key and data pointers
1113  * and increment internal pointer.  It returns NULL when key is NULL. */
1114 struct fastfind_key_value *
1115 charsets_list_next(void)
1116 {
1117         static struct fastfind_key_value kv;
1118
1119         if (!codepages[i_name].name) return NULL;
1120
1121         kv.key = codepages[i_name].aliases[i_alias];
1122         kv.data = &codepages[i_name];
1123
1124         if (codepages[i_name].aliases[i_alias + 1])
1125                 i_alias++;
1126         else {
1127                 i_name++;
1128                 i_alias = 0;
1129         }
1130
1131         return &kv;
1132 }
1133
1134 static struct fastfind_index ff_charsets_index
1135         = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1136
1137 /* It searchs for a charset named @name or one of its aliases and
1138  * returns index for it or -1 if not found. */
1139 int
1140 get_cp_index(unsigned char *name)
1141 {
1142         struct codepage_desc *codepage;
1143         int syscp = 0;
1144
1145         if (!strcasecmp(name, "System")) {
1146 #if HAVE_LANGINFO_CODESET
1147                 name = nl_langinfo(CODESET);
1148                 syscp = SYSTEM_CHARSET_FLAG;
1149 #else
1150                 name = "us-ascii";
1151 #endif
1152         }
1153
1154         codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1155         if (codepage) {
1156                 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1157                 return (codepage - codepages) | syscp;
1158
1159         } else if (syscp) {
1160                 return get_cp_index("us-ascii") | syscp;
1161
1162         } else {
1163                 return -1;
1164         }
1165 }
1166
1167 #endif /* USE_FASTFIND */
1168
1169 void
1170 init_charsets_lookup(void)
1171 {
1172 #ifdef USE_FASTFIND
1173         fastfind_index(&ff_charsets_index, FF_COMPRESS);
1174 #endif
1175 }
1176
1177 void
1178 free_charsets_lookup(void)
1179 {
1180 #ifdef USE_FASTFIND
1181         fastfind_done(&ff_charsets_index);
1182 #endif
1183 }
1184
1185 unsigned char *
1186 get_cp_name(int cp_index)
1187 {
1188         if (cp_index < 0) return "none";
1189         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1190
1191         return codepages[cp_index].name;
1192 }
1193
1194 unsigned char *
1195 get_cp_mime_name(int cp_index)
1196 {
1197         if (cp_index < 0) return "none";
1198         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1199         if (!codepages[cp_index].aliases) return NULL;
1200
1201         return codepages[cp_index].aliases[0];
1202 }
1203
1204 int
1205 is_cp_utf8(int cp_index)
1206 {
1207         cp_index &= ~SYSTEM_CHARSET_FLAG;
1208         return codepages[cp_index].table == table_utf_8;
1209 }