src/intl/charsets.c

   1 /* Charsets convertor */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #if HAVE_LANGINFO_CODESET
   8 #include <langinfo.h>
   9 #endif
  10
  11 #include <ctype.h>
  12 #include <stdlib.h>
  13
  14 #include "elinks.h"
  15
  16 #include "document/options.h"
  17 #include "intl/charsets.h"
  18 #include "util/conv.h"
  19 #include "util/error.h"
  20 #include "util/fastfind.h"
  21 #include "util/memory.h"
  22 #include "util/string.h"
  23
  24
  25 /* Fix namespace clash on MacOS. */
  26 #define table table_elinks
  27
  28 struct table_entry {
  29         unsigned char c;
  30         unicode_val_T u;
  31 };
  32
  33 struct codepage_desc {
  34         unsigned char *name;
  35         unsigned char **aliases;
  36         struct table_entry *table;
  37 };
  38
  39 #include "intl/codepage.inc"
  40 #include "intl/uni_7b.inc"
  41 #include "intl/entity.inc"
  42
  43
  44 static char strings[256][2] = {
  45         "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
  46         "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
  47         "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
  48         "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
  49         "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
  50         "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
  51         "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
  52         "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
  53         "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
  54         "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
  55         "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
  56         "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
  57         "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
  58         "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
  59         "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
  60         "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
  61         "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
  62         "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
  63         "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
  64         "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
  65         "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
  66         "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
  67         "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
  68         "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
  69         "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
  70         "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
  71         "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
  72         "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
  73         "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
  74         "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
  75         "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
  76         "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
  77 };
  78
  79 static void
  80 free_translation_table(struct conv_table *p)
  81 {
  82         int i;
  83
  84         for (i = 0; i < 256; i++)
  85                 if (p[i].t)
  86                         free_translation_table(p[i].u.tbl);
  87
  88         mem_free(p);
  89 }
  90
  91 static unsigned char *no_str = "*";
  92
  93 static void
  94 new_translation_table(struct conv_table *p)
  95 {
  96         int i;
  97
  98         for (i = 0; i < 256; i++)
  99                 if (p[i].t)
 100                         free_translation_table(p[i].u.tbl);
 101         for (i = 0; i < 128; i++) {
 102                 p[i].t = 0;
 103                 p[i].u.str = strings[i];
 104         }
 105         for (; i < 256; i++) {
 106                 p[i].t = 0;
 107                 p[i].u.str = no_str;
 108         }
 109 }
 110
 111 #define BIN_SEARCH(table, entry, entries, key, result)                                  \
 112 {                                                                                       \
 113         long _s = 0, _e = (entries) - 1;                                                \
 114                                                                                         \
 115         while (_s <= _e || !((result) = -1)) {                                          \
 116                 long _m = (_s + _e) / 2;                                                \
 117                                                                                         \
 118                 if ((table)[_m].entry == (key)) {                                       \
 119                         (result) = _m;                                                  \
 120                         break;                                                          \
 121                 }                                                                       \
 122                 if ((table)[_m].entry > (key)) _e = _m - 1;                             \
 123                 if ((table)[_m].entry < (key)) _s = _m + 1;                             \
 124         }                                                                               \
 125 }                                                                                       \
 126
 127 static const unicode_val_T strange_chars[32] = {
 128 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
 129 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
 130 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
 131 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
 132 };
 133
 134 #define SYSTEM_CHARSET_FLAG 128
 135
 136 unsigned char *
 137 u2cp_(unicode_val_T u, int to, int no_nbsp_hack)
 138 {
 139         int j;
 140         int s;
 141
 142         if (u < 128) return strings[u];
 143         /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
 144         if (u == 0xa0) return no_nbsp_hack ? " " : NBSP_CHAR_STRING;
 145         if (u == 0xad) return "";
 146
 147         if (u < 0xa0) {
 148                 unicode_val_T strange = strange_chars[u - 0x80];
 149
 150                 if (!strange) return NULL;
 151                 return u2cp_(strange, to, no_nbsp_hack);
 152         }
 153
 154         to &= ~SYSTEM_CHARSET_FLAG;
 155
 156         for (j = 0; codepages[to].table[j].c; j++)
 157                 if (codepages[to].table[j].u == u)
 158                         return strings[codepages[to].table[j].c];
 159
 160         BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
 161         if (s != -1) return unicode_7b[s].s;
 162
 163         return no_str;
 164 }
 165
 166 static unsigned char utf_buffer[7];
 167
 168 static unsigned char *
 169 encode_utf_8(unicode_val_T u)
 170 {
 171         memset(utf_buffer, 0, 7);
 172
 173         if (u < 0x80)
 174                 utf_buffer[0] = u;
 175         else if (u < 0x800)
 176                 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
 177                 utf_buffer[1] = 0x80 | (u & 0x3f);
 178         else if (u < 0x10000)
 179                 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
 180                 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
 181                 utf_buffer[2] = 0x80 | (u & 0x3f);
 182         else if (u < 0x200000)
 183                 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
 184                 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
 185                 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
 186                 utf_buffer[3] = 0x80 | (u & 0x3f);
 187         else if (u < 0x4000000)
 188                 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
 189                 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
 190                 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
 191                 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
 192                 utf_buffer[4] = 0x80 | (u & 0x3f);
 193         else    utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
 194                 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
 195                 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
 196                 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
 197                 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
 198                 utf_buffer[5] = 0x80 | (u & 0x3f);
 199
 200         return utf_buffer;
 201 }
 202
 203 /* This slow and ugly code is used by the terminal utf_8_io */
 204 unsigned char *
 205 cp2utf_8(int from, int c)
 206 {
 207         int j;
 208
 209         from &= ~SYSTEM_CHARSET_FLAG;
 210
 211         if (codepages[from].table == table_utf_8 || c < 128)
 212                 return strings[c];
 213
 214         for (j = 0; codepages[from].table[j].c; j++)
 215                 if (codepages[from].table[j].c == c)
 216                         return encode_utf_8(codepages[from].table[j].u);
 217
 218         return encode_utf_8(UCS_NO_CHAR);
 219 }
 220
 221 static void
 222 add_utf_8(struct conv_table *ct, unicode_val_T u, unsigned char *str)
 223 {
 224         unsigned char *p = encode_utf_8(u);
 225
 226         while (p[1]) {
 227                 if (ct[*p].t) ct = ct[*p].u.tbl;
 228                 else {
 229                         struct conv_table *nct;
 230
 231                         assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
 232                         if_assert_failed return;
 233
 234                         nct = mem_calloc(256, sizeof(*nct));
 235                         if (!nct) return;
 236                         new_translation_table(nct);
 237                         ct[*p].t = 1;
 238                         ct[*p].u.tbl = nct;
 239                         ct = nct;
 240                 }
 241                 p++;
 242         }
 243
 244         assertm(!ct[*p].t, "bad utf encoding #2");
 245         if_assert_failed return;
 246
 247         if (ct[*p].u.str == no_str)
 248                 ct[*p].u.str = str;
 249 }
 250
 251 struct conv_table utf_table[256];
 252 int utf_table_init = 1;
 253
 254 static void
 255 free_utf_table(void)
 256 {
 257         int i;
 258
 259         for (i = 128; i < 256; i++)
 260                 mem_free(utf_table[i].u.str);
 261 }
 262
 263 static struct conv_table *
 264 get_translation_table_to_utf_8(int from)
 265 {
 266         int i;
 267         static int lfr = -1;
 268
 269         if (from == -1) return NULL;
 270         from &= ~SYSTEM_CHARSET_FLAG;
 271         if (from == lfr) return utf_table;
 272         if (utf_table_init)
 273                 memset(utf_table, 0, sizeof(utf_table)),
 274                 utf_table_init = 0;
 275         else
 276                 free_utf_table();
 277
 278         for (i = 0; i < 128; i++)
 279                 utf_table[i].u.str = strings[i];
 280
 281         if (codepages[from].table == table_utf_8) {
 282                 for (i = 128; i < 256; i++)
 283                         utf_table[i].u.str = stracpy(strings[i]);
 284                 return utf_table;
 285         }
 286
 287         for (i = 128; i < 256; i++)
 288                 utf_table[i].u.str = NULL;
 289
 290         for (i = 0; codepages[from].table[i].c; i++) {
 291                 unicode_val_T u = codepages[from].table[i].u;
 292
 293                 if (!utf_table[codepages[from].table[i].c].u.str)
 294                         utf_table[codepages[from].table[i].c].u.str =
 295                                 stracpy(encode_utf_8(u));
 296         }
 297
 298         for (i = 128; i < 256; i++)
 299                 if (!utf_table[i].u.str)
 300                         utf_table[i].u.str = stracpy(no_str);
 301
 302         return utf_table;
 303 }
 304
 305 struct conv_table table[256];
 306 static int first = 1;
 307
 308 void
 309 free_conv_table(void)
 310 {
 311         if (!utf_table_init) free_utf_table();
 312         if (first) {
 313                 memset(table, 0, sizeof(table));
 314                 first = 0;
 315         }
 316         new_translation_table(table);
 317 }
 318
 319
 320 struct conv_table *
 321 get_translation_table(int from, int to)
 322 {
 323         static int lfr = -1;
 324         static int lto = -1;
 325
 326         from &= ~SYSTEM_CHARSET_FLAG;
 327         to &= ~SYSTEM_CHARSET_FLAG;
 328         if (first) {
 329                 memset(table, 0, sizeof(table));
 330                 first = 0;
 331         }
 332         if (/*from == to ||*/ from == -1 || to == -1)
 333                 return NULL;
 334         if (codepages[to].table == table_utf_8)
 335                 return get_translation_table_to_utf_8(from);
 336         if (from == lfr && to == lto)
 337                 return table;
 338         lfr = from;
 339         lto = to;
 340         new_translation_table(table);
 341
 342         if (codepages[from].table == table_utf_8) {
 343                 int i;
 344
 345                 for (i = 0; codepages[to].table[i].c; i++)
 346                         add_utf_8(table, codepages[to].table[i].u,
 347                                   strings[codepages[to].table[i].c]);
 348
 349                 for (i = 0; unicode_7b[i].x != -1; i++)
 350                         if (unicode_7b[i].x >= 0x80)
 351                                 add_utf_8(table, unicode_7b[i].x,
 352                                           unicode_7b[i].s);
 353
 354         } else {
 355                 int i;
 356
 357                 for (i = 128; i < 256; i++) {
 358                         int j;
 359
 360                         for (j = 0; codepages[from].table[j].c; j++) {
 361                                 if (codepages[from].table[j].c == i) {
 362                                         unsigned char *u;
 363
 364                                         u = u2cp(codepages[from].table[j].u, to);
 365                                         if (u) table[i].u.str = u;
 366                                         break;
 367                                 }
 368                         }
 369                 }
 370         }
 371
 372         return table;
 373 }
 374
 375 static inline int
 376 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
 377 {
 378         while (l2) {
 379                 if (*s1 > *s2) return 1;
 380                 if (*s1 < *s2) return -1;
 381                 s1++;
 382                 s2++;
 383                 l2--;
 384         }
 385
 386         return *s2 ? -1 : 0;
 387 }
 388
 389 /* Entity cache debugging purpose. */
 390 #if 0
 391 #define DEBUG_ENTITY_CACHE
 392 #else
 393 #undef DEBUG_ENTITY_CACHE
 394 #endif
 395
 396 struct entity_cache {
 397         unsigned int hits;
 398         int strlen;
 399         int encoding;
 400         unsigned char *result;
 401         unsigned char str[20]; /* Suffice in any case. */
 402 };
 403
 404 static int
 405 hits_cmp(struct entity_cache *a, struct entity_cache *b)
 406 {
 407         if (a->hits == b->hits) return 0;
 408         if (a->hits > b->hits) return -1;
 409         else return 1;
 410 }
 411
 412 static int
 413 compare_entities(const void *key_, const void *element_)
 414 {
 415         struct string *key = (struct string *) key_;
 416         struct entity *element = (struct entity *) element_;
 417         int length = key->length;
 418         unsigned char *first = key->source;
 419         unsigned char *second = element->s;
 420
 421         return xxstrcmp(first, second, length);
 422 }
 423
 424 unsigned char *
 425 get_entity_string(const unsigned char *str, const int strlen, int encoding)
 426 {
 427 #define ENTITY_CACHE_SIZE 10    /* 10 seems a good value. */
 428 #define ENTITY_CACHE_MAXLEN 9   /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
 429                                    will go in [0] table */
 430         static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
 431         static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
 432         static int first_time = 1;
 433         unsigned int slen;
 434         unsigned char *result = NULL;
 435
 436         if (strlen <= 0) return NULL;
 437
 438         if (first_time) {
 439                 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
 440                 first_time = 0;
 441         }
 442
 443         /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
 444          * + google + slashdot + websites that result from a search for test on google,
 445          * + various ones) show a quite impressive improvment:
 446          * Top ten is:
 447          * 0: hits=2459 l=4 st='nbsp'
 448          * 1: hits=2152 l=6 st='eacute'
 449          * 2: hits=235 l=6 st='egrave'
 450          * 3: hits=136 l=6 st='agrave'
 451          * 4: hits=100 l=3 st='amp'
 452          * 5: hits=40 l=5 st='laquo'
 453          * 6: hits=8 l=4 st='copy'
 454          * 7: hits=5 l=2 st='gt'
 455          * 8: hits=2 l=2 st='lt'
 456          * 9: hits=1 l=6 st='middot'
 457          *
 458          * Most of the time cache hit ratio is near 95%.
 459          *
 460          * A long test shows: 15186 hits vs. 24 misses and mean iteration
 461          * count is kept < 2 (worst case 1.58). Not so bad ;)
 462          *
 463          * --Zas */
 464
 465         /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
 466         slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
 467
 468         if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
 469                 int i;
 470
 471                 for (i = 0; i < nb_entity_cache[slen]; i++) {
 472                         if (entity_cache[slen][i].encoding == encoding
 473                             && !memcmp(str, entity_cache[slen][i].str, strlen)) {
 474 #ifdef DEBUG_ENTITY_CACHE
 475                                 static double total_iter = 0;
 476                                 static unsigned long hit_count = 0;
 477
 478                                 total_iter += i + 1;
 479                                 hit_count++;
 480                                 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
 481 #endif
 482                                 if (entity_cache[slen][i].hits < (unsigned int) ~0)
 483                                         entity_cache[slen][i].hits++;
 484                                 return entity_cache[slen][i].result;
 485                         }
 486                 }
 487 #ifdef DEBUG_ENTITY_CACHE
 488                 fprintf(stderr, "miss\n");
 489 #endif
 490         }
 491
 492         if (*str == '#') { /* Numeric entity. */
 493                 int l = (int) strlen;
 494                 unsigned char *st = (unsigned char *) str;
 495                 unicode_val_T n = 0;
 496
 497                 if (l == 1) goto end; /* &#; ? */
 498                 st++, l--;
 499                 if ((*st | 32) == 'x') { /* Hexadecimal */
 500
 501                         if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
 502                         st++, l--;
 503                         do {
 504                                 unsigned char c = (*(st++) | 32);
 505
 506                                 if (isdigit(c))
 507                                         n = (n << 4) | (c - '0');
 508                                 else if (isxdigit(c))
 509                                         n = (n << 4) | (c - 'a' + 10);
 510                                 else
 511                                         goto end; /* Bad char. */
 512                         } while (--l);
 513                 } else { /* Decimal */
 514                         if (l > 10) goto end; /* 4294967295 max. */
 515                         do {
 516                                 unsigned char c = *(st++);
 517
 518                                 if (isdigit(c))
 519                                         n = n * 10 + c - '0';
 520                                 else
 521                                         goto end; /* Bad char. */
 522                                 /* Limit to 0xFFFFFFFF. */
 523                                 if (n >= (unicode_val_T) 0xFFFFFFFFu)
 524                                         goto end;
 525                         } while (--l);
 526                 }
 527
 528                 result = u2cp(n, encoding);
 529
 530 #ifdef DEBUG_ENTITY_CACHE
 531                 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
 532 #endif
 533         } else { /* Text entity. */
 534                 struct string key = INIT_STRING((unsigned char *) str, strlen);
 535                 struct entity *element = bsearch((void *) &key, entities,
 536                                                  N_ENTITIES,
 537                                                  sizeof(*element),
 538                                                  compare_entities);
 539
 540                 if (element) result = u2cp(element->c, encoding);
 541         }
 542
 543 end:
 544         /* Take care of potential buffer overflow. */
 545         if (strlen < sizeof(entity_cache[slen][0].str)) {
 546                 struct entity_cache *ece = &entity_cache[slen][nb_entity_cache[slen]];
 547
 548                 /* Copy new entry to cache. */
 549                 ece->hits = 1;
 550                 ece->strlen = strlen;
 551                 ece->encoding = encoding;
 552                 ece->result = result;
 553                 memcpy(ece->str, str, strlen);
 554                 ece->str[strlen] = '\0';
 555
 556                 /* Increment number of cache entries if possible. */
 557                 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
 558
 559 #ifdef DEBUG_ENTITY_CACHE
 560                 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
 561                                 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
 562
 563 #endif
 564
 565                 /* Sort entries by hit order. */
 566                 if (nb_entity_cache[slen] > 1)
 567                         qsort(&entity_cache[slen][0], nb_entity_cache[slen],
 568                               sizeof(entity_cache[slen][0]), (void *) hits_cmp);
 569
 570 #ifdef DEBUG_ENTITY_CACHE
 571         {
 572                 unsigned int i;
 573
 574                 fprintf(stderr, "- Cache entries [%u] -\n", slen);
 575                 for (i = 0; i < nb_entity_cache[slen] ; i++)
 576                         fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
 577                                 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
 578                                 entity_cache[slen][i].str);
 579                 fprintf(stderr, "-----------------\n");
 580         }
 581 #endif
 582         }
 583         return result;
 584 }
 585
 586 unsigned char *
 587 convert_string(struct conv_table *convert_table,
 588                unsigned char *chars, int charslen, int cp,
 589                enum convert_string_mode mode, int *length,
 590                void (*callback)(void *data, unsigned char *buf, int buflen),
 591                void *callback_data)
 592 {
 593         unsigned char *buffer;
 594         int bufferpos = 0;
 595         int charspos = 0;
 596
 597         if (!convert_table && !memchr(chars, '&', charslen)) {
 598                 if (callback) {
 599                         if (charslen) callback(callback_data, chars, charslen);
 600                         return NULL;
 601                 } else {
 602                         return memacpy(chars, charslen);
 603                 }
 604         }
 605
 606         /* Buffer allocation */
 607
 608         buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
 609         if (!buffer) return NULL;
 610
 611         /* Iterate ;-) */
 612
 613         while (charspos < charslen) {
 614                 unsigned char *translit;
 615
 616 #define PUTC do { \
 617                 buffer[bufferpos++] = chars[charspos++]; \
 618                 translit = ""; \
 619                 goto flush; \
 620         } while (0)
 621
 622                 if (chars[charspos] != '&') {
 623                         struct conv_table *t;
 624                         int i;
 625
 626                         if (chars[charspos] < 128 || !convert_table) PUTC;
 627
 628                         t = convert_table;
 629                         i = charspos;
 630
 631                         while (t[chars[i]].t) {
 632                                 t = t[chars[i++]].u.tbl;
 633                                 if (i >= charslen) PUTC;
 634                         }
 635
 636                         translit = t[chars[i]].u.str;
 637                         charspos = i + 1;
 638
 639                 } else if (mode == CSM_FORM || mode == CSM_NONE) {
 640                         PUTC;
 641
 642                 } else {
 643                         int start = charspos + 1;
 644                         int i = start;
 645
 646                         while (i < charslen
 647                                && (isasciialpha(chars[i])
 648                                    || isdigit(chars[i])
 649                                    || (chars[i] == '#')))
 650                                 i++;
 651
 652                         /* This prevents bug 213: we were expanding "entities"
 653                          * in URL query strings. */
 654                         /* XXX: But this disables &nbsp&nbsp usage, which
 655                          * appears to be relatively common! --pasky */
 656                         if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
 657                             && i > start
 658                             && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
 659                                 translit = get_entity_string(&chars[start], i - start,
 660                                                       cp);
 661                                 if (chars[i] != ';') {
 662                                         /* Eat &nbsp &nbsp<foo> happily, but
 663                                          * pull back from the character after
 664                                          * entity string if it is not the valid
 665                                          * terminator. */
 666                                         i--;
 667                                 }
 668
 669                                 if (!translit) PUTC;
 670                                 charspos = i + (i < charslen);
 671                         } else PUTC;
 672                 }
 673
 674                 if (!translit[0]) continue;
 675
 676                 if (!translit[1]) {
 677                         buffer[bufferpos++] = translit[0];
 678                         translit = "";
 679                         goto flush;
 680                 }
 681
 682                 while (*translit) {
 683                         unsigned char *new;
 684
 685                         buffer[bufferpos++] = *(translit++);
 686 flush:
 687                         if (bufferpos & (ALLOC_GR - 1)) continue;
 688
 689                         if (callback) {
 690                                 buffer[bufferpos] = 0;
 691                                 callback(callback_data, buffer, bufferpos);
 692                                 bufferpos = 0;
 693                         } else {
 694                                 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
 695                                 if (!new) {
 696                                         mem_free(buffer);
 697                                         return NULL;
 698                                 }
 699                                 buffer = new;
 700                         }
 701                 }
 702 #undef PUTC
 703         }
 704
 705         /* Say bye */
 706
 707         buffer[bufferpos] = 0;
 708         if (length) *length = bufferpos;
 709
 710         if (callback) {
 711                 if (bufferpos) callback(callback_data, buffer, bufferpos);
 712                 mem_free(buffer);
 713                 return NULL;
 714         } else {
 715                 return buffer;
 716         }
 717 }
 718
 719
 720 #ifndef USE_FASTFIND
 721 int
 722 get_cp_index(unsigned char *name)
 723 {
 724         int i, a;
 725         int syscp = 0;
 726
 727         if (!strcasecmp(name, "System")) {
 728 #if HAVE_LANGINFO_CODESET
 729                 name = nl_langinfo(CODESET);
 730                 syscp = SYSTEM_CHARSET_FLAG;
 731 #else
 732                 name = "us-ascii";
 733 #endif
 734         }
 735
 736         for (i = 0; codepages[i].name; i++) {
 737                 for (a = 0; codepages[i].aliases[a]; a++) {
 738                         /* In the past, we looked for the longest substring
 739                          * in all the names; it is way too expensive, though:
 740                          *
 741                          *   %   cumulative   self              self     total
 742                          *  time   seconds   seconds    calls  us/call  us/call  name
 743                          *  3.00      0.66     0.03     1325    22.64    22.64  get_cp_index
 744                          *
 745                          * Anything called from redraw_screen() is in fact
 746                          * relatively expensive, even if it's called just
 747                          * once. So we will do a simple strcasecmp() here.
 748                          */
 749
 750                         if (!strcasecmp(name, codepages[i].aliases[a]))
 751                                 return i | syscp;
 752                 }
 753         }
 754
 755         if (syscp) {
 756                 return get_cp_index("us-ascii") | syscp;
 757         } else {
 758                 return -1;
 759         }
 760 }
 761
 762 #else
 763
 764 static unsigned int i_name = 0;
 765 static unsigned int i_alias = 0;
 766
 767 /* Reset internal list pointer */
 768 void
 769 charsets_list_reset(void)
 770 {
 771         i_name = 0;
 772         i_alias = 0;
 773 }
 774
 775 /* Returns a pointer to a struct that contains current key and data pointers
 776  * and increment internal pointer.  It returns NULL when key is NULL. */
 777 struct fastfind_key_value *
 778 charsets_list_next(void)
 779 {
 780         static struct fastfind_key_value kv;
 781
 782         if (!codepages[i_name].name) return NULL;
 783
 784         kv.key = codepages[i_name].aliases[i_alias];
 785         kv.data = &codepages[i_name];
 786
 787         if (codepages[i_name].aliases[i_alias + 1])
 788                 i_alias++;
 789         else {
 790                 i_name++;
 791                 i_alias = 0;
 792         }
 793
 794         return &kv;
 795 }
 796
 797 static struct fastfind_index ff_charsets_index
 798         = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
 799
 800 /* It searchs for a charset named @name or one of its aliases and
 801  * returns index for it or -1 if not found. */
 802 int
 803 get_cp_index(unsigned char *name)
 804 {
 805         struct codepage_desc *codepage;
 806         int syscp = 0;
 807
 808         if (!strcasecmp(name, "System")) {
 809 #if HAVE_LANGINFO_CODESET
 810                 name = nl_langinfo(CODESET);
 811                 syscp = SYSTEM_CHARSET_FLAG;
 812 #else
 813                 name = "us-ascii";
 814 #endif
 815         }
 816
 817         codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
 818         if (codepage) {
 819                 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
 820                 return (codepage - codepages) | syscp;
 821
 822         } else if (syscp) {
 823                 return get_cp_index("us-ascii") | syscp;
 824
 825         } else {
 826                 return -1;
 827         }
 828 }
 829
 830 #endif /* USE_FASTFIND */
 831
 832 void
 833 init_charsets_lookup(void)
 834 {
 835 #ifdef USE_FASTFIND
 836         fastfind_index(&ff_charsets_index, FF_COMPRESS);
 837 #endif
 838 }
 839
 840 void
 841 free_charsets_lookup(void)
 842 {
 843 #ifdef USE_FASTFIND
 844         fastfind_done(&ff_charsets_index);
 845 #endif
 846 }
 847
 848 unsigned char *
 849 get_cp_name(int cp_index)
 850 {
 851         if (cp_index < 0) return "none";
 852         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
 853
 854         return codepages[cp_index].name;
 855 }
 856
 857 unsigned char *
 858 get_cp_mime_name(int cp_index)
 859 {
 860         if (cp_index < 0) return "none";
 861         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
 862         if (!codepages[cp_index].aliases) return NULL;
 863
 864         return codepages[cp_index].aliases[0];
 865 }
 866
 867 int
 868 is_cp_special(int cp_index)
 869 {
 870         cp_index &= ~SYSTEM_CHARSET_FLAG;
 871         return codepages[cp_index].table == table_utf_8;
 872 }