src/intl/charsets.c

   1 /* Charsets convertor */
   2
   3 #ifndef _GNU_SOURCE
   4 #define _GNU_SOURCE /* strcasecmp() */
   5 #endif
   6
   7 #ifdef HAVE_CONFIG_H
   8 #include "config.h"
   9 #endif
  10
  11 #if HAVE_LANGINFO_CODESET
  12 #include <langinfo.h>
  13 #endif
  14
  15 #include <ctype.h>
  16 #include <stdlib.h>
  17 #if HAVE_WCTYPE_H
  18 #include <wctype.h>
  19 #endif
  20
  21 #include "elinks.h"
  22
  23 #include "document/options.h"
  24 #include "intl/charsets.h"
  25 #include "util/conv.h"
  26 #include "util/error.h"
  27 #include "util/fastfind.h"
  28 #include "util/memory.h"
  29 #include "util/string.h"
  30
  31
  32 /* Fix namespace clash on MacOS. */
  33 #define table table_elinks
  34
  35 struct table_entry {
  36         unsigned char c;
  37         /* This should in principle be unicode_val_T, but because all
  38          * the values currently in codepage.inc fit in 16 bits, we can
  39          * as well use uint16_t and halve sizeof(struct table_entry)
  40          * from 8 bytes to 4.  Should other characters ever be needed,
  41          * unicode_val_T u : 24 might be a possibility, although it
  42          * seems a little unportable as bitfields are in principle
  43          * restricted to int, which may be 16-bit.  */
  44         uint16_t u;
  45 };
  46
  47 struct codepage_desc {
  48         unsigned char *name;
  49         unsigned char *const *aliases;
  50
  51         /* The Unicode mappings of codepage bytes 0x80...0xFF.
  52          * (0x00...0x7F are assumed to be ASCII in all codepages.)
  53          * Because all current values fit in 16 bits, we store them as
  54          * uint16_t rather than unicode_val_T.  If the codepage does
  55          * not use some byte, then @highhalf maps that byte to 0xFFFF,
  56          * which C code converts to UCS_REPLACEMENT_CHARACTER where
  57          * appropriate.  (U+FFFF is reserved and will never be
  58          * assigned as a character.)  */
  59         const uint16_t *highhalf;
  60
  61         /* If some byte in the codepage corresponds to multiple Unicode
  62          * characters, then the preferred character is in @highhalf
  63          * above, and the rest are listed here in @extra.  This table
  64          * is not used for translating from the codepage to Unicode.  */
  65         const struct table_entry *table;
  66 };
  67
  68 #include "intl/codepage.inc"
  69 #include "intl/uni_7b.inc"
  70 #include "intl/entity.inc"
  71
  72
  73 static char strings[256][2] = {
  74         "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
  75         "\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
  76         "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
  77         "\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
  78         "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
  79         "\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
  80         "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
  81         "\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
  82         "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
  83         "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
  84         "\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
  85         "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
  86         "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
  87         "\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
  88         "\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
  89         "\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
  90         "\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
  91         "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
  92         "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
  93         "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
  94         "\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
  95         "\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
  96         "\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
  97         "\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
  98         "\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
  99         "\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
 100         "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
 101         "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
 102         "\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
 103         "\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
 104         "\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
 105         "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
 106 };
 107
 108 static void
 109 free_translation_table(struct conv_table *p)
 110 {
 111         int i;
 112
 113         for (i = 0; i < 256; i++)
 114                 if (p[i].t)
 115                         free_translation_table(p[i].u.tbl);
 116
 117         mem_free(p);
 118 }
 119
 120 static unsigned char *no_str = "*";
 121
 122 static void
 123 new_translation_table(struct conv_table *p)
 124 {
 125         int i;
 126
 127         for (i = 0; i < 256; i++)
 128                 if (p[i].t)
 129                         free_translation_table(p[i].u.tbl);
 130         for (i = 0; i < 128; i++) {
 131                 p[i].t = 0;
 132                 p[i].u.str = strings[i];
 133         }
 134         for (; i < 256; i++) {
 135                 p[i].t = 0;
 136                 p[i].u.str = no_str;
 137         }
 138 }
 139
 140 #define BIN_SEARCH(table, entry, entries, key, result)                                  \
 141 {                                                                                       \
 142         long _s = 0, _e = (entries) - 1;                                                \
 143                                                                                         \
 144         while (_s <= _e || !((result) = -1)) {                                          \
 145                 long _m = (_s + _e) / 2;                                                \
 146                                                                                         \
 147                 if ((table)[_m].entry == (key)) {                                       \
 148                         (result) = _m;                                                  \
 149                         break;                                                          \
 150                 }                                                                       \
 151                 if ((table)[_m].entry > (key)) _e = _m - 1;                             \
 152                 if ((table)[_m].entry < (key)) _s = _m + 1;                             \
 153         }                                                                               \
 154 }                                                                                       \
 155
 156 static const unicode_val_T strange_chars[32] = {
 157 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
 158 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
 159 0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
 160 0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
 161 };
 162
 163 #define SYSTEM_CHARSET_FLAG 128
 164 #define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
 165
 166 unsigned char *
 167 u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
 168 {
 169         int j;
 170         int s;
 171
 172         if (u < 128) return strings[u];
 173
 174         to &= ~SYSTEM_CHARSET_FLAG;
 175
 176 #ifdef CONFIG_UTF8
 177         if (is_cp_ptr_utf8(&codepages[to]))
 178                 return encode_utf8(u);
 179 #endif /* CONFIG_UTF8 */
 180
 181         /* To mark non breaking spaces, we use a special char NBSP_CHAR. */
 182         if (u == 0xa0) {
 183                 if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
 184                 else /* NBSP_MODE_ASCII */ return " ";
 185         }
 186         if (u == 0xad) return "";
 187
 188         if (u < 0xa0) {
 189                 unicode_val_T strange = strange_chars[u - 0x80];
 190
 191                 if (!strange) return NULL;
 192                 return u2cp_(strange, to, nbsp_mode);
 193         }
 194
 195         if (u < 0xFFFF)
 196                 for (j = 0; j < 0x80; j++)
 197                         if (codepages[to].highhalf[j] == u)
 198                                 return strings[0x80 + j];
 199         for (j = 0; codepages[to].table[j].c; j++)
 200                 if (codepages[to].table[j].u == u)
 201                         return strings[codepages[to].table[j].c];
 202
 203         BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
 204         if (s != -1) return unicode_7b[s].s;
 205
 206         return no_str;
 207 }
 208
 209 static unsigned char utf_buffer[7];
 210
 211 #ifdef CONFIG_UTF8
 212 inline unsigned char *
 213 encode_utf8(unicode_val_T u)
 214 #else
 215 static unsigned char *
 216 encode_utf8(unicode_val_T u)
 217 #endif /* CONFIG_UTF8 */
 218 {
 219         memset(utf_buffer, 0, 7);
 220
 221         if (u < 0x80)
 222                 utf_buffer[0] = u;
 223         else if (u < 0x800)
 224                 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
 225                 utf_buffer[1] = 0x80 | (u & 0x3f);
 226         else if (u < 0x10000)
 227                 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
 228                 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
 229                 utf_buffer[2] = 0x80 | (u & 0x3f);
 230         else if (u < 0x200000)
 231                 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
 232                 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
 233                 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
 234                 utf_buffer[3] = 0x80 | (u & 0x3f);
 235         else if (u < 0x4000000)
 236                 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
 237                 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
 238                 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
 239                 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
 240                 utf_buffer[4] = 0x80 | (u & 0x3f);
 241         else    utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
 242                 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
 243                 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
 244                 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
 245                 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
 246                 utf_buffer[5] = 0x80 | (u & 0x3f);
 247
 248         return utf_buffer;
 249 }
 250
 251 #ifdef CONFIG_UTF8
 252 /* Number of bytes utf8 character indexed by first byte. Illegal bytes are
 253  * equal ones and handled different. */
 254 static char utf8char_len_tab[256] = {
 255         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 256         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 257         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 258         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 259         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 260         1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
 261         2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
 262         3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
 263 };
 264
 265 inline int utf8charlen(const unsigned char *p)
 266 {
 267         return p ? utf8char_len_tab[*p] : 0;
 268 }
 269
 270 inline int
 271 strlen_utf8(unsigned char **str)
 272 {
 273         unsigned char *s = *str;
 274         unsigned char *end = strchr(s, '\0');
 275         int x;
 276         int len;
 277
 278         for (x = 0;; x++, s += len) {
 279                 len = utf8charlen(s);
 280                 if (s + len > end) break;
 281         }
 282         *str = s;
 283         return x;
 284 }
 285
 286 #define utf8_issingle(p) (((p) & 0x80) == 0)
 287 #define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
 288
 289 /* Start from @current and move back to @pos char. This pointer return. The
 290  * most left pointer is @start. */
 291 inline unsigned char *
 292 utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
 293 {
 294         if (current == NULL || start == NULL || pos < 0)
 295                 return NULL;
 296         while (pos > 0 && current != start) {
 297                 current--;
 298                 if (utf8_islead(*current))
 299                         pos--;
 300         }
 301         return current;
 302 }
 303
 304 /* Count number of standard terminal cells needed for displaying UTF-8
 305  * character. */
 306 int
 307 utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
 308 {
 309         unicode_val_T u;
 310
 311         if (end == NULL)
 312                 end = strchr(utf8_char, '\0');
 313
 314         if(!utf8_char || !end)
 315                 return -1;
 316
 317         u = utf8_to_unicode(&utf8_char, end);
 318
 319         return unicode_to_cell(u);
 320 }
 321
 322 /* Count number of standard terminal cells needed for displaying string
 323  * with UTF-8 characters. */
 324 int
 325 utf8_ptr2cells(unsigned char *string, unsigned char *end)
 326 {
 327         int charlen, cell, cells = 0;
 328
 329         if (end == NULL)
 330                 end = strchr(string, '\0');
 331
 332         if(!string || !end)
 333                 return -1;
 334
 335         do {
 336                 charlen = utf8charlen(string);
 337                 if (string + charlen > end)
 338                         break;
 339
 340                 cell = utf8_char2cells(string, end);
 341                 if  (cell < 0)
 342                         return -1;
 343
 344                 cells += cell;
 345                 string += charlen;
 346         } while (1);
 347
 348         return cells;
 349 }
 350
 351 /* Count number of characters in string. */
 352 int
 353 utf8_ptr2chars(unsigned char *string, unsigned char *end)
 354 {
 355         int charlen, chars = 0;
 356
 357         if (end == NULL)
 358                 end = strchr(string, '\0');
 359
 360         if(!string || !end)
 361                 return -1;
 362
 363         do {
 364                 charlen = utf8charlen(string);
 365                 if (string + charlen > end)
 366                         break;
 367
 368                 chars++;
 369                 string += charlen;
 370         } while (1);
 371
 372         return chars;
 373 }
 374
 375 /*
 376  * Count number of bytes from begining of the string needed for displaying
 377  * specified number of cells.
 378  */
 379 int
 380 utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
 381 {
 382         unsigned int bytes = 0, cells = 0;
 383
 384         assert(max_cells>=0);
 385
 386         if (end == NULL)
 387                 end = strchr(string, '\0');
 388
 389         if(!string || !end)
 390                 return -1;
 391
 392         do {
 393                 int cell = utf8_char2cells(&string[bytes], end);
 394                 if (cell < 0)
 395                         return -1;
 396
 397                 cells += cell;
 398                 if (cells > max_cells)
 399                         break;
 400
 401                 bytes += utf8charlen(&string[bytes]);
 402
 403                 if (string + bytes > end) {
 404                         bytes = end - string;
 405                         break;
 406                 }
 407         } while(1);
 408
 409         return bytes;
 410 }
 411
 412 /* Take @max steps forward from @string in the specified @way, but
 413  * not going past @end.  Return the resulting address.  Store the
 414  * number of steps taken to *@count, unless @count is NULL.
 415  *
 416  * This assumes the text is valid UTF-8, and @string and @end point to
 417  * character boundaries.  If not, it doesn't crash but the results may
 418  * be inconsistent.
 419  *
 420  * This function can do some of the same jobs as utf8charlen(),
 421  * utf8_cells2bytes(), and strlen_utf8().  */
 422 unsigned char *
 423 utf8_step_forward(unsigned char *string, unsigned char *end,
 424                   int max, enum utf8_step way, int *count)
 425 {
 426         int steps = 0;
 427         unsigned char *current = string;
 428
 429         assert(string);
 430         assert(max >= 0);
 431         if_assert_failed goto invalid_arg;
 432         if (end == NULL)
 433                 end = strchr(string, '\0');
 434
 435         switch (way) {
 436         case utf8_step_characters:
 437                 while (steps < max && current < end) {
 438                         ++current;
 439                         if (utf8_islead(*current))
 440                                 ++steps;
 441                 }
 442                 break;
 443
 444         case utf8_step_cells_fewer:
 445         case utf8_step_cells_more:
 446                 while (steps < max) {
 447                         unicode_val_T u;
 448                         unsigned char *prev = current;
 449                         int width;
 450
 451                         u = utf8_to_unicode(&current, end);
 452                         if (u == UCS_NO_CHAR) {
 453                                 /* Assume the incomplete sequence
 454                                  * costs one cell.  */
 455                                 current = end;
 456                                 ++steps;
 457                                 break;
 458                         }
 459
 460                         width = unicode_to_cell(u);
 461                         if (way == utf8_step_cells_fewer
 462                             && steps + width > max) {
 463                                 /* Back off.  */
 464                                 current = prev;
 465                                 break;
 466                         }
 467                         steps += width;
 468                 }
 469                 break;
 470
 471         default:
 472                 INTERNAL("impossible enum utf8_step");
 473         }
 474
 475 invalid_arg:
 476         if (count)
 477                 *count = steps;
 478         return current;
 479 }
 480
 481 /* Take @max steps backward from @string in the specified @way, but
 482  * not going past @start.  Return the resulting address.  Store the
 483  * number of steps taken to *@count, unless @count is NULL.
 484  *
 485  * This assumes the text is valid UTF-8, and @string and @start point
 486  * to character boundaries.  If not, it doesn't crash but the results
 487  * may be inconsistent.
 488  *
 489  * This function can do some of the same jobs as utf8_prevchar().  */
 490 unsigned char *
 491 utf8_step_backward(unsigned char *string, unsigned char *start,
 492                    int max, enum utf8_step way, int *count)
 493 {
 494         int steps = 0;
 495         unsigned char *current = string;
 496
 497         assert(string);
 498         assert(start);
 499         assert(max >= 0);
 500         if_assert_failed goto invalid_arg;
 501
 502         switch (way) {
 503         case utf8_step_characters:
 504                 while (steps < max && current > start) {
 505                         --current;
 506                         if (utf8_islead(*current))
 507                                 ++steps;
 508                 }
 509                 break;
 510
 511         case utf8_step_cells_fewer:
 512         case utf8_step_cells_more:
 513                 while (steps < max) {
 514                         unsigned char *prev = current;
 515                         unsigned char *look;
 516                         unicode_val_T u;
 517                         int width;
 518
 519                         if (current <= start)
 520                                 break;
 521                         do {
 522                                 --current;
 523                         } while (current > start && !utf8_islead(*current));
 524
 525                         look = current;
 526                         u = utf8_to_unicode(&look, prev);
 527                         if (u == UCS_NO_CHAR) {
 528                                 /* Assume the incomplete sequence
 529                                  * costs one cell.  */
 530                                 width = 1;
 531                         } else
 532                                 width = unicode_to_cell(u);
 533
 534                         if (way == utf8_step_cells_fewer
 535                             && steps + width > max) {
 536                                 /* Back off.  */
 537                                 current = prev;
 538                                 break;
 539                         }
 540                         steps += width;
 541                 }
 542                 break;
 543
 544         default:
 545                 INTERNAL("impossible enum utf8_step");
 546         }
 547
 548 invalid_arg:
 549         if (count)
 550                 *count = steps;
 551         return current;
 552 }
 553
 554 /*
 555  * Find out number of standard terminal collumns needed for displaying symbol
 556  * (glyph) which represents Unicode character c.
 557  *
 558  * TODO: Use wcwidth when it is available. This seems to require:
 559  * - Make the configure script check whether <wchar.h> and wcwidth exist.
 560  * - Define _XOPEN_SOURCE and include <wchar.h>.
 561  * - Test that __STDC_ISO_10646__ is defined.  (This macro means wchar_t
 562  *   matches ISO 10646 in all locales.)
 563  * However, these do not suffice, because wcwidth depends on LC_CTYPE
 564  * in glibc-2.3.6.  For instance, wcwidth(0xff20) is -1 when LC_CTYPE
 565  * is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
 566  * <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
 567  * U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
 568  * character is apparently not supported in all locales.  Why is that?
 569  * - Perhaps there is standardese that requires supported characters
 570  *   to be convertable to multibyte form.  Then ELinks could just pick
 571  *   some UTF-8 locale for its wcwidth purposes.
 572  * - Perhaps wcwidth can even return different nonnegative values for
 573  *   the same ISO 10646 character in different locales.  Then ELinks
 574  *   would have to set LC_CTYPE to match at least the terminal's
 575  *   charset (which may differ from the LC_CTYPE environment variable,
 576  *   especially when the master process is serving a slave terminal).
 577  *   But there is no guarantee that the libc supports all the same
 578  *   charsets as ELinks does.
 579  * For now, it seems safest to avoid the potentially locale-dependent
 580  * libc version of wcwidth, and instead use a hardcoded mapping.
 581  *
 582  * @return      2 for double-width glyph, 1 for others.
 583  *              TODO: May be extended to return 0 for zero-width glyphs
 584  *              (like composing, maybe unprintable too).
 585  */
 586 inline int
 587 unicode_to_cell(unicode_val_T c)
 588 {
 589         if (c >= 0x1100
 590                 && (c <= 0x115f                 /* Hangul Jamo */
 591                 || c == 0x2329
 592                 || c == 0x232a
 593                 || (c >= 0x2e80 && c <= 0xa4cf
 594                         && c != 0x303f)         /* CJK ... Yi */
 595                 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
 596                 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
 597                                                                 Ideographs */
 598                 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
 599                 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
 600                 || (c >= 0xffe0 && c <= 0xffe6)
 601                 || (c >= 0x20000 && c <= 0x2fffd)
 602                 || (c >= 0x30000 && c <= 0x3fffd)))
 603                 return 2;
 604
 605         return 1;
 606 }
 607
 608 /* Fold the case of a Unicode character, so that hotkeys in labels can
 609  * be compared case-insensitively.  It is unspecified whether the
 610  * result will be in upper or lower case.  */
 611 unicode_val_T
 612 unicode_fold_label_case(unicode_val_T c)
 613 {
 614 #if __STDC_ISO_10646__ && HAVE_WCTYPE_H
 615         return towlower(c);
 616 #else  /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 617         /* For now, this supports only ASCII.  It would be possible to
 618          * use code generated from CaseFolding.txt of Unicode if the
 619          * acknowledgements required by http://www.unicode.org/copyright.html
 620          * were added to associated documentation of ELinks.  */
 621         if (c >= 0x41 && c <= 0x5A)
 622                 return c + 0x20;
 623         else
 624                 return c;
 625 #endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
 626 }
 627
 628 inline unicode_val_T
 629 utf8_to_unicode(unsigned char **string, unsigned char *end)
 630 {
 631         unsigned char *str = *string;
 632         unicode_val_T u;
 633         int length;
 634
 635         length = utf8char_len_tab[str[0]];
 636
 637         if (str + length > end) {
 638                 return UCS_NO_CHAR;
 639         }
 640
 641         switch (length) {
 642                 case 1:
 643                         u = str[0];
 644                         break;
 645                 case 2:
 646                         u = (str[0] & 0x1f) << 6;
 647                         u += (str[1] & 0x3f);
 648                         break;
 649                 case 3:
 650                         u = (str[0] & 0x0f) << 12;
 651                         u += ((str[1] & 0x3f) << 6);
 652                         u += (str[2] & 0x3f);
 653                         break;
 654                 case 4:
 655                         u = (str[0] & 0x0f) << 18;
 656                         u += ((str[1] & 0x3f) << 12);
 657                         u += ((str[2] & 0x3f) << 6);
 658                         u += (str[3] & 0x3f);
 659                         break;
 660                 case 5:
 661                         u = (str[0] & 0x0f) << 24;
 662                         u += ((str[1] & 0x3f) << 18);
 663                         u += ((str[2] & 0x3f) << 12);
 664                         u += ((str[3] & 0x3f) << 6);
 665                         u += (str[4] & 0x3f);
 666                         break;
 667                 case 6:
 668                 default:
 669                         u = (str[0] & 0x01) << 30;
 670                         u += ((str[1] & 0x3f) << 24);
 671                         u += ((str[2] & 0x3f) << 18);
 672                         u += ((str[3] & 0x3f) << 12);
 673                         u += ((str[4] & 0x3f) << 6);
 674                         u += (str[5] & 0x3f);
 675                         break;
 676         }
 677         *string = str + length;
 678         return u;
 679 }
 680 #endif /* CONFIG_UTF8 */
 681
 682 /* The common part of cp2u and cp2utf_8.  */
 683 static unicode_val_T
 684 cp2u_shared(const struct codepage_desc *from, unsigned char c)
 685 {
 686         unicode_val_T u = from->highhalf[c - 0x80];
 687
 688         if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
 689         return u;
 690 }
 691
 692 /* Used for converting input from the terminal.  */
 693 unicode_val_T
 694 cp2u(int from, unsigned char c)
 695 {
 696         from &= ~SYSTEM_CHARSET_FLAG;
 697
 698         /* UTF-8 is a multibyte codepage and cannot be handled with
 699          * this function.  */
 700         assert(!is_cp_ptr_utf8(&codepages[from]));
 701         if_assert_failed return UCS_REPLACEMENT_CHARACTER;
 702
 703         if (c < 0x80) return c;
 704         else return cp2u_shared(&codepages[from], c);
 705 }
 706
 707 /* This slow and ugly code is used by the terminal utf_8_io */
 708 unsigned char *
 709 cp2utf8(int from, int c)
 710 {
 711         from &= ~SYSTEM_CHARSET_FLAG;
 712
 713         if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
 714                 return strings[c];
 715
 716         return encode_utf8(cp2u_shared(&codepages[from], c));
 717 }
 718
 719 #ifdef CONFIG_UTF8
 720 unicode_val_T
 721 cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
 722 {
 723         unicode_val_T ret;
 724
 725         if (is_cp_utf8(codepage))
 726                 return utf8_to_unicode(string, end);
 727
 728         if (*string >= end)
 729                 return UCS_NO_CHAR;
 730
 731         ret = cp2u(codepage, **string);
 732         ++*string;
 733         return ret;
 734 }
 735 #endif  /* CONFIG_UTF8 */
 736
 737
 738 static void
 739 add_utf8(struct conv_table *ct, unicode_val_T u, unsigned char *str)
 740 {
 741         unsigned char *p = encode_utf8(u);
 742
 743         while (p[1]) {
 744                 if (ct[*p].t) ct = ct[*p].u.tbl;
 745                 else {
 746                         struct conv_table *nct;
 747
 748                         assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
 749                         if_assert_failed return;
 750
 751                         nct = mem_calloc(256, sizeof(*nct));
 752                         if (!nct) return;
 753                         new_translation_table(nct);
 754                         ct[*p].t = 1;
 755                         ct[*p].u.tbl = nct;
 756                         ct = nct;
 757                 }
 758                 p++;
 759         }
 760
 761         assertm(!ct[*p].t, "bad utf encoding #2");
 762         if_assert_failed return;
 763
 764         if (ct[*p].u.str == no_str)
 765                 ct[*p].u.str = str;
 766 }
 767
 768 struct conv_table utf_table[256];
 769 int utf_table_init = 1;
 770
 771 static void
 772 free_utf_table(void)
 773 {
 774         int i;
 775
 776         for (i = 128; i < 256; i++)
 777                 mem_free(utf_table[i].u.str);
 778 }
 779
 780 static struct conv_table *
 781 get_translation_table_to_utf8(int from)
 782 {
 783         int i;
 784         static int lfr = -1;
 785
 786         if (from == -1) return NULL;
 787         from &= ~SYSTEM_CHARSET_FLAG;
 788         if (from == lfr) return utf_table;
 789         lfr = from;
 790         if (utf_table_init)
 791                 memset(utf_table, 0, sizeof(utf_table)),
 792                 utf_table_init = 0;
 793         else
 794                 free_utf_table();
 795
 796         for (i = 0; i < 128; i++)
 797                 utf_table[i].u.str = strings[i];
 798
 799         if (is_cp_ptr_utf8(&codepages[from])) {
 800                 for (i = 128; i < 256; i++)
 801                         utf_table[i].u.str = stracpy(strings[i]);
 802                 return utf_table;
 803         }
 804
 805         for (i = 128; i < 256; i++) {
 806                 unicode_val_T u = codepages[from].highhalf[i - 0x80];
 807
 808                 if (u == 0xFFFF)
 809                         utf_table[i].u.str = NULL;
 810                 else
 811                         utf_table[i].u.str = stracpy(encode_utf8(u));
 812         }
 813
 814         for (i = 0; codepages[from].table[i].c; i++) {
 815                 unicode_val_T u = codepages[from].table[i].u;
 816
 817                 if (!utf_table[codepages[from].table[i].c].u.str)
 818                         utf_table[codepages[from].table[i].c].u.str =
 819                                 stracpy(encode_utf8(u));
 820         }
 821
 822         for (i = 128; i < 256; i++)
 823                 if (!utf_table[i].u.str)
 824                         utf_table[i].u.str = stracpy(no_str);
 825
 826         return utf_table;
 827 }
 828
 829 struct conv_table table[256];
 830 static int first = 1;
 831
 832 void
 833 free_conv_table(void)
 834 {
 835         if (!utf_table_init) free_utf_table();
 836         if (first) {
 837                 memset(table, 0, sizeof(table));
 838                 first = 0;
 839         }
 840         new_translation_table(table);
 841 }
 842
 843
 844 struct conv_table *
 845 get_translation_table(int from, int to)
 846 {
 847         static int lfr = -1;
 848         static int lto = -1;
 849
 850         from &= ~SYSTEM_CHARSET_FLAG;
 851         to &= ~SYSTEM_CHARSET_FLAG;
 852         if (first) {
 853                 memset(table, 0, sizeof(table));
 854                 first = 0;
 855         }
 856         if (/*from == to ||*/ from == -1 || to == -1)
 857                 return NULL;
 858         if (is_cp_ptr_utf8(&codepages[to]))
 859                 return get_translation_table_to_utf8(from);
 860         if (from == lfr && to == lto)
 861                 return table;
 862         lfr = from;
 863         lto = to;
 864         new_translation_table(table);
 865
 866         if (is_cp_ptr_utf8(&codepages[from])) {
 867                 int i;
 868
 869                 for (i = 0x80; i <= 0xFF; i++)
 870                         if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
 871                                 add_utf8(table,
 872                                          codepages[to].highhalf[i - 0x80],
 873                                          strings[i]);
 874
 875                 for (i = 0; codepages[to].table[i].c; i++)
 876                         add_utf8(table, codepages[to].table[i].u,
 877                                  strings[codepages[to].table[i].c]);
 878
 879                 for (i = 0; unicode_7b[i].x != -1; i++)
 880                         if (unicode_7b[i].x >= 0x80)
 881                                 add_utf8(table, unicode_7b[i].x,
 882                                          unicode_7b[i].s);
 883
 884         } else {
 885                 int i;
 886
 887                 for (i = 128; i < 256; i++) {
 888                         if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
 889                                 unsigned char *u;
 890
 891                                 u = u2cp(codepages[from].highhalf[i - 0x80], to);
 892                                 if (u) table[i].u.str = u;
 893                         }
 894                 }
 895         }
 896
 897         return table;
 898 }
 899
 900 static inline int
 901 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
 902 {
 903         while (l2) {
 904                 if (*s1 > *s2) return 1;
 905                 if (*s1 < *s2) return -1;
 906                 s1++;
 907                 s2++;
 908                 l2--;
 909         }
 910
 911         return *s2 ? -1 : 0;
 912 }
 913
 914 /* Entity cache debugging purpose. */
 915 #if 0
 916 #define DEBUG_ENTITY_CACHE
 917 #else
 918 #undef DEBUG_ENTITY_CACHE
 919 #endif
 920
 921 struct entity_cache {
 922         unsigned int hits;
 923         int strlen;
 924         int encoding;
 925         unsigned char *result;
 926         unsigned char str[20]; /* Suffice in any case. */
 927 };
 928
 929 static int
 930 hits_cmp(struct entity_cache *a, struct entity_cache *b)
 931 {
 932         if (a->hits == b->hits) return 0;
 933         if (a->hits > b->hits) return -1;
 934         else return 1;
 935 }
 936
 937 static int
 938 compare_entities(const void *key_, const void *element_)
 939 {
 940         struct string *key = (struct string *) key_;
 941         struct entity *element = (struct entity *) element_;
 942         int length = key->length;
 943         unsigned char *first = key->source;
 944         unsigned char *second = element->s;
 945
 946         return xxstrcmp(first, second, length);
 947 }
 948
 949 unsigned char *
 950 get_entity_string(const unsigned char *str, const int strlen, int encoding)
 951 {
 952 #define ENTITY_CACHE_SIZE 10    /* 10 seems a good value. */
 953 #define ENTITY_CACHE_MAXLEN 9   /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
 954                                    will go in [0] table */
 955         static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
 956         static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
 957         static int first_time = 1;
 958         unsigned int slen = 0;
 959         unsigned char *result = NULL;
 960
 961         if (strlen <= 0) return NULL;
 962
 963 #ifdef CONFIG_UTF8
 964         /* TODO: caching UTF-8 */
 965         encoding &= ~SYSTEM_CHARSET_FLAG;
 966         if (is_cp_ptr_utf8(&codepages[encoding]))
 967                 goto skip;
 968 #endif /* CONFIG_UTF8 */
 969
 970         if (first_time) {
 971                 memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
 972                 first_time = 0;
 973         }
 974
 975         /* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
 976          * + google + slashdot + websites that result from a search for test on google,
 977          * + various ones) show a quite impressive improvment:
 978          * Top ten is:
 979          * 0: hits=2459 l=4 st='nbsp'
 980          * 1: hits=2152 l=6 st='eacute'
 981          * 2: hits=235 l=6 st='egrave'
 982          * 3: hits=136 l=6 st='agrave'
 983          * 4: hits=100 l=3 st='amp'
 984          * 5: hits=40 l=5 st='laquo'
 985          * 6: hits=8 l=4 st='copy'
 986          * 7: hits=5 l=2 st='gt'
 987          * 8: hits=2 l=2 st='lt'
 988          * 9: hits=1 l=6 st='middot'
 989          *
 990          * Most of the time cache hit ratio is near 95%.
 991          *
 992          * A long test shows: 15186 hits vs. 24 misses and mean iteration
 993          * count is kept < 2 (worst case 1.58). Not so bad ;)
 994          *
 995          * --Zas */
 996
 997         /* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
 998         slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
 999
1000         if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
1001                 int i;
1002
1003                 for (i = 0; i < nb_entity_cache[slen]; i++) {
1004                         if (entity_cache[slen][i].encoding == encoding
1005                             && !memcmp(str, entity_cache[slen][i].str, strlen)) {
1006 #ifdef DEBUG_ENTITY_CACHE
1007                                 static double total_iter = 0;
1008                                 static unsigned long hit_count = 0;
1009
1010                                 total_iter += i + 1;
1011                                 hit_count++;
1012                                 fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
1013 #endif
1014                                 if (entity_cache[slen][i].hits < (unsigned int) ~0)
1015                                         entity_cache[slen][i].hits++;
1016                                 return entity_cache[slen][i].result;
1017                         }
1018                 }
1019 #ifdef DEBUG_ENTITY_CACHE
1020                 fprintf(stderr, "miss\n");
1021 #endif
1022         }
1023 #ifdef CONFIG_UTF8
1024 skip:
1025 #endif /* CONFIG_UTF8 */
1026         if (*str == '#') { /* Numeric entity. */
1027                 int l = (int) strlen;
1028                 unsigned char *st = (unsigned char *) str;
1029                 unicode_val_T n = 0;
1030
1031                 if (l == 1) goto end; /* &#; ? */
1032                 st++, l--;
1033                 if ((*st | 32) == 'x') { /* Hexadecimal */
1034
1035                         if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
1036                         st++, l--;
1037                         do {
1038                                 unsigned char c = (*(st++) | 32);
1039
1040                                 if (isdigit(c))
1041                                         n = (n << 4) | (c - '0');
1042                                 else if (isxdigit(c))
1043                                         n = (n << 4) | (c - 'a' + 10);
1044                                 else
1045                                         goto end; /* Bad char. */
1046                         } while (--l);
1047                 } else { /* Decimal */
1048                         if (l > 10) goto end; /* 4294967295 max. */
1049                         do {
1050                                 unsigned char c = *(st++);
1051
1052                                 if (isdigit(c))
1053                                         n = n * 10 + c - '0';
1054                                 else
1055                                         goto end; /* Bad char. */
1056                                 /* Limit to 0xFFFFFFFF. */
1057                                 if (n >= (unicode_val_T) 0xFFFFFFFFu)
1058                                         goto end;
1059                         } while (--l);
1060                 }
1061
1062                 result = u2cp(n, encoding);
1063
1064 #ifdef DEBUG_ENTITY_CACHE
1065                 fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
1066 #endif
1067         } else { /* Text entity. */
1068                 struct string key = INIT_STRING((unsigned char *) str, strlen);
1069                 struct entity *element = bsearch((void *) &key, entities,
1070                                                  N_ENTITIES,
1071                                                  sizeof(*element),
1072                                                  compare_entities);
1073
1074                 if (element) result = u2cp(element->c, encoding);
1075         }
1076
1077 #ifdef CONFIG_UTF8
1078         if (is_cp_ptr_utf8(&codepages[encoding])) {
1079                 return result;
1080         }
1081 #endif /* CONFIG_UTF8 */
1082 end:
1083         /* Take care of potential buffer overflow. */
1084         if (strlen < sizeof(entity_cache[slen][0].str)) {
1085                 struct entity_cache *ece = &entity_cache[slen][nb_entity_cache[slen]];
1086
1087                 /* Copy new entry to cache. */
1088                 ece->hits = 1;
1089                 ece->strlen = strlen;
1090                 ece->encoding = encoding;
1091                 ece->result = result;
1092                 memcpy(ece->str, str, strlen);
1093                 ece->str[strlen] = '\0';
1094
1095                 /* Increment number of cache entries if possible. */
1096                 if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
1097
1098 #ifdef DEBUG_ENTITY_CACHE
1099                 fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
1100                                 entity_cache[slen][0].strlen, entity_cache[slen][0].str);
1101
1102 #endif
1103
1104                 /* Sort entries by hit order. */
1105                 if (nb_entity_cache[slen] > 1)
1106                         qsort(&entity_cache[slen][0], nb_entity_cache[slen],
1107                               sizeof(entity_cache[slen][0]), (void *) hits_cmp);
1108
1109 #ifdef DEBUG_ENTITY_CACHE
1110         {
1111                 unsigned int i;
1112
1113                 fprintf(stderr, "- Cache entries [%u] -\n", slen);
1114                 for (i = 0; i < nb_entity_cache[slen] ; i++)
1115                         fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
1116                                 entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
1117                                 entity_cache[slen][i].str);
1118                 fprintf(stderr, "-----------------\n");
1119         }
1120 #endif
1121         }
1122         return result;
1123 }
1124
1125 unsigned char *
1126 convert_string(struct conv_table *convert_table,
1127                unsigned char *chars, int charslen, int cp,
1128                enum convert_string_mode mode, int *length,
1129                void (*callback)(void *data, unsigned char *buf, int buflen),
1130                void *callback_data)
1131 {
1132         unsigned char *buffer;
1133         int bufferpos = 0;
1134         int charspos = 0;
1135
1136         if (!convert_table && !memchr(chars, '&', charslen)) {
1137                 if (callback) {
1138                         if (charslen) callback(callback_data, chars, charslen);
1139                         return NULL;
1140                 } else {
1141                         return memacpy(chars, charslen);
1142                 }
1143         }
1144
1145         /* Buffer allocation */
1146
1147         buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
1148         if (!buffer) return NULL;
1149
1150         /* Iterate ;-) */
1151
1152         while (charspos < charslen) {
1153                 unsigned char *translit;
1154
1155 #define PUTC do { \
1156                 buffer[bufferpos++] = chars[charspos++]; \
1157                 translit = ""; \
1158                 goto flush; \
1159         } while (0)
1160
1161                 if (chars[charspos] != '&') {
1162                         struct conv_table *t;
1163                         int i;
1164
1165                         if (chars[charspos] < 128 || !convert_table) PUTC;
1166
1167                         t = convert_table;
1168                         i = charspos;
1169
1170                         while (t[chars[i]].t) {
1171                                 t = t[chars[i++]].u.tbl;
1172                                 if (i >= charslen) PUTC;
1173                         }
1174
1175                         translit = t[chars[i]].u.str;
1176                         charspos = i + 1;
1177
1178                 } else if (mode == CSM_FORM || mode == CSM_NONE) {
1179                         PUTC;
1180
1181                 } else {
1182                         int start = charspos + 1;
1183                         int i = start;
1184
1185                         while (i < charslen
1186                                && (isasciialpha(chars[i])
1187                                    || isdigit(chars[i])
1188                                    || (chars[i] == '#')))
1189                                 i++;
1190
1191                         /* This prevents bug 213: we were expanding "entities"
1192                          * in URL query strings. */
1193                         /* XXX: But this disables &nbsp&nbsp usage, which
1194                          * appears to be relatively common! --pasky */
1195                         if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
1196                             && i > start
1197                             && !isasciialpha(chars[i]) && !isdigit(chars[i])) {
1198                                 translit = get_entity_string(&chars[start], i - start,
1199                                                       cp);
1200                                 if (chars[i] != ';') {
1201                                         /* Eat &nbsp &nbsp<foo> happily, but
1202                                          * pull back from the character after
1203                                          * entity string if it is not the valid
1204                                          * terminator. */
1205                                         i--;
1206                                 }
1207
1208                                 if (!translit) PUTC;
1209                                 charspos = i + (i < charslen);
1210                         } else PUTC;
1211                 }
1212
1213                 if (!translit[0]) continue;
1214
1215                 if (!translit[1]) {
1216                         buffer[bufferpos++] = translit[0];
1217                         translit = "";
1218                         goto flush;
1219                 }
1220
1221                 while (*translit) {
1222                         unsigned char *new;
1223
1224                         buffer[bufferpos++] = *(translit++);
1225 flush:
1226                         if (bufferpos & (ALLOC_GR - 1)) continue;
1227
1228                         if (callback) {
1229                                 buffer[bufferpos] = 0;
1230                                 callback(callback_data, buffer, bufferpos);
1231                                 bufferpos = 0;
1232                         } else {
1233                                 new = mem_realloc(buffer, bufferpos + ALLOC_GR);
1234                                 if (!new) {
1235                                         mem_free(buffer);
1236                                         return NULL;
1237                                 }
1238                                 buffer = new;
1239                         }
1240                 }
1241 #undef PUTC
1242         }
1243
1244         /* Say bye */
1245
1246         buffer[bufferpos] = 0;
1247         if (length) *length = bufferpos;
1248
1249         if (callback) {
1250                 if (bufferpos) callback(callback_data, buffer, bufferpos);
1251                 mem_free(buffer);
1252                 return NULL;
1253         } else {
1254                 return buffer;
1255         }
1256 }
1257
1258
1259 #ifndef USE_FASTFIND
1260 int
1261 get_cp_index(unsigned char *name)
1262 {
1263         int i, a;
1264         int syscp = 0;
1265
1266         if (!strcasecmp(name, "System")) {
1267 #if HAVE_LANGINFO_CODESET
1268                 name = nl_langinfo(CODESET);
1269                 syscp = SYSTEM_CHARSET_FLAG;
1270 #else
1271                 name = "us-ascii";
1272 #endif
1273         }
1274
1275         for (i = 0; codepages[i].name; i++) {
1276                 for (a = 0; codepages[i].aliases[a]; a++) {
1277                         /* In the past, we looked for the longest substring
1278                          * in all the names; it is way too expensive, though:
1279                          *
1280                          *   %   cumulative   self              self     total
1281                          *  time   seconds   seconds    calls  us/call  us/call  name
1282                          *  3.00      0.66     0.03     1325    22.64    22.64  get_cp_index
1283                          *
1284                          * Anything called from redraw_screen() is in fact
1285                          * relatively expensive, even if it's called just
1286                          * once. So we will do a simple strcasecmp() here.
1287                          */
1288
1289                         if (!strcasecmp(name, codepages[i].aliases[a]))
1290                                 return i | syscp;
1291                 }
1292         }
1293
1294         if (syscp) {
1295                 return get_cp_index("us-ascii") | syscp;
1296         } else {
1297                 return -1;
1298         }
1299 }
1300
1301 #else
1302
1303 static unsigned int i_name = 0;
1304 static unsigned int i_alias = 0;
1305
1306 /* Reset internal list pointer */
1307 void
1308 charsets_list_reset(void)
1309 {
1310         i_name = 0;
1311         i_alias = 0;
1312 }
1313
1314 /* Returns a pointer to a struct that contains current key and data pointers
1315  * and increment internal pointer.  It returns NULL when key is NULL. */
1316 struct fastfind_key_value *
1317 charsets_list_next(void)
1318 {
1319         static struct fastfind_key_value kv;
1320
1321         if (!codepages[i_name].name) return NULL;
1322
1323         kv.key = codepages[i_name].aliases[i_alias];
1324         kv.data = (void *) &codepages[i_name]; /* cast away const */
1325
1326         if (codepages[i_name].aliases[i_alias + 1])
1327                 i_alias++;
1328         else {
1329                 i_name++;
1330                 i_alias = 0;
1331         }
1332
1333         return &kv;
1334 }
1335
1336 static struct fastfind_index ff_charsets_index
1337         = INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
1338
1339 /* It searchs for a charset named @name or one of its aliases and
1340  * returns index for it or -1 if not found. */
1341 int
1342 get_cp_index(unsigned char *name)
1343 {
1344         const struct codepage_desc *codepage;
1345         int syscp = 0;
1346
1347         if (!strcasecmp(name, "System")) {
1348 #if HAVE_LANGINFO_CODESET
1349                 name = nl_langinfo(CODESET);
1350                 syscp = SYSTEM_CHARSET_FLAG;
1351 #else
1352                 name = "us-ascii";
1353 #endif
1354         }
1355
1356         codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
1357         if (codepage) {
1358                 assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
1359                 return (codepage - codepages) | syscp;
1360
1361         } else if (syscp) {
1362                 return get_cp_index("us-ascii") | syscp;
1363
1364         } else {
1365                 return -1;
1366         }
1367 }
1368
1369 #endif /* USE_FASTFIND */
1370
1371 void
1372 init_charsets_lookup(void)
1373 {
1374 #ifdef USE_FASTFIND
1375         fastfind_index(&ff_charsets_index, FF_COMPRESS);
1376 #endif
1377 }
1378
1379 void
1380 free_charsets_lookup(void)
1381 {
1382 #ifdef USE_FASTFIND
1383         fastfind_done(&ff_charsets_index);
1384 #endif
1385 }
1386
1387 unsigned char *
1388 get_cp_name(int cp_index)
1389 {
1390         if (cp_index < 0) return "none";
1391         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1392
1393         return codepages[cp_index].name;
1394 }
1395
1396 unsigned char *
1397 get_cp_mime_name(int cp_index)
1398 {
1399         if (cp_index < 0) return "none";
1400         if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
1401         if (!codepages[cp_index].aliases) return NULL;
1402
1403         return codepages[cp_index].aliases[0];
1404 }
1405
1406 int
1407 is_cp_utf8(int cp_index)
1408 {
1409         cp_index &= ~SYSTEM_CHARSET_FLAG;
1410         return is_cp_ptr_utf8(&codepages[cp_index]);
1411 }