tools/wrc/utils.c

   1 /*
   2  * Utility routines
   3  *
   4  * Copyright 1998 Bertho A. Stultiens
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  19  */
  20
  21 #include "config.h"
  22
  23 #include <assert.h>
  24 #include <stdio.h>
  25 #include <stdlib.h>
  26 #include <stdarg.h>
  27 #include <string.h>
  28 #include <ctype.h>
  29
  30 #include "../tools.h"
  31 #include "wrc.h"
  32 #include "utils.h"
  33 #include "parser.h"
  34
  35 /* #define WANT_NEAR_INDICATION */
  36
  37 #ifdef WANT_NEAR_INDICATION
  38 void make_print(char *str)
  39 {
  40         while(*str)
  41         {
  42                 if(!isprint(*str))
  43                         *str = ' ';
  44                 str++;
  45         }
  46 }
  47 #endif
  48
  49 static void generic_msg(const char *s, const char *t, const char *n, va_list ap)
  50 {
  51         fprintf(stderr, "%s:%d:%d: %s: ", input_name ? input_name : "stdin", line_number, char_number, t);
  52         vfprintf(stderr, s, ap);
  53 #ifdef WANT_NEAR_INDICATION
  54         {
  55                 char *cpy;
  56                 if(n)
  57                 {
  58                         cpy = xstrdup(n);
  59                         make_print(cpy);
  60                         fprintf(stderr, " near '%s'", cpy);
  61                         free(cpy);
  62                 }
  63         }
  64 #endif
  65 }
  66
  67
  68 int parser_error(const char *s, ...)
  69 {
  70         va_list ap;
  71         va_start(ap, s);
  72         generic_msg(s, "Error", parser_text, ap);
  73         fputc( '\n', stderr );
  74         va_end(ap);
  75         exit(1);
  76         return 1;
  77 }
  78
  79 int parser_warning(const char *s, ...)
  80 {
  81         va_list ap;
  82         va_start(ap, s);
  83         generic_msg(s, "Warning", parser_text, ap);
  84         va_end(ap);
  85         return 0;
  86 }
  87
  88 void fatal_perror( const char *msg, ... )
  89 {
  90         va_list valist;
  91         va_start( valist, msg );
  92         fprintf(stderr, "Error: ");
  93         vfprintf( stderr, msg, valist );
  94         perror( " " );
  95         va_end( valist );
  96         exit(2);
  97 }
  98
  99 void error(const char *s, ...)
 100 {
 101         va_list ap;
 102         va_start(ap, s);
 103         fprintf(stderr, "Error: ");
 104         vfprintf(stderr, s, ap);
 105         va_end(ap);
 106         exit(2);
 107 }
 108
 109 void warning(const char *s, ...)
 110 {
 111         va_list ap;
 112         va_start(ap, s);
 113         fprintf(stderr, "Warning: ");
 114         vfprintf(stderr, s, ap);
 115         va_end(ap);
 116 }
 117
 118 void chat(const char *s, ...)
 119 {
 120         if(debuglevel & DEBUGLEVEL_CHAT)
 121         {
 122                 va_list ap;
 123                 va_start(ap, s);
 124                 fprintf(stderr, "FYI: ");
 125                 vfprintf(stderr, s, ap);
 126                 va_end(ap);
 127         }
 128 }
 129
 130 int compare_striA( const char *str1, const char *str2 )
 131 {
 132     for (;;)
 133     {
 134         /* only the A-Z range is case-insensitive */
 135         char ch1 = (*str1 >= 'a' && *str1 <= 'z') ? *str1 + 'A' - 'a' : *str1;
 136         char ch2 = (*str2 >= 'a' && *str2 <= 'z') ? *str2 + 'A' - 'a' : *str2;
 137         if (!ch1 || ch1 != ch2) return ch1 - ch2;
 138         str1++;
 139         str2++;
 140     }
 141 }
 142
 143 int compare_striW( const WCHAR *str1, const WCHAR *str2 )
 144 {
 145     for (;;)
 146     {
 147         /* only the A-Z range is case-insensitive */
 148         WCHAR ch1 = (*str1 >= 'a' && *str1 <= 'z') ? *str1 + 'A' - 'a' : *str1;
 149         WCHAR ch2 = (*str2 >= 'a' && *str2 <= 'z') ? *str2 + 'A' - 'a' : *str2;
 150         if (!ch1 || ch1 != ch2) return ch1 - ch2;
 151         str1++;
 152         str2++;
 153     }
 154 }
 155
 156 int compare_striAW( const char *str1, const WCHAR *str2 )
 157 {
 158     for (;;)
 159     {
 160         /* only the A-Z range is case-insensitive */
 161         WCHAR ch1 = (*str1 >= 'a' && *str1 <= 'z') ? *str1 + 'A' - 'a' : (unsigned char)*str1;
 162         WCHAR ch2 = (*str2 >= 'a' && *str2 <= 'z') ? *str2 + 'A' - 'a' : *str2;
 163         if (!ch1 || ch1 != ch2) return ch1 - ch2;
 164         str1++;
 165         str2++;
 166     }
 167 }
 168
 169 /*
 170  *****************************************************************************
 171  * Function     : compare_name_id
 172  * Syntax       : int compare_name_id(const name_id_t *n1, const name_id_t *n2)
 173  * Input        :
 174  * Output       :
 175  * Description  :
 176  * Remarks      :
 177  *****************************************************************************
 178 */
 179 int compare_name_id(const name_id_t *n1, const name_id_t *n2)
 180 {
 181     if (n1->type != n2->type) return n1->type == name_ord ? 1 : -1;
 182     if (n1->type == name_ord) return n1->name.i_name - n2->name.i_name;
 183
 184     if (n1->name.s_name->type == str_char)
 185     {
 186         if (n2->name.s_name->type == str_char)
 187             return compare_striA(n1->name.s_name->str.cstr, n2->name.s_name->str.cstr);
 188         return compare_striAW(n1->name.s_name->str.cstr, n2->name.s_name->str.wstr);
 189     }
 190     else
 191     {
 192         if (n2->name.s_name->type == str_char)
 193             return -compare_striAW(n2->name.s_name->str.cstr, n1->name.s_name->str.wstr);
 194         return compare_striW(n1->name.s_name->str.wstr, n2->name.s_name->str.wstr);
 195     }
 196 }
 197
 198 #ifdef _WIN32
 199
 200 int is_valid_codepage(int id)
 201 {
 202     return IsValidCodePage( id );
 203 }
 204
 205 static WCHAR *codepage_to_unicode( int codepage, const char *src, int srclen, int *dstlen )
 206 {
 207     WCHAR *dst = xmalloc( (srclen + 1) * sizeof(WCHAR) );
 208     DWORD ret = MultiByteToWideChar( codepage, MB_ERR_INVALID_CHARS, src, srclen, dst, srclen );
 209     if (!ret) return NULL;
 210     dst[ret] = 0;
 211     *dstlen = ret;
 212     return dst;
 213 }
 214
 215 #else  /* _WIN32 */
 216
 217 struct nls_info
 218 {
 219     unsigned short  codepage;
 220     unsigned short  unidef;
 221     unsigned short  trans_unidef;
 222     unsigned short *cp2uni;
 223     unsigned short *dbcs_offsets;
 224 };
 225
 226 static struct nls_info nlsinfo[128];
 227
 228 static void init_nls_info( struct nls_info *info, unsigned short *ptr )
 229 {
 230     unsigned short hdr_size = ptr[0];
 231
 232     info->codepage      = ptr[1];
 233     info->unidef        = ptr[4];
 234     info->trans_unidef  = ptr[6];
 235     ptr += hdr_size;
 236     info->cp2uni = ++ptr;
 237     ptr += 256;
 238     if (*ptr++) ptr += 256;  /* glyph table */
 239     info->dbcs_offsets  = *ptr ? ptr + 1 : NULL;
 240 }
 241
 242 static const struct nls_info *get_nls_info( unsigned int codepage )
 243 {
 244     unsigned short *data;
 245     char *path;
 246     unsigned int i;
 247     size_t size;
 248
 249     for (i = 0; i < ARRAY_SIZE(nlsinfo) && nlsinfo[i].codepage; i++)
 250         if (nlsinfo[i].codepage == codepage) return &nlsinfo[i];
 251
 252     assert( i < ARRAY_SIZE(nlsinfo) );
 253
 254     for (i = 0; nlsdirs[i]; i++)
 255     {
 256         path = strmake( "%s/c_%03u.nls", nlsdirs[i], codepage );
 257         if ((data = read_file( path, &size )))
 258         {
 259             free( path );
 260             init_nls_info( &nlsinfo[i], data );
 261             return &nlsinfo[i];
 262         }
 263         free( path );
 264     }
 265     return NULL;
 266 }
 267
 268 int is_valid_codepage(int cp)
 269 {
 270     return cp == CP_UTF8 || get_nls_info( cp );
 271 }
 272
 273 static WCHAR *codepage_to_unicode( int codepage, const char *src, int srclen, int *dstlen )
 274 {
 275     const struct nls_info *info = get_nls_info( codepage );
 276     unsigned int i;
 277     WCHAR dbch, *dst = xmalloc( (srclen + 1) * sizeof(WCHAR) );
 278
 279     if (!info) error( "codepage %u not supported\n", codepage );
 280
 281     if (info->dbcs_offsets)
 282     {
 283         for (i = 0; srclen; i++, srclen--, src++)
 284         {
 285             unsigned short off = info->dbcs_offsets[(unsigned char)*src];
 286             if (off)
 287             {
 288                 if (srclen == 1) return NULL;
 289                 dbch = (src[0] << 8) | (unsigned char)src[1];
 290                 src++;
 291                 srclen--;
 292                 dst[i] = info->dbcs_offsets[off + (unsigned char)*src];
 293                 if (dst[i] == info->unidef && dbch != info->trans_unidef) return NULL;
 294             }
 295             else
 296             {
 297                 dst[i] = info->cp2uni[(unsigned char)*src];
 298                 if (dst[i] == info->unidef && *src != info->trans_unidef) return NULL;
 299             }
 300         }
 301     }
 302     else
 303     {
 304         for (i = 0; i < srclen; i++)
 305         {
 306             dst[i] = info->cp2uni[(unsigned char)src[i]];
 307             if (dst[i] == info->unidef && src[i] != info->trans_unidef) return NULL;
 308         }
 309     }
 310     dst[i] = 0;
 311     *dstlen = i;
 312     return dst;
 313 }
 314
 315 #endif  /* _WIN32 */
 316
 317 static WCHAR *utf8_to_unicode( const char *src, int srclen, int *dstlen )
 318 {
 319     static const char utf8_length[128] =
 320     {
 321         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */
 322         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */
 323         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */
 324         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */
 325         0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */
 326         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */
 327         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */
 328         3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0  /* 0xf0-0xff */
 329     };
 330     static const unsigned char utf8_mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
 331
 332     const char *srcend = src + srclen;
 333     int len, res;
 334     WCHAR *ret, *dst;
 335
 336     dst = ret = xmalloc( (srclen + 1) * sizeof(WCHAR) );
 337     while (src < srcend)
 338     {
 339         unsigned char ch = *src++;
 340         if (ch < 0x80)  /* special fast case for 7-bit ASCII */
 341         {
 342             *dst++ = ch;
 343             continue;
 344         }
 345         len = utf8_length[ch - 0x80];
 346         if (len && src + len <= srcend)
 347         {
 348             res = ch & utf8_mask[len];
 349             switch (len)
 350             {
 351             case 3:
 352                 if ((ch = *src ^ 0x80) >= 0x40) break;
 353                 res = (res << 6) | ch;
 354                 src++;
 355                 if (res < 0x10) break;
 356             case 2:
 357                 if ((ch = *src ^ 0x80) >= 0x40) break;
 358                 res = (res << 6) | ch;
 359                 if (res >= 0x110000 >> 6) break;
 360                 src++;
 361                 if (res < 0x20) break;
 362                 if (res >= 0xd800 >> 6 && res <= 0xdfff >> 6) break;
 363             case 1:
 364                 if ((ch = *src ^ 0x80) >= 0x40) break;
 365                 res = (res << 6) | ch;
 366                 src++;
 367                 if (res < 0x80) break;
 368                 if (res <= 0xffff) *dst++ = res;
 369                 else
 370                 {
 371                     res -= 0x10000;
 372                     *dst++ = 0xd800 | (res >> 10);
 373                     *dst++ = 0xdc00 | (res & 0x3ff);
 374                 }
 375                 continue;
 376             }
 377         }
 378         *dst++ = 0xfffd;
 379     }
 380     *dst = 0;
 381     *dstlen = dst - ret;
 382     return ret;
 383 }
 384
 385 static char *unicode_to_utf8( const WCHAR *src, int srclen, int *dstlen )
 386 {
 387     char *ret, *dst;
 388
 389     dst = ret = xmalloc( srclen * 3 + 1 );
 390     for ( ; srclen; srclen--, src++)
 391     {
 392         unsigned int ch = *src;
 393
 394         if (ch < 0x80)  /* 0x00-0x7f: 1 byte */
 395         {
 396             *dst++ = ch;
 397             continue;
 398         }
 399         if (ch < 0x800)  /* 0x80-0x7ff: 2 bytes */
 400         {
 401             dst[1] = 0x80 | (ch & 0x3f);
 402             ch >>= 6;
 403             dst[0] = 0xc0 | ch;
 404             dst += 2;
 405             continue;
 406         }
 407         if (ch >= 0xd800 && ch <= 0xdbff && srclen > 1 && src[1] >= 0xdc00 && src[1] <= 0xdfff)
 408         {
 409             /* 0x10000-0x10ffff: 4 bytes */
 410             ch = 0x10000 + ((ch & 0x3ff) << 10) + (src[1] & 0x3ff);
 411             dst[3] = 0x80 | (ch & 0x3f);
 412             ch >>= 6;
 413             dst[2] = 0x80 | (ch & 0x3f);
 414             ch >>= 6;
 415             dst[1] = 0x80 | (ch & 0x3f);
 416             ch >>= 6;
 417             dst[0] = 0xf0 | ch;
 418             dst += 4;
 419             src++;
 420             srclen--;
 421             continue;
 422         }
 423         if (ch >= 0xd800 && ch <= 0xdfff) ch = 0xfffd;  /* invalid surrogate pair */
 424
 425         /* 0x800-0xffff: 3 bytes */
 426         dst[2] = 0x80 | (ch & 0x3f);
 427         ch >>= 6;
 428         dst[1] = 0x80 | (ch & 0x3f);
 429         ch >>= 6;
 430         dst[0] = 0xe0 | ch;
 431         dst += 3;
 432     }
 433     *dst = 0;
 434     *dstlen = dst - ret;
 435     return ret;
 436 }
 437
 438 string_t *convert_string_unicode( const string_t *str, int codepage )
 439 {
 440     string_t *ret = xmalloc(sizeof(*ret));
 441
 442     ret->type = str_unicode;
 443     ret->loc = str->loc;
 444
 445     if (str->type == str_char)
 446     {
 447         if (!codepage) parser_error( "Current language is Unicode only, cannot convert string" );
 448
 449         if (codepage == CP_UTF8)
 450             ret->str.wstr = utf8_to_unicode( str->str.cstr, str->size, &ret->size );
 451         else
 452             ret->str.wstr = codepage_to_unicode( codepage, str->str.cstr, str->size, &ret->size );
 453         if (!ret->str.wstr) parser_error( "Invalid character in string '%.*s' for codepage %u",
 454                                           str->size, str->str.cstr, codepage );
 455     }
 456     else
 457     {
 458         ret->size     = str->size;
 459         ret->str.wstr = xmalloc(sizeof(WCHAR)*(ret->size+1));
 460         memcpy( ret->str.wstr, str->str.wstr, ret->size * sizeof(WCHAR) );
 461         ret->str.wstr[ret->size] = 0;
 462     }
 463     return ret;
 464 }
 465
 466 char *convert_string_utf8( const string_t *str, int codepage )
 467 {
 468     int len;
 469     string_t *wstr = convert_string_unicode( str, codepage );
 470     char *ret = unicode_to_utf8( wstr->str.wstr, wstr->size, &len );
 471     free_string( wstr );
 472     return ret;
 473 }
 474
 475 void free_string(string_t *str)
 476 {
 477     if (str->type == str_unicode) free( str->str.wstr );
 478     else free( str->str.cstr );
 479     free( str );
 480 }
 481
 482 /* check if the string is valid utf8 despite a different codepage being in use */
 483 int check_valid_utf8( const string_t *str, int codepage )
 484 {
 485     int i, count;
 486     WCHAR *wstr;
 487
 488     if (!check_utf8) return 0;
 489     if (!codepage) return 0;
 490     if (codepage == CP_UTF8) return 0;
 491     if (!is_valid_codepage( codepage )) return 0;
 492
 493     for (i = count = 0; i < str->size; i++)
 494     {
 495         if ((unsigned char)str->str.cstr[i] >= 0xf5) goto done;
 496         if ((unsigned char)str->str.cstr[i] >= 0xc2) { count++; continue; }
 497         if ((unsigned char)str->str.cstr[i] >= 0x80) goto done;
 498     }
 499     if (!count) return 0;  /* no 8-bit chars at all */
 500
 501     wstr = utf8_to_unicode( str->str.cstr, str->size, &count );
 502     for (i = 0; i < count; i++) if (wstr[i] == 0xfffd) break;
 503     free( wstr );
 504     return (i == count);
 505
 506 done:
 507     check_utf8 = 0;  /* at least one 8-bit non-utf8 string found, stop checking */
 508     return 0;
 509 }
 510
 511
 512 struct lang2cp
 513 {
 514     unsigned short lang;
 515     unsigned short sublang;
 516     unsigned int   cp;
 517 };
 518
 519 /* language to codepage conversion table */
 520 /* specific sublanguages need only be specified if their codepage */
 521 /* differs from the default (SUBLANG_NEUTRAL) */
 522 static const struct lang2cp lang2cps[] =
 523 {
 524     { LANG_AFRIKAANS,      SUBLANG_NEUTRAL,              1252 },
 525     { LANG_ALBANIAN,       SUBLANG_NEUTRAL,              1250 },
 526     { LANG_ALSATIAN,       SUBLANG_NEUTRAL,              1252 },
 527     { LANG_AMHARIC,        SUBLANG_NEUTRAL,              0    },
 528     { LANG_ARABIC,         SUBLANG_NEUTRAL,              1256 },
 529     { LANG_ARMENIAN,       SUBLANG_NEUTRAL,              0    },
 530     { LANG_ASSAMESE,       SUBLANG_NEUTRAL,              0    },
 531     { LANG_ASTURIAN,       SUBLANG_NEUTRAL,              1252 },
 532     { LANG_AZERI,          SUBLANG_NEUTRAL,              1254 },
 533     { LANG_AZERI,          SUBLANG_AZERI_CYRILLIC,       1251 },
 534     { LANG_BASHKIR,        SUBLANG_NEUTRAL,              1251 },
 535     { LANG_BASQUE,         SUBLANG_NEUTRAL,              1252 },
 536     { LANG_BELARUSIAN,     SUBLANG_NEUTRAL,              1251 },
 537     { LANG_BENGALI,        SUBLANG_NEUTRAL,              0    },
 538     { LANG_BOSNIAN,        SUBLANG_NEUTRAL,              1250 },
 539     { LANG_BOSNIAN,        SUBLANG_BOSNIAN_BOSNIA_HERZEGOVINA_CYRILLIC, 1251 },
 540     { LANG_BRETON,         SUBLANG_NEUTRAL,              1252 },
 541     { LANG_BULGARIAN,      SUBLANG_NEUTRAL,              1251 },
 542     { LANG_CATALAN,        SUBLANG_NEUTRAL,              1252 },
 543     { LANG_CHINESE,        SUBLANG_NEUTRAL,              950  },
 544     { LANG_CHINESE,        SUBLANG_CHINESE_SIMPLIFIED,   936  },
 545     { LANG_CHINESE,        SUBLANG_CHINESE_SINGAPORE,    936  },
 546 #ifdef LANG_CORNISH
 547     { LANG_CORNISH,        SUBLANG_NEUTRAL,              1252 },
 548 #endif /* LANG_CORNISH */
 549     { LANG_CORSICAN,       SUBLANG_NEUTRAL,              1252 },
 550     { LANG_CROATIAN,       SUBLANG_NEUTRAL,              1250 },
 551     { LANG_CZECH,          SUBLANG_NEUTRAL,              1250 },
 552     { LANG_DANISH,         SUBLANG_NEUTRAL,              1252 },
 553     { LANG_DARI,           SUBLANG_NEUTRAL,              1256 },
 554     { LANG_DIVEHI,         SUBLANG_NEUTRAL,              0    },
 555     { LANG_DUTCH,          SUBLANG_NEUTRAL,              1252 },
 556     { LANG_ENGLISH,        SUBLANG_NEUTRAL,              1252 },
 557 #ifdef LANG_ESPERANTO
 558     { LANG_ESPERANTO,      SUBLANG_NEUTRAL,              1252 },
 559 #endif /* LANG_ESPERANTO */
 560     { LANG_ESTONIAN,       SUBLANG_NEUTRAL,              1257 },
 561     { LANG_FAEROESE,       SUBLANG_NEUTRAL,              1252 },
 562     { LANG_FILIPINO,       SUBLANG_NEUTRAL,              1252 },
 563     { LANG_FINNISH,        SUBLANG_NEUTRAL,              1252 },
 564     { LANG_FRENCH,         SUBLANG_NEUTRAL,              1252 },
 565     { LANG_FRISIAN,        SUBLANG_NEUTRAL,              1252 },
 566 #ifdef LANG_MANX_GAELIC
 567     { LANG_MANX_GAELIC,    SUBLANG_NEUTRAL,              1252 },
 568 #endif /* LANG_MANX_GAELIC */
 569     { LANG_GALICIAN,       SUBLANG_NEUTRAL,              1252 },
 570     { LANG_GEORGIAN,       SUBLANG_NEUTRAL,              0    },
 571     { LANG_GERMAN,         SUBLANG_NEUTRAL,              1252 },
 572     { LANG_GREEK,          SUBLANG_NEUTRAL,              1253 },
 573     { LANG_GREENLANDIC,    SUBLANG_NEUTRAL,              1252 },
 574     { LANG_GUJARATI,       SUBLANG_NEUTRAL,              0    },
 575     { LANG_HAUSA,          SUBLANG_NEUTRAL,              1252 },
 576     { LANG_HEBREW,         SUBLANG_NEUTRAL,              1255 },
 577     { LANG_HINDI,          SUBLANG_NEUTRAL,              0    },
 578     { LANG_HUNGARIAN,      SUBLANG_NEUTRAL,              1250 },
 579     { LANG_ICELANDIC,      SUBLANG_NEUTRAL,              1252 },
 580     { LANG_IGBO,           SUBLANG_NEUTRAL,              1252 },
 581     { LANG_INDONESIAN,     SUBLANG_NEUTRAL,              1252 },
 582     { LANG_INUKTITUT,      SUBLANG_NEUTRAL,              0    },
 583     { LANG_INUKTITUT,      SUBLANG_INUKTITUT_CANADA_LATIN, 0  },
 584     { LANG_INVARIANT,      SUBLANG_NEUTRAL,              0    },
 585     { LANG_IRISH,          SUBLANG_NEUTRAL,              1252 },
 586     { LANG_ITALIAN,        SUBLANG_NEUTRAL,              1252 },
 587     { LANG_JAPANESE,       SUBLANG_NEUTRAL,              932  },
 588     { LANG_KANNADA,        SUBLANG_NEUTRAL,              0    },
 589     { LANG_KAZAK,          SUBLANG_NEUTRAL,              1251 },
 590     { LANG_KHMER,          SUBLANG_NEUTRAL,              0    },
 591     { LANG_KICHE,          SUBLANG_NEUTRAL,              1252 },
 592     { LANG_KINYARWANDA,    SUBLANG_NEUTRAL,              1252 },
 593     { LANG_KONKANI,        SUBLANG_NEUTRAL,              0    },
 594     { LANG_KOREAN,         SUBLANG_NEUTRAL,              949  },
 595     { LANG_KYRGYZ,         SUBLANG_NEUTRAL,              1251 },
 596     { LANG_LAO,            SUBLANG_NEUTRAL,              0    },
 597     { LANG_LATVIAN,        SUBLANG_NEUTRAL,              1257 },
 598     { LANG_LITHUANIAN,     SUBLANG_NEUTRAL,              1257 },
 599     { LANG_LOWER_SORBIAN,  SUBLANG_NEUTRAL,              1252 },
 600     { LANG_LUXEMBOURGISH,  SUBLANG_NEUTRAL,              1252 },
 601     { LANG_MACEDONIAN,     SUBLANG_NEUTRAL,              1251 },
 602     { LANG_MALAY,          SUBLANG_NEUTRAL,              1252 },
 603     { LANG_MALAYALAM,      SUBLANG_NEUTRAL,              0    },
 604     { LANG_MALTESE,        SUBLANG_NEUTRAL,              0    },
 605     { LANG_MAORI,          SUBLANG_NEUTRAL,              0    },
 606     { LANG_MAPUDUNGUN,     SUBLANG_NEUTRAL,              1252 },
 607     { LANG_MARATHI,        SUBLANG_NEUTRAL,              0    },
 608     { LANG_MOHAWK,         SUBLANG_NEUTRAL,              1252 },
 609     { LANG_MONGOLIAN,      SUBLANG_NEUTRAL,              1251 },
 610     { LANG_NEPALI,         SUBLANG_NEUTRAL,              0    },
 611     { LANG_NEUTRAL,        SUBLANG_NEUTRAL,              1252 },
 612     { LANG_NORWEGIAN,      SUBLANG_NEUTRAL,              1252 },
 613     { LANG_OCCITAN,        SUBLANG_NEUTRAL,              1252 },
 614     { LANG_ORIYA,          SUBLANG_NEUTRAL,              0    },
 615     { LANG_PASHTO,         SUBLANG_NEUTRAL,              0    },
 616     { LANG_PERSIAN,        SUBLANG_NEUTRAL,              1256 },
 617     { LANG_POLISH,         SUBLANG_NEUTRAL,              1250 },
 618     { LANG_PORTUGUESE,     SUBLANG_NEUTRAL,              1252 },
 619     { LANG_PUNJABI,        SUBLANG_NEUTRAL,              0    },
 620     { LANG_QUECHUA,        SUBLANG_NEUTRAL,              1252 },
 621     { LANG_ROMANIAN,       SUBLANG_NEUTRAL,              1250 },
 622     { LANG_ROMANSH,        SUBLANG_NEUTRAL,              1252 },
 623     { LANG_RUSSIAN,        SUBLANG_NEUTRAL,              1251 },
 624     { LANG_SAMI,           SUBLANG_NEUTRAL,              1252 },
 625     { LANG_SANSKRIT,       SUBLANG_NEUTRAL,              0    },
 626     { LANG_SCOTTISH_GAELIC,SUBLANG_NEUTRAL,              1252 },
 627     { LANG_SERBIAN,        SUBLANG_NEUTRAL,              1250 },
 628     { LANG_SERBIAN,        SUBLANG_SERBIAN_CYRILLIC,     1251 },
 629     { LANG_SINHALESE,      SUBLANG_NEUTRAL,              0    },
 630     { LANG_SLOVAK,         SUBLANG_NEUTRAL,              1250 },
 631     { LANG_SLOVENIAN,      SUBLANG_NEUTRAL,              1250 },
 632     { LANG_SOTHO,          SUBLANG_NEUTRAL,              1252 },
 633     { LANG_SPANISH,        SUBLANG_NEUTRAL,              1252 },
 634     { LANG_SWAHILI,        SUBLANG_NEUTRAL,              1252 },
 635     { LANG_SWEDISH,        SUBLANG_NEUTRAL,              1252 },
 636     { LANG_SYRIAC,         SUBLANG_NEUTRAL,              0    },
 637     { LANG_TAJIK,          SUBLANG_NEUTRAL,              1251 },
 638     { LANG_TAMAZIGHT,      SUBLANG_NEUTRAL,              1252 },
 639     { LANG_TAMIL,          SUBLANG_NEUTRAL,              0    },
 640     { LANG_TATAR,          SUBLANG_NEUTRAL,              1251 },
 641     { LANG_TELUGU,         SUBLANG_NEUTRAL,              0    },
 642     { LANG_THAI,           SUBLANG_NEUTRAL,              874  },
 643     { LANG_TIBETAN,        SUBLANG_NEUTRAL,              0    },
 644     { LANG_TSWANA,         SUBLANG_NEUTRAL,              1252 },
 645     { LANG_TURKISH,        SUBLANG_NEUTRAL,              1254 },
 646     { LANG_TURKMEN,        SUBLANG_NEUTRAL,              1250 },
 647     { LANG_UIGHUR,         SUBLANG_NEUTRAL,              1256 },
 648     { LANG_UKRAINIAN,      SUBLANG_NEUTRAL,              1251 },
 649     { LANG_UPPER_SORBIAN,  SUBLANG_NEUTRAL,              1252 },
 650     { LANG_URDU,           SUBLANG_NEUTRAL,              1256 },
 651     { LANG_UZBEK,          SUBLANG_NEUTRAL,              1254 },
 652     { LANG_UZBEK,          SUBLANG_UZBEK_CYRILLIC,       1251 },
 653     { LANG_VIETNAMESE,     SUBLANG_NEUTRAL,              1258 },
 654 #ifdef LANG_WALON
 655     { LANG_WALON,          SUBLANG_NEUTRAL,              1252 },
 656 #endif /* LANG_WALON */
 657     { LANG_WELSH,          SUBLANG_NEUTRAL,              1252 },
 658     { LANG_WOLOF,          SUBLANG_NEUTRAL,              1252 },
 659     { LANG_XHOSA,          SUBLANG_NEUTRAL,              1252 },
 660     { LANG_YAKUT,          SUBLANG_NEUTRAL,              1251 },
 661     { LANG_YI,             SUBLANG_NEUTRAL,              0    },
 662     { LANG_YORUBA,         SUBLANG_NEUTRAL,              1252 },
 663     { LANG_ZULU,           SUBLANG_NEUTRAL,              1252 }
 664 };
 665
 666 int get_language_codepage( unsigned short lang, unsigned short sublang )
 667 {
 668     unsigned int i;
 669     int cp = -1, defcp = -1;
 670
 671     for (i = 0; i < ARRAY_SIZE(lang2cps); i++)
 672     {
 673         if (lang2cps[i].lang != lang) continue;
 674         if (lang2cps[i].sublang == sublang)
 675         {
 676             cp = lang2cps[i].cp;
 677             break;
 678         }
 679         if (lang2cps[i].sublang == SUBLANG_NEUTRAL) defcp = lang2cps[i].cp;
 680     }
 681
 682     if (cp == -1) cp = defcp;
 683     return cp;
 684 }