tools/wrc/utils.c

   1 /*
   2  * Utility routines
   3  *
   4  * Copyright 1998 Bertho A. Stultiens
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  19  */
  20
  21 #include "config.h"
  22
  23 #include <assert.h>
  24 #include <stdio.h>
  25 #include <stdlib.h>
  26 #include <stdarg.h>
  27 #include <string.h>
  28 #include <ctype.h>
  29
  30 #include "../tools.h"
  31 #include "wrc.h"
  32 #include "winternl.h"
  33 #include "utils.h"
  34 #include "parser.h"
  35
  36 /* #define WANT_NEAR_INDICATION */
  37
  38 #ifdef WANT_NEAR_INDICATION
  39 void make_print(char *str)
  40 {
  41         while(*str)
  42         {
  43                 if(!isprint(*str))
  44                         *str = ' ';
  45                 str++;
  46         }
  47 }
  48 #endif
  49
  50 static void generic_msg(const char *s, const char *t, const char *n, va_list ap)
  51 {
  52         fprintf(stderr, "%s:%d:%d: %s: ", input_name ? input_name : "stdin", line_number, char_number, t);
  53         vfprintf(stderr, s, ap);
  54 #ifdef WANT_NEAR_INDICATION
  55         {
  56                 char *cpy;
  57                 if(n)
  58                 {
  59                         cpy = xstrdup(n);
  60                         make_print(cpy);
  61                         fprintf(stderr, " near '%s'", cpy);
  62                         free(cpy);
  63                 }
  64         }
  65 #endif
  66 }
  67
  68
  69 int parser_error(const char *s, ...)
  70 {
  71         va_list ap;
  72         va_start(ap, s);
  73         generic_msg(s, "Error", parser_text, ap);
  74         fputc( '\n', stderr );
  75         va_end(ap);
  76         exit(1);
  77         return 1;
  78 }
  79
  80 int parser_warning(const char *s, ...)
  81 {
  82         va_list ap;
  83         va_start(ap, s);
  84         generic_msg(s, "Warning", parser_text, ap);
  85         va_end(ap);
  86         return 0;
  87 }
  88
  89 void fatal_perror( const char *msg, ... )
  90 {
  91         va_list valist;
  92         va_start( valist, msg );
  93         fprintf(stderr, "Error: ");
  94         vfprintf( stderr, msg, valist );
  95         perror( " " );
  96         va_end( valist );
  97         exit(2);
  98 }
  99
 100 void error(const char *s, ...)
 101 {
 102         va_list ap;
 103         va_start(ap, s);
 104         fprintf(stderr, "Error: ");
 105         vfprintf(stderr, s, ap);
 106         va_end(ap);
 107         exit(2);
 108 }
 109
 110 void warning(const char *s, ...)
 111 {
 112         va_list ap;
 113         va_start(ap, s);
 114         fprintf(stderr, "Warning: ");
 115         vfprintf(stderr, s, ap);
 116         va_end(ap);
 117 }
 118
 119 void chat(const char *s, ...)
 120 {
 121         if(debuglevel & DEBUGLEVEL_CHAT)
 122         {
 123                 va_list ap;
 124                 va_start(ap, s);
 125                 fprintf(stderr, "FYI: ");
 126                 vfprintf(stderr, s, ap);
 127                 va_end(ap);
 128         }
 129 }
 130
 131 int compare_striA( const char *str1, const char *str2 )
 132 {
 133     for (;;)
 134     {
 135         /* only the A-Z range is case-insensitive */
 136         char ch1 = (*str1 >= 'a' && *str1 <= 'z') ? *str1 + 'A' - 'a' : *str1;
 137         char ch2 = (*str2 >= 'a' && *str2 <= 'z') ? *str2 + 'A' - 'a' : *str2;
 138         if (!ch1 || ch1 != ch2) return ch1 - ch2;
 139         str1++;
 140         str2++;
 141     }
 142 }
 143
 144 int compare_striW( const WCHAR *str1, const WCHAR *str2 )
 145 {
 146     for (;;)
 147     {
 148         /* only the A-Z range is case-insensitive */
 149         WCHAR ch1 = (*str1 >= 'a' && *str1 <= 'z') ? *str1 + 'A' - 'a' : *str1;
 150         WCHAR ch2 = (*str2 >= 'a' && *str2 <= 'z') ? *str2 + 'A' - 'a' : *str2;
 151         if (!ch1 || ch1 != ch2) return ch1 - ch2;
 152         str1++;
 153         str2++;
 154     }
 155 }
 156
 157 int compare_striAW( const char *str1, const WCHAR *str2 )
 158 {
 159     for (;;)
 160     {
 161         /* only the A-Z range is case-insensitive */
 162         WCHAR ch1 = (*str1 >= 'a' && *str1 <= 'z') ? *str1 + 'A' - 'a' : (unsigned char)*str1;
 163         WCHAR ch2 = (*str2 >= 'a' && *str2 <= 'z') ? *str2 + 'A' - 'a' : *str2;
 164         if (!ch1 || ch1 != ch2) return ch1 - ch2;
 165         str1++;
 166         str2++;
 167     }
 168 }
 169
 170 /*
 171  *****************************************************************************
 172  * Function     : compare_name_id
 173  * Syntax       : int compare_name_id(const name_id_t *n1, const name_id_t *n2)
 174  * Input        :
 175  * Output       :
 176  * Description  :
 177  * Remarks      :
 178  *****************************************************************************
 179 */
 180 int compare_name_id(const name_id_t *n1, const name_id_t *n2)
 181 {
 182     if (n1->type != n2->type) return n1->type == name_ord ? 1 : -1;
 183     if (n1->type == name_ord) return n1->name.i_name - n2->name.i_name;
 184
 185     if (n1->name.s_name->type == str_char)
 186     {
 187         if (n2->name.s_name->type == str_char)
 188             return compare_striA(n1->name.s_name->str.cstr, n2->name.s_name->str.cstr);
 189         return compare_striAW(n1->name.s_name->str.cstr, n2->name.s_name->str.wstr);
 190     }
 191     else
 192     {
 193         if (n2->name.s_name->type == str_char)
 194             return -compare_striAW(n2->name.s_name->str.cstr, n1->name.s_name->str.wstr);
 195         return compare_striW(n1->name.s_name->str.wstr, n2->name.s_name->str.wstr);
 196     }
 197 }
 198
 199 #ifdef _WIN32
 200
 201 int is_valid_codepage(int id)
 202 {
 203     return IsValidCodePage( id );
 204 }
 205
 206 static WCHAR *codepage_to_unicode( int codepage, const char *src, int srclen, int *dstlen )
 207 {
 208     WCHAR *dst = xmalloc( (srclen + 1) * sizeof(WCHAR) );
 209     DWORD ret = MultiByteToWideChar( codepage, MB_ERR_INVALID_CHARS, src, srclen, dst, srclen );
 210     if (!ret) return NULL;
 211     dst[ret] = 0;
 212     *dstlen = ret;
 213     return dst;
 214 }
 215
 216 int get_language_codepage( language_t lang )
 217 {
 218     DWORD codepage;
 219
 220     if (!lang) return 1252;
 221     if (!GetLocaleInfoW( lang, LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER,
 222                          (WCHAR *)&codepage, sizeof(codepage)/sizeof(WCHAR) )) return -1;
 223     return codepage;
 224 }
 225
 226 language_t get_language_from_name( const char *name )
 227 {
 228     WCHAR nameW[LOCALE_NAME_MAX_LENGTH];
 229
 230     MultiByteToWideChar( 1252, 0, name, -1, nameW, ARRAY_SIZE(nameW) );
 231     return LocaleNameToLCID( nameW, LOCALE_ALLOW_NEUTRAL_NAMES );
 232 }
 233
 234 #else  /* _WIN32 */
 235
 236 struct nls_info
 237 {
 238     unsigned short  codepage;
 239     unsigned short  unidef;
 240     unsigned short  trans_unidef;
 241     unsigned short *cp2uni;
 242     unsigned short *dbcs_offsets;
 243 };
 244
 245 static struct nls_info nlsinfo[128];
 246
 247 static void init_nls_info( struct nls_info *info, unsigned short *ptr )
 248 {
 249     unsigned short hdr_size = ptr[0];
 250
 251     info->codepage      = ptr[1];
 252     info->unidef        = ptr[4];
 253     info->trans_unidef  = ptr[6];
 254     ptr += hdr_size;
 255     info->cp2uni = ++ptr;
 256     ptr += 256;
 257     if (*ptr++) ptr += 256;  /* glyph table */
 258     info->dbcs_offsets  = *ptr ? ptr + 1 : NULL;
 259 }
 260
 261 static const struct nls_info *get_nls_info( unsigned int codepage )
 262 {
 263     unsigned short *data;
 264     char *path;
 265     unsigned int i;
 266     size_t size;
 267
 268     for (i = 0; i < ARRAY_SIZE(nlsinfo) && nlsinfo[i].codepage; i++)
 269         if (nlsinfo[i].codepage == codepage) return &nlsinfo[i];
 270
 271     assert( i < ARRAY_SIZE(nlsinfo) );
 272
 273     for (i = 0; nlsdirs[i]; i++)
 274     {
 275         path = strmake( "%s/c_%03u.nls", nlsdirs[i], codepage );
 276         if ((data = read_file( path, &size )))
 277         {
 278             free( path );
 279             init_nls_info( &nlsinfo[i], data );
 280             return &nlsinfo[i];
 281         }
 282         free( path );
 283     }
 284     return NULL;
 285 }
 286
 287 int is_valid_codepage(int cp)
 288 {
 289     return cp == CP_UTF8 || get_nls_info( cp );
 290 }
 291
 292 static WCHAR *codepage_to_unicode( int codepage, const char *src, int srclen, int *dstlen )
 293 {
 294     const struct nls_info *info = get_nls_info( codepage );
 295     unsigned int i;
 296     WCHAR dbch, *dst = xmalloc( (srclen + 1) * sizeof(WCHAR) );
 297
 298     if (!info) error( "codepage %u not supported\n", codepage );
 299
 300     if (info->dbcs_offsets)
 301     {
 302         for (i = 0; srclen; i++, srclen--, src++)
 303         {
 304             unsigned short off = info->dbcs_offsets[(unsigned char)*src];
 305             if (off)
 306             {
 307                 if (srclen == 1) return NULL;
 308                 dbch = (src[0] << 8) | (unsigned char)src[1];
 309                 src++;
 310                 srclen--;
 311                 dst[i] = info->dbcs_offsets[off + (unsigned char)*src];
 312                 if (dst[i] == info->unidef && dbch != info->trans_unidef) return NULL;
 313             }
 314             else
 315             {
 316                 dst[i] = info->cp2uni[(unsigned char)*src];
 317                 if (dst[i] == info->unidef && *src != info->trans_unidef) return NULL;
 318             }
 319         }
 320     }
 321     else
 322     {
 323         for (i = 0; i < srclen; i++)
 324         {
 325             dst[i] = info->cp2uni[(unsigned char)src[i]];
 326             if (dst[i] == info->unidef && src[i] != info->trans_unidef) return NULL;
 327         }
 328     }
 329     dst[i] = 0;
 330     *dstlen = i;
 331     return dst;
 332 }
 333
 334 static const NLS_LOCALE_LCID_INDEX *lcids_index;
 335 static const NLS_LOCALE_HEADER *locale_table;
 336 static const NLS_LOCALE_LCNAME_INDEX *lcnames_index;
 337 static const WCHAR *locale_strings;
 338
 339 static void load_locale_nls(void)
 340 {
 341     struct
 342     {
 343         unsigned int ctypes;
 344         unsigned int unknown1;
 345         unsigned int unknown2;
 346         unsigned int unknown3;
 347         unsigned int locales;
 348         unsigned int charmaps;
 349         unsigned int geoids;
 350         unsigned int scripts;
 351     } *header;
 352     char *path;
 353     unsigned int i;
 354     size_t size;
 355
 356     for (i = 0; nlsdirs[i]; i++)
 357     {
 358         path = strmake( "%s/locale.nls", nlsdirs[i] );
 359         header = read_file( path, &size );
 360         free( path );
 361         if (!header) continue;
 362         locale_table = (const NLS_LOCALE_HEADER *)((char *)header + header->locales);
 363         lcids_index = (const NLS_LOCALE_LCID_INDEX *)((char *)locale_table + locale_table->lcids_offset);
 364         lcnames_index = (const NLS_LOCALE_LCNAME_INDEX *)((char *)locale_table + locale_table->lcnames_offset);
 365         locale_strings = (const WCHAR *)((char *)locale_table + locale_table->strings_offset);
 366         return;
 367     }
 368     error( "unable to load locale.nls\n" );
 369 }
 370
 371 static int compare_locale_names( const char *n1, const WCHAR *n2 )
 372 {
 373     for (;;)
 374     {
 375         WCHAR ch1 = (unsigned char)*n1++;
 376         WCHAR ch2 = *n2++;
 377         if (ch1 >= 'a' && ch1 <= 'z') ch1 -= 'a' - 'A';
 378         if (ch2 >= 'a' && ch2 <= 'z') ch2 -= 'a' - 'A';
 379         if (!ch1 || ch1 != ch2) return ch1 - ch2;
 380     }
 381 }
 382
 383 static const NLS_LOCALE_LCNAME_INDEX *find_lcname_entry( const char *name )
 384 {
 385     int min = 0, max = locale_table->nb_lcnames - 1;
 386
 387     if (!name) return NULL;
 388     while (min <= max)
 389     {
 390         int res, pos = (min + max) / 2;
 391         const WCHAR *str = locale_strings + lcnames_index[pos].name;
 392         res = compare_locale_names( name, str + 1 );
 393         if (res < 0) max = pos - 1;
 394         else if (res > 0) min = pos + 1;
 395         else return &lcnames_index[pos];
 396     }
 397     return NULL;
 398 }
 399
 400 static const NLS_LOCALE_LCID_INDEX *find_lcid_entry( LCID lcid )
 401 {
 402     int min = 0, max = locale_table->nb_lcids - 1;
 403
 404     while (min <= max)
 405     {
 406         int pos = (min + max) / 2;
 407         if (lcid < lcids_index[pos].id) max = pos - 1;
 408         else if (lcid > lcids_index[pos].id) min = pos + 1;
 409         else return &lcids_index[pos];
 410     }
 411     return NULL;
 412 }
 413
 414 static const NLS_LOCALE_DATA *get_locale_data( UINT idx )
 415 {
 416     ULONG offset = locale_table->locales_offset + idx * locale_table->locale_size;
 417     return (const NLS_LOCALE_DATA *)((const char *)locale_table + offset);
 418 }
 419
 420 int get_language_codepage( language_t lang )
 421 {
 422     const NLS_LOCALE_LCID_INDEX *entry;
 423
 424     if (!lang) return 1252;
 425     if (lang == MAKELANGID( LANG_ENGLISH, SUBLANG_DEFAULT )) return 1252;
 426     if (!locale_table) load_locale_nls();
 427     if (!(entry = find_lcid_entry( lang ))) return -1;
 428     return get_locale_data( entry->idx )->idefaultansicodepage;
 429 }
 430
 431 language_t get_language_from_name( const char *name )
 432 {
 433     const NLS_LOCALE_LCNAME_INDEX *entry;
 434
 435     if (!locale_table) load_locale_nls();
 436     if (!(entry = find_lcname_entry( name ))) return 0;
 437     return get_locale_data( entry->idx )->unique_lcid;
 438 }
 439
 440 #endif  /* _WIN32 */
 441
 442 static WCHAR *utf8_to_unicode( const char *src, int srclen, int *dstlen )
 443 {
 444     static const char utf8_length[128] =
 445     {
 446         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */
 447         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */
 448         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */
 449         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */
 450         0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */
 451         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */
 452         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */
 453         3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0  /* 0xf0-0xff */
 454     };
 455     static const unsigned char utf8_mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
 456
 457     const char *srcend = src + srclen;
 458     int len, res;
 459     WCHAR *ret, *dst;
 460
 461     dst = ret = xmalloc( (srclen + 1) * sizeof(WCHAR) );
 462     while (src < srcend)
 463     {
 464         unsigned char ch = *src++;
 465         if (ch < 0x80)  /* special fast case for 7-bit ASCII */
 466         {
 467             *dst++ = ch;
 468             continue;
 469         }
 470         len = utf8_length[ch - 0x80];
 471         if (len && src + len <= srcend)
 472         {
 473             res = ch & utf8_mask[len];
 474             switch (len)
 475             {
 476             case 3:
 477                 if ((ch = *src ^ 0x80) >= 0x40) break;
 478                 res = (res << 6) | ch;
 479                 src++;
 480                 if (res < 0x10) break;
 481             case 2:
 482                 if ((ch = *src ^ 0x80) >= 0x40) break;
 483                 res = (res << 6) | ch;
 484                 if (res >= 0x110000 >> 6) break;
 485                 src++;
 486                 if (res < 0x20) break;
 487                 if (res >= 0xd800 >> 6 && res <= 0xdfff >> 6) break;
 488             case 1:
 489                 if ((ch = *src ^ 0x80) >= 0x40) break;
 490                 res = (res << 6) | ch;
 491                 src++;
 492                 if (res < 0x80) break;
 493                 if (res <= 0xffff) *dst++ = res;
 494                 else
 495                 {
 496                     res -= 0x10000;
 497                     *dst++ = 0xd800 | (res >> 10);
 498                     *dst++ = 0xdc00 | (res & 0x3ff);
 499                 }
 500                 continue;
 501             }
 502         }
 503         *dst++ = 0xfffd;
 504     }
 505     *dst = 0;
 506     *dstlen = dst - ret;
 507     return ret;
 508 }
 509
 510 static char *unicode_to_utf8( const WCHAR *src, int srclen, int *dstlen )
 511 {
 512     char *ret, *dst;
 513
 514     dst = ret = xmalloc( srclen * 3 + 1 );
 515     for ( ; srclen; srclen--, src++)
 516     {
 517         unsigned int ch = *src;
 518
 519         if (ch < 0x80)  /* 0x00-0x7f: 1 byte */
 520         {
 521             *dst++ = ch;
 522             continue;
 523         }
 524         if (ch < 0x800)  /* 0x80-0x7ff: 2 bytes */
 525         {
 526             dst[1] = 0x80 | (ch & 0x3f);
 527             ch >>= 6;
 528             dst[0] = 0xc0 | ch;
 529             dst += 2;
 530             continue;
 531         }
 532         if (ch >= 0xd800 && ch <= 0xdbff && srclen > 1 && src[1] >= 0xdc00 && src[1] <= 0xdfff)
 533         {
 534             /* 0x10000-0x10ffff: 4 bytes */
 535             ch = 0x10000 + ((ch & 0x3ff) << 10) + (src[1] & 0x3ff);
 536             dst[3] = 0x80 | (ch & 0x3f);
 537             ch >>= 6;
 538             dst[2] = 0x80 | (ch & 0x3f);
 539             ch >>= 6;
 540             dst[1] = 0x80 | (ch & 0x3f);
 541             ch >>= 6;
 542             dst[0] = 0xf0 | ch;
 543             dst += 4;
 544             src++;
 545             srclen--;
 546             continue;
 547         }
 548         if (ch >= 0xd800 && ch <= 0xdfff) ch = 0xfffd;  /* invalid surrogate pair */
 549
 550         /* 0x800-0xffff: 3 bytes */
 551         dst[2] = 0x80 | (ch & 0x3f);
 552         ch >>= 6;
 553         dst[1] = 0x80 | (ch & 0x3f);
 554         ch >>= 6;
 555         dst[0] = 0xe0 | ch;
 556         dst += 3;
 557     }
 558     *dst = 0;
 559     *dstlen = dst - ret;
 560     return ret;
 561 }
 562
 563 string_t *convert_string_unicode( const string_t *str, int codepage )
 564 {
 565     string_t *ret = xmalloc(sizeof(*ret));
 566
 567     ret->type = str_unicode;
 568     ret->loc = str->loc;
 569
 570     if (str->type == str_char)
 571     {
 572         if (!codepage) parser_error( "Current language is Unicode only, cannot convert string" );
 573
 574         if (codepage == CP_UTF8)
 575             ret->str.wstr = utf8_to_unicode( str->str.cstr, str->size, &ret->size );
 576         else
 577             ret->str.wstr = codepage_to_unicode( codepage, str->str.cstr, str->size, &ret->size );
 578         if (!ret->str.wstr) parser_error( "Invalid character in string '%.*s' for codepage %u",
 579                                           str->size, str->str.cstr, codepage );
 580     }
 581     else
 582     {
 583         ret->size     = str->size;
 584         ret->str.wstr = xmalloc(sizeof(WCHAR)*(ret->size+1));
 585         memcpy( ret->str.wstr, str->str.wstr, ret->size * sizeof(WCHAR) );
 586         ret->str.wstr[ret->size] = 0;
 587     }
 588     return ret;
 589 }
 590
 591 char *convert_string_utf8( const string_t *str, int codepage )
 592 {
 593     int len;
 594     string_t *wstr = convert_string_unicode( str, codepage );
 595     char *ret = unicode_to_utf8( wstr->str.wstr, wstr->size, &len );
 596     free_string( wstr );
 597     return ret;
 598 }
 599
 600 void free_string(string_t *str)
 601 {
 602     if (str->type == str_unicode) free( str->str.wstr );
 603     else free( str->str.cstr );
 604     free( str );
 605 }
 606
 607 /* check if the string is valid utf8 despite a different codepage being in use */
 608 int check_valid_utf8( const string_t *str, int codepage )
 609 {
 610     int i, count;
 611     WCHAR *wstr;
 612
 613     if (!check_utf8) return 0;
 614     if (!codepage) return 0;
 615     if (codepage == CP_UTF8) return 0;
 616     if (!is_valid_codepage( codepage )) return 0;
 617
 618     for (i = count = 0; i < str->size; i++)
 619     {
 620         if ((unsigned char)str->str.cstr[i] >= 0xf5) goto done;
 621         if ((unsigned char)str->str.cstr[i] >= 0xc2) { count++; continue; }
 622         if ((unsigned char)str->str.cstr[i] >= 0x80) goto done;
 623     }
 624     if (!count) return 0;  /* no 8-bit chars at all */
 625
 626     wstr = utf8_to_unicode( str->str.cstr, str->size, &count );
 627     for (i = 0; i < count; i++) if (wstr[i] == 0xfffd) break;
 628     free( wstr );
 629     return (i == count);
 630
 631 done:
 632     check_utf8 = 0;  /* at least one 8-bit non-utf8 string found, stop checking */
 633     return 0;
 634 }
 635
 636 const char *get_nameid_str(const name_id_t *n)
 637 {
 638     int len;
 639
 640     if (!n) return "<none>";
 641     if (n->type == name_ord) return strmake( "%u", n->name.i_name );
 642     if (n->name.s_name->type == str_char) return n->name.s_name->str.cstr;
 643     return unicode_to_utf8( n->name.s_name->str.wstr, n->name.s_name->size, &len );
 644 }