4 * Copyright 1998 Bertho A. Stultiens
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
36 /* #define WANT_NEAR_INDICATION */
38 #ifdef WANT_NEAR_INDICATION
39 void make_print(char *str
)
50 static void generic_msg(const char *s
, const char *t
, const char *n
, va_list ap
)
52 fprintf(stderr
, "%s:%d:%d: %s: ", input_name
? input_name
: "stdin", line_number
, char_number
, t
);
53 vfprintf(stderr
, s
, ap
);
54 #ifdef WANT_NEAR_INDICATION
61 fprintf(stderr
, " near '%s'", cpy
);
69 int parser_error(const char *s
, ...)
73 generic_msg(s
, "Error", parser_text
, ap
);
74 fputc( '\n', stderr
);
80 int parser_warning(const char *s
, ...)
84 generic_msg(s
, "Warning", parser_text
, ap
);
89 void fatal_perror( const char *msg
, ... )
92 va_start( valist
, msg
);
93 fprintf(stderr
, "Error: ");
94 vfprintf( stderr
, msg
, valist
);
100 void error(const char *s
, ...)
104 fprintf(stderr
, "Error: ");
105 vfprintf(stderr
, s
, ap
);
110 void warning(const char *s
, ...)
114 fprintf(stderr
, "Warning: ");
115 vfprintf(stderr
, s
, ap
);
119 void chat(const char *s
, ...)
121 if(debuglevel
& DEBUGLEVEL_CHAT
)
125 fprintf(stderr
, "FYI: ");
126 vfprintf(stderr
, s
, ap
);
131 int compare_striA( const char *str1
, const char *str2
)
135 /* only the A-Z range is case-insensitive */
136 char ch1
= (*str1
>= 'a' && *str1
<= 'z') ? *str1
+ 'A' - 'a' : *str1
;
137 char ch2
= (*str2
>= 'a' && *str2
<= 'z') ? *str2
+ 'A' - 'a' : *str2
;
138 if (!ch1
|| ch1
!= ch2
) return ch1
- ch2
;
144 int compare_striW( const WCHAR
*str1
, const WCHAR
*str2
)
148 /* only the A-Z range is case-insensitive */
149 WCHAR ch1
= (*str1
>= 'a' && *str1
<= 'z') ? *str1
+ 'A' - 'a' : *str1
;
150 WCHAR ch2
= (*str2
>= 'a' && *str2
<= 'z') ? *str2
+ 'A' - 'a' : *str2
;
151 if (!ch1
|| ch1
!= ch2
) return ch1
- ch2
;
157 int compare_striAW( const char *str1
, const WCHAR
*str2
)
161 /* only the A-Z range is case-insensitive */
162 WCHAR ch1
= (*str1
>= 'a' && *str1
<= 'z') ? *str1
+ 'A' - 'a' : (unsigned char)*str1
;
163 WCHAR ch2
= (*str2
>= 'a' && *str2
<= 'z') ? *str2
+ 'A' - 'a' : *str2
;
164 if (!ch1
|| ch1
!= ch2
) return ch1
- ch2
;
171 *****************************************************************************
172 * Function : compare_name_id
173 * Syntax : int compare_name_id(const name_id_t *n1, const name_id_t *n2)
178 *****************************************************************************
180 int compare_name_id(const name_id_t
*n1
, const name_id_t
*n2
)
182 if (n1
->type
!= n2
->type
) return n1
->type
== name_ord
? 1 : -1;
183 if (n1
->type
== name_ord
) return n1
->name
.i_name
- n2
->name
.i_name
;
185 if (n1
->name
.s_name
->type
== str_char
)
187 if (n2
->name
.s_name
->type
== str_char
)
188 return compare_striA(n1
->name
.s_name
->str
.cstr
, n2
->name
.s_name
->str
.cstr
);
189 return compare_striAW(n1
->name
.s_name
->str
.cstr
, n2
->name
.s_name
->str
.wstr
);
193 if (n2
->name
.s_name
->type
== str_char
)
194 return -compare_striAW(n2
->name
.s_name
->str
.cstr
, n1
->name
.s_name
->str
.wstr
);
195 return compare_striW(n1
->name
.s_name
->str
.wstr
, n2
->name
.s_name
->str
.wstr
);
201 int is_valid_codepage(int id
)
203 return IsValidCodePage( id
);
206 static WCHAR
*codepage_to_unicode( int codepage
, const char *src
, int srclen
, int *dstlen
)
208 WCHAR
*dst
= xmalloc( (srclen
+ 1) * sizeof(WCHAR
) );
209 DWORD ret
= MultiByteToWideChar( codepage
, MB_ERR_INVALID_CHARS
, src
, srclen
, dst
, srclen
);
210 if (!ret
) return NULL
;
216 int get_language_codepage( language_t lang
)
220 if (!lang
) return 1252;
221 if (!GetLocaleInfoW( lang
, LOCALE_IDEFAULTANSICODEPAGE
| LOCALE_RETURN_NUMBER
,
222 (WCHAR
*)&codepage
, sizeof(codepage
)/sizeof(WCHAR
) )) return -1;
226 language_t
get_language_from_name( const char *name
)
228 WCHAR nameW
[LOCALE_NAME_MAX_LENGTH
];
230 MultiByteToWideChar( 1252, 0, name
, -1, nameW
, ARRAY_SIZE(nameW
) );
231 return LocaleNameToLCID( nameW
, LOCALE_ALLOW_NEUTRAL_NAMES
);
238 unsigned short codepage
;
239 unsigned short unidef
;
240 unsigned short trans_unidef
;
241 unsigned short *cp2uni
;
242 unsigned short *dbcs_offsets
;
245 static struct nls_info nlsinfo
[128];
247 static void init_nls_info( struct nls_info
*info
, unsigned short *ptr
)
249 unsigned short hdr_size
= ptr
[0];
251 info
->codepage
= ptr
[1];
252 info
->unidef
= ptr
[4];
253 info
->trans_unidef
= ptr
[6];
255 info
->cp2uni
= ++ptr
;
257 if (*ptr
++) ptr
+= 256; /* glyph table */
258 info
->dbcs_offsets
= *ptr
? ptr
+ 1 : NULL
;
261 static const struct nls_info
*get_nls_info( unsigned int codepage
)
263 unsigned short *data
;
268 for (i
= 0; i
< ARRAY_SIZE(nlsinfo
) && nlsinfo
[i
].codepage
; i
++)
269 if (nlsinfo
[i
].codepage
== codepage
) return &nlsinfo
[i
];
271 assert( i
< ARRAY_SIZE(nlsinfo
) );
273 for (i
= 0; nlsdirs
[i
]; i
++)
275 path
= strmake( "%s/c_%03u.nls", nlsdirs
[i
], codepage
);
276 if ((data
= read_file( path
, &size
)))
279 init_nls_info( &nlsinfo
[i
], data
);
287 int is_valid_codepage(int cp
)
289 return cp
== CP_UTF8
|| get_nls_info( cp
);
292 static WCHAR
*codepage_to_unicode( int codepage
, const char *src
, int srclen
, int *dstlen
)
294 const struct nls_info
*info
= get_nls_info( codepage
);
296 WCHAR dbch
, *dst
= xmalloc( (srclen
+ 1) * sizeof(WCHAR
) );
298 if (!info
) error( "codepage %u not supported\n", codepage
);
300 if (info
->dbcs_offsets
)
302 for (i
= 0; srclen
; i
++, srclen
--, src
++)
304 unsigned short off
= info
->dbcs_offsets
[(unsigned char)*src
];
307 if (srclen
== 1) return NULL
;
308 dbch
= (src
[0] << 8) | (unsigned char)src
[1];
311 dst
[i
] = info
->dbcs_offsets
[off
+ (unsigned char)*src
];
312 if (dst
[i
] == info
->unidef
&& dbch
!= info
->trans_unidef
) return NULL
;
316 dst
[i
] = info
->cp2uni
[(unsigned char)*src
];
317 if (dst
[i
] == info
->unidef
&& *src
!= info
->trans_unidef
) return NULL
;
323 for (i
= 0; i
< srclen
; i
++)
325 dst
[i
] = info
->cp2uni
[(unsigned char)src
[i
]];
326 if (dst
[i
] == info
->unidef
&& src
[i
] != info
->trans_unidef
) return NULL
;
334 static const NLS_LOCALE_LCID_INDEX
*lcids_index
;
335 static const NLS_LOCALE_HEADER
*locale_table
;
336 static const NLS_LOCALE_LCNAME_INDEX
*lcnames_index
;
337 static const WCHAR
*locale_strings
;
339 static void load_locale_nls(void)
344 unsigned int unknown1
;
345 unsigned int unknown2
;
346 unsigned int unknown3
;
347 unsigned int locales
;
348 unsigned int charmaps
;
350 unsigned int scripts
;
356 for (i
= 0; nlsdirs
[i
]; i
++)
358 path
= strmake( "%s/locale.nls", nlsdirs
[i
] );
359 header
= read_file( path
, &size
);
361 if (!header
) continue;
362 locale_table
= (const NLS_LOCALE_HEADER
*)((char *)header
+ header
->locales
);
363 lcids_index
= (const NLS_LOCALE_LCID_INDEX
*)((char *)locale_table
+ locale_table
->lcids_offset
);
364 lcnames_index
= (const NLS_LOCALE_LCNAME_INDEX
*)((char *)locale_table
+ locale_table
->lcnames_offset
);
365 locale_strings
= (const WCHAR
*)((char *)locale_table
+ locale_table
->strings_offset
);
368 error( "unable to load locale.nls\n" );
371 static int compare_locale_names( const char *n1
, const WCHAR
*n2
)
375 WCHAR ch1
= (unsigned char)*n1
++;
377 if (ch1
>= 'a' && ch1
<= 'z') ch1
-= 'a' - 'A';
378 if (ch2
>= 'a' && ch2
<= 'z') ch2
-= 'a' - 'A';
379 if (!ch1
|| ch1
!= ch2
) return ch1
- ch2
;
383 static const NLS_LOCALE_LCNAME_INDEX
*find_lcname_entry( const char *name
)
385 int min
= 0, max
= locale_table
->nb_lcnames
- 1;
387 if (!name
) return NULL
;
390 int res
, pos
= (min
+ max
) / 2;
391 const WCHAR
*str
= locale_strings
+ lcnames_index
[pos
].name
;
392 res
= compare_locale_names( name
, str
+ 1 );
393 if (res
< 0) max
= pos
- 1;
394 else if (res
> 0) min
= pos
+ 1;
395 else return &lcnames_index
[pos
];
400 static const NLS_LOCALE_LCID_INDEX
*find_lcid_entry( LCID lcid
)
402 int min
= 0, max
= locale_table
->nb_lcids
- 1;
406 int pos
= (min
+ max
) / 2;
407 if (lcid
< lcids_index
[pos
].id
) max
= pos
- 1;
408 else if (lcid
> lcids_index
[pos
].id
) min
= pos
+ 1;
409 else return &lcids_index
[pos
];
414 static const NLS_LOCALE_DATA
*get_locale_data( UINT idx
)
416 ULONG offset
= locale_table
->locales_offset
+ idx
* locale_table
->locale_size
;
417 return (const NLS_LOCALE_DATA
*)((const char *)locale_table
+ offset
);
420 int get_language_codepage( language_t lang
)
422 const NLS_LOCALE_LCID_INDEX
*entry
;
424 if (!lang
) return 1252;
425 if (lang
== MAKELANGID( LANG_ENGLISH
, SUBLANG_DEFAULT
)) return 1252;
426 if (!locale_table
) load_locale_nls();
427 if (!(entry
= find_lcid_entry( lang
))) return -1;
428 return get_locale_data( entry
->idx
)->idefaultansicodepage
;
431 language_t
get_language_from_name( const char *name
)
433 const NLS_LOCALE_LCNAME_INDEX
*entry
;
435 if (!locale_table
) load_locale_nls();
436 if (!(entry
= find_lcname_entry( name
))) return 0;
437 return get_locale_data( entry
->idx
)->unique_lcid
;
442 static WCHAR
*utf8_to_unicode( const char *src
, int srclen
, int *dstlen
)
444 static const char utf8_length
[128] =
446 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */
447 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */
448 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */
449 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */
450 0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */
451 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */
452 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */
453 3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0 /* 0xf0-0xff */
455 static const unsigned char utf8_mask
[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
457 const char *srcend
= src
+ srclen
;
461 dst
= ret
= xmalloc( (srclen
+ 1) * sizeof(WCHAR
) );
464 unsigned char ch
= *src
++;
465 if (ch
< 0x80) /* special fast case for 7-bit ASCII */
470 len
= utf8_length
[ch
- 0x80];
471 if (len
&& src
+ len
<= srcend
)
473 res
= ch
& utf8_mask
[len
];
477 if ((ch
= *src
^ 0x80) >= 0x40) break;
478 res
= (res
<< 6) | ch
;
480 if (res
< 0x10) break;
482 if ((ch
= *src
^ 0x80) >= 0x40) break;
483 res
= (res
<< 6) | ch
;
484 if (res
>= 0x110000 >> 6) break;
486 if (res
< 0x20) break;
487 if (res
>= 0xd800 >> 6 && res
<= 0xdfff >> 6) break;
489 if ((ch
= *src
^ 0x80) >= 0x40) break;
490 res
= (res
<< 6) | ch
;
492 if (res
< 0x80) break;
493 if (res
<= 0xffff) *dst
++ = res
;
497 *dst
++ = 0xd800 | (res
>> 10);
498 *dst
++ = 0xdc00 | (res
& 0x3ff);
510 static char *unicode_to_utf8( const WCHAR
*src
, int srclen
, int *dstlen
)
514 dst
= ret
= xmalloc( srclen
* 3 + 1 );
515 for ( ; srclen
; srclen
--, src
++)
517 unsigned int ch
= *src
;
519 if (ch
< 0x80) /* 0x00-0x7f: 1 byte */
524 if (ch
< 0x800) /* 0x80-0x7ff: 2 bytes */
526 dst
[1] = 0x80 | (ch
& 0x3f);
532 if (ch
>= 0xd800 && ch
<= 0xdbff && srclen
> 1 && src
[1] >= 0xdc00 && src
[1] <= 0xdfff)
534 /* 0x10000-0x10ffff: 4 bytes */
535 ch
= 0x10000 + ((ch
& 0x3ff) << 10) + (src
[1] & 0x3ff);
536 dst
[3] = 0x80 | (ch
& 0x3f);
538 dst
[2] = 0x80 | (ch
& 0x3f);
540 dst
[1] = 0x80 | (ch
& 0x3f);
548 if (ch
>= 0xd800 && ch
<= 0xdfff) ch
= 0xfffd; /* invalid surrogate pair */
550 /* 0x800-0xffff: 3 bytes */
551 dst
[2] = 0x80 | (ch
& 0x3f);
553 dst
[1] = 0x80 | (ch
& 0x3f);
563 string_t
*convert_string_unicode( const string_t
*str
, int codepage
)
565 string_t
*ret
= xmalloc(sizeof(*ret
));
567 ret
->type
= str_unicode
;
570 if (str
->type
== str_char
)
572 if (!codepage
) parser_error( "Current language is Unicode only, cannot convert string" );
574 if (codepage
== CP_UTF8
)
575 ret
->str
.wstr
= utf8_to_unicode( str
->str
.cstr
, str
->size
, &ret
->size
);
577 ret
->str
.wstr
= codepage_to_unicode( codepage
, str
->str
.cstr
, str
->size
, &ret
->size
);
578 if (!ret
->str
.wstr
) parser_error( "Invalid character in string '%.*s' for codepage %u",
579 str
->size
, str
->str
.cstr
, codepage
);
583 ret
->size
= str
->size
;
584 ret
->str
.wstr
= xmalloc(sizeof(WCHAR
)*(ret
->size
+1));
585 memcpy( ret
->str
.wstr
, str
->str
.wstr
, ret
->size
* sizeof(WCHAR
) );
586 ret
->str
.wstr
[ret
->size
] = 0;
591 char *convert_string_utf8( const string_t
*str
, int codepage
)
594 string_t
*wstr
= convert_string_unicode( str
, codepage
);
595 char *ret
= unicode_to_utf8( wstr
->str
.wstr
, wstr
->size
, &len
);
600 void free_string(string_t
*str
)
602 if (str
->type
== str_unicode
) free( str
->str
.wstr
);
603 else free( str
->str
.cstr
);
607 /* check if the string is valid utf8 despite a different codepage being in use */
608 int check_valid_utf8( const string_t
*str
, int codepage
)
613 if (!check_utf8
) return 0;
614 if (!codepage
) return 0;
615 if (codepage
== CP_UTF8
) return 0;
616 if (!is_valid_codepage( codepage
)) return 0;
618 for (i
= count
= 0; i
< str
->size
; i
++)
620 if ((unsigned char)str
->str
.cstr
[i
] >= 0xf5) goto done
;
621 if ((unsigned char)str
->str
.cstr
[i
] >= 0xc2) { count
++; continue; }
622 if ((unsigned char)str
->str
.cstr
[i
] >= 0x80) goto done
;
624 if (!count
) return 0; /* no 8-bit chars at all */
626 wstr
= utf8_to_unicode( str
->str
.cstr
, str
->size
, &count
);
627 for (i
= 0; i
< count
; i
++) if (wstr
[i
] == 0xfffd) break;
632 check_utf8
= 0; /* at least one 8-bit non-utf8 string found, stop checking */
636 const char *get_nameid_str(const name_id_t
*n
)
640 if (!n
) return "<none>";
641 if (n
->type
== name_ord
) return strmake( "%u", n
->name
.i_name
);
642 if (n
->name
.s_name
->type
== str_char
) return n
->name
.s_name
->str
.cstr
;
643 return unicode_to_utf8( n
->name
.s_name
->str
.wstr
, n
->name
.s_name
->size
, &len
);