2 * Wine Message Compiler lexical scanner
4 * Copyright 2000 Bertho A. Stultiens (BS)
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
36 * Keywords are case insensitive. All normal input is treated as
37 * being in codepage iso-8859-1 for ascii input files (unicode
38 * page 0) and as equivalent unicode if unicode input is selected.
39 * All normal input, which is not part of a message text, is
40 * enforced to be unicode page 0. Otherwise an error will be
41 * generated. The normal file data should only be ASCII because
42 * that is the basic definition of the grammar.
44 * Byteorder or unicode input is determined automatically by
45 * reading the first 8 bytes and checking them against unicode
46 * page 0 byteorder (hibyte must be 0).
48 * Alternatively, the input is checked against a special byte
49 * sequence to identify the file.
64 * Default added identifiers for classes:
74 * The 'Codepages' keyword is a wmc extension.
77 static const WCHAR ustr_application
[] = { 'A', 'p', 'p', 'l', 'i', 'c', 'a', 't', 'i', 'o', 'n', 0 };
78 static const WCHAR ustr_codepages
[] = { 'C', 'o', 'd', 'e', 'p', 'a', 'g', 'e', 's', 0 };
79 static const WCHAR ustr_english
[] = { 'E', 'n', 'g', 'l', 'i', 's', 'h', 0 };
80 static const WCHAR ustr_error
[] = { 'E', 'r', 'r', 'o', 'r', 0 };
81 static const WCHAR ustr_facility
[] = { 'F', 'a', 'c', 'i', 'l', 'i', 't', 'y', 0 };
82 static const WCHAR ustr_facilitynames
[] = { 'F', 'a', 'c', 'i', 'l', 'i', 't', 'y', 'N', 'a', 'm', 'e', 's', 0 };
83 static const WCHAR ustr_informational
[] = { 'I', 'n', 'f', 'o', 'r', 'm', 'a', 't', 'i', 'o', 'n', 'a', 'l', 0 };
84 static const WCHAR ustr_language
[] = { 'L', 'a', 'n', 'g', 'u', 'a', 'g', 'e', 0};
85 static const WCHAR ustr_languagenames
[] = { 'L', 'a', 'n', 'g', 'u', 'a', 'g', 'e', 'N', 'a', 'm', 'e', 's', 0};
86 static const WCHAR ustr_messageid
[] = { 'M', 'e', 's', 's', 'a', 'g', 'e', 'I', 'd', 0 };
87 static const WCHAR ustr_messageidtypedef
[] = { 'M', 'e', 's', 's', 'a', 'g', 'e', 'I', 'd', 'T', 'y', 'p', 'e', 'd', 'e', 'f', 0 };
88 static const WCHAR ustr_dxgi
[] = { 'D', 'x', 'g', 'i', 0 };
89 static const WCHAR ustr_null
[] = { 'N', 'u', 'l', 'l', 0 };
90 static const WCHAR ustr_outputbase
[] = { 'O', 'u', 't', 'p', 'u', 't', 'B', 'a', 's', 'e', 0 };
91 static const WCHAR ustr_severity
[] = { 'S', 'e', 'v', 'e', 'r', 'i', 't', 'y', 0 };
92 static const WCHAR ustr_severitynames
[] = { 'S', 'e', 'v', 'e', 'r', 'i', 't', 'y', 'N', 'a', 'm', 'e', 's', 0 };
93 static const WCHAR ustr_success
[] = { 'S', 'u', 'c', 'c', 'e', 's', 's', 0 };
94 static const WCHAR ustr_symbolicname
[] = { 'S', 'y', 'm', 'b', 'o', 'l', 'i', 'c', 'N', 'a', 'm', 'e', 0 };
95 static const WCHAR ustr_system
[] = { 'S', 'y', 's', 't', 'e', 'm', 0 };
96 static const WCHAR ustr_warning
[] = { 'W', 'a', 'r', 'n', 'i', 'n', 'g', 0 };
97 static const WCHAR ustr_msg00001
[] = { 'm', 's', 'g', '0', '0', '0', '0', '1', 0 };
99 * This table is to beat any form of "expression building" to check for
100 * correct filename characters. It is also used for ident checks.
101 * FIXME: use it more consistently.
104 #define CH_SHORTNAME 0x01
105 #define CH_LONGNAME 0x02
106 #define CH_IDENT 0x04
107 #define CH_NUMBER 0x08
108 /*#define CH_WILDCARD 0x10*/
109 /*#define CH_DOT 0x20*/
110 #define CH_PUNCT 0x40
111 #define CH_INVALID 0x80
113 static const char char_table
[256] = {
114 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 0x00 - 0x07 */
115 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 0x08 - 0x0F */
116 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 0x10 - 0x17 */
117 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 0x18 - 0x1F */
118 0x80, 0x03, 0x80, 0x03, 0x03, 0x03, 0x03, 0x03, /* 0x20 - 0x27 " !"#$%&'" */
119 0x43, 0x43, 0x10, 0x80, 0x03, 0x03, 0x22, 0x80, /* 0x28 - 0x2F "()*+,-./" */
120 0x0b, 0x0b, 0x0b, 0x0b, 0x0b, 0x0b, 0x0b, 0x0b, /* 0x30 - 0x37 "01234567" */
121 0x0b, 0x0b, 0xc0, 0x80, 0x80, 0x80, 0x80, 0x10, /* 0x38 - 0x3F "89:;<=>?" */
122 0x03, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x40 - 0x47 "@ABCDEFG" */
123 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x48 - 0x4F "HIJKLMNO" */
124 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x50 - 0x57 "PQRSTUVW" */
125 0x07, 0x07, 0x07, 0x80, 0x80, 0x80, 0x80, 0x07, /* 0x58 - 0x5F "XYZ[\]^_" */
126 0x03, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x60 - 0x67 "`abcdefg" */
127 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x68 - 0x6F "hijklmno" */
128 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x70 - 0x77 "pqrstuvw" */
129 0x07, 0x07, 0x07, 0x03, 0x80, 0x03, 0x03, 0x80, /* 0x78 - 0x7F "xyz{|}~ " */
130 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0x80 - 0x87 */
131 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0x88 - 0x8F */
132 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0x90 - 0x97 */
133 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0x98 - 0x9F */
134 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xA0 - 0xA7 */
135 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xA8 - 0xAF */
136 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xB0 - 0xB7 */
137 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xB8 - 0xBF */
138 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xC0 - 0xC7 */
139 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xC8 - 0xCF */
140 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xD0 - 0xD7 */
141 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xD8 - 0xDF */
142 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xE0 - 0xE7 */
143 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xE8 - 0xEF */
144 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xF0 - 0xF7 */
145 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x80, /* 0xF8 - 0xFF */
148 static int isisochar(int ch
)
150 return !(ch
& (~0xff));
155 void set_codepage(int cp
)
163 #define INPUTBUFFER_SIZE 2048 /* Must be larger than 4 and approx. large enough to hold a line */
165 static int nungetstack
= 0;
166 static int allocungetstack
= 0;
167 static char *ungetstack
= NULL
;
168 static int ninputbuffer
= 0;
169 static WCHAR inputbuffer
[INPUTBUFFER_SIZE
];
172 * Fill the input buffer with *one* line of input.
173 * The line is '\n' terminated so that scanning
174 * messages with translation works as expected
175 * (otherwise we cannot pre-translate because the
176 * language is first known one line before the
179 static int fill_inputbuffer(void)
181 static enum input_mode
{ INPUT_UNKNOWN
, INPUT_ASCII
, INPUT_UTF8
, INPUT_UNICODE
} mode
;
183 static unsigned char utf8_bom
[3] = { 0xef, 0xbb, 0xbf };
185 int i
, pos
= 0, len
= 0;
186 char buffer
[INPUTBUFFER_SIZE
];
188 if (mode
== INPUT_UNKNOWN
)
190 len
= fread( buffer
, 1, 8, yyin
);
191 wbuf
= (WCHAR
*)buffer
;
192 if (len
>= 3 && !memcmp( buffer
, utf8_bom
, 3 ))
195 memmove( buffer
, buffer
+ 3, len
- 3 );
200 if (wbuf
[0] == 0xfeff || wbuf
[0] == 0xfffe)
202 mode
= INPUT_UNICODE
;
204 swapped
= (wbuf
[0] == 0xfffe);
206 else if (!((wbuf
[0] | wbuf
[1] | wbuf
[2] | wbuf
[3]) & 0xff00))
208 mode
= INPUT_UNICODE
;
210 else if (!((wbuf
[0] | wbuf
[1] | wbuf
[2] | wbuf
[3]) & 0x00ff))
212 mode
= INPUT_UNICODE
;
217 if (mode
== INPUT_UNICODE
)
220 memcpy( inputbuffer
, wbuf
+ pos
, len
* sizeof(WCHAR
) );
222 else if (mode
== INPUT_UNKNOWN
) mode
= unicodein
? INPUT_UTF8
: INPUT_ASCII
;
228 if (!fgets( buffer
+ len
, sizeof(buffer
) - len
, yyin
)) break;
229 wbuf
= codepage_to_unicode( codepage
, buffer
, strlen(buffer
), &ninputbuffer
);
230 memcpy( inputbuffer
, wbuf
, ninputbuffer
* sizeof(WCHAR
) );
234 if (!fgets( buffer
+ len
, sizeof(buffer
) - len
, yyin
)) break;
235 wbuf
= utf8_to_unicode( buffer
, strlen(buffer
), &ninputbuffer
);
236 memcpy( inputbuffer
, wbuf
, ninputbuffer
* sizeof(WCHAR
) );
240 len
+= fread( inputbuffer
+ len
, sizeof(WCHAR
), INPUTBUFFER_SIZE
- len
, yyin
);
242 if (swapped
) for (i
= 0; i
< len
; i
++) inputbuffer
[i
] = BYTESWAP_WORD( inputbuffer
[i
] );
248 if (ferror(yyin
)) xyyerror( "Fatal: reading input failed\n" );
252 static int get_unichar(void)
254 static WCHAR
*b
= NULL
;
258 return ungetstack
[--nungetstack
];
262 if(!fill_inputbuffer())
271 static void unget_unichar(int ch
)
278 if(nungetstack
== allocungetstack
)
280 allocungetstack
+= 32;
281 ungetstack
= xrealloc(ungetstack
, allocungetstack
* sizeof(*ungetstack
));
284 ungetstack
[nungetstack
++] = (WCHAR
)ch
;
289 * Normal character stack.
290 * Used for number scanning.
292 static int ncharstack
= 0;
293 static int alloccharstack
= 0;
294 static char *charstack
= NULL
;
296 static void empty_char_stack(void)
301 static void push_char(int ch
)
303 if(ncharstack
== alloccharstack
)
305 alloccharstack
+= 32;
306 charstack
= xrealloc(charstack
, alloccharstack
* sizeof(*charstack
));
308 charstack
[ncharstack
++] = (char)ch
;
311 static int tos_char_stack(void)
316 return (int)(charstack
[ncharstack
-1] & 0xff);
319 static char *get_char_stack(void)
325 * Unicode character stack.
326 * Used for general scanner.
328 static int nunicharstack
= 0;
329 static int allocunicharstack
= 0;
330 static WCHAR
*unicharstack
= NULL
;
332 static void empty_unichar_stack(void)
337 static void push_unichar(int ch
)
339 if(nunicharstack
== allocunicharstack
)
341 allocunicharstack
+= 128;
342 unicharstack
= xrealloc(unicharstack
, allocunicharstack
* sizeof(*unicharstack
));
344 unicharstack
[nunicharstack
++] = (WCHAR
)ch
;
348 static int tos_unichar_stack(void)
353 return (int)(unicharstack
[nunicharstack
-1] & 0xffff);
357 static WCHAR
*get_unichar_stack(void)
365 * state | ch | next state
366 * ------+-----------------+--------------------------
369 * 0 | . | error (should never occur)
372 * 1 | [89a-wyzA-WYZ_] | error invalid digit
374 * 2 | [0-9a-fA-F] | 2
375 * 2 | [g-zG-Z_] | error invalid hex digit
376 * 2 | . | return (hex-number) if TOS != [xX] else error
378 * 3 | [89a-zA-Z_] | error invalid octal digit
379 * 3 | . | return (octal-number)
381 * 4 | [a-zA-Z_] | error invalid decimal digit
382 * 4 | . | return (decimal-number)
384 * All non-identifier characters [^a-zA-Z_0-9] terminate the scan
385 * and return the value. This is not entirely correct, but close
386 * enough (should check punctuators as trailing context, but the
387 * char_table is not adapted to that and it is questionable whether
388 * it is worth the trouble).
389 * All non-iso-8859-1 characters are an error.
391 static int scan_number(int ch
)
400 xyyerror("Invalid digit\n");
414 internal_error(__FILE__
, __LINE__
, "Non-digit in first number-scanner state\n");
417 if(ch
== 'x' || ch
== 'X')
422 else if(ch
>= '0' && ch
<= '7')
427 else if(isalpha(ch
) || ch
== '_')
428 xyyerror("Invalid number digit\n");
439 else if(isalpha(ch
) || ch
== '_' || !isxdigit(tos_char_stack()))
440 xyyerror("Invalid hex digit\n");
448 if(ch
>= '0' && ch
<= '7')
450 else if(isalnum(ch
) || ch
== '_')
451 xyyerror("Invalid octal digit\n");
461 else if(isalnum(ch
) || ch
== '_')
462 xyyerror("Invalid decimal digit\n");
470 internal_error(__FILE__
, __LINE__
, "Invalid state in number-scanner\n");
477 mcy_lval
.num
= strtoul(get_char_stack(), NULL
, base
);
481 static void newline(void)
487 static int unisort(const void *p1
, const void *p2
)
489 return unistricmp(((const token_t
*)p1
)->name
, ((const token_t
*)p2
)->name
);
492 static token_t
*tokentable
= NULL
;
493 static int ntokentable
= 0;
495 token_t
*lookup_token(const WCHAR
*s
)
500 return (token_t
*)bsearch(&tok
, tokentable
, ntokentable
, sizeof(*tokentable
), unisort
);
503 void add_token(tok_e type
, const WCHAR
*name
, int tok
, int cp
, const WCHAR
*alias
, int fix
)
506 tokentable
= xrealloc(tokentable
, ntokentable
* sizeof(*tokentable
));
507 tokentable
[ntokentable
-1].type
= type
;
508 tokentable
[ntokentable
-1].name
= name
;
509 tokentable
[ntokentable
-1].token
= tok
;
510 tokentable
[ntokentable
-1].codepage
= cp
;
511 tokentable
[ntokentable
-1].alias
= alias
;
512 tokentable
[ntokentable
-1].fixed
= fix
;
513 qsort(tokentable
, ntokentable
, sizeof(*tokentable
), unisort
);
516 void get_tokentable(token_t
**tab
, int *len
)
530 static const WCHAR ustr_dot1
[] = { '.', '\n', 0 };
531 static const WCHAR ustr_dot2
[] = { '.', '\r', '\n', 0 };
532 static int isinit
= 0;
538 set_codepage(WMC_DEFAULT_CODEPAGE
);
539 add_token(tok_keyword
, ustr_codepages
, tCODEPAGE
, 0, NULL
, 0);
540 add_token(tok_keyword
, ustr_facility
, tFACILITY
, 0, NULL
, 1);
541 add_token(tok_keyword
, ustr_facilitynames
, tFACNAMES
, 0, NULL
, 1);
542 add_token(tok_keyword
, ustr_language
, tLANGUAGE
, 0, NULL
, 1);
543 add_token(tok_keyword
, ustr_languagenames
, tLANNAMES
, 0, NULL
, 1);
544 add_token(tok_keyword
, ustr_messageid
, tMSGID
, 0, NULL
, 1);
545 add_token(tok_keyword
, ustr_messageidtypedef
, tTYPEDEF
, 0, NULL
, 1);
546 add_token(tok_keyword
, ustr_outputbase
, tBASE
, 0, NULL
, 1);
547 add_token(tok_keyword
, ustr_severity
, tSEVERITY
, 0, NULL
, 1);
548 add_token(tok_keyword
, ustr_severitynames
, tSEVNAMES
, 0, NULL
, 1);
549 add_token(tok_keyword
, ustr_symbolicname
, tSYMNAME
, 0, NULL
, 1);
550 add_token(tok_severity
, ustr_error
, 0x03, 0, NULL
, 0);
551 add_token(tok_severity
, ustr_warning
, 0x02, 0, NULL
, 0);
552 add_token(tok_severity
, ustr_informational
, 0x01, 0, NULL
, 0);
553 add_token(tok_severity
, ustr_success
, 0x00, 0, NULL
, 0);
554 add_token(tok_facility
, ustr_application
, 0xFFF, 0, NULL
, 0);
555 add_token(tok_facility
, ustr_system
, 0x0FF, 0, NULL
, 0);
556 add_token(tok_facility
, ustr_dxgi
, 0x87a, 0, NULL
, 0);
557 add_token(tok_facility
, ustr_null
, 0x000, 0, NULL
, 0);
558 add_token(tok_language
, ustr_english
, 0x409, 437, ustr_msg00001
, 0);
561 empty_unichar_stack();
567 while((ch
= get_unichar()) != '\n')
570 xyyerror("Unexpected EOF\n");
576 if(!unistrcmp(ustr_dot1
, get_unichar_stack()) || !unistrcmp(ustr_dot2
, get_unichar_stack()))
579 /* Reset the codepage to our default after each message */
580 set_codepage(WMC_DEFAULT_CODEPAGE
);
583 mcy_lval
.str
= xunistrdup(get_unichar_stack());
608 while(n
< 8 && isisochar(ch
))
610 int t
= char_table
[ch
];
611 if((t
& CH_PUNCT
) || !(t
& CH_SHORTNAME
))
621 mcy_lval
.str
= xunistrdup(get_unichar_stack());
625 if(char_table
[ch
] & CH_IDENT
)
628 while(isisochar(ch
) && (char_table
[ch
] & (CH_IDENT
|CH_NUMBER
)))
635 if(!(tok
= lookup_token(get_unichar_stack())))
637 mcy_lval
.str
= xunistrdup(get_unichar_stack());
646 codepage
= tok
->codepage
;
654 internal_error(__FILE__
, __LINE__
, "Invalid token type encountered\n");
658 if(isspace(ch
)) /* Ignore space */
662 return scan_number(ch
);
674 while(ch
!= '\n' && ch
!= EOF
)
680 push_unichar(ch
); /* Include the newline */
682 mcy_lval
.str
= xunistrdup(get_unichar_stack());
685 xyyerror("Invalid character '%c' (0x%04x)\n", isisochar(ch
) && isprint(ch
) ? ch
: '.', ch
);