2 * Wine Message Compiler lexical scanner
4 * Copyright 2000 Bertho A. Stultiens (BS)
21 * Keywords are case insenitive. All normal input is treated as
22 * being in codepage iso-8859-1 for ascii input files (unicode
23 * page 0) and as equivalent unicode if unicode input is selected.
24 * All normal input, which is not part of a message text, is
25 * enforced to be unicode page 0. Otherwise an error will be
26 * generated. The normal file data should only be ASCII because
27 * that is the basic definition of the grammar.
29 * Byteorder or unicode input is determined automatically by
30 * reading the first 8 bytes and checking them against unicode
31 * page 0 byteorder (hibyte must be 0).
33 * Alternatively, the input is checked against a special byte
34 * sequence to identify the file.
49 * Default added identifiers for classes:
59 * The 'Codepages' keyword is a wmc extension.
62 static WCHAR ustr_application
[] = { 'A', 'p', 'p', 'l', 'i', 'c', 'a', 't', 'i', 'o', 'n', 0 };
63 static WCHAR ustr_codepages
[] = { 'C', 'o', 'd', 'e', 'p', 'a', 'g', 'e', 's', 0 };
64 static WCHAR ustr_english
[] = { 'E', 'n', 'g', 'l', 'i', 's', 'h', 0 };
65 static WCHAR ustr_error
[] = { 'E', 'r', 'r', 'o', 'r', 0 };
66 static WCHAR ustr_facility
[] = { 'F', 'a', 'c', 'i', 'l', 'i', 't', 'y', 0 };
67 static WCHAR ustr_facilitynames
[] = { 'F', 'a', 'c', 'i', 'l', 'i', 't', 'y', 'N', 'a', 'm', 'e', 's', 0 };
68 static WCHAR ustr_informational
[] = { 'I', 'n', 'f', 'o', 'r', 'm', 'a', 't', 'i', 'o', 'n', 'a', 'l', 0 };
69 static WCHAR ustr_language
[] = { 'L', 'a', 'n', 'g', 'u', 'a', 'g', 'e', 0};
70 static WCHAR ustr_languagenames
[] = { 'L', 'a', 'n', 'g', 'u', 'a', 'g', 'e', 'N', 'a', 'm', 'e', 's', 0};
71 static WCHAR ustr_messageid
[] = { 'M', 'e', 's', 's', 'a', 'g', 'e', 'I', 'd', 0 };
72 static WCHAR ustr_messageidtypedef
[] = { 'M', 'e', 's', 's', 'a', 'g', 'e', 'I', 'd', 'T', 'y', 'p', 'e', 'd', 'e', 'f', 0 };
73 static WCHAR ustr_outputbase
[] = { 'O', 'u', 't', 'p', 'u', 't', 'B', 'a', 's', 'e', 0 };
74 static WCHAR ustr_severity
[] = { 'S', 'e', 'v', 'e', 'r', 'i', 't', 'y', 0 };
75 static WCHAR ustr_severitynames
[] = { 'S', 'e', 'v', 'e', 'r', 'i', 't', 'y', 'N', 'a', 'm', 'e', 's', 0 };
76 static WCHAR ustr_success
[] = { 'S', 'u', 'c', 'c', 'e', 's', 's', 0 };
77 static WCHAR ustr_symbolicname
[] = { 'S', 'y', 'm', 'b', 'o', 'l', 'i', 'c', 'N', 'a', 'm', 'e', 0 };
78 static WCHAR ustr_system
[] = { 'S', 'y', 's', 't', 'e', 'm', 0 };
79 static WCHAR ustr_warning
[] = { 'W', 'a', 'r', 'n', 'i', 'n', 'g', 0 };
80 static WCHAR ustr_msg00001
[] = { 'm', 's', 'g', '0', '0', '0', '0', '1', 0 };
82 * This table is to beat any form of "expression building" to check for
83 * correct filename characters. It is also used for ident checks.
84 * FIXME: use it more consistently.
87 #define CH_SHORTNAME 0x01
88 #define CH_LONGNAME 0x02
90 #define CH_NUMBER 0x08
91 /*#define CH_WILDCARD 0x10*/
92 /*#define CH_DOT 0x20*/
94 #define CH_INVALID 0x80
96 static const char char_table
[256] = {
97 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 0x00 - 0x07 */
98 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 0x08 - 0x0F */
99 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 0x10 - 0x17 */
100 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 0x18 - 0x1F */
101 0x80, 0x03, 0x80, 0x03, 0x03, 0x03, 0x03, 0x03, /* 0x20 - 0x27 " !"#$%&'" */
102 0x43, 0x43, 0x10, 0x80, 0x03, 0x03, 0x22, 0x80, /* 0x28 - 0x2F "()*+,-./" */
103 0x0b, 0x0b, 0x0b, 0x0b, 0x0b, 0x0b, 0x0b, 0x0b, /* 0x30 - 0x37 "01234567" */
104 0x0b, 0x0b, 0xc0, 0x80, 0x80, 0x80, 0x80, 0x10, /* 0x38 - 0x3F "89:;<=>?" */
105 0x03, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x40 - 0x47 "@ABCDEFG" */
106 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x48 - 0x4F "HIJKLMNO" */
107 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x50 - 0x57 "PQRSTUVW" */
108 0x07, 0x07, 0x07, 0x80, 0x80, 0x80, 0x80, 0x07, /* 0x58 - 0x5F "XYZ[\]^_" */
109 0x03, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x60 - 0x67 "`abcdefg" */
110 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x68 - 0x6F "hijklmno" */
111 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x70 - 0x77 "pqrstuvw" */
112 0x07, 0x07, 0x07, 0x03, 0x80, 0x03, 0x03, 0x80, /* 0x78 - 0x7F "xyz{|}~ " */
113 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0x80 - 0x87 */
114 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0x88 - 0x8F */
115 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0x90 - 0x97 */
116 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0x98 - 0x9F */
117 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xA0 - 0xA7 */
118 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xA8 - 0xAF */
119 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xB0 - 0xB7 */
120 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xB8 - 0xBF */
121 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xC0 - 0xC7 */
122 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xC8 - 0xCF */
123 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xD0 - 0xD7 */
124 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xD8 - 0xDF */
125 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xE0 - 0xE7 */
126 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xE8 - 0xEF */
127 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xF0 - 0xF7 */
128 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x80, /* 0xF8 - 0xFF */
131 static int isisochar(int ch
)
133 return !(ch
& (~0xff));
137 static const union cptable
*codepage_def
;
139 void set_codepage(int cp
)
142 codepage_def
= find_codepage(codepage
);
144 xyyerror("Codepage %d not found; cannot process", codepage
);
150 static int nungetstack
= 0;
151 static int allocungetstack
= 0;
152 static char *ungetstack
= NULL
;
153 static int ninputbuffer
= 0;
154 static WCHAR
*inputbuffer
= NULL
;
155 static char *xlatebuffer
= NULL
;
157 #define INPUTBUFFER_SIZE 2048 /* Must be larger than 4 and approx. large enough to hold a line */
160 * Fill the input buffer with *one* line of input.
161 * The line is '\n' terminated so that scanning
162 * messages with translation works as expected
163 * (otherwise we cannot pre-translate because the
164 * language is first known one line before the
167 static int fill_inputbuffer(void)
170 static char err_fatalread
[] = "Fatal: reading input failed";
171 static int endian
= -1;
175 inputbuffer
= xmalloc(INPUTBUFFER_SIZE
);
176 xlatebuffer
= xmalloc(INPUTBUFFER_SIZE
);
183 cptr
= fgets(xlatebuffer
, INPUTBUFFER_SIZE
, yyin
);
184 if(!cptr
&& ferror(yyin
))
185 xyyerror(err_fatalread
);
188 assert(codepage_def
!= NULL
);
189 n
= cp_mbstowcs(codepage_def
, 0, xlatebuffer
, strlen(xlatebuffer
)+1, inputbuffer
, INPUTBUFFER_SIZE
);
191 internal_error(__FILE__
, __LINE__
, "Could not translate to unicode (%d)", n
);
193 goto try_again
; /* Should not hapen */
194 n
--; /* Strip added conversion '\0' from input length */
197 * Detect UTF-8 in the first time we read some bytes by
198 * checking the special sequence "FE..." or something like
199 * that. I need to check www.unicode.org for details.
206 n
= fread(inputbuffer
, 1, 8, yyin
);
209 if(!n
&& ferror(yyin
))
210 xyyerror(err_fatalread
);
212 xyyerror("Fatal: file to short to determine byteorder (should never happen)");
214 if(isisochar(inputbuffer
[0]) &&
215 isisochar(inputbuffer
[1]) &&
216 isisochar(inputbuffer
[2]) &&
217 isisochar(inputbuffer
[3]))
219 #ifdef WORDS_BIGENDIAN
222 endian
= WMC_BO_LITTLE
;
225 else if(isisochar(BYTESWAP_WORD(inputbuffer
[0])) &&
226 isisochar(BYTESWAP_WORD(inputbuffer
[1])) &&
227 isisochar(BYTESWAP_WORD(inputbuffer
[2])) &&
228 isisochar(BYTESWAP_WORD(inputbuffer
[3])))
230 #ifdef WORDS_BIGENDIAN
231 endian
= WMC_BO_LITTLE
;
237 xyyerror("Fatal: cannot determine file's byteorder");
239 * Determine the file-endian with the leader-bytes
240 * "FF FE..."; can't remember the exact sequence.
243 #ifdef WORDS_BIGENDIAN
244 if(endian
== WMC_BO_LITTLE
)
246 if(endian
== WMC_BO_BIG
)
249 inputbuffer
[0] = BYTESWAP_WORD(inputbuffer
[0]);
250 inputbuffer
[1] = BYTESWAP_WORD(inputbuffer
[1]);
251 inputbuffer
[2] = BYTESWAP_WORD(inputbuffer
[2]);
252 inputbuffer
[3] = BYTESWAP_WORD(inputbuffer
[3]);
260 for(i
= 0; i
< INPUTBUFFER_SIZE
; i
++)
263 t
= fread(&inputbuffer
[i
], 2, 1, yyin
);
264 if(!t
&& ferror(yyin
))
265 xyyerror(err_fatalread
);
269 #ifdef WORDS_BIGENDIAN
270 if(endian
== WMC_BO_LITTLE
)
272 if(endian
== WMC_BO_BIG
)
275 if((inputbuffer
[i
] = BYTESWAP_WORD(inputbuffer
[i
])) == '\n')
280 if(inputbuffer
[i
] == '\n')
290 yywarning("Re-read line (input was or converted to zilch)");
291 goto try_again
; /* Should not happen, but could be due to stdin reading and a signal */
298 static int get_unichar(void)
300 static WCHAR
*b
= NULL
;
304 return ungetstack
[--nungetstack
];
308 if(!fill_inputbuffer())
314 return (int)(*b
++ & 0xffff);
317 static void unget_unichar(int ch
)
324 if(nungetstack
== allocungetstack
)
326 allocungetstack
+= 32;
327 ungetstack
= xrealloc(ungetstack
, allocungetstack
* sizeof(*ungetstack
));
330 ungetstack
[nungetstack
++] = (WCHAR
)ch
;
335 * Normal character stack.
336 * Used for number scanning.
338 static int ncharstack
= 0;
339 static int alloccharstack
= 0;
340 static char *charstack
= NULL
;
342 static void empty_char_stack(void)
347 static void push_char(int ch
)
349 if(ncharstack
== alloccharstack
)
351 alloccharstack
+= 32;
352 charstack
= xrealloc(charstack
, alloccharstack
* sizeof(*charstack
));
354 charstack
[ncharstack
++] = (char)ch
;
357 static int tos_char_stack(void)
362 return (int)(charstack
[ncharstack
-1] & 0xff);
365 static char *get_char_stack(void)
371 * Unicode character stack.
372 * Used for general scanner.
374 static int nunicharstack
= 0;
375 static int allocunicharstack
= 0;
376 static WCHAR
*unicharstack
= NULL
;
378 static void empty_unichar_stack(void)
383 static void push_unichar(int ch
)
385 if(nunicharstack
== allocunicharstack
)
387 allocunicharstack
+= 128;
388 unicharstack
= xrealloc(unicharstack
, allocunicharstack
* sizeof(*unicharstack
));
390 unicharstack
[nunicharstack
++] = (WCHAR
)ch
;
394 static int tos_unichar_stack(void)
399 return (int)(unicharstack
[nunicharstack
-1] & 0xffff);
403 static WCHAR
*get_unichar_stack(void)
411 * state | ch | next state
412 * ------+-----------------+--------------------------
415 * 0 | . | error (should never occur)
418 * 1 | [89a-wyzA-WYZ_] | error invalid digit
420 * 2 | [0-9a-fA-F] | 2
421 * 2 | [g-zG-Z_] | error invalid hex digit
422 * 2 | . | return (hex-number) if TOS != [xX] else error
424 * 3 | [89a-zA-Z_] | error invalid octal digit
425 * 3 | . | return (octal-number)
427 * 4 | [a-zA-Z_] | error invalid decimal digit
428 * 4 | . | return (decimal-number)
430 * All non-identifier characters [^a-zA-Z_0-9] terminate the scan
431 * and return the value. This is not entirely correct, but close
432 * enough (should check punctuators as trailing context, but the
433 * char_table is not adapted to that and it is questionable whether
434 * it is worth the trouble).
435 * All non-iso-8859-1 characters are an error.
437 static int scan_number(int ch
)
446 xyyerror("Invalid digit");
460 internal_error(__FILE__
, __LINE__
, "Non-digit in first number-scanner state");
463 if(ch
== 'x' || ch
== 'X')
468 else if(ch
>= '0' && ch
<= '7')
473 else if(isalpha(ch
) || ch
== '_')
474 xyyerror("Invalid number digit");
485 else if(isalpha(ch
) || ch
== '_' || !isxdigit(tos_char_stack()))
486 xyyerror("Invalid hex digit");
494 if(ch
>= '0' && ch
<= '7')
496 else if(isalnum(ch
) || ch
== '_')
497 xyyerror("Invalid octal digit");
507 else if(isalnum(ch
) || ch
== '_')
508 xyyerror("Invalid decimal digit");
516 internal_error(__FILE__
, __LINE__
, "Invalid state in number-scanner");
523 yylval
.num
= strtoul(get_char_stack(), NULL
, base
);
527 static void newline(void)
533 static int unisort(const void *p1
, const void *p2
)
535 return unistricmp(((token_t
*)p1
)->name
, ((token_t
*)p2
)->name
);
538 static token_t
*tokentable
= NULL
;
539 static int ntokentable
= 0;
541 token_t
*lookup_token(const WCHAR
*s
)
546 return (token_t
*)bsearch(&tok
, tokentable
, ntokentable
, sizeof(*tokentable
), unisort
);
549 void add_token(tok_e type
, const WCHAR
*name
, int tok
, int cp
, const WCHAR
*alias
, int fix
)
552 tokentable
= xrealloc(tokentable
, ntokentable
* sizeof(*tokentable
));
553 tokentable
[ntokentable
-1].type
= type
;
554 tokentable
[ntokentable
-1].name
= name
;
555 tokentable
[ntokentable
-1].token
= tok
;
556 tokentable
[ntokentable
-1].codepage
= cp
;
557 tokentable
[ntokentable
-1].alias
= alias
;
558 tokentable
[ntokentable
-1].fixed
= fix
;
559 qsort(tokentable
, ntokentable
, sizeof(*tokentable
), unisort
);
562 void get_tokentable(token_t
**tab
, int *len
)
576 static WCHAR ustr_dot1
[] = { '.', '\n', 0 };
577 static WCHAR ustr_dot2
[] = { '.', '\r', '\n', 0 };
578 static int isinit
= 0;
584 set_codepage(WMC_DEFAULT_CODEPAGE
);
585 add_token(tok_keyword
, ustr_codepages
, tCODEPAGE
, 0, NULL
, 0);
586 add_token(tok_keyword
, ustr_facility
, tFACILITY
, 0, NULL
, 1);
587 add_token(tok_keyword
, ustr_facilitynames
, tFACNAMES
, 0, NULL
, 1);
588 add_token(tok_keyword
, ustr_language
, tLANGUAGE
, 0, NULL
, 1);
589 add_token(tok_keyword
, ustr_languagenames
, tLANNAMES
, 0, NULL
, 1);
590 add_token(tok_keyword
, ustr_messageid
, tMSGID
, 0, NULL
, 1);
591 add_token(tok_keyword
, ustr_messageidtypedef
, tTYPEDEF
, 0, NULL
, 1);
592 add_token(tok_keyword
, ustr_outputbase
, tBASE
, 0, NULL
, 1);
593 add_token(tok_keyword
, ustr_severity
, tSEVERITY
, 0, NULL
, 1);
594 add_token(tok_keyword
, ustr_severitynames
, tSEVNAMES
, 0, NULL
, 1);
595 add_token(tok_keyword
, ustr_symbolicname
, tSYMNAME
, 0, NULL
, 1);
596 add_token(tok_severity
, ustr_error
, 0x03, 0, NULL
, 0);
597 add_token(tok_severity
, ustr_warning
, 0x02, 0, NULL
, 0);
598 add_token(tok_severity
, ustr_informational
, 0x01, 0, NULL
, 0);
599 add_token(tok_severity
, ustr_success
, 0x00, 0, NULL
, 0);
600 add_token(tok_facility
, ustr_application
, 0xFFF, 0, NULL
, 0);
601 add_token(tok_facility
, ustr_system
, 0x0FF, 0, NULL
, 0);
602 add_token(tok_language
, ustr_english
, 0x409, 437, ustr_msg00001
, 0);
605 empty_unichar_stack();
611 while((ch
= get_unichar()) != '\n')
614 xyyerror("Unexpected EOF");
620 if(!unistrcmp(ustr_dot1
, get_unichar_stack()) || !unistrcmp(ustr_dot2
, get_unichar_stack()))
623 /* Reset the codepage to our default after each message */
624 set_codepage(WMC_DEFAULT_CODEPAGE
);
627 yylval
.str
= xunistrdup(get_unichar_stack());
652 while(n
< 8 && isisochar(ch
))
654 int t
= char_table
[ch
];
655 if((t
& CH_PUNCT
) || !(t
& CH_SHORTNAME
))
665 yylval
.str
= xunistrdup(get_unichar_stack());
669 if(char_table
[ch
] & CH_IDENT
)
672 while(isisochar(ch
) && (char_table
[ch
] & (CH_IDENT
|CH_NUMBER
)))
679 if(!(tok
= lookup_token(get_unichar_stack())))
681 yylval
.str
= xunistrdup(get_unichar_stack());
690 codepage
= tok
->codepage
;
698 internal_error(__FILE__
, __LINE__
, "Invalid token type encountered");
702 if(isspace(ch
)) /* Ignore space */
706 return scan_number(ch
);
718 while(ch
!= '\n' && ch
!= EOF
)
724 push_unichar(ch
); /* Include the newline */
726 yylval
.str
= xunistrdup(get_unichar_stack());
729 xyyerror("Invalid character '%c' (0x%04x)", isisochar(ch
) && isprint(ch
) ? ch
: '.', ch
);