more fixes to scanner
[k8jam.git] / src / scan.c
blob35f831062779d4c8d37566a22c054a8597a73a43
1 /*
2 * Copyright 1993-2002 Christopher Seiwald and Perforce Software, Inc.
4 * This file is part of Jam - see jam.c for Copyright information.
5 */
6 /*
7 * scan.c - the jam yacc scanner
9 * 12/26/93 (seiwald) - bump buf in yylex to 10240 - yuk.
10 * 09/16/94 (seiwald) - check for overflows, unmatched {}'s, etc.
11 * Also handle tokens abutting EOF by remembering
12 * to return EOF now matter how many times yylex()
13 * reinvokes yyline().
14 * 02/11/95 (seiwald) - honor only punctuation keywords if SCAN_PUNCT.
15 * 07/27/95 (seiwald) - Include jamgram.h after scan.h, so that YYSTYPE is
16 * defined before Linux's yacc tries to redefine it.
17 * 01/10/01 (seiwald) - \ can now escape any whitespace char
18 * 11/04/02 (seiwald) - const-ing for string literals
20 #include "jam.h"
21 #include "lists.h"
22 #include "parse.h"
23 #include "scan.h"
24 #include "jamgram.h"
25 #include "jambase.h"
26 #include "newstr.h"
27 #include "kstrings.h"
30 const struct keyword {
31 const char *word;
32 int type;
33 } keywords[] = {
34 #include "jamgramtab.h"
35 {0,0}
39 struct include {
40 struct include *next; /* next serial include file */
41 const char *string; /* pointer into current line */
42 char **strings; /* for yyfparse() -- text to parse */
43 FILE *file; /* for yyfparse() -- file being read */
44 const char *fname; /* for yyfparse() -- file name */
45 int line; /* line counter for error messages */
46 char buf[512]; /* for yyfparse() -- line buffer */
49 static struct include *incp = 0; /* current file; head of chain */
51 static int scan_mode = SCAN_NORMAL;
52 static int any_errors = 0;
53 static char *symdump (YYSTYPE *s);
55 /* no single token can be larger */
56 #define BIGGEST_TOKEN (10240)
59 /*#ifndef NDEBUG*/
60 static const char *mnames[] = {
61 "SCAN_NORMAL",
62 "SCAN_BEFORE_STRING",
63 "SCAN_STRING",
64 "SCAN_PUNCT",
65 "SCAN_PUNCT_BS",
67 /*#endif*/
70 * Set parser mode: normal, string, or keyword
72 void yymode (int n) {
73 if (n == SCAN_PUNCT && scan_mode == SCAN_BEFORE_STRING) n = SCAN_PUNCT_BS;
74 else if (n == SCAN_NORMAL && scan_mode == SCAN_PUNCT_BS) n = SCAN_BEFORE_STRING;
75 #ifndef NDEBUG
76 if (DEBUG_SCAN) {
77 if (scan_mode != n) printf("**MODE TRANSITION: %s --> %s\n", mnames[scan_mode], mnames[n]);
79 #endif
80 scan_mode = n;
84 void yyerror (const char *s) {
85 if (incp) printf("%s: line %d: ", incp->fname, incp->line);
86 printf("%s at %s\n", s, symdump(&yylval));
87 ++any_errors;
91 int yyanyerrors (void) {
92 return (any_errors != 0);
96 void yyfparse (const char *s) {
97 struct include *i = (struct include *)malloc(sizeof(*i));
98 /* push this onto the incp chain */
99 i->string = "";
100 i->strings = 0;
101 i->file = 0;
102 i->fname = copystr(s);
103 i->line = 0;
104 i->next = incp;
105 incp = i;
106 /* if the filename is "::Jambase", it means use the internal jambase */
107 if (strcmp(s, "::Jambase") == 0) {
108 jambaseUnpack();
109 i->strings = jambase;
115 * yyline() - read new line and return first character
117 * fabricates a continuous stream of characters across include files, returning EOF at the bitter end
119 int yyline (void) {
120 struct include *i = incp;
121 if (!incp) return EOF;
122 /* once we start reading from the input stream, we reset the
123 * include insertion point so that the next include file becomes
124 * the head of the list */
125 /* if there is more data in this line, return it */
126 if (*i->string) return *i->string++;
127 /* if we're reading from an internal string list, go to the next string */
128 if (i->strings) {
129 if (!*i->strings) goto next;
130 ++i->line;
131 i->string = *(i->strings++);
132 return *i->string++;
134 /* if necessary, open the file */
135 if (!i->file) {
136 FILE *f = stdin;
137 if (strcmp(i->fname, "-") && !(f = fopen(i->fname, "r"))) perror(i->fname);
138 i->file = f;
140 /* if there's another line in this file, start it */
141 if (i->file && fgets(i->buf, sizeof(i->buf), i->file)) {
142 ++i->line;
143 i->string = i->buf;
144 return *i->string++;
146 next:
147 /* this include is done */
148 /* free it up and return EOF so yyparse() returns to parse_file() */
149 incp = i->next;
150 /* close file, free name */
151 if (i->file && i->file != stdin) fclose(i->file);
152 freestr(i->fname);
153 free(i);
154 return EOF;
159 * yylex() - set yylval to current token; return its type
161 * Macros to move things along:
163 * yychar() - return and advance character; invalid after EOF
164 * yyprev() - back up one character; invalid before yychar()
166 * yychar() returns a continuous stream of characters, until it hits
167 * the EOF of the current include file.
169 #define yychar() (*incp->string ? *incp->string++ : yyline())
170 #define yyprev() (incp->string--)
173 /* eat white space */
174 static int skip_spaces (int c) {
175 for (;;) {
176 /* skip past white space */
177 while (c != EOF && isspace(c)) c = yychar();
178 /* not a comment? swallow up comment line */
179 if (c != '#') break;
180 while ((c = yychar()) != EOF && c != '\n') ;
182 return c;
186 static int digit (int c, int base) {
187 if (c == EOF) return -1;
188 if (c >= 'a' && c <= 'z') c -= 32;
189 if (c < '0' || (c > '9' && c < 'A') || c > 'Z') return -1;
190 if ((c -= '0') > 9) c -= 7;
191 if (c >= base) return -1;
192 return c;
196 int yylex (void) {
197 static char buf[BIGGEST_TOKEN];
198 const struct keyword *k;
199 char *b = buf;
200 int c;
201 if (!incp) goto eof;
202 /* get first character (whitespace or of token) */
203 c = yychar();
204 if (scan_mode == SCAN_STRING) {
205 tKString s;
206 /* if scanning for a string (action's {}'s), look for the closing brace */
207 /* we handle matching braces, if they match! */
208 int nest = 1;
209 /* skip spaces and newline */
211 while (c != EOF && c != '\n' && isspace(c)) c = yychar();
212 if (c == '\n') c = yychar();
214 /*while (c != EOF && isspace(c)) c = yychar();*/
215 kStringNew(&s);
216 /* collect string */
217 while (c != EOF && b < buf+sizeof(buf)) {
218 if (c == '{') ++nest;
219 if (c == '}' && !--nest) break;
220 /* *b++ = c; */
221 kStringPushBack(&s, c);
222 c = yychar();
224 /* we ate the ending brace -- regurgitate it */
225 if (c != EOF) yyprev();
226 /* check obvious errors */
227 /* if (b == buf+sizeof(buf)) { yyerror("action block too big"); goto eof; } */
228 if (nest) { kStringFree(&s); yyerror("unmatched {} in action block"); goto eof; }
229 /* *b = 0; */
230 /* remove trailing newlines and spaces, add one newline */
231 /*strcpy(buf+nest, "\n");*/
233 nest = kStringLen(&s);
234 while (nest > 0 && isspace(kStringCStr(&s)[nest-1])) kStringPopBack(&s);
235 kStringAppendCStr(&s, "\n");
237 yylval.type = T_STRING;
238 yylval.string = newstr(kStringCStr(&s));
239 kStringFree(&s);
240 /*fprintf(stderr, "::: [%s]\n", yylval.string);*/
241 yymode(SCAN_NORMAL);
242 } else {
243 int in_quote = 0, not_keyword = 0, was_not_alnum = 0;
244 int n, d;
245 c = skip_spaces(c);
246 /* c now points to the first character of a token */
247 if (c == EOF) goto eof;
248 /* while scanning the word, disqualify it for (expensive)
249 * keyword lookup when we can: $anything, "anything", \anything */
250 not_keyword = (c == '$');
251 if (strchr("{}[];", c)) {
252 *b++ = c;
253 goto lexdone;
255 if (c == ':') {
256 /* only ':abc' is good, ':*' is not */
257 *b++ = ':';
258 c = yychar();
259 if (c == EOF || isspace(c) || isalnum(c) || c == '$' || c == '_') goto lexdoneback;
261 /* look for white space to delimit word */
262 /* "'s get stripped but preserve white space */
263 /* \ protects next character */
264 for (; c != EOF && b < buf+sizeof(buf) && (in_quote || !isspace(c)); c = yychar()) {
265 if (c == '"') {
266 /* begin or end " */
267 in_quote = !in_quote;
268 not_keyword = 1;
269 continue;
271 if (!in_quote) {
272 /* k8: allow specials to work as delimiters */
273 if (strchr("{};", c)) break;
274 if (!not_keyword) {
275 if (strchr("[]", c)) break; /* only in keywords; to allow things like $(a[2]) */
276 if (!was_not_alnum && c == ':') break; /* '*:' is not good */
279 if (c != '\\') {
280 /* normal char */
281 if (!isalnum(c)) was_not_alnum = 1;
282 *b++ = c;
283 continue;
285 /* screened char */
286 if ((c = yychar()) == EOF) break;
287 was_not_alnum = 1;
288 if (in_quote) {
289 switch (c) {
290 case 't': *b++ = '\t'; break;
291 case 'n': *b++ = '\n'; break;
292 case 'r': *b++ = '\r'; break;
293 case 'v': *b++ = '\v'; break;
294 case 'b': *b++ = '\b'; break;
295 case 'a': *b++ = '\a'; break;
296 case 'f': *b++ = '\f'; break;
297 case 'e': *b++ = '\x1b'; break;
298 case 'x':
299 c = yychar(); // first digit
300 n = digit(c, 16);
301 if (n < 0) { yyerror("invalid hex escape in quoted string"); goto eof; }
302 c = yychar(); // second digit
303 d = digit(c, 16);
304 if (d < 0) { if (c != EOF) yyprev(); } else n = (n*16)+d;
305 if (n == 0) { yyerror("invalid hex escape in quoted string"); goto eof; }
306 *b++ = n;
307 break;
308 //TODO: add '\uXXXX'?
309 default:
310 if (isalnum(c)) { yyerror("invalid escape in quoted string"); goto eof; }
311 *b++ = c;
312 break;
314 } else {
315 *b++ = c;
317 not_keyword = 1;
319 /* we looked ahead a character -- back up */
320 lexdoneback:
321 if (c != EOF) yyprev();
322 lexdone:
323 /* check obvious errors */
324 if (b == buf+sizeof(buf)) { yyerror("string too big"); goto eof; }
325 if (in_quote) { yyerror("unmatched \" in string"); goto eof; }
326 *b = 0;
327 /* scan token table */
328 /* don't scan if it's obviously not a keyword or if its */
329 /* an alphabetic when we were looking for punctuation */
330 yylval.type = T_ARG;
331 if (!not_keyword && !((scan_mode == SCAN_PUNCT || scan_mode == SCAN_PUNCT_BS) && isalnum(*buf))) {
332 /* find token */
333 for (k = keywords; k->word; ++k) {
334 if (strcmp(k->word, buf) == 0) {
335 yylval.type = k->type;
336 yylval.string = k->word; /* used by symdump */
337 break;
341 if (yylval.type == T_ARG) {
342 yylval.string = newstr(buf);
343 if (strcmp(buf, "on") == 0) {
344 printf("\"ON\": not_keyword=%d; scan_mode=%s\n", not_keyword, mnames[scan_mode]);
348 if (DEBUG_SCAN) printf("scan %s\n", symdump(&yylval));
349 return yylval.type;
350 eof:
351 yylval.type = EOF;
352 return yylval.type;
356 static char *symdump (YYSTYPE *s) {
357 static char buf[BIGGEST_TOKEN+20];
358 switch (s->type) {
359 case EOF: snprintf(buf, sizeof(buf), "EOF"); break;
360 case 0: snprintf(buf, sizeof(buf), "unknown symbol %s", s->string); break;
361 case T_ARG: snprintf(buf, sizeof(buf), "argument %s", s->string); break;
362 case T_STRING: snprintf(buf, sizeof(buf), "string \"%s\"", s->string); break;
363 default: snprintf(buf, sizeof(buf), "keyword %s", s->string); break;
365 return buf;
369 void yystatetrans (int tk) {
370 if (scan_mode == SCAN_BEFORE_STRING) {
371 switch (tk) {
372 case T_LBRACE_t: yymode(SCAN_STRING); break;
374 } else {
375 switch (tk) {
376 case T_ACTIONS_t: yymode(SCAN_BEFORE_STRING); break;
377 case T_LBRACKET_t: yymode(SCAN_NORMAL); break;