cosmetix
[k8jam.git] / src / scan.c
blobe2c3f65c144e45ebeeb80ee022f50953071fc999
1 /*
2 * Copyright 1993-2002 Christopher Seiwald and Perforce Software, Inc.
4 * This file is part of Jam - see jam.c for Copyright information.
5 */
6 /*
7 * scan.c - the jam yacc scanner
9 * 12/26/93 (seiwald) - bump buf in yylex to 10240 - yuk.
10 * 09/16/94 (seiwald) - check for overflows, unmatched {}'s, etc.
11 * Also handle tokens abutting EOF by remembering
12 * to return EOF now matter how many times yylex()
13 * reinvokes yyline().
14 * 02/11/95 (seiwald) - honor only punctuation keywords if SCAN_PUNCT.
15 * 07/27/95 (seiwald) - Include jamgram.h after scan.h, so that YYSTYPE is
16 * defined before Linux's yacc tries to redefine it.
17 * 01/10/01 (seiwald) - \ can now escape any whitespace char
18 * 11/04/02 (seiwald) - const-ing for string literals
20 #include "jam.h"
21 #include "lists.h"
22 #include "parse.h"
23 #include "scan.h"
24 #include "jamgram.h"
25 #include "jambase.h"
26 #include "newstr.h"
27 #include "dstrings.h"
30 token_t yylval;
33 typedef struct {
34 const char *word;
35 int type;
36 } keyword_t;
39 static const keyword_t keywords[] = {
40 #include "jamgramtab.h"
41 {0,0}
45 typedef struct include_s {
46 struct include_s *next; /* next serial include file */
47 const char *string; /* pointer into current line */
48 char **strings; /* for yyfparse() -- text to parse */
49 FILE *file; /* for yyfparse() -- file being read */
50 const char *fname; /* for yyfparse() -- file name */
51 int line; /* line counter for error messages */
52 int pos; /* position for error messages */
53 int back_count; /* # of yyunget()ed chars */
54 char back_chars[2]; /* buffer for yyunget()ed chars */
55 char *fcontents; /* for yyfparse() -- file contents */
56 int prevwasn; /* !0: increment line and reset to 0 */
57 } include_t;
59 static include_t *incp = NULL; /* current file; head of chain */
62 static int scan_mode = SCAN_NORMAL;
63 /*static int any_errors = 0;*/
65 static const char *symdump (const token_t *s);
68 #ifndef NDEBUG
69 static const char *mnames[] = {
70 "SCAN_NORMAL",
71 "SCAN_STRING",
72 "SCAN_PUNCT",
74 #endif
78 * Set parser mode: normal, string, or keyword
80 void yymode (int n) {
81 #ifndef NDEBUG
82 if (DEBUG_SCAN && scan_mode != n) printf("**MODE TRANSITION: %s --> %s\n", mnames[scan_mode], mnames[n]);
83 #endif
84 scan_mode = n;
88 void yyerror (const token_t *tk, const char *s) {
89 printf("ERROR(%d:%d) '%s': %s\n", tk->line, tk->pos, tk->file, s);
90 exit(EXITBAD); /* exit now */
94 static void yywarning_ex (const char *s) {
95 printf("WARNING(%d:%d) '%s': %s\n", incp->line, incp->pos, incp->fname, s);
99 void yyfparse (const char *s) {
100 include_t *i = (include_t *)malloc(sizeof(*i));
101 /* push this onto the incp chain */
102 i->string = "";
103 i->strings = NULL;
104 i->file = NULL;
105 //i->fname = strdup(s);
106 i->fname = newstr(s);
107 i->line = 0;
108 i->pos = 0;
109 i->next = incp;
110 i->back_count = 0;
111 i->fcontents = NULL;
112 i->prevwasn = 1;
113 incp = i;
114 /* if the filename is "::Jambase", it means use the internal jambase */
115 if (strcmp(s, "::Jambase") == 0) {
116 jambase_unpack();
117 i->strings = jambase;
123 * yychar() - read new line and return first character
125 * fabricates a continuous stream of characters across include files, returning EOF at the bitter end
127 static int yychar (void) {
128 include_t *i = incp;
129 if (!incp) return EOF;
130 /* once we start reading from the input stream, we reset the
131 * include insertion point so that the next include file becomes
132 * the head of the list */
133 if (i->back_count) return i->back_chars[--i->back_count];
134 /* if there is more data in this line, return it */
135 if (i->prevwasn) { i->prevwasn = 0; ++i->line; i->pos = 0; }
136 again:
137 ++i->pos;
138 if (*i->string) {
139 if (*i->string == '\n') i->prevwasn = 1;
140 return *i->string++;
142 /* if we're reading from an internal string list, go to the next string */
143 if (i->strings) {
144 if (!*i->strings) goto next;
145 i->string = *(i->strings++);
146 return *i->string++;
148 /* if necessary, open the file and get file contents */
149 if (!i->file) {
150 FILE *f;
151 long fsize;
152 if ((f = fopen(i->fname, "rb")) == NULL) perror(i->fname);
153 i->file = f;
154 if (fseek(f, 0, SEEK_END) < 0) perror(i->fname);
155 if ((fsize = ftell(f)) < 0) perror(i->fname);
156 if (fseek(f, 0, SEEK_SET) < 0) perror(i->fname);
157 i->fcontents = calloc(fsize+1, 1);
158 if (fsize > 0 && fread(i->fcontents, fsize, 1, f) != 1) perror(i->fname);
159 fclose(f); /* don't need to hold it opened */
160 i->string = i->fcontents;
161 goto again;
163 next:
164 /* this include is done */
165 /* free it up and return EOF so yyparse() returns to parse_file() */
166 incp = i->next;
167 /* close file, free name */
168 if (i->fcontents != NULL) free(i->fcontents);
169 //if (i->fname != NULL) free(i->fname);
170 free(i);
171 return EOF;
176 * yychar() - back up one character
178 static inline void yyunget (int c) {
179 if (c != EOF) {
180 if (incp->back_count >= 2) { fprintf(stderr, "yyunget: too much!\n"); abort(); }
181 incp->back_chars[incp->back_count++] = c;
186 /* eat white space */
187 static int skip_spaces (int c) {
188 for (;;) {
189 /* skip past white space */
190 while (c != EOF && isspace(c)) {
191 yylval.line = incp->line;
192 yylval.pos = incp->pos;
193 c = yychar();
195 /* not a comment? swallow up comment line */
196 if (c != '#') break;
197 while ((c = yychar()) != EOF && c != '\n') ;
199 return c;
203 static int digit (int c, int base) {
204 if (c == EOF) return -1;
205 if (c >= 'a' && c <= 'z') c -= 32;
206 if (c < '0' || (c > '9' && c < 'A') || c > 'Z') return -1;
207 if ((c -= '0') > 9) c -= 7;
208 if (c >= base) return -1;
209 return c;
213 /* textlen includes trailing zero */
214 static void remove_indent (char *text, int textlen, int indent) {
215 if (indent > 0) {
216 while (*text) {
217 char *eol = strchr(text, '\n');
218 if (eol == NULL) eol = text+textlen-1;
219 if (eol-text >= indent) {
220 textlen -= indent;
221 eol -= indent;
222 memmove(text, text+indent, textlen);
224 if (!eol[0]) break;
225 textlen -= eol+1-text;
226 text = eol+1;
232 static inline const keyword_t *find_keyword (const char *nbuf, size_t nblen) {
233 if (nblen > 0) {
234 for (const keyword_t *k = keywords; k->word; ++k) if (strncmp(k->word, nbuf, nblen) == 0 && k->word[nblen] == 0) return k;
236 return NULL;
241 * yylex() - set yylval to current token; return its type
244 #define PUSH_CHAR(_c) do { \
245 if (sbused+1 >= sbsize) { \
246 int newsz = ((sbused+1)|0x7ff)+1; \
247 char *nb = realloc(sbuf, newsz); \
248 if (nb == NULL) { fprintf(stderr, "FATAL: out of memory!\n"); abort(); } \
249 sbuf = nb; \
250 sbsize = newsz; \
252 sbuf[sbused++] = (_c); \
253 } while (0)
255 static char *sbuf = NULL;
256 static int sbsize = 0;
257 static int sbused;
260 /* "$(" already scanned and pushed */
261 /* return char after ")" */
262 int scan_varaccess (void) {
263 int c = yychar(), qch = 0, oc;
264 if (c == EOF) return c;
265 /* scan variable name */
266 while (c != EOF && c != '[' && c != ':') {
267 PUSH_CHAR(c);
268 oc = c;
269 c = yychar();
270 if (oc == ')') return c;
271 if (oc == '$' && c == '(') { PUSH_CHAR(c); c = scan_varaccess(); }
273 if (c == EOF) return c;
274 /* scan indexing; 'c' is not pushed */
275 if (c == '[') {
276 while (c != EOF && c != ']') {
277 PUSH_CHAR(c);
278 oc = c;
279 c = yychar();
280 if (oc == ')') return c;
281 if (oc == '$' && c == '(') { PUSH_CHAR(c); c = scan_varaccess(); }
283 /* find either selector or ')' */
284 while (c != EOF && c != ':') {
285 PUSH_CHAR(c);
286 oc = c;
287 c = yychar();
288 if (oc == ')') return c;
289 if (oc == '$' && c == '(') { PUSH_CHAR(c); c = scan_varaccess(); }
291 if (c == EOF) return c;
293 /* scan selectors; 'c' is not pushed */
294 while (c != EOF) {
295 if (qch != '\'' && c == '\\') {
296 /* screening */
297 PUSH_CHAR(c);
298 if ((c = yychar()) == EOF) break;
299 PUSH_CHAR(c);
300 c = yychar();
301 continue;
303 PUSH_CHAR(c);
304 oc = c;
305 c = yychar();
306 if (!qch && (oc == '"' || oc == '\'')) { qch = oc; continue; }
307 if (!qch && oc == ')') return c;
308 if (qch != '\'' && oc == '$' && c == '(') {
309 PUSH_CHAR(c);
310 c = scan_varaccess();
311 continue;
313 if (qch && oc == qch) {
314 if (!(qch == '\'' && c == '\'')) qch = 0;
315 continue;
318 return c;
322 int yylex (void) {
323 const keyword_t *kw;
324 int c;
325 sbused = 0;
326 yylval.strlit = 0;
327 if (!incp) goto eof;
328 yylval.strlit = 0; /* expand this string */
329 yylval.line = incp->line;
330 yylval.pos = incp->pos;
331 yylval.file = incp->fname;
332 /* get first character (whitespace or of token) */
333 c = yychar();
334 if (scan_mode == SCAN_STRING) {
335 /* if scanning for a string (action's {}'s), look for the closing brace */
336 /* we handle matching braces, if they match! */
337 int nest = 1, indent = -1, cind, bol;
338 /* skip spaces and newline */
339 while (c != EOF && c != '\n' && isspace(c)) c = yychar();
340 if (c == '\n') c = yychar();
341 /* collect string, caclucate indent */
342 cind = 0;
343 bol = 1;
344 while (c != EOF) {
345 if (c == '{') ++nest;
346 else if (c == '}' && !--nest) break;
347 /* indent calculation */
348 if (c == '\n') {
349 cind = 0;
350 bol = 1;
351 } else if (bol) {
352 if (isspace(c)) {
353 ++cind;
354 } else {
355 bol = 0;
356 if (indent < 0 || cind < indent) indent = cind;
359 PUSH_CHAR(c);
360 c = yychar();
362 /* we ate the ending brace -- regurgitate it */
363 if (c != EOF) yyunget(c);
364 /* check obvious errors */
365 if (nest) { yyerror(&yylval, "unmatched {} in action block"); goto eof; }
366 /* remove trailing newlines and spaces, add one newline */
367 while (sbused > 0 && isspace(sbuf[sbused-1])) --sbused;
368 PUSH_CHAR('\n');
369 PUSH_CHAR(0);
370 if (indent > 0) {
371 //fprintf(stderr, "=== %d ===\n%s===\n", indent, sbuf);
372 remove_indent(sbuf, sbused, indent);
373 //fprintf(stderr, "--- %d ---\n%s---\n", indent, sbuf);
375 yylval.type = T_STRING;
376 yylval.string = newstr(sbuf);
377 yymode(SCAN_NORMAL);
378 } else {
379 int keyword = 0, qch = 0;
380 int n;
381 c = skip_spaces(c);
382 /* c now contains the first character of a token */
383 if (c == EOF) goto eof;
384 /* special thingy: single-quoted string */
385 if (c == '\'') {
386 for (c = yychar(); c != EOF; c = yychar()) {
387 if (c == '\'') {
388 /* check for special case: "''" */
389 if ((c = yychar()) != '\'') {
390 if (c != EOF && !isspace(c)) yyunget(c);
391 break;
394 PUSH_CHAR(c);
396 PUSH_CHAR(0);
397 yylval.type = T_ARG;
398 yylval.strlit = 1; /* don't expand this string */
399 yylval.string = newstr(sbuf);
400 goto lexret;
402 /* 'normal' mode */
403 keyword = (scan_mode == SCAN_NORMAL && isalpha(c)) || (scan_mode == SCAN_PUNCT && !isalnum(c)); /* maybe */
404 //if (DEBUG_SCAN) printf("mode: %d; char: '%c'; keyword: %d\n", scan_mode, c, keyword);
405 /* look for white space to delimit word */
406 /* \ protects next character */
407 for (; c != EOF; c = yychar()) {
408 /* check if this is var access */
409 if (c == '$') {
410 keyword = 0;
411 PUSH_CHAR(c);
412 if ((c = yychar()) == EOF) break;
413 if (c == '(') {
414 PUSH_CHAR(c);
415 c = scan_varaccess();
416 yyunget(c);
417 continue;
419 if (isalnum(c) || c == '_' || c == '-' || c == '<' || c == '>') yywarning_ex("\"$x\" -- maybe you want \"$(x\" instead?");
421 /* check for some common bugs */
422 if (!qch && c == '(') {
423 int nc = yychar();
424 yyunget(nc);
425 if (nc == '$') yywarning_ex("\"($\" -- maybe you want \"$(\" instead?");
426 if (((sbused > 0 && !isalnum(sbuf[sbused-1])) || (sbused == 0)) &&
427 (isalnum(nc) || nc == '_' || nc == '-' || nc == '<' || nc == '>')) yywarning_ex("\"(x\" -- maybe you want \"$(x\" instead?");
429 /* 'c' is not pushed yet */
430 if (!qch && scan_mode == SCAN_PUNCT) {
431 /* we are in list, the only possible keywords follows */
432 if (strchr("{}[];", c) != NULL) {
433 if (sbused == 0) {
434 keyword = 1;
435 PUSH_CHAR(c);
436 c = ' ';
438 break;
441 if (!qch && (isspace(c) || c == '\'')) break;
442 if (!qch && scan_mode == SCAN_NORMAL && c != '"' && c != '\'' && !isalnum(c)) {
443 /* check if this char (and possibly next) forms non-alnum token */
444 PUSH_CHAR(c);
445 if ((c = yychar()) != EOF) {
446 /* try 2-char tokens */
447 PUSH_CHAR(c);
448 if ((kw = find_keyword(sbuf+sbused-2, 2)) != NULL) {
449 if (sbused == 2) {
450 /* wow! token! */
451 yylval.type = kw->type;
452 yylval.string = kw->word; /* used by symdump */
453 goto lexret;
455 yywarning_ex("non-alpha token without whitespace");
456 /* return this 2 chars */
457 yyunget(sbuf[--sbused]);
458 yyunget(sbuf[--sbused]);
459 c = ' ';
460 break;
462 /* return one char back */
463 --sbused;
464 yyunget(c);
466 /* try 1-char token */
467 if (sbused > 1 && sbuf[sbused-1] == '=' && isalnum(sbuf[sbused-2])) goto skipkwone;
468 if (sbused == 1 && sbuf[sbused-1] == '!') {
469 int nc = yychar();
470 yyunget(nc);
471 if (isalnum(nc) || nc == '-' || nc == '_') goto skipkwone;
473 if ((kw = find_keyword(sbuf+sbused-1, 1)) != NULL) {
474 if (sbused == 1) {
475 /* wow! token! */
476 yylval.type = kw->type;
477 yylval.string = kw->word; /* used by symdump */
478 goto lexret;
480 if (strchr("{}[];", sbuf[sbused-1]) == NULL) yywarning_ex("non-alpha token without whitespace");
481 /* return this char */
482 yyunget(sbuf[--sbused]);
483 c = ' ';
484 break;
486 skipkwone:
487 /* pop this char and process it as usual */
488 c = sbuf[--sbused];
490 /* check for quoting */
491 if (qch && c == qch) {
492 qch = 0;
493 continue;
495 if (!qch && c == '"') {
496 keyword = 0;
497 qch = c;
498 continue;
500 /* screened char? */
501 if (c == '\\') {
502 keyword = 0;
503 if ((c = yychar()) == EOF) break;
504 if (qch) {
505 /* in string */
506 switch (c) {
507 case 'a': PUSH_CHAR('\a'); break;
508 case 'b': PUSH_CHAR('\b'); break;
509 case 'e': PUSH_CHAR('\x1b'); break;
510 case 'f': PUSH_CHAR('\f'); break;
511 case 'n': PUSH_CHAR('\n'); break;
512 case 'r': PUSH_CHAR('\r'); break;
513 case 't': PUSH_CHAR('\t'); break;
514 case 'v': PUSH_CHAR('\v'); break;
515 case 'x':
516 // first digit
517 if ((c = yychar()) == EOF) { yyerror(&yylval, "invalid hex escape in quoted string"); goto eof; }
518 if ((n = digit(c, 16)) < 0) { yyerror(&yylval, "invalid hex escape in quoted string"); goto eof; }
519 // second digit
520 if ((c = yychar()) != EOF) {
521 int d = digit(c, 16);
522 if (d < 0) yyunget(c); else n = (n*16)+d;
524 if (n == 0) { yyerror(&yylval, "invalid hex escape in quoted string"); goto eof; }
525 PUSH_CHAR(n);
526 break;
527 //TODO: add '\uXXXX'?
528 default:
529 if (isalnum(c)) { yyerror(&yylval, "invalid escape in quoted string"); goto eof; }
530 PUSH_CHAR(c);
531 break;
533 } else {
534 /* not in string */
535 PUSH_CHAR(c);
537 continue;
539 /* normal char */
540 if (scan_mode == SCAN_NORMAL) {
541 if (keyword && !isalpha(c)) keyword = 0;
542 } else if (scan_mode == SCAN_PUNCT) {
543 if (keyword && isalnum(c)) keyword = 0;
545 PUSH_CHAR(c);
547 /* we looked ahead a character -- back up */
548 /* don't return spaces, they will be skipped on next call anyway */
549 if (c != EOF && !isspace(c)) yyunget(c);
550 /* check obvious errors */
551 if (qch) { yyerror(&yylval, "unmatched \" in string"); goto eof; }
552 PUSH_CHAR(0);
553 /*if (DEBUG_SCAN) printf("keyword: %d; str='%s' (%d)\n", keyword, sbuf, sbused);*/
554 /* scan token table */
555 yylval.type = T_ARG;
556 if (keyword && sbused > 0) {
557 /* find token */
558 if ((kw = find_keyword(sbuf, sbused-1)) != NULL) {
559 yylval.type = kw->type;
560 yylval.string = kw->word; /* used by symdump */
563 if (yylval.type == T_ARG) yylval.string = newstr(sbuf);
565 lexret:
566 if (DEBUG_SCAN) printf("scan %s\n", symdump(&yylval));
567 return yylval.type;
568 eof:
569 yylval.type = 0; /* 0 is EOF for lemon */
570 return yylval.type;
573 #undef PUSH_CHAR
576 static const char *symdump (const token_t *s) {
577 static char *buf = NULL;
578 static int bufsz = 0;
579 int nsz;
580 if (s->type == EOF) return "EOF";
581 nsz = strlen(s->string)+128;
582 if (nsz > bufsz) {
583 char *nb = realloc(buf, nsz);
584 if (nb == NULL) { fprintf(stderr, "FATAL: out of memory!\n"); abort(); }
585 buf = nb;
586 bufsz = nsz;
588 switch (s->type) {
589 case 0: sprintf(buf, "unknown symbol <%s>", s->string); break;
590 case T_ARG: sprintf(buf, "argument <%s>", s->string); break;
591 case T_STRING: sprintf(buf, "string \"%s\"", s->string); break;
592 default: sprintf(buf, "keyword `%s`", s->string); break;
594 return buf;