fixed some indexing bugs in Jambase
[k8jam.git] / src / scan.c
blobaf0d3df8b3377e1ebe9592a07c72cc526196b79f
1 /*
2 * Copyright 1993-2002 Christopher Seiwald and Perforce Software, Inc.
4 * This file is part of Jam - see jam.c for Copyright information.
5 */
6 /*
7 * scan.c - the jam yacc scanner
9 * 12/26/93 (seiwald) - bump buf in yylex to 10240 - yuk.
10 * 09/16/94 (seiwald) - check for overflows, unmatched {}'s, etc.
11 * Also handle tokens abutting EOF by remembering
12 * to return EOF now matter how many times yylex()
13 * reinvokes yyline().
14 * 02/11/95 (seiwald) - honor only punctuation keywords if SCAN_PUNCT.
15 * 07/27/95 (seiwald) - Include jamgram.h after scan.h, so that YYSTYPE is
16 * defined before Linux's yacc tries to redefine it.
17 * 01/10/01 (seiwald) - \ can now escape any whitespace char
18 * 11/04/02 (seiwald) - const-ing for string literals
20 #include "jam.h"
21 #include "lists.h"
22 #include "parse.h"
23 #include "scan.h"
24 #include "jamgram.h"
25 #include "jambase.h"
26 #include "newstr.h"
27 #include "dstrings.h"
30 token_t yylval;
33 typedef struct {
34 const char *word;
35 int type;
36 } keyword_t;
39 static const keyword_t keywords[] = {
40 #include "jamgramtab.h"
41 {0,0}
45 typedef struct include_s {
46 struct include_s *next; /* next serial include file */
47 const char *string; /* pointer into current line */
48 char **strings; /* for yyfparse() -- text to parse */
49 FILE *file; /* for yyfparse() -- file being read */
50 const char *fname; /* for yyfparse() -- file name */
51 int line; /* line counter for error messages */
52 int pos; /* position for error messages */
53 int back_count; /* # of yyunget()ed chars */
54 char back_chars[2]; /* buffer for yyunget()ed chars */
55 char *fcontents; /* for yyfparse() -- file contents */
56 int prevwasn; /* !0: increment line and reset to 0 */
57 } include_t;
59 static include_t *incp = NULL; /* current file; head of chain */
62 static int scan_mode = SCAN_NORMAL;
63 /*static int any_errors = 0;*/
65 static const char *symdump (const token_t *s);
68 #ifndef NDEBUG
69 static const char *mnames[] = {
70 "SCAN_NORMAL",
71 "SCAN_STRING",
72 "SCAN_PUNCT",
74 #endif
78 * Set parser mode: normal, string, or keyword
80 void yymode (int n) {
81 #ifndef NDEBUG
82 if (DEBUG_SCAN && scan_mode != n) printf("**MODE TRANSITION: %s --> %s\n", mnames[scan_mode], mnames[n]);
83 #endif
84 scan_mode = n;
88 void yyerror (const token_t *tk, const char *s) {
89 printf("ERROR(%d:%d) '%s': %s\n", tk->line, tk->pos, tk->file, s);
90 exit(EXITBAD); /* exit now */
94 static void yywarning_ex (const char *s) {
95 printf("WARNING(%d:%d) '%s': %s\n", incp->line, incp->pos, incp->fname, s);
99 void yyfparse (const char *s) {
100 include_t *i = (include_t *)malloc(sizeof(*i));
101 /* push this onto the incp chain */
102 i->string = "";
103 i->strings = NULL;
104 i->file = NULL;
105 //i->fname = strdup(s);
106 i->fname = newstr(s);
107 i->line = 0;
108 i->pos = 0;
109 i->next = incp;
110 i->back_count = 0;
111 i->fcontents = NULL;
112 i->prevwasn = 1;
113 incp = i;
114 /* if the filename is "::Jambase", it means use the internal jambase */
115 if (strcmp(s, "::Jambase") == 0) {
116 jambase_unpack();
117 i->strings = jambase;
123 * yychar() - read new line and return first character
125 * fabricates a continuous stream of characters across include files, returning EOF at the bitter end
127 static int yychar (void) {
128 include_t *i = incp;
129 if (!incp) return EOF;
130 /* once we start reading from the input stream, we reset the
131 * include insertion point so that the next include file becomes
132 * the head of the list */
133 if (i->back_count) return i->back_chars[--i->back_count];
134 /* if there is more data in this line, return it */
135 if (i->prevwasn) { i->prevwasn = 0; ++i->line; i->pos = 0; }
136 again:
137 ++i->pos;
138 if (*i->string) {
139 if (*i->string == '\n') i->prevwasn = 1;
140 return *i->string++;
142 /* if we're reading from an internal string list, go to the next string */
143 if (i->strings) {
144 if (!*i->strings) goto next;
145 i->string = *(i->strings++);
146 return *i->string++;
148 /* if necessary, open the file and get file contents */
149 if (!i->file) {
150 FILE *f;
151 long fsize;
152 if ((f = fopen(i->fname, "rb")) == NULL) perror(i->fname);
153 i->file = f;
154 if (fseek(f, 0, SEEK_END) < 0) perror(i->fname);
155 if ((fsize = ftell(f)) < 0) perror(i->fname);
156 if (fseek(f, 0, SEEK_SET) < 0) perror(i->fname);
157 i->fcontents = calloc(fsize+1, 1);
158 if (fsize > 0 && fread(i->fcontents, fsize, 1, f) != 1) perror(i->fname);
159 fclose(f); /* don't need to hold it opened */
160 i->string = i->fcontents;
161 goto again;
163 next:
164 /* this include is done */
165 /* free it up and return EOF so yyparse() returns to parse_file() */
166 incp = i->next;
167 /* close file, free name */
168 if (i->fcontents != NULL) free(i->fcontents);
169 //if (i->fname != NULL) free(i->fname);
170 free(i);
171 return EOF;
176 * yychar() - back up one character
178 static inline void yyunget (int c) {
179 if (c != EOF) {
180 if (incp->back_count >= 2) { fprintf(stderr, "yyunget: too much!\n"); abort(); }
181 incp->back_chars[incp->back_count++] = c;
186 /* eat white space */
187 static int skip_spaces (int c) {
188 for (;;) {
189 /* skip past white space */
190 while (c != EOF && isspace(c)) {
191 yylval.line = incp->line;
192 yylval.pos = incp->pos;
193 c = yychar();
195 /* not a comment? swallow up comment line */
196 if (c != '#') break;
197 while ((c = yychar()) != EOF && c != '\n') ;
199 return c;
203 static int digit (int c, int base) {
204 if (c == EOF) return -1;
205 if (c >= 'a' && c <= 'z') c -= 32;
206 if (c < '0' || (c > '9' && c < 'A') || c > 'Z') return -1;
207 if ((c -= '0') > 9) c -= 7;
208 if (c >= base) return -1;
209 return c;
213 /* textlen includes trailing zero */
214 static void remove_indent (char *text, int textlen, int indent) {
215 if (indent > 0) {
216 while (*text) {
217 char *eol = strchr(text, '\n');
218 if (eol == NULL) eol = text+textlen-1;
219 if (eol-text >= indent) {
220 textlen -= indent;
221 eol -= indent;
222 memmove(text, text+indent, textlen);
224 if (!eol[0]) break;
225 textlen -= eol+1-text;
226 text = eol+1;
232 static inline const keyword_t *find_keyword (const char *nbuf, size_t nblen) {
233 if (nblen > 0) {
234 for (const keyword_t *k = keywords; k->word; ++k) if (strncmp(k->word, nbuf, nblen) == 0 && k->word[nblen] == 0) return k;
236 return NULL;
241 * yylex() - set yylval to current token; return its type
244 #define PUSH_CHAR(_c) do { \
245 if (sbused+1 >= sbsize) { \
246 int newsz = ((sbused+1)|0x7ff)+1; \
247 char *nb = realloc(sbuf, newsz); \
248 if (nb == NULL) { fprintf(stderr, "FATAL: out of memory!\n"); abort(); } \
249 sbuf = nb; \
250 sbsize = newsz; \
252 sbuf[sbused++] = (_c); \
253 } while (0)
255 static char *sbuf = NULL;
256 static int sbsize = 0;
257 static int sbused;
260 /* "$(" already scanned and pushed */
261 /* return char after ")" */
262 int scan_varaccess (void) {
263 int c = yychar(), qch = 0, oc;
264 if (c == EOF) return c;
265 /* scan variable name */
266 while (c != EOF && c != '[' && c != ':') {
267 PUSH_CHAR(c);
268 oc = c;
269 c = yychar();
270 if (oc == ')') return c;
271 if (oc == '$' && c == '(') { PUSH_CHAR(c); c = scan_varaccess(); }
273 if (c == EOF) return c;
274 /* scan indexing; 'c' is not pushed */
275 if (c == '[') {
276 while (c != EOF && c != ']') {
277 PUSH_CHAR(c);
278 oc = c;
279 c = yychar();
280 if (oc == ')') return c;
281 if (oc == '$' && c == '(') { PUSH_CHAR(c); c = scan_varaccess(); }
283 /* find either selector or ')' */
284 while (c != EOF && c != ':') {
285 PUSH_CHAR(c);
286 oc = c;
287 c = yychar();
288 if (oc == ')') return c;
289 if (oc == '$' && c == '(') { PUSH_CHAR(c); c = scan_varaccess(); }
291 if (c == EOF) return c;
293 /* scan selectors; 'c' is not pushed */
294 while (c != EOF) {
295 if (qch != '\'' && c == '\\') {
296 /* screening */
297 PUSH_CHAR(c);
298 if ((c = yychar()) == EOF) break;
299 PUSH_CHAR(c);
300 c = yychar();
301 continue;
303 PUSH_CHAR(c);
304 oc = c;
305 c = yychar();
306 if (!qch && (oc == '"' || oc == '\'')) { qch = oc; continue; }
307 if (!qch && oc == ')') return c;
308 if (qch != '\'' && oc == '$' && c == '(') {
309 PUSH_CHAR(c);
310 c = scan_varaccess();
311 continue;
313 if (qch && oc == qch) {
314 if (!(qch == '\'' && c == '\'')) qch = 0;
315 continue;
318 return c;
322 int yylex (void) {
323 const keyword_t *kw;
324 int c;
325 sbused = 0;
326 yylval.strlit = 0;
327 if (!incp) goto eof;
328 yylval.strlit = 0; /* expand this string */
329 yylval.line = incp->line;
330 yylval.pos = incp->pos;
331 yylval.file = incp->fname;
332 /* get first character (whitespace or of token) */
333 c = yychar();
334 if (scan_mode == SCAN_STRING) {
335 /* if scanning for a string (action's {}'s), look for the closing brace */
336 /* we handle matching braces, if they match! */
337 int nest = 1, indent = -1, cind, bol;
338 /* skip spaces and newline */
339 while (c != EOF && c != '\n' && isspace(c)) c = yychar();
340 if (c == '\n') c = yychar();
341 /* collect string, caclucate indent */
342 cind = 0;
343 bol = 1;
344 while (c != EOF) {
345 if (c == '{') ++nest;
346 else if (c == '}' && !--nest) break;
347 /* indent calculation */
348 if (c == '\n') {
349 cind = 0;
350 bol = 1;
351 } else if (bol) {
352 if (isspace(c)) {
353 ++cind;
354 } else {
355 bol = 0;
356 if (indent < 0 || cind < indent) indent = cind;
359 PUSH_CHAR(c);
360 c = yychar();
362 /* we ate the ending brace -- regurgitate it */
363 if (c != EOF) yyunget(c);
364 /* check obvious errors */
365 if (nest) { yyerror(&yylval, "unmatched {} in action block"); goto eof; }
366 /* remove trailing newlines and spaces, add one newline */
367 while (sbused > 0 && isspace(sbuf[sbused-1])) --sbused;
368 PUSH_CHAR('\n');
369 PUSH_CHAR(0);
370 if (indent > 0) {
371 //fprintf(stderr, "=== %d ===\n%s===\n", indent, sbuf);
372 remove_indent(sbuf, sbused, indent);
373 //fprintf(stderr, "--- %d ---\n%s---\n", indent, sbuf);
375 yylval.type = T_STRING;
376 yylval.string = newstr(sbuf);
377 yymode(SCAN_NORMAL);
378 } else {
379 int keyword = 0, qch = 0;
380 int n;
381 c = skip_spaces(c);
382 /* c now contains the first character of a token */
383 if (c == EOF) goto eof;
384 /* special thingy: single-quoted string */
385 if (c == '\'') {
386 for (c = yychar(); c != EOF; c = yychar()) {
387 if (c == '\'') {
388 /* check for special case: "''" */
389 if ((c = yychar()) != '\'') {
390 if (c != EOF && !isspace(c)) yyunget(c);
391 break;
394 PUSH_CHAR(c);
396 PUSH_CHAR(0);
397 yylval.type = T_ARG;
398 yylval.strlit = 1; /* don't expand this string */
399 yylval.string = newstr(sbuf);
400 goto lexret;
402 /* 'normal' mode */
403 keyword = (scan_mode == SCAN_NORMAL && isalpha(c)) || (scan_mode == SCAN_PUNCT && !isalnum(c)); /* maybe */
404 //if (DEBUG_SCAN) printf("mode: %d; char: '%c'; keyword: %d\n", scan_mode, c, keyword);
405 /* look for white space to delimit word */
406 /* \ protects next character */
407 for (; c != EOF; c = yychar()) {
408 /* check if this is var access */
409 if (c == '$') {
410 keyword = 0;
411 PUSH_CHAR(c);
412 if ((c = yychar()) == EOF) break;
413 if (c == '(') {
414 PUSH_CHAR(c);
415 c = scan_varaccess();
416 yyunget(c);
417 continue;
420 /* check for some common bugs */
421 if (!qch && c == '(') {
422 int nc = yychar();
423 yyunget(nc);
424 if (nc == '$') yywarning_ex("\"($\" -- maybe you want \"$(\" instead?");
425 if (sbused > 0 && !isalnum(sbuf[sbused-1]) && (isalnum(nc) || nc == '_' || nc == '-')) yywarning_ex("\"(\" -- maybe you want \"$(\" instead?");
427 /* 'c' is not pushed yet */
428 if (!qch && scan_mode == SCAN_PUNCT) {
429 /* we are in list, the only possible keywords follows */
430 if (strchr("{}[];", c) != NULL) {
431 if (sbused == 0) {
432 keyword = 1;
433 PUSH_CHAR(c);
434 c = ' ';
436 break;
439 if (!qch && (isspace(c) || c == '\'')) break;
440 if (!qch && scan_mode == SCAN_NORMAL && c != '"' && c != '\'' && !isalnum(c)) {
441 /* check if this char (and possibly next) forms non-alnum token */
442 PUSH_CHAR(c);
443 if ((c = yychar()) != EOF) {
444 /* try 2-char tokens */
445 PUSH_CHAR(c);
446 if ((kw = find_keyword(sbuf+sbused-2, 2)) != NULL) {
447 if (sbused == 2) {
448 /* wow! token! */
449 yylval.type = kw->type;
450 yylval.string = kw->word; /* used by symdump */
451 goto lexret;
453 yywarning_ex("non-alpha token without whitespace");
454 /* return this 2 chars */
455 yyunget(sbuf[--sbused]);
456 yyunget(sbuf[--sbused]);
457 c = ' ';
458 break;
460 /* return one char back */
461 --sbused;
462 yyunget(c);
464 /* try 1-char token */
465 if (sbused > 1 && sbuf[sbused-1] == '=' && isalnum(sbuf[sbused-2])) goto skipkwone;
466 if (sbused == 1 && sbuf[sbused-1] == '!') {
467 int nc = yychar();
468 yyunget(nc);
469 if (isalnum(nc)) goto skipkwone;
471 if ((kw = find_keyword(sbuf+sbused-1, 1)) != NULL) {
472 if (sbused == 1) {
473 /* wow! token! */
474 yylval.type = kw->type;
475 yylval.string = kw->word; /* used by symdump */
476 goto lexret;
478 if (strchr("{}[];", sbuf[sbused-1]) == NULL) yywarning_ex("non-alpha token without whitespace");
479 /* return this char */
480 yyunget(sbuf[--sbused]);
481 c = ' ';
482 break;
484 skipkwone:
485 /* pop this char and process it as usual */
486 c = sbuf[--sbused];
488 /* check for quoting */
489 if (qch && c == qch) {
490 qch = 0;
491 continue;
493 if (!qch && c == '"') {
494 keyword = 0;
495 qch = c;
496 continue;
498 /* screened char? */
499 if (c == '\\') {
500 keyword = 0;
501 if ((c = yychar()) == EOF) break;
502 if (qch) {
503 /* in string */
504 switch (c) {
505 case 'a': PUSH_CHAR('\a'); break;
506 case 'b': PUSH_CHAR('\b'); break;
507 case 'e': PUSH_CHAR('\x1b'); break;
508 case 'f': PUSH_CHAR('\f'); break;
509 case 'n': PUSH_CHAR('\n'); break;
510 case 'r': PUSH_CHAR('\r'); break;
511 case 't': PUSH_CHAR('\t'); break;
512 case 'v': PUSH_CHAR('\v'); break;
513 case 'x':
514 // first digit
515 if ((c = yychar()) == EOF) { yyerror(&yylval, "invalid hex escape in quoted string"); goto eof; }
516 if ((n = digit(c, 16)) < 0) { yyerror(&yylval, "invalid hex escape in quoted string"); goto eof; }
517 // second digit
518 if ((c = yychar()) != EOF) {
519 int d = digit(c, 16);
520 if (d < 0) yyunget(c); else n = (n*16)+d;
522 if (n == 0) { yyerror(&yylval, "invalid hex escape in quoted string"); goto eof; }
523 PUSH_CHAR(n);
524 break;
525 //TODO: add '\uXXXX'?
526 default:
527 if (isalnum(c)) { yyerror(&yylval, "invalid escape in quoted string"); goto eof; }
528 PUSH_CHAR(c);
529 break;
531 } else {
532 /* not in string */
533 PUSH_CHAR(c);
535 continue;
537 /* normal char */
538 if (scan_mode == SCAN_NORMAL) {
539 if (keyword && !isalpha(c)) keyword = 0;
540 } else if (scan_mode == SCAN_PUNCT) {
541 if (keyword && isalnum(c)) keyword = 0;
543 PUSH_CHAR(c);
545 /* we looked ahead a character -- back up */
546 /* don't return spaces, they will be skipped on next call anyway */
547 if (c != EOF && !isspace(c)) yyunget(c);
548 /* check obvious errors */
549 if (qch) { yyerror(&yylval, "unmatched \" in string"); goto eof; }
550 PUSH_CHAR(0);
551 /*if (DEBUG_SCAN) printf("keyword: %d; str='%s' (%d)\n", keyword, sbuf, sbused);*/
552 /* scan token table */
553 yylval.type = T_ARG;
554 if (keyword && sbused > 0) {
555 /* find token */
556 if ((kw = find_keyword(sbuf, sbused-1)) != NULL) {
557 yylval.type = kw->type;
558 yylval.string = kw->word; /* used by symdump */
561 if (yylval.type == T_ARG) yylval.string = newstr(sbuf);
563 lexret:
564 if (DEBUG_SCAN) printf("scan %s\n", symdump(&yylval));
565 return yylval.type;
566 eof:
567 yylval.type = 0; /* 0 is EOF for lemon */
568 return yylval.type;
571 #undef PUSH_CHAR
574 static const char *symdump (const token_t *s) {
575 static char *buf = NULL;
576 static int bufsz = 0;
577 int nsz;
578 if (s->type == EOF) return "EOF";
579 nsz = strlen(s->string)+128;
580 if (nsz > bufsz) {
581 char *nb = realloc(buf, nsz);
582 if (nb == NULL) { fprintf(stderr, "FATAL: out of memory!\n"); abort(); }
583 buf = nb;
584 bufsz = nsz;
586 switch (s->type) {
587 case 0: sprintf(buf, "unknown symbol <%s>", s->string); break;
588 case T_ARG: sprintf(buf, "argument <%s>", s->string); break;
589 case T_STRING: sprintf(buf, "string \"%s\"", s->string); break;
590 default: sprintf(buf, "keyword `%s`", s->string); break;
592 return buf;