src/scan.c

   1 /*
   2  * Copyright 1993-2002 Christopher Seiwald and Perforce Software, Inc.
   3  *
   4  * This file is part of Jam - see jam.c for Copyright information.
   5  */
   6 /*
   7  * scan.c - the jam yacc scanner
   8  *
   9  * 12/26/93 (seiwald) - bump buf in yylex to 10240 - yuk.
  10  * 09/16/94 (seiwald) - check for overflows, unmatched {}'s, etc.
  11  *                      Also handle tokens abutting EOF by remembering
  12  *                      to return EOF now matter how many times yylex()
  13  *                      reinvokes yyline().
  14  * 02/11/95 (seiwald) - honor only punctuation keywords if SCAN_PUNCT.
  15  * 07/27/95 (seiwald) - Include jamgram.h after scan.h, so that YYSTYPE is
  16  *                      defined before Linux's yacc tries to redefine it.
  17  * 01/10/01 (seiwald) - \ can now escape any whitespace char
  18  * 11/04/02 (seiwald) - const-ing for string literals
  19  */
  20 #include "jam.h"
  21 #include "lists.h"
  22 #include "parse.h"
  23 #include "scan.h"
  24 #include "jamgram.h"
  25 #include "jambase.h"
  26 #include "newstr.h"
  27
  28
  29 const struct keyword {
  30   const char *word;
  31   int type;
  32 } keywords[] = {
  33 #include "jamgramtab.h"
  34   {0,0}
  35 };
  36
  37
  38 struct include {
  39   struct include *next; /* next serial include file */
  40   const char *string; /* pointer into current line */
  41   char **strings; /* for yyfparse() -- text to parse */
  42   FILE *file; /* for yyfparse() -- file being read */
  43   const char *fname; /* for yyfparse() -- file name */
  44   int line; /* line counter for error messages */
  45   char buf[512]; /* for yyfparse() -- line buffer */
  46 };
  47
  48 static struct include *incp = 0; /* current file; head of chain */
  49
  50 static int scan_mode = SCAN_NORMAL;
  51 static int any_errors = 0;
  52 static char *symdump (YYSTYPE *s);
  53
  54 /* no single token can be larger */
  55 #define BIGGEST_TOKEN  (10240)
  56
  57
  58 /*
  59  * Set parser mode: normal, string, or keyword
  60  */
  61 void yymode (int n) {
  62   scan_mode = n;
  63 }
  64
  65
  66 void yyerror (const char *s) {
  67   if (incp) printf("%s: line %d: ", incp->fname, incp->line);
  68   printf("%s at %s\n", s, symdump(&yylval));
  69   ++any_errors;
  70 }
  71
  72
  73 int yyanyerrors (void) {
  74   return (any_errors != 0);
  75 }
  76
  77
  78 void yyfparse (const char *s) {
  79   struct include *i = (struct include *)malloc(sizeof(*i));
  80   /* push this onto the incp chain */
  81   i->string = "";
  82   i->strings = 0;
  83   i->file = 0;
  84   i->fname = copystr(s);
  85   i->line = 0;
  86   i->next = incp;
  87   incp = i;
  88   /* if the filename is "::Jambase", it means use the internal jambase */
  89   if (strcmp(s, "::Jambase") == 0) {
  90     jambaseUnpack();
  91     i->strings = jambase;
  92   }
  93 }
  94
  95
  96 /*
  97  * yyline() - read new line and return first character
  98  *
  99  * fabricates a continuous stream of characters across include files, returning EOF at the bitter end
 100  */
 101 int yyline (void) {
 102   struct include *i = incp;
 103   if (!incp) return EOF;
 104   /* once we start reading from the input stream, we reset the
 105    * include insertion point so that the next include file becomes
 106    * the head of the list */
 107   /* if there is more data in this line, return it */
 108   if (*i->string) return *i->string++;
 109   /* if we're reading from an internal string list, go to the next string */
 110   if (i->strings) {
 111     if (!*i->strings) goto next;
 112     ++i->line;
 113     i->string = *(i->strings++);
 114     return *i->string++;
 115   }
 116   /* if necessary, open the file */
 117   if (!i->file) {
 118     FILE *f = stdin;
 119     if (strcmp(i->fname, "-") && !(f = fopen(i->fname, "r"))) perror(i->fname);
 120     i->file = f;
 121   }
 122   /* if there's another line in this file, start it */
 123   if (i->file && fgets(i->buf, sizeof(i->buf), i->file)) {
 124     ++i->line;
 125     i->string = i->buf;
 126     return *i->string++;
 127   }
 128 next:
 129   /* this include is done */
 130   /* free it up and return EOF so yyparse() returns to parse_file() */
 131   incp = i->next;
 132   /* close file, free name */
 133   if (i->file && i->file != stdin) fclose(i->file);
 134   freestr(i->fname);
 135   free(i);
 136   return EOF;
 137 }
 138
 139
 140 /*
 141  * yylex() - set yylval to current token; return its type
 142  *
 143  * Macros to move things along:
 144  *
 145  *  yychar() - return and advance character; invalid after EOF
 146  *  yyprev() - back up one character; invalid before yychar()
 147  *
 148  * yychar() returns a continuous stream of characters, until it hits
 149  * the EOF of the current include file.
 150  */
 151 #define yychar()  (*incp->string ? *incp->string++ : yyline())
 152 #define yyprev()  (incp->string--)
 153
 154
 155 /* eat white space */
 156 static int skip_spaces (int c) {
 157   for (;;) {
 158     /* skip past white space */
 159     while (c != EOF && isspace(c)) c = yychar();
 160     /* not a comment? swallow up comment line */
 161     if (c != '#') break;
 162     while ((c = yychar()) != EOF && c != '\n') ;
 163   }
 164   return c;
 165 }
 166
 167
 168 static int digit (int c, int base) {
 169   if (c == EOF) return -1;
 170   if (c >= 'a' && c <= 'z') c -= 32;
 171   if (c < '0' || (c > '9' && c < 'A') || c > 'Z') return -1;
 172   if ((c -= '0') > 9) c -= 7;
 173   if (c >= base) return -1;
 174   return c;
 175 }
 176
 177
 178 int yylex (void) {
 179   static char buf[BIGGEST_TOKEN];
 180   int c;
 181   if (!incp) goto eof;
 182   /* get first character (whitespace or of token) */
 183   c = yychar();
 184   if (scan_mode == SCAN_STRING) {
 185     /* if scanning for a string (action's {}'s), look for the closing brace */
 186     /* we handle matching braces, if they match! */
 187     char *b = buf;
 188     int nest = 1;
 189     while (c != EOF && b < buf+sizeof(buf)) {
 190       if (c == '{') ++nest;
 191       if (c == '}' && !--nest) break;
 192       *b++ = c;
 193       c = yychar();
 194     }
 195     /* we ate the ending brace -- regurgitate it */
 196     if (c != EOF) yyprev();
 197     /* check obvious errors */
 198     if (b == buf+sizeof(buf)) { yyerror("action block too big"); goto eof; }
 199     if (nest) { yyerror("unmatched {} in action block"); goto eof; }
 200     *b = 0;
 201     yylval.type = STRING;
 202     yylval.string = newstr(buf);
 203   } else {
 204     char *b = buf;
 205     const struct keyword *k;
 206     int inquote = 0, notkeyword = 0, n, d, wasNotAlNum = 0;
 207     c = skip_spaces(c);
 208     /* c now points to the first character of a token */
 209     if (c == EOF) goto eof;
 210     //printf(":'%c'\n", c);
 211 #if 0
 212     if (!isalpha(c) && c != '$' && c != '_' && c != '"' && c != '\'') {
 213       const struct keyword *kgood = NULL;
 214       /* special chars are delimiters */
 215       while (c != EOF) {
 216         *b++ = c;
 217         *b = 0;
 218         for (k = keywords; k->word != NULL; ++k) if (!isalpha(k->word[0]) && strcmp(buf, k->word) == 0) break;
 219         if (k->word != NULL) {
 220           /* good keyword */
 221           kgood = k;
 222           c = yychar();
 223           continue;
 224         }
 225         /* bad keyword */
 226         break;
 227       }
 228       if (c != EOF) --b; /* remove last char from token buffer */
 229       if (kgood != NULL) {
 230         /* ok, we got it */
 231         printf("![%s]\n", buf);
 232         goto lexdoneback;
 233       }
 234     }
 235     /* bad luck, try it another way */
 236 #else
 237     /* while scanning the word, disqualify it for (expensive)
 238      * keyword lookup when we can: $anything, "anything", \anything */
 239     notkeyword = (c == '$');
 240     if (c == '{' || c == '}' || c == ';' || c == '[' || c == ']') {
 241       *b++ = c;
 242       goto lexdone;
 243     }
 244     if (c == ':') {
 245       /* only ':abc' is good, ':*' is not */
 246       c = yychar();
 247       if (c == EOF || isspace(c) || isalnum(c) || c == '$' || c == '_') {
 248         *b++ = ':';
 249         goto lexdoneback;
 250       }
 251     }
 252 #endif
 253     /* look for white space to delimit word */
 254     /* "'s get stripped but preserve white space */
 255     /* \ protects next character */
 256     while (c != EOF && b < buf+sizeof(buf) && (inquote || !isspace(c))) {
 257       if (c == '"') {
 258         /* begin or end " */
 259         inquote = !inquote;
 260         notkeyword = 1;
 261       } else if (!inquote && (c == '{' || c == '}' || c == ';')) {
 262         /* k8: allow specials to work as delimiters */
 263         break;
 264       } else if (!inquote && !notkeyword && (c == '[' || c == ']')) {
 265         /* k8: allow specials to work as delimiters */
 266         break;
 267       } else if (!inquote && !notkeyword && !wasNotAlNum && c == ':') {
 268         /* k8: allow specials to work as delimiters; '*:' is not good */
 269         /**b = 0; printf("***OUT [%s]! %d\n", buf, incp?incp->line:0);*/
 270         break;
 271       } else if (c != '\\') {
 272         /* normal char */
 273         if (!isalnum(c)) wasNotAlNum = 1;
 274         *b++ = c;
 275       } else if ((c = yychar()) != EOF) {
 276         /* \c */
 277         wasNotAlNum = 1;
 278         if (inquote) {
 279           switch (c) {
 280             case 't': *b++ = '\t'; break;
 281             case 'n': *b++ = '\n'; break;
 282             case 'r': *b++ = '\r'; break;
 283             case 'v': *b++ = '\v'; break;
 284             case 'b': *b++ = '\b'; break;
 285             case 'a': *b++ = '\a'; break;
 286             case 'f': *b++ = '\f'; break;
 287             case 'e': *b++ = '\x1b'; break;
 288             case 'x':
 289               c = yychar(); // first digit
 290               n = digit(c, 16);
 291               if (n < 0) { yyerror("invalid hex escape in quoted string"); goto eof; }
 292               c = yychar(); // second digit
 293               d = digit(c, 16);
 294               if (d < 0) { if (c != EOF) yyprev(); } else n = (n*16)+d;
 295               if (n == 0) { yyerror("invalid hex escape in quoted string"); goto eof; }
 296               *b++ = n;
 297               break;
 298             default: *b++ = c; break;
 299           }
 300         } else {
 301           *b++ = c;
 302         }
 303         notkeyword = 1;
 304       } else {
 305         /* \EOF */
 306         break;
 307       }
 308       c = yychar();
 309     }
 310     /* we looked ahead a character - back up */
 311 lexdoneback:
 312     if (c != EOF) yyprev();
 313 lexdone:
 314     /* check obvious errors */
 315     if (b == buf+sizeof(buf)) { yyerror("string too big"); goto eof; }
 316     if (inquote) { yyerror("unmatched \" in string"); goto eof; }
 317     /* scan token table */
 318     /* don't scan if it's obviously not a keyword or if its */
 319     /* an alphabetic when were looking for punctuation */
 320     *b = 0;
 321     yylval.type = ARG;
 322     if (!notkeyword && !(isalpha(*buf) && scan_mode == SCAN_PUNCT)) {
 323       for (k = keywords; k->word; ++k) {
 324         if (*buf == *k->word && strcmp(k->word, buf) == 0) {
 325           yylval.type = k->type;
 326           yylval.string = k->word; /* used by symdump */
 327           break;
 328         }
 329       }
 330     }
 331     if (yylval.type == ARG) yylval.string = newstr(buf);
 332   }
 333   if (DEBUG_SCAN) printf("scan %s\n", symdump(&yylval));
 334   return yylval.type;
 335 eof:
 336   yylval.type = EOF;
 337   return yylval.type;
 338 }
 339
 340
 341 static char *symdump (YYSTYPE *s) {
 342   static char buf[BIGGEST_TOKEN+20];
 343   switch (s->type) {
 344     case EOF: snprintf(buf, sizeof(buf), "EOF"); break;
 345     case 0: snprintf(buf, sizeof(buf), "unknown symbol %s", s->string); break;
 346     case ARG: snprintf(buf, sizeof(buf), "argument %s", s->string); break;
 347     case STRING: snprintf(buf, sizeof(buf), "string \"%s\"", s->string); break;
 348     default: snprintf(buf, sizeof(buf), "keyword %s", s->string); break;
 349   }
 350   return buf;
 351 }