contrib/awk/lex.c

   1 /****************************************************************
   2 Copyright (C) Lucent Technologies 1997
   3 All Rights Reserved
   4
   5 Permission to use, copy, modify, and distribute this software and
   6 its documentation for any purpose and without fee is hereby
   7 granted, provided that the above copyright notice appear in all
   8 copies and that both that the copyright notice and this
   9 permission notice and warranty disclaimer appear in supporting
  10 documentation, and that the name Lucent Technologies or any of
  11 its entities not be used in advertising or publicity pertaining
  12 to distribution of the software without specific, written prior
  13 permission.
  14
  15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
  16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
  17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
  18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
  20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
  21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
  22 THIS SOFTWARE.
  23 ****************************************************************/
  24
  25 #include <stdio.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <ctype.h>
  29 #include "awk.h"
  30 #include "awkgram.tab.h"
  31
  32 extern YYSTYPE  yylval;
  33 extern bool     infunc;
  34
  35 int     lineno  = 1;
  36 int     bracecnt = 0;
  37 int     brackcnt  = 0;
  38 int     parencnt = 0;
  39
  40 typedef struct Keyword {
  41         const char *word;
  42         int     sub;
  43         int     type;
  44 } Keyword;
  45
  46 const Keyword keywords[] = {    /* keep sorted: binary searched */
  47         { "BEGIN",      XBEGIN,         XBEGIN },
  48         { "END",        XEND,           XEND },
  49         { "NF",         VARNF,          VARNF },
  50         { "atan2",      FATAN,          BLTIN },
  51         { "break",      BREAK,          BREAK },
  52         { "close",      CLOSE,          CLOSE },
  53         { "continue",   CONTINUE,       CONTINUE },
  54         { "cos",        FCOS,           BLTIN },
  55         { "delete",     DELETE,         DELETE },
  56         { "do",         DO,             DO },
  57         { "else",       ELSE,           ELSE },
  58         { "exit",       EXIT,           EXIT },
  59         { "exp",        FEXP,           BLTIN },
  60         { "fflush",     FFLUSH,         BLTIN },
  61         { "for",        FOR,            FOR },
  62         { "func",       FUNC,           FUNC },
  63         { "function",   FUNC,           FUNC },
  64         { "getline",    GETLINE,        GETLINE },
  65         { "gsub",       GSUB,           GSUB },
  66         { "if",         IF,             IF },
  67         { "in",         IN,             IN },
  68         { "index",      INDEX,          INDEX },
  69         { "int",        FINT,           BLTIN },
  70         { "length",     FLENGTH,        BLTIN },
  71         { "log",        FLOG,           BLTIN },
  72         { "match",      MATCHFCN,       MATCHFCN },
  73         { "next",       NEXT,           NEXT },
  74         { "nextfile",   NEXTFILE,       NEXTFILE },
  75         { "print",      PRINT,          PRINT },
  76         { "printf",     PRINTF,         PRINTF },
  77         { "rand",       FRAND,          BLTIN },
  78         { "return",     RETURN,         RETURN },
  79         { "sin",        FSIN,           BLTIN },
  80         { "split",      SPLIT,          SPLIT },
  81         { "sprintf",    SPRINTF,        SPRINTF },
  82         { "sqrt",       FSQRT,          BLTIN },
  83         { "srand",      FSRAND,         BLTIN },
  84         { "sub",        SUB,            SUB },
  85         { "substr",     SUBSTR,         SUBSTR },
  86         { "system",     FSYSTEM,        BLTIN },
  87         { "tolower",    FTOLOWER,       BLTIN },
  88         { "toupper",    FTOUPPER,       BLTIN },
  89         { "while",      WHILE,          WHILE },
  90 };
  91
  92 #define RET(x)  { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
  93
  94 static int peek(void)
  95 {
  96         int c = input();
  97         unput(c);
  98         return c;
  99 }
 100
 101 static int gettok(char **pbuf, int *psz)        /* get next input token */
 102 {
 103         int c, retc;
 104         char *buf = *pbuf;
 105         int sz = *psz;
 106         char *bp = buf;
 107
 108         c = input();
 109         if (c == 0)
 110                 return 0;
 111         buf[0] = c;
 112         buf[1] = 0;
 113         if (!isalnum(c) && c != '.' && c != '_')
 114                 return c;
 115
 116         *bp++ = c;
 117         if (isalpha(c) || c == '_') {   /* it's a varname */
 118                 for ( ; (c = input()) != 0; ) {
 119                         if (bp-buf >= sz)
 120                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
 121                                         FATAL( "out of space for name %.10s...", buf );
 122                         if (isalnum(c) || c == '_')
 123                                 *bp++ = c;
 124                         else {
 125                                 *bp = 0;
 126                                 unput(c);
 127                                 break;
 128                         }
 129                 }
 130                 *bp = 0;
 131                 retc = 'a';     /* alphanumeric */
 132         } else {        /* maybe it's a number, but could be . */
 133                 char *rem;
 134                 /* read input until can't be a number */
 135                 for ( ; (c = input()) != 0; ) {
 136                         if (bp-buf >= sz)
 137                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
 138                                         FATAL( "out of space for number %.10s...", buf );
 139                         if (isdigit(c) || c == 'e' || c == 'E'
 140                           || c == '.' || c == '+' || c == '-')
 141                                 *bp++ = c;
 142                         else {
 143                                 unput(c);
 144                                 break;
 145                         }
 146                 }
 147                 *bp = 0;
 148                 strtod(buf, &rem);      /* parse the number */
 149                 if (rem == buf) {       /* it wasn't a valid number at all */
 150                         buf[1] = 0;     /* return one character as token */
 151                         retc = (uschar)buf[0];  /* character is its own type */
 152                         unputstr(rem+1); /* put rest back for later */
 153                 } else {        /* some prefix was a number */
 154                         unputstr(rem);  /* put rest back for later */
 155                         rem[0] = 0;     /* truncate buf after number part */
 156                         retc = '0';     /* type is number */
 157                 }
 158         }
 159         *pbuf = buf;
 160         *psz = sz;
 161         return retc;
 162 }
 163
 164 int     word(char *);
 165 int     string(void);
 166 int     regexpr(void);
 167 bool    sc      = false;        /* true => return a } right now */
 168 bool    reg     = false;        /* true => return a REGEXPR now */
 169
 170 int yylex(void)
 171 {
 172         int c;
 173         static char *buf = NULL;
 174         static int bufsize = 5; /* BUG: setting this small causes core dump! */
 175
 176         if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
 177                 FATAL( "out of space in yylex" );
 178         if (sc) {
 179                 sc = false;
 180                 RET('}');
 181         }
 182         if (reg) {
 183                 reg = false;
 184                 return regexpr();
 185         }
 186         for (;;) {
 187                 c = gettok(&buf, &bufsize);
 188                 if (c == 0)
 189                         return 0;
 190                 if (isalpha(c) || c == '_')
 191                         return word(buf);
 192                 if (isdigit(c)) {
 193                         char *cp = tostring(buf);
 194                         double result;
 195
 196                         if (is_number(cp, & result))
 197                                 yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
 198                         else
 199                                 yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
 200                         free(cp);
 201                         /* should this also have STR set? */
 202                         RET(NUMBER);
 203                 }
 204
 205                 yylval.i = c;
 206                 switch (c) {
 207                 case '\n':      /* {EOL} */
 208                         lineno++;
 209                         RET(NL);
 210                 case '\r':      /* assume \n is coming */
 211                 case ' ':       /* {WS}+ */
 212                 case '\t':
 213                         break;
 214                 case '#':       /* #.* strip comments */
 215                         while ((c = input()) != '\n' && c != 0)
 216                                 ;
 217                         unput(c);
 218                         /*
 219                          * Next line is a hack, itcompensates for
 220                          * unput's treatment of \n.
 221                          */
 222                         lineno++;
 223                         break;
 224                 case ';':
 225                         RET(';');
 226                 case '\\':
 227                         if (peek() == '\n') {
 228                                 input();
 229                                 lineno++;
 230                         } else if (peek() == '\r') {
 231                                 input(); input();       /* \n */
 232                                 lineno++;
 233                         } else {
 234                                 RET(c);
 235                         }
 236                         break;
 237                 case '&':
 238                         if (peek() == '&') {
 239                                 input(); RET(AND);
 240                         } else
 241                                 RET('&');
 242                 case '|':
 243                         if (peek() == '|') {
 244                                 input(); RET(BOR);
 245                         } else
 246                                 RET('|');
 247                 case '!':
 248                         if (peek() == '=') {
 249                                 input(); yylval.i = NE; RET(NE);
 250                         } else if (peek() == '~') {
 251                                 input(); yylval.i = NOTMATCH; RET(MATCHOP);
 252                         } else
 253                                 RET(NOT);
 254                 case '~':
 255                         yylval.i = MATCH;
 256                         RET(MATCHOP);
 257                 case '<':
 258                         if (peek() == '=') {
 259                                 input(); yylval.i = LE; RET(LE);
 260                         } else {
 261                                 yylval.i = LT; RET(LT);
 262                         }
 263                 case '=':
 264                         if (peek() == '=') {
 265                                 input(); yylval.i = EQ; RET(EQ);
 266                         } else {
 267                                 yylval.i = ASSIGN; RET(ASGNOP);
 268                         }
 269                 case '>':
 270                         if (peek() == '=') {
 271                                 input(); yylval.i = GE; RET(GE);
 272                         } else if (peek() == '>') {
 273                                 input(); yylval.i = APPEND; RET(APPEND);
 274                         } else {
 275                                 yylval.i = GT; RET(GT);
 276                         }
 277                 case '+':
 278                         if (peek() == '+') {
 279                                 input(); yylval.i = INCR; RET(INCR);
 280                         } else if (peek() == '=') {
 281                                 input(); yylval.i = ADDEQ; RET(ASGNOP);
 282                         } else
 283                                 RET('+');
 284                 case '-':
 285                         if (peek() == '-') {
 286                                 input(); yylval.i = DECR; RET(DECR);
 287                         } else if (peek() == '=') {
 288                                 input(); yylval.i = SUBEQ; RET(ASGNOP);
 289                         } else
 290                                 RET('-');
 291                 case '*':
 292                         if (peek() == '=') {    /* *= */
 293                                 input(); yylval.i = MULTEQ; RET(ASGNOP);
 294                         } else if (peek() == '*') {     /* ** or **= */
 295                                 input();        /* eat 2nd * */
 296                                 if (peek() == '=') {
 297                                         input(); yylval.i = POWEQ; RET(ASGNOP);
 298                                 } else {
 299                                         RET(POWER);
 300                                 }
 301                         } else
 302                                 RET('*');
 303                 case '/':
 304                         RET('/');
 305                 case '%':
 306                         if (peek() == '=') {
 307                                 input(); yylval.i = MODEQ; RET(ASGNOP);
 308                         } else
 309                                 RET('%');
 310                 case '^':
 311                         if (peek() == '=') {
 312                                 input(); yylval.i = POWEQ; RET(ASGNOP);
 313                         } else
 314                                 RET(POWER);
 315
 316                 case '$':
 317                         /* BUG: awkward, if not wrong */
 318                         c = gettok(&buf, &bufsize);
 319                         if (isalpha(c)) {
 320                                 if (strcmp(buf, "NF") == 0) {   /* very special */
 321                                         unputstr("(NF)");
 322                                         RET(INDIRECT);
 323                                 }
 324                                 c = peek();
 325                                 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
 326                                         unputstr(buf);
 327                                         RET(INDIRECT);
 328                                 }
 329                                 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
 330                                 RET(IVAR);
 331                         } else if (c == 0) {    /*  */
 332                                 SYNTAX( "unexpected end of input after $" );
 333                                 RET(';');
 334                         } else {
 335                                 unputstr(buf);
 336                                 RET(INDIRECT);
 337                         }
 338
 339                 case '}':
 340                         if (--bracecnt < 0)
 341                                 SYNTAX( "extra }" );
 342                         sc = true;
 343                         RET(';');
 344                 case ']':
 345                         if (--brackcnt < 0)
 346                                 SYNTAX( "extra ]" );
 347                         RET(']');
 348                 case ')':
 349                         if (--parencnt < 0)
 350                                 SYNTAX( "extra )" );
 351                         RET(')');
 352                 case '{':
 353                         bracecnt++;
 354                         RET('{');
 355                 case '[':
 356                         brackcnt++;
 357                         RET('[');
 358                 case '(':
 359                         parencnt++;
 360                         RET('(');
 361
 362                 case '"':
 363                         return string();        /* BUG: should be like tran.c ? */
 364
 365                 default:
 366                         RET(c);
 367                 }
 368         }
 369 }
 370
 371 int string(void)
 372 {
 373         int c, n;
 374         char *s, *bp;
 375         static char *buf = NULL;
 376         static int bufsz = 500;
 377
 378         if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
 379                 FATAL("out of space for strings");
 380         for (bp = buf; (c = input()) != '"'; ) {
 381                 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
 382                         FATAL("out of space for string %.10s...", buf);
 383                 switch (c) {
 384                 case '\n':
 385                 case '\r':
 386                 case 0:
 387                         *bp = '\0';
 388                         SYNTAX( "non-terminated string %.10s...", buf );
 389                         if (c == 0)     /* hopeless */
 390                                 FATAL( "giving up" );
 391                         lineno++;
 392                         break;
 393                 case '\\':
 394                         c = input();
 395                         switch (c) {
 396                         case '\n': break;
 397                         case '"': *bp++ = '"'; break;
 398                         case 'n': *bp++ = '\n'; break;
 399                         case 't': *bp++ = '\t'; break;
 400                         case 'f': *bp++ = '\f'; break;
 401                         case 'r': *bp++ = '\r'; break;
 402                         case 'b': *bp++ = '\b'; break;
 403                         case 'v': *bp++ = '\v'; break;
 404                         case 'a': *bp++ = '\a'; break;
 405                         case '\\': *bp++ = '\\'; break;
 406
 407                         case '0': case '1': case '2': /* octal: \d \dd \ddd */
 408                         case '3': case '4': case '5': case '6': case '7':
 409                                 n = c - '0';
 410                                 if ((c = peek()) >= '0' && c < '8') {
 411                                         n = 8 * n + input() - '0';
 412                                         if ((c = peek()) >= '0' && c < '8')
 413                                                 n = 8 * n + input() - '0';
 414                                 }
 415                                 *bp++ = n;
 416                                 break;
 417
 418                         case 'x':       /* hex  \x0-9a-fA-F + */
 419                             {   char xbuf[100], *px;
 420                                 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
 421                                         if (isdigit(c)
 422                                          || (c >= 'a' && c <= 'f')
 423                                          || (c >= 'A' && c <= 'F'))
 424                                                 *px++ = c;
 425                                         else
 426                                                 break;
 427                                 }
 428                                 *px = 0;
 429                                 unput(c);
 430                                 sscanf(xbuf, "%x", (unsigned int *) &n);
 431                                 *bp++ = n;
 432                                 break;
 433                             }
 434
 435                         default:
 436                                 *bp++ = c;
 437                                 break;
 438                         }
 439                         break;
 440                 default:
 441                         *bp++ = c;
 442                         break;
 443                 }
 444         }
 445         *bp = 0;
 446         s = tostring(buf);
 447         *bp++ = ' '; *bp++ = '\0';
 448         yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
 449         free(s);
 450         RET(STRING);
 451 }
 452
 453
 454 static int binsearch(char *w, const Keyword *kp, int n)
 455 {
 456         int cond, low, mid, high;
 457
 458         low = 0;
 459         high = n - 1;
 460         while (low <= high) {
 461                 mid = (low + high) / 2;
 462                 if ((cond = strcmp(w, kp[mid].word)) < 0)
 463                         high = mid - 1;
 464                 else if (cond > 0)
 465                         low = mid + 1;
 466                 else
 467                         return mid;
 468         }
 469         return -1;
 470 }
 471
 472 int word(char *w)
 473 {
 474         const Keyword *kp;
 475         int c, n;
 476
 477         n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
 478         if (n != -1) {  /* found in table */
 479                 kp = keywords + n;
 480                 yylval.i = kp->sub;
 481                 switch (kp->type) {     /* special handling */
 482                 case BLTIN:
 483                         if (kp->sub == FSYSTEM && safe)
 484                                 SYNTAX( "system is unsafe" );
 485                         RET(kp->type);
 486                 case FUNC:
 487                         if (infunc)
 488                                 SYNTAX( "illegal nested function" );
 489                         RET(kp->type);
 490                 case RETURN:
 491                         if (!infunc)
 492                                 SYNTAX( "return not in function" );
 493                         RET(kp->type);
 494                 case VARNF:
 495                         yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
 496                         RET(VARNF);
 497                 default:
 498                         RET(kp->type);
 499                 }
 500         }
 501         c = peek();     /* look for '(' */
 502         if (c != '(' && infunc && (n=isarg(w)) >= 0) {
 503                 yylval.i = n;
 504                 RET(ARG);
 505         } else {
 506                 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
 507                 if (c == '(') {
 508                         RET(CALL);
 509                 } else {
 510                         RET(VAR);
 511                 }
 512         }
 513 }
 514
 515 void startreg(void)     /* next call to yylex will return a regular expression */
 516 {
 517         reg = true;
 518 }
 519
 520 int regexpr(void)
 521 {
 522         int c;
 523         static char *buf = NULL;
 524         static int bufsz = 500;
 525         char *bp;
 526
 527         if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
 528                 FATAL("out of space for reg expr");
 529         bp = buf;
 530         for ( ; (c = input()) != '/' && c != 0; ) {
 531                 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
 532                         FATAL("out of space for reg expr %.10s...", buf);
 533                 if (c == '\n') {
 534                         *bp = '\0';
 535                         SYNTAX( "newline in regular expression %.10s...", buf );
 536                         unput('\n');
 537                         break;
 538                 } else if (c == '\\') {
 539                         *bp++ = '\\';
 540                         *bp++ = input();
 541                 } else {
 542                         *bp++ = c;
 543                 }
 544         }
 545         *bp = 0;
 546         if (c == 0)
 547                 SYNTAX("non-terminated regular expression %.10s...", buf);
 548         yylval.s = buf;
 549         unput('/');
 550         RET(REGEXPR);
 551 }
 552
 553 /* low-level lexical stuff, sort of inherited from lex */
 554
 555 char    ebuf[300];
 556 char    *ep = ebuf;
 557 char    yysbuf[100];    /* pushback buffer */
 558 char    *yysptr = yysbuf;
 559 FILE    *yyin = NULL;
 560
 561 int input(void) /* get next lexical input character */
 562 {
 563         int c;
 564         extern char *lexprog;
 565
 566         if (yysptr > yysbuf)
 567                 c = (uschar)*--yysptr;
 568         else if (lexprog != NULL) {     /* awk '...' */
 569                 if ((c = (uschar)*lexprog) != 0)
 570                         lexprog++;
 571         } else                          /* awk -f ... */
 572                 c = pgetc();
 573         if (c == EOF)
 574                 c = 0;
 575         if (ep >= ebuf + sizeof ebuf)
 576                 ep = ebuf;
 577         *ep = c;
 578         if (c != 0) {
 579                 ep++;
 580         }
 581         return (c);
 582 }
 583
 584 void unput(int c)       /* put lexical character back on input */
 585 {
 586         if (c == '\n')
 587                 lineno--;
 588         if (yysptr >= yysbuf + sizeof(yysbuf))
 589                 FATAL("pushed back too much: %.20s...", yysbuf);
 590         *yysptr++ = c;
 591         if (--ep < ebuf)
 592                 ep = ebuf + sizeof(ebuf) - 1;
 593 }
 594
 595 void unputstr(const char *s)    /* put a string back on input */
 596 {
 597         int i;
 598
 599         for (i = strlen(s)-1; i >= 0; i--)
 600                 unput(s[i]);
 601 }