usr.bin/indent/lexi.c

   1 /*
   2  * Copyright (c) 1985 Sun Microsystems, Inc.
   3  * Copyright (c) 1980, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. All advertising materials mentioning features or use of this software
  16  *    must display the following acknowledgement:
  17  *      This product includes software developed by the University of
  18  *      California, Berkeley and its contributors.
  19  * 4. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  *
  35  * @(#)lexi.c   8.1 (Berkeley) 6/6/93
  36  * $FreeBSD: src/usr.bin/indent/lexi.c,v 1.21 2010/04/15 21:41:07 avg Exp $
  37  */
  38
  39 /*
  40  * Here we have the token scanner for indent.  It scans off one token and puts
  41  * it in the global variable "token".  It returns a code, indicating the type
  42  * of token scanned.
  43  */
  44
  45 #include <err.h>
  46 #include <stdio.h>
  47 #include <ctype.h>
  48 #include <stdlib.h>
  49 #include <string.h>
  50 #include "indent_globs.h"
  51 #include "indent_codes.h"
  52 #include "indent.h"
  53
  54 #define alphanum 1
  55 #define opchar 3
  56
  57 struct templ {
  58     const char *rwd;
  59     int         rwcode;
  60 };
  61
  62 struct templ specials[1000] =
  63 {
  64     {"switch", 1},
  65     {"case", 2},
  66     {"break", 0},
  67     {"struct", 3},
  68     {"union", 3},
  69     {"enum", 3},
  70     {"default", 2},
  71     {"int", 4},
  72     {"char", 4},
  73     {"float", 4},
  74     {"double", 4},
  75     {"long", 4},
  76     {"short", 4},
  77     {"typdef", 4},
  78     {"unsigned", 4},
  79     {"register", 4},
  80     {"static", 4},
  81     {"global", 4},
  82     {"extern", 4},
  83     {"void", 4},
  84     {"const", 4},
  85     {"volatile", 4},
  86     {"goto", 0},
  87     {"return", 0},
  88     {"if", 5},
  89     {"while", 5},
  90     {"for", 5},
  91     {"else", 6},
  92     {"do", 6},
  93     {"sizeof", 7},
  94     {0, 0}
  95 };
  96
  97 char        chartype[128] =
  98 {                               /* this is used to facilitate the decision of
  99                                  * what type (alphanumeric, operator) each
 100                                  * character is */
 101     0, 0, 0, 0, 0, 0, 0, 0,
 102     0, 0, 0, 0, 0, 0, 0, 0,
 103     0, 0, 0, 0, 0, 0, 0, 0,
 104     0, 0, 0, 0, 0, 0, 0, 0,
 105     0, 3, 0, 0, 1, 3, 3, 0,
 106     0, 0, 3, 3, 0, 3, 0, 3,
 107     1, 1, 1, 1, 1, 1, 1, 1,
 108     1, 1, 0, 0, 3, 3, 3, 3,
 109     0, 1, 1, 1, 1, 1, 1, 1,
 110     1, 1, 1, 1, 1, 1, 1, 1,
 111     1, 1, 1, 1, 1, 1, 1, 1,
 112     1, 1, 1, 0, 0, 0, 3, 1,
 113     0, 1, 1, 1, 1, 1, 1, 1,
 114     1, 1, 1, 1, 1, 1, 1, 1,
 115     1, 1, 1, 1, 1, 1, 1, 1,
 116     1, 1, 1, 0, 3, 0, 3, 0
 117 };
 118
 119 int
 120 lexi(void)
 121 {
 122     int         unary_delim;    /* this is set to 1 if the current token
 123                                  * forces a following operator to be unary */
 124     static int  last_code;      /* the last token type returned */
 125     static int  l_struct;       /* set to 1 if the last token was 'struct' */
 126     int         code;           /* internal code to be returned */
 127     char        qchar;          /* the delimiter character for a string */
 128
 129     e_token = s_token;          /* point to start of place to save token */
 130     unary_delim = false;
 131     ps.col_1 = ps.last_nl;      /* tell world that this token started in
 132                                  * column 1 iff the last thing scanned was nl */
 133     ps.last_nl = false;
 134
 135     while (*buf_ptr == ' ' || *buf_ptr == '\t') {       /* get rid of blanks */
 136         ps.col_1 = false;       /* leading blanks imply token is not in column
 137                                  * 1 */
 138         if (++buf_ptr >= buf_end)
 139             fill_buffer();
 140     }
 141
 142     /* Scan an alphanumeric token */
 143     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 144         /*
 145          * we have a character or number
 146          */
 147         const char *j;          /* used for searching thru list of
 148                                  *
 149                                  * reserved words */
 150         struct templ *p;
 151
 152         if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 153             int         seendot = 0,
 154                         seenexp = 0,
 155                         seensfx = 0;
 156             if (*buf_ptr == '0' &&
 157                     (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
 158                 *e_token++ = *buf_ptr++;
 159                 *e_token++ = *buf_ptr++;
 160                 while (isxdigit(*buf_ptr)) {
 161                     CHECK_SIZE_TOKEN;
 162                     *e_token++ = *buf_ptr++;
 163                 }
 164             }
 165             else
 166                 while (1) {
 167                     if (*buf_ptr == '.') {
 168                         if (seendot)
 169                             break;
 170                         else
 171                             seendot++;
 172                     }
 173                     CHECK_SIZE_TOKEN;
 174                     *e_token++ = *buf_ptr++;
 175                     if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
 176                         if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
 177                             break;
 178                         else {
 179                             seenexp++;
 180                             seendot++;
 181                             CHECK_SIZE_TOKEN;
 182                             *e_token++ = *buf_ptr++;
 183                             if (*buf_ptr == '+' || *buf_ptr == '-')
 184                                 *e_token++ = *buf_ptr++;
 185                         }
 186                     }
 187                 }
 188             while (1) {
 189                 if (!(seensfx & 1) &&
 190                         (*buf_ptr == 'U' || *buf_ptr == 'u')) {
 191                     CHECK_SIZE_TOKEN;
 192                     *e_token++ = *buf_ptr++;
 193                     seensfx |= 1;
 194                     continue;
 195                 }
 196                 if (!(seensfx & 2) &&
 197                         (*buf_ptr == 'L' || *buf_ptr == 'l')) {
 198                     CHECK_SIZE_TOKEN;
 199                     if (buf_ptr[1] == buf_ptr[0])
 200                         *e_token++ = *buf_ptr++;
 201                     *e_token++ = *buf_ptr++;
 202                     seensfx |= 2;
 203                     continue;
 204                 }
 205                 break;
 206             }
 207         }
 208         else
 209             while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
 210                 /* fill_buffer() terminates buffer with newline */
 211                 if (*buf_ptr == BACKSLASH) {
 212                     if (*(buf_ptr + 1) == '\n') {
 213                         buf_ptr += 2;
 214                         if (buf_ptr >= buf_end)
 215                             fill_buffer();
 216                         } else
 217                             break;
 218                 }
 219                 CHECK_SIZE_TOKEN;
 220                 /* copy it over */
 221                 *e_token++ = *buf_ptr++;
 222                 if (buf_ptr >= buf_end)
 223                     fill_buffer();
 224             }
 225         *e_token++ = '\0';
 226         while (*buf_ptr == ' ' || *buf_ptr == '\t') {   /* get rid of blanks */
 227             if (++buf_ptr >= buf_end)
 228                 fill_buffer();
 229         }
 230         ps.its_a_keyword = false;
 231         ps.sizeof_keyword = false;
 232         if (l_struct && !ps.p_l_follow) {
 233                                 /* if last token was 'struct' and we're not
 234                                  * in parentheses, then this token
 235                                  * should be treated as a declaration */
 236             l_struct = false;
 237             last_code = ident;
 238             ps.last_u_d = true;
 239             return (decl);
 240         }
 241         ps.last_u_d = l_struct; /* Operator after identifier is binary
 242                                  * unless last token was 'struct' */
 243         l_struct = false;
 244         last_code = ident;      /* Remember that this is the code we will
 245                                  * return */
 246
 247         if (auto_typedefs) {
 248             const char *q = s_token;
 249             size_t q_len = strlen(q);
 250             /* Check if we have an "_t" in the end */
 251             if (q_len > 2 &&
 252                 (strcmp(q + q_len - 2, "_t") == 0)) {
 253                 ps.its_a_keyword = true;
 254                 ps.last_u_d = true;
 255                 goto found_auto_typedef;
 256             }
 257         }
 258
 259         /*
 260          * This loop will check if the token is a keyword.
 261          */
 262         for (p = specials; (j = p->rwd) != NULL; p++) {
 263             const char *q = s_token;    /* point at scanned token */
 264             if (*j++ != *q++ || *j++ != *q++)
 265                 continue;       /* This test depends on the fact that
 266                                  * identifiers are always at least 1 character
 267                                  * long (ie. the first two bytes of the
 268                                  * identifier are always meaningful) */
 269             if (q[-1] == 0)
 270                 break;          /* If its a one-character identifier */
 271             while (*q++ == *j)
 272                 if (*j++ == 0)
 273                     goto found_keyword; /* I wish that C had a multi-level
 274                                          * break... */
 275         }
 276         if (p->rwd) {           /* we have a keyword */
 277     found_keyword:
 278             ps.its_a_keyword = true;
 279             ps.last_u_d = true;
 280             switch (p->rwcode) {
 281             case 1:             /* it is a switch */
 282                 return (swstmt);
 283             case 2:             /* a case or default */
 284                 return (casestmt);
 285
 286             case 3:             /* a "struct" */
 287                 /*
 288                  * Next time around, we will want to know that we have had a
 289                  * 'struct'
 290                  */
 291                 l_struct = true;
 292                 /* FALLTHROUGH */
 293
 294             case 4:             /* one of the declaration keywords */
 295             found_auto_typedef:
 296                 if (ps.p_l_follow) {
 297                     ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
 298                     break;      /* inside parens: cast, param list or sizeof */
 299                 }
 300                 last_code = decl;
 301                 return (decl);
 302
 303             case 5:             /* if, while, for */
 304                 return (sp_paren);
 305
 306             case 6:             /* do, else */
 307                 return (sp_nparen);
 308
 309             case 7:
 310                 ps.sizeof_keyword = true;
 311             default:            /* all others are treated like any other
 312                                  * identifier */
 313                 return (ident);
 314             }                   /* end of switch */
 315         }                       /* end of if (found_it) */
 316         if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
 317             char *tp = buf_ptr;
 318             while (tp < buf_end)
 319                 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
 320                     goto not_proc;
 321             strncpy(ps.procname, token, sizeof ps.procname - 1);
 322             ps.in_parameter_declaration = 1;
 323             rparen_count = 1;
 324     not_proc:;
 325         }
 326         /*
 327          * The following hack attempts to guess whether or not the current
 328          * token is in fact a declaration keyword -- one that has been
 329          * typedefd
 330          */
 331         if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
 332                 && !ps.p_l_follow
 333                 && !ps.block_init
 334                 && (ps.last_token == rparen || ps.last_token == semicolon ||
 335                     ps.last_token == decl ||
 336                     ps.last_token == lbrace || ps.last_token == rbrace)) {
 337             ps.its_a_keyword = true;
 338             ps.last_u_d = true;
 339             last_code = decl;
 340             return decl;
 341         }
 342         if (last_code == decl)  /* if this is a declared variable, then
 343                                  * following sign is unary */
 344             ps.last_u_d = true; /* will make "int a -1" work */
 345         last_code = ident;
 346         return (ident);         /* the ident is not in the list */
 347     }                           /* end of procesing for alpanum character */
 348
 349     /* Scan a non-alphanumeric token */
 350
 351     *e_token++ = *buf_ptr;              /* if it is only a one-character token, it is
 352                                  * moved here */
 353     *e_token = '\0';
 354     if (++buf_ptr >= buf_end)
 355         fill_buffer();
 356
 357     switch (*token) {
 358     case '\n':
 359         unary_delim = ps.last_u_d;
 360         ps.last_nl = true;      /* remember that we just had a newline */
 361         code = (had_eof ? 0 : newline);
 362
 363         /*
 364          * if data has been exhausted, the newline is a dummy, and we should
 365          * return code to stop
 366          */
 367         break;
 368
 369     case '\'':                  /* start of quoted character */
 370     case '"':                   /* start of string */
 371         qchar = *token;
 372         if (troff) {
 373             e_token[-1] = '`';
 374             if (qchar == '"')
 375                 *e_token++ = '`';
 376             e_token = chfont(&bodyf, &stringf, e_token);
 377         }
 378         do {                    /* copy the string */
 379             while (1) {         /* move one character or [/<char>]<char> */
 380                 if (*buf_ptr == '\n') {
 381                     diag2(1, "Unterminated literal");
 382                     goto stop_lit;
 383                 }
 384                 CHECK_SIZE_TOKEN;       /* Only have to do this once in this loop,
 385                                          * since CHECK_SIZE guarantees that there
 386                                          * are at least 5 entries left */
 387                 *e_token = *buf_ptr++;
 388                 if (buf_ptr >= buf_end)
 389                     fill_buffer();
 390                 if (*e_token == BACKSLASH) {    /* if escape, copy extra char */
 391                     if (*buf_ptr == '\n')       /* check for escaped newline */
 392                         ++line_no;
 393                     if (troff) {
 394                         *++e_token = BACKSLASH;
 395                         if (*buf_ptr == BACKSLASH)
 396                             *++e_token = BACKSLASH;
 397                     }
 398                     *++e_token = *buf_ptr++;
 399                     ++e_token;  /* we must increment this again because we
 400                                  * copied two chars */
 401                     if (buf_ptr >= buf_end)
 402                         fill_buffer();
 403                 }
 404                 else
 405                     break;      /* we copied one character */
 406             }                   /* end of while (1) */
 407         } while (*e_token++ != qchar);
 408         if (troff) {
 409             e_token = chfont(&stringf, &bodyf, e_token - 1);
 410             if (qchar == '"')
 411                 *e_token++ = '\'';
 412         }
 413 stop_lit:
 414         code = ident;
 415         break;
 416
 417     case ('('):
 418     case ('['):
 419         unary_delim = true;
 420         code = lparen;
 421         break;
 422
 423     case (')'):
 424     case (']'):
 425         code = rparen;
 426         break;
 427
 428     case '#':
 429         unary_delim = ps.last_u_d;
 430         code = preesc;
 431         break;
 432
 433     case '?':
 434         unary_delim = true;
 435         code = question;
 436         break;
 437
 438     case (':'):
 439         code = colon;
 440         unary_delim = true;
 441         break;
 442
 443     case (';'):
 444         unary_delim = true;
 445         code = semicolon;
 446         break;
 447
 448     case ('{'):
 449         unary_delim = true;
 450
 451         /*
 452          * if (ps.in_or_st) ps.block_init = 1;
 453          */
 454         /* ?    code = ps.block_init ? lparen : lbrace; */
 455         code = lbrace;
 456         break;
 457
 458     case ('}'):
 459         unary_delim = true;
 460         /* ?    code = ps.block_init ? rparen : rbrace; */
 461         code = rbrace;
 462         break;
 463
 464     case 014:                   /* a form feed */
 465         unary_delim = ps.last_u_d;
 466         ps.last_nl = true;      /* remember this so we can set 'ps.col_1'
 467                                  * right */
 468         code = form_feed;
 469         break;
 470
 471     case (','):
 472         unary_delim = true;
 473         code = comma;
 474         break;
 475
 476     case '.':
 477         unary_delim = false;
 478         code = period;
 479         break;
 480
 481     case '-':
 482     case '+':                   /* check for -, +, --, ++ */
 483         code = (ps.last_u_d ? unary_op : binary_op);
 484         unary_delim = true;
 485
 486         if (*buf_ptr == token[0]) {
 487             /* check for doubled character */
 488             *e_token++ = *buf_ptr++;
 489             /* buffer overflow will be checked at end of loop */
 490             if (last_code == ident || last_code == rparen) {
 491                 code = (ps.last_u_d ? unary_op : postop);
 492                 /* check for following ++ or -- */
 493                 unary_delim = false;
 494             }
 495         }
 496         else if (*buf_ptr == '=')
 497             /* check for operator += */
 498             *e_token++ = *buf_ptr++;
 499         else if (*buf_ptr == '>') {
 500             /* check for operator -> */
 501             *e_token++ = *buf_ptr++;
 502             if (!pointer_as_binop) {
 503                 unary_delim = false;
 504                 code = unary_op;
 505                 ps.want_blank = false;
 506             }
 507         }
 508         break;                  /* buffer overflow will be checked at end of
 509                                  * switch */
 510
 511     case '=':
 512         if (ps.in_or_st)
 513             ps.block_init = 1;
 514 #ifdef undef
 515         if (chartype[*buf_ptr] == opchar) {     /* we have two char assignment */
 516             e_token[-1] = *buf_ptr++;
 517             if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
 518                 *e_token++ = *buf_ptr++;
 519             *e_token++ = '=';   /* Flip =+ to += */
 520             *e_token = 0;
 521         }
 522 #else
 523         if (*buf_ptr == '=') {/* == */
 524             *e_token++ = '=';   /* Flip =+ to += */
 525             buf_ptr++;
 526             *e_token = 0;
 527         }
 528 #endif
 529         code = binary_op;
 530         unary_delim = true;
 531         break;
 532         /* can drop thru!!! */
 533
 534     case '>':
 535     case '<':
 536     case '!':                   /* ops like <, <<, <=, !=, etc */
 537         if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
 538             *e_token++ = *buf_ptr;
 539             if (++buf_ptr >= buf_end)
 540                 fill_buffer();
 541         }
 542         if (*buf_ptr == '=')
 543             *e_token++ = *buf_ptr++;
 544         code = (ps.last_u_d ? unary_op : binary_op);
 545         unary_delim = true;
 546         break;
 547
 548     default:
 549         if (token[0] == '/' && *buf_ptr == '*') {
 550             /* it is start of comment */
 551             *e_token++ = '*';
 552
 553             if (++buf_ptr >= buf_end)
 554                 fill_buffer();
 555
 556             code = comment;
 557             unary_delim = ps.last_u_d;
 558             break;
 559         }
 560         while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
 561             /*
 562              * handle ||, &&, etc, and also things as in int *****i
 563              */
 564             *e_token++ = *buf_ptr;
 565             if (++buf_ptr >= buf_end)
 566                 fill_buffer();
 567         }
 568         code = (ps.last_u_d ? unary_op : binary_op);
 569         unary_delim = true;
 570
 571
 572     }                           /* end of switch */
 573     if (code != newline) {
 574         l_struct = false;
 575         last_code = code;
 576     }
 577     if (buf_ptr >= buf_end)     /* check for input buffer empty */
 578         fill_buffer();
 579     ps.last_u_d = unary_delim;
 580     *e_token = '\0';            /* null terminate the token */
 581     return (code);
 582 }
 583
 584 /*
 585  * Add the given keyword to the keyword table, using val as the keyword type
 586  */
 587 void
 588 addkey(char *key, int val)
 589 {
 590     struct templ *p = specials;
 591     while (p->rwd)
 592         if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
 593             return;
 594         else
 595             p++;
 596     if (p >= specials + sizeof specials / sizeof specials[0])
 597         return;                 /* For now, table overflows are silently
 598                                  * ignored */
 599     p->rwd = key;
 600     p->rwcode = val;
 601     p[1].rwd = NULL;
 602     p[1].rwcode = 0;
 603 }