usr.bin/indent/lexi.c

   1 /*
   2  * Copyright (c) 1985 Sun Microsystems, Inc.
   3  * Copyright (c) 1980, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. Neither the name of the University nor the names of its contributors
  16  *    may be used to endorse or promote products derived from this software
  17  *    without specific prior written permission.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  29  * SUCH DAMAGE.
  30  *
  31  * @(#)lexi.c   8.1 (Berkeley) 6/6/93
  32  * $FreeBSD: src/usr.bin/indent/lexi.c,v 1.21 2010/04/15 21:41:07 avg Exp $
  33  */
  34
  35 /*
  36  * Here we have the token scanner for indent.  It scans off one token and puts
  37  * it in the global variable "token".  It returns a code, indicating the type
  38  * of token scanned.
  39  */
  40
  41 #include <err.h>
  42 #include <stdio.h>
  43 #include <ctype.h>
  44 #include <stdlib.h>
  45 #include <string.h>
  46 #include "indent_globs.h"
  47 #include "indent_codes.h"
  48 #include "indent.h"
  49
  50 #define alphanum 1
  51 #define opchar 3
  52
  53 struct templ {
  54     const char *rwd;
  55     int         rwcode;
  56 };
  57
  58 struct templ specials[1000] =
  59 {
  60     {"switch", 1},
  61     {"case", 2},
  62     {"break", 0},
  63     {"struct", 3},
  64     {"union", 3},
  65     {"enum", 3},
  66     {"default", 2},
  67     {"int", 4},
  68     {"char", 4},
  69     {"float", 4},
  70     {"double", 4},
  71     {"long", 4},
  72     {"short", 4},
  73     {"typdef", 4},
  74     {"unsigned", 4},
  75     {"register", 4},
  76     {"static", 4},
  77     {"global", 4},
  78     {"extern", 4},
  79     {"void", 4},
  80     {"const", 4},
  81     {"volatile", 4},
  82     {"goto", 0},
  83     {"return", 0},
  84     {"if", 5},
  85     {"while", 5},
  86     {"for", 5},
  87     {"else", 6},
  88     {"do", 6},
  89     {"sizeof", 7},
  90     {0, 0}
  91 };
  92
  93 char        chartype[128] =
  94 {                               /* this is used to facilitate the decision of
  95                                  * what type (alphanumeric, operator) each
  96                                  * character is */
  97     0, 0, 0, 0, 0, 0, 0, 0,
  98     0, 0, 0, 0, 0, 0, 0, 0,
  99     0, 0, 0, 0, 0, 0, 0, 0,
 100     0, 0, 0, 0, 0, 0, 0, 0,
 101     0, 3, 0, 0, 1, 3, 3, 0,
 102     0, 0, 3, 3, 0, 3, 0, 3,
 103     1, 1, 1, 1, 1, 1, 1, 1,
 104     1, 1, 0, 0, 3, 3, 3, 3,
 105     0, 1, 1, 1, 1, 1, 1, 1,
 106     1, 1, 1, 1, 1, 1, 1, 1,
 107     1, 1, 1, 1, 1, 1, 1, 1,
 108     1, 1, 1, 0, 0, 0, 3, 1,
 109     0, 1, 1, 1, 1, 1, 1, 1,
 110     1, 1, 1, 1, 1, 1, 1, 1,
 111     1, 1, 1, 1, 1, 1, 1, 1,
 112     1, 1, 1, 0, 3, 0, 3, 0
 113 };
 114
 115 int
 116 lexi(void)
 117 {
 118     int         unary_delim;    /* this is set to 1 if the current token
 119                                  * forces a following operator to be unary */
 120     static int  last_code;      /* the last token type returned */
 121     static int  l_struct;       /* set to 1 if the last token was 'struct' */
 122     int         code;           /* internal code to be returned */
 123     char        qchar;          /* the delimiter character for a string */
 124
 125     e_token = s_token;          /* point to start of place to save token */
 126     unary_delim = false;
 127     ps.col_1 = ps.last_nl;      /* tell world that this token started in
 128                                  * column 1 iff the last thing scanned was nl */
 129     ps.last_nl = false;
 130
 131     while (*buf_ptr == ' ' || *buf_ptr == '\t') {       /* get rid of blanks */
 132         ps.col_1 = false;       /* leading blanks imply token is not in column
 133                                  * 1 */
 134         if (++buf_ptr >= buf_end)
 135             fill_buffer();
 136     }
 137
 138     /* Scan an alphanumeric token */
 139     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 140         /*
 141          * we have a character or number
 142          */
 143         const char *j;          /* used for searching thru list of
 144                                  *
 145                                  * reserved words */
 146         struct templ *p;
 147
 148         if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 149             int         seendot = 0,
 150                         seenexp = 0,
 151                         seensfx = 0;
 152             if (*buf_ptr == '0' &&
 153                     (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
 154                 *e_token++ = *buf_ptr++;
 155                 *e_token++ = *buf_ptr++;
 156                 while (isxdigit(*buf_ptr)) {
 157                     CHECK_SIZE_TOKEN;
 158                     *e_token++ = *buf_ptr++;
 159                 }
 160             }
 161             else
 162                 while (1) {
 163                     if (*buf_ptr == '.') {
 164                         if (seendot)
 165                             break;
 166                         else
 167                             seendot++;
 168                     }
 169                     CHECK_SIZE_TOKEN;
 170                     *e_token++ = *buf_ptr++;
 171                     if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
 172                         if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
 173                             break;
 174                         else {
 175                             seenexp++;
 176                             seendot++;
 177                             CHECK_SIZE_TOKEN;
 178                             *e_token++ = *buf_ptr++;
 179                             if (*buf_ptr == '+' || *buf_ptr == '-')
 180                                 *e_token++ = *buf_ptr++;
 181                         }
 182                     }
 183                 }
 184             while (1) {
 185                 if (!(seensfx & 1) &&
 186                         (*buf_ptr == 'U' || *buf_ptr == 'u')) {
 187                     CHECK_SIZE_TOKEN;
 188                     *e_token++ = *buf_ptr++;
 189                     seensfx |= 1;
 190                     continue;
 191                 }
 192                 if (!(seensfx & 2) &&
 193                         (*buf_ptr == 'L' || *buf_ptr == 'l')) {
 194                     CHECK_SIZE_TOKEN;
 195                     if (buf_ptr[1] == buf_ptr[0])
 196                         *e_token++ = *buf_ptr++;
 197                     *e_token++ = *buf_ptr++;
 198                     seensfx |= 2;
 199                     continue;
 200                 }
 201                 break;
 202             }
 203         }
 204         else
 205             while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
 206                 /* fill_buffer() terminates buffer with newline */
 207                 if (*buf_ptr == BACKSLASH) {
 208                     if (*(buf_ptr + 1) == '\n') {
 209                         buf_ptr += 2;
 210                         if (buf_ptr >= buf_end)
 211                             fill_buffer();
 212                         } else
 213                             break;
 214                 }
 215                 CHECK_SIZE_TOKEN;
 216                 /* copy it over */
 217                 *e_token++ = *buf_ptr++;
 218                 if (buf_ptr >= buf_end)
 219                     fill_buffer();
 220             }
 221         *e_token++ = '\0';
 222         while (*buf_ptr == ' ' || *buf_ptr == '\t') {   /* get rid of blanks */
 223             if (++buf_ptr >= buf_end)
 224                 fill_buffer();
 225         }
 226         ps.its_a_keyword = false;
 227         ps.sizeof_keyword = false;
 228         if (l_struct && !ps.p_l_follow) {
 229                                 /* if last token was 'struct' and we're not
 230                                  * in parentheses, then this token
 231                                  * should be treated as a declaration */
 232             l_struct = false;
 233             last_code = ident;
 234             ps.last_u_d = true;
 235             return (decl);
 236         }
 237         ps.last_u_d = l_struct; /* Operator after identifier is binary
 238                                  * unless last token was 'struct' */
 239         l_struct = false;
 240         last_code = ident;      /* Remember that this is the code we will
 241                                  * return */
 242
 243         if (auto_typedefs) {
 244             const char *q = s_token;
 245             size_t q_len = strlen(q);
 246             /* Check if we have an "_t" in the end */
 247             if (q_len > 2 &&
 248                 (strcmp(q + q_len - 2, "_t") == 0)) {
 249                 ps.its_a_keyword = true;
 250                 ps.last_u_d = true;
 251                 goto found_auto_typedef;
 252             }
 253         }
 254
 255         /*
 256          * This loop will check if the token is a keyword.
 257          */
 258         for (p = specials; (j = p->rwd) != NULL; p++) {
 259             const char *q = s_token;    /* point at scanned token */
 260             if (*j++ != *q++ || *j++ != *q++)
 261                 continue;       /* This test depends on the fact that
 262                                  * identifiers are always at least 1 character
 263                                  * long (ie. the first two bytes of the
 264                                  * identifier are always meaningful) */
 265             if (q[-1] == 0)
 266                 break;          /* If its a one-character identifier */
 267             while (*q++ == *j)
 268                 if (*j++ == 0)
 269                     goto found_keyword; /* I wish that C had a multi-level
 270                                          * break... */
 271         }
 272         if (p->rwd) {           /* we have a keyword */
 273     found_keyword:
 274             ps.its_a_keyword = true;
 275             ps.last_u_d = true;
 276             switch (p->rwcode) {
 277             case 1:             /* it is a switch */
 278                 return (swstmt);
 279             case 2:             /* a case or default */
 280                 return (casestmt);
 281
 282             case 3:             /* a "struct" */
 283                 /*
 284                  * Next time around, we will want to know that we have had a
 285                  * 'struct'
 286                  */
 287                 l_struct = true;
 288                 /* FALLTHROUGH */
 289
 290             case 4:             /* one of the declaration keywords */
 291             found_auto_typedef:
 292                 if (ps.p_l_follow) {
 293                     ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
 294                     break;      /* inside parens: cast, param list or sizeof */
 295                 }
 296                 last_code = decl;
 297                 return (decl);
 298
 299             case 5:             /* if, while, for */
 300                 return (sp_paren);
 301
 302             case 6:             /* do, else */
 303                 return (sp_nparen);
 304
 305             case 7:
 306                 ps.sizeof_keyword = true;
 307                 /* FALLTHROUGH */
 308             default:            /* all others are treated like any other
 309                                  * identifier */
 310                 return (ident);
 311             }                   /* end of switch */
 312         }                       /* end of if (found_it) */
 313         if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
 314             char *tp = buf_ptr;
 315             while (tp < buf_end)
 316                 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
 317                     goto not_proc;
 318             strncpy(ps.procname, token, sizeof ps.procname - 1);
 319             ps.in_parameter_declaration = 1;
 320             rparen_count = 1;
 321     not_proc:;
 322         }
 323         /*
 324          * The following hack attempts to guess whether or not the current
 325          * token is in fact a declaration keyword -- one that has been
 326          * typedefd
 327          */
 328         if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
 329                 && !ps.p_l_follow
 330                 && !ps.block_init
 331                 && (ps.last_token == rparen || ps.last_token == semicolon ||
 332                     ps.last_token == decl ||
 333                     ps.last_token == lbrace || ps.last_token == rbrace)) {
 334             ps.its_a_keyword = true;
 335             ps.last_u_d = true;
 336             last_code = decl;
 337             return decl;
 338         }
 339         if (last_code == decl)  /* if this is a declared variable, then
 340                                  * following sign is unary */
 341             ps.last_u_d = true; /* will make "int a -1" work */
 342         last_code = ident;
 343         return (ident);         /* the ident is not in the list */
 344     }                           /* end of procesing for alpanum character */
 345
 346     /* Scan a non-alphanumeric token */
 347
 348     *e_token++ = *buf_ptr;              /* if it is only a one-character token, it is
 349                                  * moved here */
 350     *e_token = '\0';
 351     if (++buf_ptr >= buf_end)
 352         fill_buffer();
 353
 354     switch (*token) {
 355     case '\n':
 356         unary_delim = ps.last_u_d;
 357         ps.last_nl = true;      /* remember that we just had a newline */
 358         code = (had_eof ? 0 : newline);
 359
 360         /*
 361          * if data has been exhausted, the newline is a dummy, and we should
 362          * return code to stop
 363          */
 364         break;
 365
 366     case '\'':                  /* start of quoted character */
 367     case '"':                   /* start of string */
 368         qchar = *token;
 369         if (troff) {
 370             e_token[-1] = '`';
 371             if (qchar == '"')
 372                 *e_token++ = '`';
 373             e_token = chfont(&bodyf, &stringf, e_token);
 374         }
 375         do {                    /* copy the string */
 376             while (1) {         /* move one character or [/<char>]<char> */
 377                 if (*buf_ptr == '\n') {
 378                     diag2(1, "Unterminated literal");
 379                     goto stop_lit;
 380                 }
 381                 CHECK_SIZE_TOKEN;       /* Only have to do this once in this loop,
 382                                          * since CHECK_SIZE guarantees that there
 383                                          * are at least 5 entries left */
 384                 *e_token = *buf_ptr++;
 385                 if (buf_ptr >= buf_end)
 386                     fill_buffer();
 387                 if (*e_token == BACKSLASH) {    /* if escape, copy extra char */
 388                     if (*buf_ptr == '\n')       /* check for escaped newline */
 389                         ++line_no;
 390                     if (troff) {
 391                         *++e_token = BACKSLASH;
 392                         if (*buf_ptr == BACKSLASH)
 393                             *++e_token = BACKSLASH;
 394                     }
 395                     *++e_token = *buf_ptr++;
 396                     ++e_token;  /* we must increment this again because we
 397                                  * copied two chars */
 398                     if (buf_ptr >= buf_end)
 399                         fill_buffer();
 400                 }
 401                 else
 402                     break;      /* we copied one character */
 403             }                   /* end of while (1) */
 404         } while (*e_token++ != qchar);
 405         if (troff) {
 406             e_token = chfont(&stringf, &bodyf, e_token - 1);
 407             if (qchar == '"')
 408                 *e_token++ = '\'';
 409         }
 410 stop_lit:
 411         code = ident;
 412         break;
 413
 414     case ('('):
 415     case ('['):
 416         unary_delim = true;
 417         code = lparen;
 418         break;
 419
 420     case (')'):
 421     case (']'):
 422         code = rparen;
 423         break;
 424
 425     case '#':
 426         unary_delim = ps.last_u_d;
 427         code = preesc;
 428         break;
 429
 430     case '?':
 431         unary_delim = true;
 432         code = question;
 433         break;
 434
 435     case (':'):
 436         code = colon;
 437         unary_delim = true;
 438         break;
 439
 440     case (';'):
 441         unary_delim = true;
 442         code = semicolon;
 443         break;
 444
 445     case ('{'):
 446         unary_delim = true;
 447
 448         /*
 449          * if (ps.in_or_st) ps.block_init = 1;
 450          */
 451         /* ?    code = ps.block_init ? lparen : lbrace; */
 452         code = lbrace;
 453         break;
 454
 455     case ('}'):
 456         unary_delim = true;
 457         /* ?    code = ps.block_init ? rparen : rbrace; */
 458         code = rbrace;
 459         break;
 460
 461     case 014:                   /* a form feed */
 462         unary_delim = ps.last_u_d;
 463         ps.last_nl = true;      /* remember this so we can set 'ps.col_1'
 464                                  * right */
 465         code = form_feed;
 466         break;
 467
 468     case (','):
 469         unary_delim = true;
 470         code = comma;
 471         break;
 472
 473     case '.':
 474         unary_delim = false;
 475         code = period;
 476         break;
 477
 478     case '-':
 479     case '+':                   /* check for -, +, --, ++ */
 480         code = (ps.last_u_d ? unary_op : binary_op);
 481         unary_delim = true;
 482
 483         if (*buf_ptr == token[0]) {
 484             /* check for doubled character */
 485             *e_token++ = *buf_ptr++;
 486             /* buffer overflow will be checked at end of loop */
 487             if (last_code == ident || last_code == rparen) {
 488                 code = (ps.last_u_d ? unary_op : postop);
 489                 /* check for following ++ or -- */
 490                 unary_delim = false;
 491             }
 492         }
 493         else if (*buf_ptr == '=')
 494             /* check for operator += */
 495             *e_token++ = *buf_ptr++;
 496         else if (*buf_ptr == '>') {
 497             /* check for operator -> */
 498             *e_token++ = *buf_ptr++;
 499             if (!pointer_as_binop) {
 500                 unary_delim = false;
 501                 code = unary_op;
 502                 ps.want_blank = false;
 503             }
 504         }
 505         break;                  /* buffer overflow will be checked at end of
 506                                  * switch */
 507
 508     case '=':
 509         if (ps.in_or_st)
 510             ps.block_init = 1;
 511 #ifdef undef
 512         if (chartype[*buf_ptr] == opchar) {     /* we have two char assignment */
 513             e_token[-1] = *buf_ptr++;
 514             if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
 515                 *e_token++ = *buf_ptr++;
 516             *e_token++ = '=';   /* Flip =+ to += */
 517             *e_token = 0;
 518         }
 519 #else
 520         if (*buf_ptr == '=') {/* == */
 521             *e_token++ = '=';   /* Flip =+ to += */
 522             buf_ptr++;
 523             *e_token = 0;
 524         }
 525 #endif
 526         code = binary_op;
 527         unary_delim = true;
 528         break;
 529         /* can drop thru!!! */
 530
 531     case '>':
 532     case '<':
 533     case '!':                   /* ops like <, <<, <=, !=, etc */
 534         if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
 535             *e_token++ = *buf_ptr;
 536             if (++buf_ptr >= buf_end)
 537                 fill_buffer();
 538         }
 539         if (*buf_ptr == '=')
 540             *e_token++ = *buf_ptr++;
 541         code = (ps.last_u_d ? unary_op : binary_op);
 542         unary_delim = true;
 543         break;
 544
 545     default:
 546         if (token[0] == '/' && *buf_ptr == '*') {
 547             /* it is start of comment */
 548             *e_token++ = '*';
 549
 550             if (++buf_ptr >= buf_end)
 551                 fill_buffer();
 552
 553             code = comment;
 554             unary_delim = ps.last_u_d;
 555             break;
 556         }
 557         while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
 558             /*
 559              * handle ||, &&, etc, and also things as in int *****i
 560              */
 561             *e_token++ = *buf_ptr;
 562             if (++buf_ptr >= buf_end)
 563                 fill_buffer();
 564         }
 565         code = (ps.last_u_d ? unary_op : binary_op);
 566         unary_delim = true;
 567
 568
 569     }                           /* end of switch */
 570     if (code != newline) {
 571         l_struct = false;
 572         last_code = code;
 573     }
 574     if (buf_ptr >= buf_end)     /* check for input buffer empty */
 575         fill_buffer();
 576     ps.last_u_d = unary_delim;
 577     *e_token = '\0';            /* null terminate the token */
 578     return (code);
 579 }
 580
 581 /*
 582  * Add the given keyword to the keyword table, using val as the keyword type
 583  */
 584 void
 585 addkey(char *key, int val)
 586 {
 587     struct templ *p = specials;
 588     while (p->rwd)
 589         if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
 590             return;
 591         else
 592             p++;
 593     if (p >= specials + sizeof specials / sizeof specials[0])
 594         return;                 /* For now, table overflows are silently
 595                                  * ignored */
 596     p->rwd = key;
 597     p->rwcode = val;
 598     p[1].rwd = NULL;
 599     p[1].rwcode = 0;
 600 }