usr.bin/indent/lexi.c

   1 /*
   2  * Copyright (c) 1985 Sun Microsystems, Inc.
   3  * Copyright (c) 1980, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. Neither the name of the University nor the names of its contributors
  16  *    may be used to endorse or promote products derived from this software
  17  *    without specific prior written permission.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  29  * SUCH DAMAGE.
  30  *
  31  * @(#)lexi.c   8.1 (Berkeley) 6/6/93
  32  * $FreeBSD: src/usr.bin/indent/lexi.c,v 1.21 2010/04/15 21:41:07 avg Exp $
  33  */
  34
  35 /*
  36  * Here we have the token scanner for indent.  It scans off one token and puts
  37  * it in the global variable "token".  It returns a code, indicating the type
  38  * of token scanned.
  39  */
  40
  41 #include <err.h>
  42 #include <stdio.h>
  43 #include <ctype.h>
  44 #include <stdlib.h>
  45 #include <string.h>
  46 #include "indent_globs.h"
  47 #include "indent_codes.h"
  48 #include "indent.h"
  49
  50 #define alphanum 1
  51 #define opchar 3
  52
  53 struct templ {
  54     const char *rwd;
  55     int         rwcode;
  56 };
  57
  58 struct templ specials[1000] =
  59 {
  60     {"switch", 1},
  61     {"case", 2},
  62     {"break", 0},
  63     {"struct", 3},
  64     {"union", 3},
  65     {"enum", 3},
  66     {"default", 2},
  67     {"int", 4},
  68     {"char", 4},
  69     {"float", 4},
  70     {"double", 4},
  71     {"long", 4},
  72     {"short", 4},
  73     {"typdef", 4},
  74     {"unsigned", 4},
  75     {"register", 4},
  76     {"static", 4},
  77     {"global", 4},
  78     {"extern", 4},
  79     {"void", 4},
  80     {"const", 4},
  81     {"volatile", 4},
  82     {"goto", 0},
  83     {"return", 0},
  84     {"if", 5},
  85     {"while", 5},
  86     {"for", 5},
  87     {"else", 6},
  88     {"do", 6},
  89     {"sizeof", 7},
  90     {0, 0}
  91 };
  92
  93 char        chartype[128] =
  94 {                               /* this is used to facilitate the decision of
  95                                  * what type (alphanumeric, operator) each
  96                                  * character is */
  97     0, 0, 0, 0, 0, 0, 0, 0,
  98     0, 0, 0, 0, 0, 0, 0, 0,
  99     0, 0, 0, 0, 0, 0, 0, 0,
 100     0, 0, 0, 0, 0, 0, 0, 0,
 101     0, 3, 0, 0, 1, 3, 3, 0,
 102     0, 0, 3, 3, 0, 3, 0, 3,
 103     1, 1, 1, 1, 1, 1, 1, 1,
 104     1, 1, 0, 0, 3, 3, 3, 3,
 105     0, 1, 1, 1, 1, 1, 1, 1,
 106     1, 1, 1, 1, 1, 1, 1, 1,
 107     1, 1, 1, 1, 1, 1, 1, 1,
 108     1, 1, 1, 0, 0, 0, 3, 1,
 109     0, 1, 1, 1, 1, 1, 1, 1,
 110     1, 1, 1, 1, 1, 1, 1, 1,
 111     1, 1, 1, 1, 1, 1, 1, 1,
 112     1, 1, 1, 0, 3, 0, 3, 0
 113 };
 114
 115 int
 116 lexi(void)
 117 {
 118     int         unary_delim;    /* this is set to 1 if the current token
 119                                  * forces a following operator to be unary */
 120     static int  last_code;      /* the last token type returned */
 121     static int  l_struct;       /* set to 1 if the last token was 'struct' */
 122     int         code;           /* internal code to be returned */
 123     char        qchar;          /* the delimiter character for a string */
 124
 125     e_token = s_token;          /* point to start of place to save token */
 126     unary_delim = false;
 127     ps.col_1 = ps.last_nl;      /* tell world that this token started in
 128                                  * column 1 iff the last thing scanned was nl */
 129     ps.last_nl = false;
 130
 131     while (*buf_ptr == ' ' || *buf_ptr == '\t') {       /* get rid of blanks */
 132         ps.col_1 = false;       /* leading blanks imply token is not in column
 133                                  * 1 */
 134         if (++buf_ptr >= buf_end)
 135             fill_buffer();
 136     }
 137
 138     /* Scan an alphanumeric token */
 139     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 140         /*
 141          * we have a character or number
 142          */
 143         const char *j;          /* used for searching thru list of
 144                                  *
 145                                  * reserved words */
 146         struct templ *p;
 147
 148         if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 149             int         seendot = 0,
 150                         seenexp = 0,
 151                         seensfx = 0;
 152             if (*buf_ptr == '0' &&
 153                     (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
 154                 *e_token++ = *buf_ptr++;
 155                 *e_token++ = *buf_ptr++;
 156                 while (isxdigit(*buf_ptr)) {
 157                     CHECK_SIZE_TOKEN;
 158                     *e_token++ = *buf_ptr++;
 159                 }
 160             }
 161             else
 162                 while (1) {
 163                     if (*buf_ptr == '.') {
 164                         if (seendot)
 165                             break;
 166                         else
 167                             seendot++;
 168                     }
 169                     CHECK_SIZE_TOKEN;
 170                     *e_token++ = *buf_ptr++;
 171                     if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
 172                         if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
 173                             break;
 174                         else {
 175                             seenexp++;
 176                             seendot++;
 177                             CHECK_SIZE_TOKEN;
 178                             *e_token++ = *buf_ptr++;
 179                             if (*buf_ptr == '+' || *buf_ptr == '-')
 180                                 *e_token++ = *buf_ptr++;
 181                         }
 182                     }
 183                 }
 184             while (1) {
 185                 if (!(seensfx & 1) &&
 186                         (*buf_ptr == 'U' || *buf_ptr == 'u')) {
 187                     CHECK_SIZE_TOKEN;
 188                     *e_token++ = *buf_ptr++;
 189                     seensfx |= 1;
 190                     continue;
 191                 }
 192                 if (!(seensfx & 2) &&
 193                         (*buf_ptr == 'L' || *buf_ptr == 'l')) {
 194                     CHECK_SIZE_TOKEN;
 195                     if (buf_ptr[1] == buf_ptr[0])
 196                         *e_token++ = *buf_ptr++;
 197                     *e_token++ = *buf_ptr++;
 198                     seensfx |= 2;
 199                     continue;
 200                 }
 201                 break;
 202             }
 203         }
 204         else
 205             while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
 206                 /* fill_buffer() terminates buffer with newline */
 207                 if (*buf_ptr == BACKSLASH) {
 208                     if (*(buf_ptr + 1) == '\n') {
 209                         buf_ptr += 2;
 210                         if (buf_ptr >= buf_end)
 211                             fill_buffer();
 212                         } else
 213                             break;
 214                 }
 215                 CHECK_SIZE_TOKEN;
 216                 /* copy it over */
 217                 *e_token++ = *buf_ptr++;
 218                 if (buf_ptr >= buf_end)
 219                     fill_buffer();
 220             }
 221         *e_token++ = '\0';
 222         while (*buf_ptr == ' ' || *buf_ptr == '\t') {   /* get rid of blanks */
 223             if (++buf_ptr >= buf_end)
 224                 fill_buffer();
 225         }
 226         ps.its_a_keyword = false;
 227         ps.sizeof_keyword = false;
 228         if (l_struct && !ps.p_l_follow) {
 229                                 /* if last token was 'struct' and we're not
 230                                  * in parentheses, then this token
 231                                  * should be treated as a declaration */
 232             l_struct = false;
 233             last_code = ident;
 234             ps.last_u_d = true;
 235             return (decl);
 236         }
 237         ps.last_u_d = l_struct; /* Operator after identifier is binary
 238                                  * unless last token was 'struct' */
 239         l_struct = false;
 240         last_code = ident;      /* Remember that this is the code we will
 241                                  * return */
 242
 243         if (auto_typedefs) {
 244             const char *q = s_token;
 245             size_t q_len = strlen(q);
 246             /* Check if we have an "_t" in the end */
 247             if (q_len > 2 &&
 248                 (strcmp(q + q_len - 2, "_t") == 0)) {
 249                 ps.its_a_keyword = true;
 250                 ps.last_u_d = true;
 251                 goto found_auto_typedef;
 252             }
 253         }
 254
 255         /*
 256          * This loop will check if the token is a keyword.
 257          */
 258         for (p = specials; (j = p->rwd) != NULL; p++) {
 259             const char *q = s_token;    /* point at scanned token */
 260             if (*j++ != *q++ || *j++ != *q++)
 261                 continue;       /* This test depends on the fact that
 262                                  * identifiers are always at least 1 character
 263                                  * long (ie. the first two bytes of the
 264                                  * identifier are always meaningful) */
 265             if (q[-1] == 0)
 266                 break;          /* If its a one-character identifier */
 267             while (*q++ == *j)
 268                 if (*j++ == 0)
 269                     goto found_keyword; /* I wish that C had a multi-level
 270                                          * break... */
 271         }
 272         if (p->rwd) {           /* we have a keyword */
 273     found_keyword:
 274             ps.its_a_keyword = true;
 275             ps.last_u_d = true;
 276             switch (p->rwcode) {
 277             case 1:             /* it is a switch */
 278                 return (swstmt);
 279             case 2:             /* a case or default */
 280                 return (casestmt);
 281
 282             case 3:             /* a "struct" */
 283                 /*
 284                  * Next time around, we will want to know that we have had a
 285                  * 'struct'
 286                  */
 287                 l_struct = true;
 288                 /* FALLTHROUGH */
 289
 290             case 4:             /* one of the declaration keywords */
 291             found_auto_typedef:
 292                 if (ps.p_l_follow) {
 293                     ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
 294                     break;      /* inside parens: cast, param list or sizeof */
 295                 }
 296                 last_code = decl;
 297                 return (decl);
 298
 299             case 5:             /* if, while, for */
 300                 return (sp_paren);
 301
 302             case 6:             /* do, else */
 303                 return (sp_nparen);
 304
 305             case 7:
 306                 ps.sizeof_keyword = true;
 307             default:            /* all others are treated like any other
 308                                  * identifier */
 309                 return (ident);
 310             }                   /* end of switch */
 311         }                       /* end of if (found_it) */
 312         if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
 313             char *tp = buf_ptr;
 314             while (tp < buf_end)
 315                 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
 316                     goto not_proc;
 317             strncpy(ps.procname, token, sizeof ps.procname - 1);
 318             ps.in_parameter_declaration = 1;
 319             rparen_count = 1;
 320     not_proc:;
 321         }
 322         /*
 323          * The following hack attempts to guess whether or not the current
 324          * token is in fact a declaration keyword -- one that has been
 325          * typedefd
 326          */
 327         if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
 328                 && !ps.p_l_follow
 329                 && !ps.block_init
 330                 && (ps.last_token == rparen || ps.last_token == semicolon ||
 331                     ps.last_token == decl ||
 332                     ps.last_token == lbrace || ps.last_token == rbrace)) {
 333             ps.its_a_keyword = true;
 334             ps.last_u_d = true;
 335             last_code = decl;
 336             return decl;
 337         }
 338         if (last_code == decl)  /* if this is a declared variable, then
 339                                  * following sign is unary */
 340             ps.last_u_d = true; /* will make "int a -1" work */
 341         last_code = ident;
 342         return (ident);         /* the ident is not in the list */
 343     }                           /* end of procesing for alpanum character */
 344
 345     /* Scan a non-alphanumeric token */
 346
 347     *e_token++ = *buf_ptr;              /* if it is only a one-character token, it is
 348                                  * moved here */
 349     *e_token = '\0';
 350     if (++buf_ptr >= buf_end)
 351         fill_buffer();
 352
 353     switch (*token) {
 354     case '\n':
 355         unary_delim = ps.last_u_d;
 356         ps.last_nl = true;      /* remember that we just had a newline */
 357         code = (had_eof ? 0 : newline);
 358
 359         /*
 360          * if data has been exhausted, the newline is a dummy, and we should
 361          * return code to stop
 362          */
 363         break;
 364
 365     case '\'':                  /* start of quoted character */
 366     case '"':                   /* start of string */
 367         qchar = *token;
 368         if (troff) {
 369             e_token[-1] = '`';
 370             if (qchar == '"')
 371                 *e_token++ = '`';
 372             e_token = chfont(&bodyf, &stringf, e_token);
 373         }
 374         do {                    /* copy the string */
 375             while (1) {         /* move one character or [/<char>]<char> */
 376                 if (*buf_ptr == '\n') {
 377                     diag2(1, "Unterminated literal");
 378                     goto stop_lit;
 379                 }
 380                 CHECK_SIZE_TOKEN;       /* Only have to do this once in this loop,
 381                                          * since CHECK_SIZE guarantees that there
 382                                          * are at least 5 entries left */
 383                 *e_token = *buf_ptr++;
 384                 if (buf_ptr >= buf_end)
 385                     fill_buffer();
 386                 if (*e_token == BACKSLASH) {    /* if escape, copy extra char */
 387                     if (*buf_ptr == '\n')       /* check for escaped newline */
 388                         ++line_no;
 389                     if (troff) {
 390                         *++e_token = BACKSLASH;
 391                         if (*buf_ptr == BACKSLASH)
 392                             *++e_token = BACKSLASH;
 393                     }
 394                     *++e_token = *buf_ptr++;
 395                     ++e_token;  /* we must increment this again because we
 396                                  * copied two chars */
 397                     if (buf_ptr >= buf_end)
 398                         fill_buffer();
 399                 }
 400                 else
 401                     break;      /* we copied one character */
 402             }                   /* end of while (1) */
 403         } while (*e_token++ != qchar);
 404         if (troff) {
 405             e_token = chfont(&stringf, &bodyf, e_token - 1);
 406             if (qchar == '"')
 407                 *e_token++ = '\'';
 408         }
 409 stop_lit:
 410         code = ident;
 411         break;
 412
 413     case ('('):
 414     case ('['):
 415         unary_delim = true;
 416         code = lparen;
 417         break;
 418
 419     case (')'):
 420     case (']'):
 421         code = rparen;
 422         break;
 423
 424     case '#':
 425         unary_delim = ps.last_u_d;
 426         code = preesc;
 427         break;
 428
 429     case '?':
 430         unary_delim = true;
 431         code = question;
 432         break;
 433
 434     case (':'):
 435         code = colon;
 436         unary_delim = true;
 437         break;
 438
 439     case (';'):
 440         unary_delim = true;
 441         code = semicolon;
 442         break;
 443
 444     case ('{'):
 445         unary_delim = true;
 446
 447         /*
 448          * if (ps.in_or_st) ps.block_init = 1;
 449          */
 450         /* ?    code = ps.block_init ? lparen : lbrace; */
 451         code = lbrace;
 452         break;
 453
 454     case ('}'):
 455         unary_delim = true;
 456         /* ?    code = ps.block_init ? rparen : rbrace; */
 457         code = rbrace;
 458         break;
 459
 460     case 014:                   /* a form feed */
 461         unary_delim = ps.last_u_d;
 462         ps.last_nl = true;      /* remember this so we can set 'ps.col_1'
 463                                  * right */
 464         code = form_feed;
 465         break;
 466
 467     case (','):
 468         unary_delim = true;
 469         code = comma;
 470         break;
 471
 472     case '.':
 473         unary_delim = false;
 474         code = period;
 475         break;
 476
 477     case '-':
 478     case '+':                   /* check for -, +, --, ++ */
 479         code = (ps.last_u_d ? unary_op : binary_op);
 480         unary_delim = true;
 481
 482         if (*buf_ptr == token[0]) {
 483             /* check for doubled character */
 484             *e_token++ = *buf_ptr++;
 485             /* buffer overflow will be checked at end of loop */
 486             if (last_code == ident || last_code == rparen) {
 487                 code = (ps.last_u_d ? unary_op : postop);
 488                 /* check for following ++ or -- */
 489                 unary_delim = false;
 490             }
 491         }
 492         else if (*buf_ptr == '=')
 493             /* check for operator += */
 494             *e_token++ = *buf_ptr++;
 495         else if (*buf_ptr == '>') {
 496             /* check for operator -> */
 497             *e_token++ = *buf_ptr++;
 498             if (!pointer_as_binop) {
 499                 unary_delim = false;
 500                 code = unary_op;
 501                 ps.want_blank = false;
 502             }
 503         }
 504         break;                  /* buffer overflow will be checked at end of
 505                                  * switch */
 506
 507     case '=':
 508         if (ps.in_or_st)
 509             ps.block_init = 1;
 510 #ifdef undef
 511         if (chartype[*buf_ptr] == opchar) {     /* we have two char assignment */
 512             e_token[-1] = *buf_ptr++;
 513             if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
 514                 *e_token++ = *buf_ptr++;
 515             *e_token++ = '=';   /* Flip =+ to += */
 516             *e_token = 0;
 517         }
 518 #else
 519         if (*buf_ptr == '=') {/* == */
 520             *e_token++ = '=';   /* Flip =+ to += */
 521             buf_ptr++;
 522             *e_token = 0;
 523         }
 524 #endif
 525         code = binary_op;
 526         unary_delim = true;
 527         break;
 528         /* can drop thru!!! */
 529
 530     case '>':
 531     case '<':
 532     case '!':                   /* ops like <, <<, <=, !=, etc */
 533         if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
 534             *e_token++ = *buf_ptr;
 535             if (++buf_ptr >= buf_end)
 536                 fill_buffer();
 537         }
 538         if (*buf_ptr == '=')
 539             *e_token++ = *buf_ptr++;
 540         code = (ps.last_u_d ? unary_op : binary_op);
 541         unary_delim = true;
 542         break;
 543
 544     default:
 545         if (token[0] == '/' && *buf_ptr == '*') {
 546             /* it is start of comment */
 547             *e_token++ = '*';
 548
 549             if (++buf_ptr >= buf_end)
 550                 fill_buffer();
 551
 552             code = comment;
 553             unary_delim = ps.last_u_d;
 554             break;
 555         }
 556         while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
 557             /*
 558              * handle ||, &&, etc, and also things as in int *****i
 559              */
 560             *e_token++ = *buf_ptr;
 561             if (++buf_ptr >= buf_end)
 562                 fill_buffer();
 563         }
 564         code = (ps.last_u_d ? unary_op : binary_op);
 565         unary_delim = true;
 566
 567
 568     }                           /* end of switch */
 569     if (code != newline) {
 570         l_struct = false;
 571         last_code = code;
 572     }
 573     if (buf_ptr >= buf_end)     /* check for input buffer empty */
 574         fill_buffer();
 575     ps.last_u_d = unary_delim;
 576     *e_token = '\0';            /* null terminate the token */
 577     return (code);
 578 }
 579
 580 /*
 581  * Add the given keyword to the keyword table, using val as the keyword type
 582  */
 583 void
 584 addkey(char *key, int val)
 585 {
 586     struct templ *p = specials;
 587     while (p->rwd)
 588         if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
 589             return;
 590         else
 591             p++;
 592     if (p >= specials + sizeof specials / sizeof specials[0])
 593         return;                 /* For now, table overflows are silently
 594                                  * ignored */
 595     p->rwd = key;
 596     p->rwcode = val;
 597     p[1].rwd = NULL;
 598     p[1].rwcode = 0;
 599 }