usr.bin/indent/lexi.c

   1 /*
   2  * Copyright (c) 1985 Sun Microsystems, Inc.
   3  * Copyright (c) 1980, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. All advertising materials mentioning features or use of this software
  16  *    must display the following acknowledgement:
  17  *      This product includes software developed by the University of
  18  *      California, Berkeley and its contributors.
  19  * 4. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  *
  35  * @(#)lexi.c   8.1 (Berkeley) 6/6/93
  36  * $FreeBSD: src/usr.bin/indent/lexi.c,v 1.19 2005/11/20 13:48:15 dds Exp $
  37  * $DragonFly: src/usr.bin/indent/lexi.c,v 1.3 2005/04/10 20:55:38 drhodus Exp $
  38  */
  39
  40 /*
  41  * Here we have the token scanner for indent.  It scans off one token and puts
  42  * it in the global variable "token".  It returns a code, indicating the type
  43  * of token scanned.
  44  */
  45
  46 #include <err.h>
  47 #include <stdio.h>
  48 #include <ctype.h>
  49 #include <stdlib.h>
  50 #include <string.h>
  51 #include "indent_globs.h"
  52 #include "indent_codes.h"
  53 #include "indent.h"
  54
  55 #define alphanum 1
  56 #define opchar 3
  57
  58 struct templ {
  59     const char *rwd;
  60     int         rwcode;
  61 };
  62
  63 struct templ specials[1000] =
  64 {
  65     {"switch", 1},
  66     {"case", 2},
  67     {"break", 0},
  68     {"struct", 3},
  69     {"union", 3},
  70     {"enum", 3},
  71     {"default", 2},
  72     {"int", 4},
  73     {"char", 4},
  74     {"float", 4},
  75     {"double", 4},
  76     {"long", 4},
  77     {"short", 4},
  78     {"typdef", 4},
  79     {"unsigned", 4},
  80     {"register", 4},
  81     {"static", 4},
  82     {"global", 4},
  83     {"extern", 4},
  84     {"void", 4},
  85     {"const", 4},
  86     {"volatile", 4},
  87     {"goto", 0},
  88     {"return", 0},
  89     {"if", 5},
  90     {"while", 5},
  91     {"for", 5},
  92     {"else", 6},
  93     {"do", 6},
  94     {"sizeof", 7},
  95     {0, 0}
  96 };
  97
  98 char        chartype[128] =
  99 {                               /* this is used to facilitate the decision of
 100                                  * what type (alphanumeric, operator) each
 101                                  * character is */
 102     0, 0, 0, 0, 0, 0, 0, 0,
 103     0, 0, 0, 0, 0, 0, 0, 0,
 104     0, 0, 0, 0, 0, 0, 0, 0,
 105     0, 0, 0, 0, 0, 0, 0, 0,
 106     0, 3, 0, 0, 1, 3, 3, 0,
 107     0, 0, 3, 3, 0, 3, 0, 3,
 108     1, 1, 1, 1, 1, 1, 1, 1,
 109     1, 1, 0, 0, 3, 3, 3, 3,
 110     0, 1, 1, 1, 1, 1, 1, 1,
 111     1, 1, 1, 1, 1, 1, 1, 1,
 112     1, 1, 1, 1, 1, 1, 1, 1,
 113     1, 1, 1, 0, 0, 0, 3, 1,
 114     0, 1, 1, 1, 1, 1, 1, 1,
 115     1, 1, 1, 1, 1, 1, 1, 1,
 116     1, 1, 1, 1, 1, 1, 1, 1,
 117     1, 1, 1, 0, 3, 0, 3, 0
 118 };
 119
 120 int
 121 lexi(void)
 122 {
 123     int         unary_delim;    /* this is set to 1 if the current token
 124                                  * forces a following operator to be unary */
 125     static int  last_code;      /* the last token type returned */
 126     static int  l_struct;       /* set to 1 if the last token was 'struct' */
 127     int         code;           /* internal code to be returned */
 128     char        qchar;          /* the delimiter character for a string */
 129
 130     e_token = s_token;          /* point to start of place to save token */
 131     unary_delim = false;
 132     ps.col_1 = ps.last_nl;      /* tell world that this token started in
 133                                  * column 1 iff the last thing scanned was nl */
 134     ps.last_nl = false;
 135
 136     while (*buf_ptr == ' ' || *buf_ptr == '\t') {       /* get rid of blanks */
 137         ps.col_1 = false;       /* leading blanks imply token is not in column
 138                                  * 1 */
 139         if (++buf_ptr >= buf_end)
 140             fill_buffer();
 141     }
 142
 143     /* Scan an alphanumeric token */
 144     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 145         /*
 146          * we have a character or number
 147          */
 148         const char *j;          /* used for searching thru list of
 149                                  *
 150                                  * reserved words */
 151         struct templ *p;
 152
 153         if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 154             int         seendot = 0,
 155                         seenexp = 0,
 156                         seensfx = 0;
 157             if (*buf_ptr == '0' &&
 158                     (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
 159                 *e_token++ = *buf_ptr++;
 160                 *e_token++ = *buf_ptr++;
 161                 while (isxdigit(*buf_ptr)) {
 162                     CHECK_SIZE_TOKEN;
 163                     *e_token++ = *buf_ptr++;
 164                 }
 165             }
 166             else
 167                 while (1) {
 168                     if (*buf_ptr == '.') {
 169                         if (seendot)
 170                             break;
 171                         else
 172                             seendot++;
 173                     }
 174                     CHECK_SIZE_TOKEN;
 175                     *e_token++ = *buf_ptr++;
 176                     if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
 177                         if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
 178                             break;
 179                         else {
 180                             seenexp++;
 181                             seendot++;
 182                             CHECK_SIZE_TOKEN;
 183                             *e_token++ = *buf_ptr++;
 184                             if (*buf_ptr == '+' || *buf_ptr == '-')
 185                                 *e_token++ = *buf_ptr++;
 186                         }
 187                     }
 188                 }
 189             while (1) {
 190                 if (!(seensfx & 1) &&
 191                         (*buf_ptr == 'U' || *buf_ptr == 'u')) {
 192                     CHECK_SIZE_TOKEN;
 193                     *e_token++ = *buf_ptr++;
 194                     seensfx |= 1;
 195                     continue;
 196                 }
 197                 if (!(seensfx & 2) &&
 198                         (*buf_ptr == 'L' || *buf_ptr == 'l')) {
 199                     CHECK_SIZE_TOKEN;
 200                     if (buf_ptr[1] == buf_ptr[0])
 201                         *e_token++ = *buf_ptr++;
 202                     *e_token++ = *buf_ptr++;
 203                     seensfx |= 2;
 204                     continue;
 205                 }
 206                 break;
 207             }
 208         }
 209         else
 210             while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
 211                 /* fill_buffer() terminates buffer with newline */
 212                 if (*buf_ptr == BACKSLASH) {
 213                     if (*(buf_ptr + 1) == '\n') {
 214                         buf_ptr += 2;
 215                         if (buf_ptr >= buf_end)
 216                             fill_buffer();
 217                         } else
 218                             break;
 219                 }
 220                 CHECK_SIZE_TOKEN;
 221                 /* copy it over */
 222                 *e_token++ = *buf_ptr++;
 223                 if (buf_ptr >= buf_end)
 224                     fill_buffer();
 225             }
 226         *e_token++ = '\0';
 227         while (*buf_ptr == ' ' || *buf_ptr == '\t') {   /* get rid of blanks */
 228             if (++buf_ptr >= buf_end)
 229                 fill_buffer();
 230         }
 231         ps.its_a_keyword = false;
 232         ps.sizeof_keyword = false;
 233         if (l_struct && !ps.p_l_follow) {
 234                                 /* if last token was 'struct' and we're not
 235                                  * in parentheses, then this token
 236                                  * should be treated as a declaration */
 237             l_struct = false;
 238             last_code = ident;
 239             ps.last_u_d = true;
 240             return (decl);
 241         }
 242         ps.last_u_d = l_struct; /* Operator after identifier is binary
 243                                  * unless last token was 'struct' */
 244         l_struct = false;
 245         last_code = ident;      /* Remember that this is the code we will
 246                                  * return */
 247
 248         /*
 249          * This loop will check if the token is a keyword.
 250          */
 251         for (p = specials; (j = p->rwd) != 0; p++) {
 252             const char *q = s_token;    /* point at scanned token */
 253             if (*j++ != *q++ || *j++ != *q++)
 254                 continue;       /* This test depends on the fact that
 255                                  * identifiers are always at least 1 character
 256                                  * long (ie. the first two bytes of the
 257                                  * identifier are always meaningful) */
 258             if (q[-1] == 0)
 259                 break;          /* If its a one-character identifier */
 260             while (*q++ == *j)
 261                 if (*j++ == 0)
 262                     goto found_keyword; /* I wish that C had a multi-level
 263                                          * break... */
 264         }
 265         if (p->rwd) {           /* we have a keyword */
 266     found_keyword:
 267             ps.its_a_keyword = true;
 268             ps.last_u_d = true;
 269             switch (p->rwcode) {
 270             case 1:             /* it is a switch */
 271                 return (swstmt);
 272             case 2:             /* a case or default */
 273                 return (casestmt);
 274
 275             case 3:             /* a "struct" */
 276                 /*
 277                  * Next time around, we will want to know that we have had a
 278                  * 'struct'
 279                  */
 280                 l_struct = true;
 281                 /* FALLTHROUGH */
 282
 283             case 4:             /* one of the declaration keywords */
 284                 if (ps.p_l_follow) {
 285                     ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
 286                     break;      /* inside parens: cast, param list or sizeof */
 287                 }
 288                 last_code = decl;
 289                 return (decl);
 290
 291             case 5:             /* if, while, for */
 292                 return (sp_paren);
 293
 294             case 6:             /* do, else */
 295                 return (sp_nparen);
 296
 297             case 7:
 298                 ps.sizeof_keyword = true;
 299             default:            /* all others are treated like any other
 300                                  * identifier */
 301                 return (ident);
 302             }                   /* end of switch */
 303         }                       /* end of if (found_it) */
 304         if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
 305             char *tp = buf_ptr;
 306             while (tp < buf_end)
 307                 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
 308                     goto not_proc;
 309             strncpy(ps.procname, token, sizeof ps.procname - 1);
 310             ps.in_parameter_declaration = 1;
 311             rparen_count = 1;
 312     not_proc:;
 313         }
 314         /*
 315          * The following hack attempts to guess whether or not the current
 316          * token is in fact a declaration keyword -- one that has been
 317          * typedefd
 318          */
 319         if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
 320                 && !ps.p_l_follow
 321                 && !ps.block_init
 322                 && (ps.last_token == rparen || ps.last_token == semicolon ||
 323                     ps.last_token == decl ||
 324                     ps.last_token == lbrace || ps.last_token == rbrace)) {
 325             ps.its_a_keyword = true;
 326             ps.last_u_d = true;
 327             last_code = decl;
 328             return decl;
 329         }
 330         if (last_code == decl)  /* if this is a declared variable, then
 331                                  * following sign is unary */
 332             ps.last_u_d = true; /* will make "int a -1" work */
 333         last_code = ident;
 334         return (ident);         /* the ident is not in the list */
 335     }                           /* end of procesing for alpanum character */
 336
 337     /* Scan a non-alphanumeric token */
 338
 339     *e_token++ = *buf_ptr;              /* if it is only a one-character token, it is
 340                                  * moved here */
 341     *e_token = '\0';
 342     if (++buf_ptr >= buf_end)
 343         fill_buffer();
 344
 345     switch (*token) {
 346     case '\n':
 347         unary_delim = ps.last_u_d;
 348         ps.last_nl = true;      /* remember that we just had a newline */
 349         code = (had_eof ? 0 : newline);
 350
 351         /*
 352          * if data has been exhausted, the newline is a dummy, and we should
 353          * return code to stop
 354          */
 355         break;
 356
 357     case '\'':                  /* start of quoted character */
 358     case '"':                   /* start of string */
 359         qchar = *token;
 360         if (troff) {
 361             e_token[-1] = '`';
 362             if (qchar == '"')
 363                 *e_token++ = '`';
 364             e_token = chfont(&bodyf, &stringf, e_token);
 365         }
 366         do {                    /* copy the string */
 367             while (1) {         /* move one character or [/<char>]<char> */
 368                 if (*buf_ptr == '\n') {
 369                     diag2(1, "Unterminated literal");
 370                     goto stop_lit;
 371                 }
 372                 CHECK_SIZE_TOKEN;       /* Only have to do this once in this loop,
 373                                          * since CHECK_SIZE guarantees that there
 374                                          * are at least 5 entries left */
 375                 *e_token = *buf_ptr++;
 376                 if (buf_ptr >= buf_end)
 377                     fill_buffer();
 378                 if (*e_token == BACKSLASH) {    /* if escape, copy extra char */
 379                     if (*buf_ptr == '\n')       /* check for escaped newline */
 380                         ++line_no;
 381                     if (troff) {
 382                         *++e_token = BACKSLASH;
 383                         if (*buf_ptr == BACKSLASH)
 384                             *++e_token = BACKSLASH;
 385                     }
 386                     *++e_token = *buf_ptr++;
 387                     ++e_token;  /* we must increment this again because we
 388                                  * copied two chars */
 389                     if (buf_ptr >= buf_end)
 390                         fill_buffer();
 391                 }
 392                 else
 393                     break;      /* we copied one character */
 394             }                   /* end of while (1) */
 395         } while (*e_token++ != qchar);
 396         if (troff) {
 397             e_token = chfont(&stringf, &bodyf, e_token - 1);
 398             if (qchar == '"')
 399                 *e_token++ = '\'';
 400         }
 401 stop_lit:
 402         code = ident;
 403         break;
 404
 405     case ('('):
 406     case ('['):
 407         unary_delim = true;
 408         code = lparen;
 409         break;
 410
 411     case (')'):
 412     case (']'):
 413         code = rparen;
 414         break;
 415
 416     case '#':
 417         unary_delim = ps.last_u_d;
 418         code = preesc;
 419         break;
 420
 421     case '?':
 422         unary_delim = true;
 423         code = question;
 424         break;
 425
 426     case (':'):
 427         code = colon;
 428         unary_delim = true;
 429         break;
 430
 431     case (';'):
 432         unary_delim = true;
 433         code = semicolon;
 434         break;
 435
 436     case ('{'):
 437         unary_delim = true;
 438
 439         /*
 440          * if (ps.in_or_st) ps.block_init = 1;
 441          */
 442         /* ?    code = ps.block_init ? lparen : lbrace; */
 443         code = lbrace;
 444         break;
 445
 446     case ('}'):
 447         unary_delim = true;
 448         /* ?    code = ps.block_init ? rparen : rbrace; */
 449         code = rbrace;
 450         break;
 451
 452     case 014:                   /* a form feed */
 453         unary_delim = ps.last_u_d;
 454         ps.last_nl = true;      /* remember this so we can set 'ps.col_1'
 455                                  * right */
 456         code = form_feed;
 457         break;
 458
 459     case (','):
 460         unary_delim = true;
 461         code = comma;
 462         break;
 463
 464     case '.':
 465         unary_delim = false;
 466         code = period;
 467         break;
 468
 469     case '-':
 470     case '+':                   /* check for -, +, --, ++ */
 471         code = (ps.last_u_d ? unary_op : binary_op);
 472         unary_delim = true;
 473
 474         if (*buf_ptr == token[0]) {
 475             /* check for doubled character */
 476             *e_token++ = *buf_ptr++;
 477             /* buffer overflow will be checked at end of loop */
 478             if (last_code == ident || last_code == rparen) {
 479                 code = (ps.last_u_d ? unary_op : postop);
 480                 /* check for following ++ or -- */
 481                 unary_delim = false;
 482             }
 483         }
 484         else if (*buf_ptr == '=')
 485             /* check for operator += */
 486             *e_token++ = *buf_ptr++;
 487         else if (*buf_ptr == '>') {
 488             /* check for operator -> */
 489             *e_token++ = *buf_ptr++;
 490             if (!pointer_as_binop) {
 491                 unary_delim = false;
 492                 code = unary_op;
 493                 ps.want_blank = false;
 494             }
 495         }
 496         break;                  /* buffer overflow will be checked at end of
 497                                  * switch */
 498
 499     case '=':
 500         if (ps.in_or_st)
 501             ps.block_init = 1;
 502 #ifdef undef
 503         if (chartype[*buf_ptr] == opchar) {     /* we have two char assignment */
 504             e_token[-1] = *buf_ptr++;
 505             if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
 506                 *e_token++ = *buf_ptr++;
 507             *e_token++ = '=';   /* Flip =+ to += */
 508             *e_token = 0;
 509         }
 510 #else
 511         if (*buf_ptr == '=') {/* == */
 512             *e_token++ = '=';   /* Flip =+ to += */
 513             buf_ptr++;
 514             *e_token = 0;
 515         }
 516 #endif
 517         code = binary_op;
 518         unary_delim = true;
 519         break;
 520         /* can drop thru!!! */
 521
 522     case '>':
 523     case '<':
 524     case '!':                   /* ops like <, <<, <=, !=, etc */
 525         if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
 526             *e_token++ = *buf_ptr;
 527             if (++buf_ptr >= buf_end)
 528                 fill_buffer();
 529         }
 530         if (*buf_ptr == '=')
 531             *e_token++ = *buf_ptr++;
 532         code = (ps.last_u_d ? unary_op : binary_op);
 533         unary_delim = true;
 534         break;
 535
 536     default:
 537         if (token[0] == '/' && *buf_ptr == '*') {
 538             /* it is start of comment */
 539             *e_token++ = '*';
 540
 541             if (++buf_ptr >= buf_end)
 542                 fill_buffer();
 543
 544             code = comment;
 545             unary_delim = ps.last_u_d;
 546             break;
 547         }
 548         while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
 549             /*
 550              * handle ||, &&, etc, and also things as in int *****i
 551              */
 552             *e_token++ = *buf_ptr;
 553             if (++buf_ptr >= buf_end)
 554                 fill_buffer();
 555         }
 556         code = (ps.last_u_d ? unary_op : binary_op);
 557         unary_delim = true;
 558
 559
 560     }                           /* end of switch */
 561     if (code != newline) {
 562         l_struct = false;
 563         last_code = code;
 564     }
 565     if (buf_ptr >= buf_end)     /* check for input buffer empty */
 566         fill_buffer();
 567     ps.last_u_d = unary_delim;
 568     *e_token = '\0';            /* null terminate the token */
 569     return (code);
 570 }
 571
 572 /*
 573  * Add the given keyword to the keyword table, using val as the keyword type
 574  */
 575 void
 576 addkey(char *key, int val)
 577 {
 578     struct templ *p = specials;
 579     while (p->rwd)
 580         if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
 581             return;
 582         else
 583             p++;
 584     if (p >= specials + sizeof specials / sizeof specials[0])
 585         return;                 /* For now, table overflows are silently
 586                                  * ignored */
 587     p->rwd = key;
 588     p->rwcode = val;
 589     p[1].rwd = 0;
 590     p[1].rwcode = 0;
 591 }