tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the pre-processor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  *  Licensed under the Open Software License version 1.1
   9  */
  10 #include <stdio.h>
  11 #include <stdlib.h>
  12 #include <stdarg.h>
  13 #include <stddef.h>
  14 #include <string.h>
  15 #include <ctype.h>
  16 #include <unistd.h>
  17 #include <sys/stat.h>
  18
  19 #include "lib.h"
  20 #include "token.h"
  21 #include "symbol.h"
  22
  23 #define EOF (-1)
  24
  25 int input_stream_nr = 0;
  26 struct stream *input_streams;
  27 static int input_streams_allocated;
  28
  29 #define BUFSIZE (8192)
  30
  31 typedef struct {
  32         int fd, offset, size;
  33         struct position pos;
  34         struct token **tokenlist;
  35         struct token *token;
  36         unsigned char *buffer;
  37 } stream_t;
  38
  39
  40 const char *show_special(int val)
  41 {
  42         static const char *combinations[] = COMBINATION_STRINGS;
  43         static char buffer[4];
  44
  45         buffer[0] = val;
  46         buffer[1] = 0;
  47         if (val >= SPECIAL_BASE)
  48                 strcpy(buffer, combinations[val - SPECIAL_BASE]);
  49         return buffer;
  50 }
  51
  52 const char *show_ident(const struct ident *ident)
  53 {
  54         static char buffer[256];
  55         if (!ident)
  56                 return "<noident>";
  57         sprintf(buffer, "%.*s", ident->len, ident->name);
  58         return buffer;
  59 }
  60
  61 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  62 {
  63         if (isprint(c)) {
  64                 if (c == escape || c == '\\')
  65                         *ptr++ = '\\';
  66                 *ptr++ = c;
  67                 return ptr;
  68         }
  69         *ptr++ = '\\';
  70         switch (c) {
  71         case '\n':
  72                 *ptr++ = 'n';
  73                 return ptr;
  74         case '\t':
  75                 *ptr++ = 't';
  76                 return ptr;
  77         }
  78         if (!isdigit(next))
  79                 return ptr + sprintf(ptr, "%o", c);
  80
  81         return ptr + sprintf(ptr, "%03o", c);
  82 }
  83
  84 const char *show_string(const struct string *string)
  85 {
  86         static char buffer[256];
  87         char *ptr;
  88         int i;
  89
  90         ptr = buffer;
  91         *ptr++ = '"';
  92         for (i = 0; i < string->length-1; i++) {
  93                 const unsigned char *p = string->data + i;
  94                 ptr = charstr(ptr, p[0], '"', p[1]);
  95         }
  96         *ptr++ = '"';
  97         *ptr = '\0';
  98         return buffer;
  99 }
 100
 101 const char *show_token(const struct token *token)
 102 {
 103         static char buffer[256];
 104
 105         if (!token)
 106                 return "<no token>";
 107         switch (token_type(token)) {
 108         case TOKEN_ERROR:
 109                 return "syntax error";
 110
 111         case TOKEN_EOF:
 112                 return "end-of-input";
 113
 114         case TOKEN_IDENT:
 115                 return show_ident(token->ident);
 116
 117         case TOKEN_STRING:
 118                 return show_string(token->string);
 119
 120         case TOKEN_NUMBER:
 121                 return token->number;
 122
 123         case TOKEN_SPECIAL:
 124                 return show_special(token->special);
 125
 126         case TOKEN_CHAR: {
 127                 char *ptr = buffer;
 128                 int c = token->character;
 129                 *ptr++ = '\'';
 130                 ptr = charstr(ptr, c, '\'', 0);
 131                 *ptr++ = '\'';
 132                 *ptr++ = '\0';
 133                 return buffer;
 134         }
 135
 136         case TOKEN_STREAMBEGIN:
 137                 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
 138                 return buffer;
 139
 140         case TOKEN_STREAMEND:
 141                 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
 142                 return buffer;
 143
 144         default:
 145                 return "WTF???";
 146         }
 147 }
 148
 149 int init_stream(const char *name, int fd)
 150 {
 151         int stream = input_stream_nr;
 152         struct stream *current;
 153
 154         if (stream >= input_streams_allocated) {
 155                 int newalloc = stream * 4 / 3 + 10;
 156                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 157                 if (!input_streams)
 158                         die("Unable to allocate more streams space");
 159                 input_streams_allocated = newalloc;
 160         }
 161         current = input_streams + stream;
 162         memset(current, 0, sizeof(*current));
 163         current->name = name;
 164         current->fd = fd;
 165         current->constant = -1; // "unknown"
 166         if (fd > 0) {
 167                 int i;
 168                 struct stat st;
 169
 170                 fstat(fd, &st);
 171                 current->dev = st.st_dev;
 172                 current->ino = st.st_ino;
 173                 for (i = 0; i < stream; i++) {
 174                         struct stream *s = input_streams + i;
 175                         if (s->dev == st.st_dev && s->ino == st.st_ino) {
 176                                 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
 177                                         return -1;
 178                         }
 179                 }
 180         }
 181         input_stream_nr = stream+1;
 182         return stream;
 183 }
 184
 185 static struct token * alloc_token(stream_t *stream)
 186 {
 187         struct token *token = __alloc_token(0);
 188         token->pos = stream->pos;
 189         return token;
 190 }
 191
 192 static int nextchar(stream_t *stream)
 193 {
 194         int offset = stream->offset;
 195         int size = stream->size;
 196         int c;
 197         int complain = -1;
 198
 199 repeat:
 200         complain++;
 201         if (offset >= size) {
 202                 size = read(stream->fd, stream->buffer, BUFSIZE);
 203                 if (size <= 0)
 204                         return EOF;
 205                 stream->size = size;
 206                 stream->offset = 0;
 207                 offset = 0;
 208         }
 209         c = stream->buffer[offset];
 210         stream->offset = ++offset;
 211
 212         stream->pos.pos++;
 213
 214         /* Ignore DOS-stype '\r' characters */
 215         if (c == '\r')
 216                 goto repeat;
 217
 218         if (c == '\n') {
 219                 stream->pos.line++;
 220                 stream->pos.newline = 1;
 221                 stream->pos.pos = 0;
 222                 complain = 0;
 223         }
 224
 225         if (complain)
 226                 warn(stream->pos, "non-ASCII data stream");
 227
 228         return c;
 229 }
 230
 231 struct token eof_token_entry;
 232
 233 static void mark_eof(stream_t *stream, struct token *end_token)
 234 {
 235         struct token *end;
 236
 237         end = alloc_token(stream);
 238         token_type(end) = TOKEN_STREAMEND;
 239         end->pos.newline = 1;
 240
 241         eof_token_entry.next = &eof_token_entry;
 242         eof_token_entry.pos.newline = 1;
 243
 244         if (!end_token)
 245                 end_token =  &eof_token_entry;
 246         end->next = end_token;
 247         *stream->tokenlist = end;
 248         stream->tokenlist = NULL;
 249 }
 250
 251 static void add_token(stream_t *stream)
 252 {
 253         struct token *token = stream->token;
 254
 255         stream->token = NULL;
 256         token->next = NULL;
 257         *stream->tokenlist = token;
 258         stream->tokenlist = &token->next;
 259 }
 260
 261 static void drop_token(stream_t *stream)
 262 {
 263         stream->pos.newline |= stream->token->pos.newline;
 264         stream->pos.whitespace |= stream->token->pos.whitespace;
 265         stream->token = NULL;
 266 }
 267
 268
 269 /*
 270  * pp-number:
 271  *      digit
 272  *      . digit
 273  *      pp-number digit
 274  *      pp-number identifier-nodigit
 275  *      pp-number e sign
 276  *      pp-number E sign
 277  *      pp-number p sign
 278  *      pp-number P sign
 279  *      pp-number .
 280  */
 281 static int get_one_number(int c, int next, stream_t *stream)
 282 {
 283         struct token *token;
 284         static char buffer[256];
 285         char *p = buffer, *buf;
 286         int len;
 287
 288         *p++ = c;
 289         for (;;) {
 290                 switch (next) {
 291                 case 'e': case 'E':
 292                 case 'p': case 'P':
 293                         *p++ = next;
 294                         next = nextchar(stream);
 295                         if (next != '-' && next != '+')
 296                                 continue;
 297                 /* Fallthrough for sign of 'e'/'p' */
 298                 case '0'...'9':
 299                 case '.': case '_':
 300                 case 'a'...'d': case 'A'...'D':
 301                 case 'f'...'o': case 'F'...'O':
 302                 case 'q'...'z': case 'Q'...'Z':
 303                         *p++ = next;
 304                         next = nextchar(stream);
 305                         continue;
 306                 }
 307                 break;
 308         }
 309         *p++ = 0;
 310         len = p - buffer;
 311         buf = __alloc_bytes(len);
 312         memcpy(buf, buffer, len);
 313
 314         token = stream->token;
 315         token_type(token) = TOKEN_NUMBER;
 316         token->number = buf;
 317         add_token(stream);
 318
 319         return next;
 320 }
 321
 322 static int escapechar(int first, int type, stream_t *stream, int *valp)
 323 {
 324         int next, value;
 325
 326         next = nextchar(stream);
 327         value = first;
 328
 329         if (first == '\n')
 330                 warn(stream->pos, "Newline in string or character constant");
 331
 332         if (first == '\\' && next != EOF) {
 333                 value = next;
 334                 next = nextchar(stream);
 335                 if (value != type) {
 336                         switch (value) {
 337                         case 'a':
 338                                 value = '\a';
 339                                 break;
 340                         case 'b':
 341                                 value = '\b';
 342                                 break;
 343                         case 't':
 344                                 value = '\t';
 345                                 break;
 346                         case 'n':
 347                                 value = '\n';
 348                                 break;
 349                         case 'v':
 350                                 value = '\v';
 351                                 break;
 352                         case 'f':
 353                                 value = '\f';
 354                                 break;
 355                         case 'r':
 356                                 value = '\r';
 357                                 break;
 358                         case 'e':
 359                                 value = '\e';
 360                                 break;
 361                         case '\\':
 362                                 break;
 363                         case '\'':
 364                                 break;
 365                         case '"':
 366                                 break;
 367                         case '\n':
 368                                 next = escapechar(next, type, stream, &value);
 369                                 break;
 370                         case '0'...'7': {
 371                                 int nr = 2;
 372                                 value -= '0';
 373                                 while (next >= '0' && next <= '9') {
 374                                         value = (value << 3) + (next-'0');
 375                                         next = nextchar(stream);
 376                                         if (!--nr)
 377                                                 break;
 378                                 }
 379                                 value &= 0xff;
 380                                 break;
 381                         }
 382                         case 'x': {
 383                                 int hex = hexval(next);
 384                                 if (hex < 16) {
 385                                         value = hex;
 386                                         next = nextchar(stream);
 387                                         while ((hex = hexval(next)) < 16) {
 388                                                 value = (value << 4) + hex;
 389                                                 next = nextchar(stream);
 390                                         }
 391                                         value &= 0xff;
 392                                         break;
 393                                 }
 394                         }
 395                         /* Fallthrough */
 396                         default:
 397                                 warn(stream->pos, "Unknown escape '%c'", value);
 398                         }
 399                 }
 400                 /* Mark it as escaped */
 401                 value |= 0x100;
 402         }
 403         *valp = value;
 404         return next;
 405 }
 406
 407 static int get_char_token(int next, stream_t *stream)
 408 {
 409         int value;
 410         struct token *token;
 411
 412         next = escapechar(next, '\'', stream, &value);
 413         if (value == '\'' || next != '\'') {
 414                 warn(stream->pos, "Bad character constant");
 415                 drop_token(stream);
 416                 return next;
 417         }
 418
 419         token = stream->token;
 420         token_type(token) = TOKEN_CHAR;
 421         token->character = value & 0xff;
 422
 423         add_token(stream);
 424         return nextchar(stream);
 425 }
 426
 427 static int get_string_token(int next, stream_t *stream)
 428 {
 429         static char buffer[512];
 430         struct string *string;
 431         struct token *token;
 432         int len = 0;
 433
 434         for (;;) {
 435                 int val;
 436                 next = escapechar(next, '"', stream, &val);
 437                 if (val == '"')
 438                         break;
 439                 if (next == EOF) {
 440                         warn(stream->pos, "Enf of file in middle of string");
 441                         return next;
 442                 }
 443                 if (len < sizeof(buffer)) {
 444                         buffer[len] = val;
 445                         len++;
 446                 }
 447
 448         }
 449
 450         if (len > 256)
 451                 warn(stream->pos, "String too long");
 452
 453         string = __alloc_string(len+1);
 454         memcpy(string->data, buffer, len);
 455         string->data[len] = '\0';
 456         string->length = len+1;
 457
 458         /* Pass it on.. */
 459         token = stream->token;
 460         token_type(token) = TOKEN_STRING;
 461         token->string = string;
 462         add_token(stream);
 463
 464         return next;
 465 }
 466
 467 static int drop_stream_eoln(stream_t *stream)
 468 {
 469         int next = nextchar(stream);
 470         drop_token(stream);
 471         for (;;) {
 472                 int curr = next;
 473                 if (curr == EOF)
 474                         return next;
 475                 next = nextchar(stream);
 476                 if (curr == '\n')
 477                         return next;
 478         }
 479 }
 480
 481 static int drop_stream_comment(stream_t *stream)
 482 {
 483         int newline;
 484         int next;
 485         drop_token(stream);
 486         newline = stream->pos.newline;
 487
 488         next = nextchar(stream);
 489         for (;;) {
 490                 int curr = next;
 491                 if (curr == EOF) {
 492                         warn(stream->pos, "End of file in the middle of a comment");
 493                         return curr;
 494                 }
 495                 next = nextchar(stream);
 496                 if (curr == '*' && next == '/')
 497                         break;
 498         }
 499         stream->pos.newline = newline;
 500         return nextchar(stream);
 501 }
 502
 503 unsigned char combinations[][3] = COMBINATION_STRINGS;
 504
 505 #define NR_COMBINATIONS (sizeof(combinations)/3)
 506
 507 static int get_one_special(int c, stream_t *stream)
 508 {
 509         struct token *token;
 510         unsigned char c1, c2, c3;
 511         int next, value, i;
 512         char *comb;
 513
 514         next = nextchar(stream);
 515
 516         /*
 517          * Check for numbers, strings, character constants, and comments
 518          */
 519         switch (c) {
 520         case '.':
 521                 if (next >= '0' && next <= '9')
 522                         return get_one_number(c, next, stream);
 523                 break;
 524         case '"':
 525                 return get_string_token(next, stream);
 526         case '\'':
 527                 return get_char_token(next, stream);
 528         case '/':
 529                 if (next == '/')
 530                         return drop_stream_eoln(stream);
 531                 if (next == '*')
 532                         return drop_stream_comment(stream);
 533         }
 534
 535         /*
 536          * Check for combinations
 537          */
 538         value = c;
 539         comb = combinations[0];
 540         c1 = c; c2 = next; c3 = 0;
 541         for (i = 0; i < NR_COMBINATIONS; i++) {
 542                 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
 543                         value = i + SPECIAL_BASE;
 544                         next = nextchar(stream);
 545                         if (c3)
 546                                 break;
 547                         c3 = next;
 548                 }
 549                 comb += 3;
 550         }
 551
 552         /* Pass it on.. */
 553         token = stream->token;
 554         token_type(token) = TOKEN_SPECIAL;
 555         token->special = value;
 556         add_token(stream);
 557         return next;
 558 }
 559
 560 #define IDENT_HASH_BITS (10)
 561 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 562 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 563
 564 #define ident_hash_init(c)              (c)
 565 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 566 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 567
 568 static struct ident *hash_table[IDENT_HASH_SIZE];
 569 int ident_hit, ident_miss;
 570
 571 void show_identifier_stats(void)
 572 {
 573         int i;
 574         int distribution[100];
 575
 576         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 577                 ident_hit, ident_miss);
 578
 579         for (i = 0; i < 100; i++)
 580                 distribution[i] = 0;
 581
 582         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 583                 struct ident * ident = hash_table[i];
 584                 int count = 0;
 585
 586                 while (ident) {
 587                         count++;
 588                         ident = ident->next;
 589                 }
 590                 if (count > 99)
 591                         count = 99;
 592                 distribution[count]++;
 593         }
 594
 595         for (i = 0; i < 100; i++) {
 596                 if (distribution[i])
 597                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 598         }
 599 }
 600
 601 static struct ident *alloc_ident(const char *name, int len)
 602 {
 603         struct ident *ident = __alloc_ident(len);
 604         ident->symbols = NULL;
 605         ident->len = len;
 606         ident->tainted = 0;
 607         memcpy(ident->name, name, len);
 608         return ident;
 609 }
 610
 611 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 612 {
 613         ident->next = hash_table[hash];
 614         hash_table[hash] = ident;
 615         ident_miss++;
 616         return ident;
 617 }
 618
 619 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 620 {
 621         struct ident *ident;
 622
 623         ident = hash_table[hash];
 624         while (ident) {
 625                 if (ident->len == len && !memcmp(ident->name, name, len)) {
 626                         ident_hit++;
 627                         return ident;
 628                 }
 629                 ident = ident->next;
 630         }
 631
 632         return insert_hash(alloc_ident(name, len), hash);
 633 }
 634
 635 static unsigned long hash_name(const char *name, int len)
 636 {
 637         unsigned long hash;
 638         const unsigned char *p = (const unsigned char *)name;
 639
 640         hash = ident_hash_init(*p++);
 641         while (--len) {
 642                 unsigned int i = *p++;
 643                 hash = ident_hash_add(hash, i);
 644         }
 645         return ident_hash_end(hash);
 646 }
 647
 648 struct ident *hash_ident(struct ident *ident)
 649 {
 650         return insert_hash(ident, hash_name(ident->name, ident->len));
 651 }
 652
 653 struct ident *built_in_ident(const char *name)
 654 {
 655         int len = strlen(name);
 656         return create_hashed_ident(name, len, hash_name(name, len));
 657 }
 658
 659 struct token *built_in_token(int stream, const char *name)
 660 {
 661         struct token *token;
 662
 663         token = __alloc_token(0);
 664         token->pos.stream = stream;
 665         token_type(token) = TOKEN_IDENT;
 666         token->ident = built_in_ident(name);
 667         return token;
 668 }
 669
 670 static int get_one_identifier(int c, stream_t *stream)
 671 {
 672         struct token *token;
 673         struct ident *ident;
 674         unsigned long hash;
 675         char buf[256];
 676         int len = 1;
 677         int next;
 678
 679         hash = ident_hash_init(c);
 680         buf[0] = c;
 681         for (;;) {
 682                 next = nextchar(stream);
 683                 switch (next) {
 684                 case '0'...'9':
 685                 case 'a'...'z':
 686                 case 'A'...'Z':
 687                 case '_':
 688                         if (len < sizeof(buf)) {
 689                                 hash = ident_hash_add(hash, next);
 690                                 buf[len] = next;
 691                                 len++;
 692                         }
 693                         continue;
 694                 }
 695                 break;
 696         };
 697         hash = ident_hash_end(hash);
 698
 699         ident = create_hashed_ident(buf, len, hash);
 700
 701         /* Pass it on.. */
 702         token = stream->token;
 703         token_type(token) = TOKEN_IDENT;
 704         token->ident = ident;
 705         add_token(stream);
 706         return next;
 707 }
 708
 709 static int get_one_token(int c, stream_t *stream)
 710 {
 711         switch (c) {
 712         case '0'...'9':
 713                 return get_one_number(c, nextchar(stream), stream);
 714         case 'a'...'z':
 715         case 'A'...'Z':
 716         case '_':
 717                 return get_one_identifier(c, stream);
 718         default:
 719                 return get_one_special(c, stream);
 720         }
 721 }
 722
 723 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 724         unsigned char *buf, unsigned int buf_size)
 725 {
 726         struct token *begin;
 727
 728         stream->pos.stream = idx;
 729         stream->pos.line = 1;
 730         stream->pos.newline = 1;
 731         stream->pos.whitespace = 0;
 732         stream->pos.pos = 0;
 733         stream->pos.noexpand = 0;
 734
 735         stream->token = NULL;
 736         stream->fd = fd;
 737         stream->offset = 0;
 738         stream->size = buf_size;
 739         stream->buffer = buf;
 740
 741         begin = alloc_token(stream);
 742         token_type(begin) = TOKEN_STREAMBEGIN;
 743         stream->tokenlist = &begin->next;
 744         return begin;
 745 }
 746
 747 static void tokenize_stream(stream_t *stream, struct token *endtoken)
 748 {
 749         int c = nextchar(stream);
 750         while (c != EOF) {
 751                 if (c == '\\') {
 752                         c = nextchar(stream);
 753                         stream->pos.newline = 0;
 754                         stream->pos.whitespace = 1;
 755                         continue;
 756                 }
 757                 if (!isspace(c)) {
 758                         struct token *token = alloc_token(stream);
 759                         stream->token = token;
 760                         stream->pos.newline = 0;
 761                         stream->pos.whitespace = 0;
 762                         c = get_one_token(c, stream);
 763                         continue;
 764                 }
 765                 stream->pos.whitespace = 1;
 766                 c = nextchar(stream);
 767         }
 768         mark_eof(stream, endtoken);
 769 }
 770
 771 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
 772 {
 773         stream_t stream;
 774         struct token *begin;
 775
 776         begin = setup_stream(&stream, 0, -1, buffer, size);
 777         tokenize_stream(&stream, endtoken);
 778         return begin;
 779 }
 780
 781 struct token * tokenize(const char *name, int fd, struct token *endtoken)
 782 {
 783         struct token *begin;
 784         stream_t stream;
 785         unsigned char buffer[BUFSIZE];
 786         int idx;
 787
 788         idx = init_stream(name, fd);
 789         if (idx < 0)
 790                 return endtoken;
 791
 792         begin = setup_stream(&stream, idx, fd, buffer, 0);
 793         tokenize_stream(&stream, endtoken);
 794         return begin;
 795 }