tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the pre-processor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *
   7  *  Licensed under the Open Software License version 1.1
   8  */
   9 #include <stdio.h>
  10 #include <stdlib.h>
  11 #include <stdarg.h>
  12 #include <stddef.h>
  13 #include <string.h>
  14 #include <ctype.h>
  15 #include <unistd.h>
  16 #include <sys/stat.h>
  17
  18 #include "lib.h"
  19 #include "token.h"
  20 #include "symbol.h"
  21
  22 #define EOF (-1)
  23
  24 int input_stream_nr = 0;
  25 struct stream *input_streams;
  26 static int input_streams_allocated;
  27
  28 #define BUFSIZE (8192)
  29
  30 typedef struct {
  31         int fd, offset, size;
  32         struct position pos;
  33         struct token **tokenlist;
  34         struct token *token;
  35         unsigned char *buffer;
  36 } stream_t;
  37
  38
  39 const char *show_special(int val)
  40 {
  41         static const char *combinations[] = COMBINATION_STRINGS;
  42         static char buffer[4];
  43
  44         buffer[0] = val;
  45         buffer[1] = 0;
  46         if (val >= SPECIAL_BASE)
  47                 strcpy(buffer, combinations[val - SPECIAL_BASE]);
  48         return buffer;
  49 }
  50
  51 const char *show_ident(const struct ident *ident)
  52 {
  53         static char buffer[256];
  54         if (!ident)
  55                 return "<noident>";
  56         sprintf(buffer, "%.*s", ident->len, ident->name);
  57         return buffer;
  58 }
  59
  60 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  61 {
  62         if (isprint(c)) {
  63                 if (c == escape || c == '\\')
  64                         *ptr++ = '\\';
  65                 *ptr++ = c;
  66                 return ptr;
  67         }
  68         *ptr++ = '\\';
  69         switch (c) {
  70         case '\n':
  71                 *ptr++ = 'n';
  72                 return ptr;
  73         case '\t':
  74                 *ptr++ = 't';
  75                 return ptr;
  76         }
  77         if (!isdigit(next))
  78                 return ptr + sprintf(ptr, "%o", c);
  79
  80         return ptr + sprintf(ptr, "%03o", c);
  81 }
  82
  83 const char *show_string(const struct string *string)
  84 {
  85         static char buffer[256];
  86         char *ptr;
  87         int i;
  88
  89         ptr = buffer;
  90         *ptr++ = '"';
  91         for (i = 0; i < string->length-1; i++) {
  92                 const unsigned char *p = string->data + i;
  93                 ptr = charstr(ptr, p[0], '"', p[1]);
  94         }
  95         *ptr++ = '"';
  96         *ptr = '\0';
  97         return buffer;
  98 }
  99
 100 const char *show_token(const struct token *token)
 101 {
 102         static char buffer[256];
 103
 104         if (!token)
 105                 return "<no token>";
 106         switch (token_type(token)) {
 107         case TOKEN_ERROR:
 108                 return "syntax error";
 109
 110         case TOKEN_EOF:
 111                 return "end-of-input";
 112
 113         case TOKEN_IDENT:
 114                 return show_ident(token->ident);
 115
 116         case TOKEN_STRING:
 117                 return show_string(token->string);
 118
 119         case TOKEN_INTEGER: {
 120                 const char *p = token->integer;
 121                 switch (*p) {
 122                 case 'o':       // octal
 123                 case 'x':       // hex
 124                         buffer[0] = '0';
 125                         strcpy(buffer+1, p+1);
 126                         return buffer;
 127                 default:
 128                         return p;
 129                 }
 130         }
 131
 132         case TOKEN_FP:
 133                 return token->fp;
 134
 135         case TOKEN_SPECIAL:
 136                 return show_special(token->special);
 137
 138         case TOKEN_CHAR: {
 139                 char *ptr = buffer;
 140                 int c = token->character;
 141                 *ptr++ = '\'';
 142                 ptr = charstr(ptr, c, '\'', 0);
 143                 *ptr++ = '\'';
 144                 *ptr++ = '\0';
 145                 return buffer;
 146         }
 147
 148         case TOKEN_STREAMBEGIN:
 149                 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
 150                 return buffer;
 151
 152         case TOKEN_STREAMEND:
 153                 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
 154                 return buffer;
 155
 156         default:
 157                 return "WTF???";
 158         }
 159 }
 160
 161 int init_stream(const char *name, int fd)
 162 {
 163         int stream = input_stream_nr;
 164         struct stream *current;
 165
 166         if (stream >= input_streams_allocated) {
 167                 int newalloc = stream * 4 / 3 + 10;
 168                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 169                 if (!input_streams)
 170                         die("Unable to allocate more streams space");
 171                 input_streams_allocated = newalloc;
 172         }
 173         current = input_streams + stream;
 174         memset(current, 0, sizeof(*current));
 175         current->name = name;
 176         current->fd = fd;
 177         current->constant = -1; // "unknown"
 178         if (fd > 0) {
 179                 int i;
 180                 struct stat st;
 181
 182                 fstat(fd, &st);
 183                 current->dev = st.st_dev;
 184                 current->ino = st.st_ino;
 185                 for (i = 0; i < stream; i++) {
 186                         struct stream *s = input_streams + i;
 187                         if (s->dev == st.st_dev && s->ino == st.st_ino) {
 188                                 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
 189                                         return -1;
 190                         }
 191                 }
 192         }
 193         input_stream_nr = stream+1;
 194         return stream;
 195 }
 196
 197 static struct token * alloc_token(stream_t *stream)
 198 {
 199         struct token *token = __alloc_token(0);
 200         token->pos = stream->pos;
 201         return token;
 202 }
 203
 204 static int nextchar(stream_t *stream)
 205 {
 206         int offset = stream->offset;
 207         int size = stream->size;
 208         int c;
 209
 210         if (offset >= size) {
 211                 size = read(stream->fd, stream->buffer, BUFSIZE);
 212                 if (size <= 0)
 213                         return EOF;
 214                 stream->size = size;
 215                 stream->offset = 0;
 216                 offset = 0;
 217         }
 218         c = stream->buffer[offset];
 219         stream->offset = offset + 1;
 220         stream->pos.pos++;
 221         if (c == '\n') {
 222                 stream->pos.line++;
 223                 stream->pos.newline = 1;
 224                 stream->pos.pos = 0;
 225         }
 226         return c;
 227 }
 228
 229 struct token eof_token_entry;
 230
 231 static void mark_eof(stream_t *stream, struct token *end_token)
 232 {
 233         struct token *end;
 234
 235         end = alloc_token(stream);
 236         token_type(end) = TOKEN_STREAMEND;
 237         end->pos.newline = 1;
 238
 239         eof_token_entry.next = &eof_token_entry;
 240         eof_token_entry.pos.newline = 1;
 241
 242         if (!end_token)
 243                 end_token =  &eof_token_entry;
 244         end->next = end_token;
 245         *stream->tokenlist = end;
 246         stream->tokenlist = NULL;
 247 }
 248
 249 static void add_token(stream_t *stream)
 250 {
 251         struct token *token = stream->token;
 252
 253         stream->token = NULL;
 254         token->next = NULL;
 255         *stream->tokenlist = token;
 256         stream->tokenlist = &token->next;
 257 }
 258
 259 static void drop_token(stream_t *stream)
 260 {
 261         stream->pos.newline |= stream->token->pos.newline;
 262         stream->pos.whitespace |= stream->token->pos.whitespace;
 263         stream->token = NULL;
 264 }
 265
 266 static int get_base_number(unsigned int base, char **p, int next, stream_t *stream)
 267 {
 268         char *buf = *p;
 269
 270         *buf++ = next;
 271         for (;;) {
 272                 unsigned int n;
 273                 next = nextchar(stream);
 274                 n = hexval(next);
 275                 if (n >= base)
 276                         break;
 277                 *buf++ = next;
 278         }
 279         *p = buf;
 280         return next;
 281 }
 282
 283 static int do_fp(char *buffer, int len, int next, stream_t *stream)
 284 {
 285         struct token *token = stream->token;
 286         void *buf;
 287
 288         /* Get the decimal part */
 289         if (next == '.') {
 290                 buffer[len++] = next;
 291                 next = nextchar(stream);
 292                 while (next >= '0' && next <= '9') {
 293                         buffer[len++] = next;
 294                         next = nextchar(stream);
 295                 }
 296         }
 297
 298         /* Get the exponential part */
 299         if (next == 'e' || next == 'E') {
 300                 buffer[len++] = next;
 301                 next = nextchar(stream);
 302                 while (next >= '0' && next <= '9') {
 303                         buffer[len++] = next;
 304                         next = nextchar(stream);
 305                 }
 306         }
 307
 308         /* Get the 'lf' type specifiers */
 309         while (next == 'f' || next == 'F' || next == 'l' || next == 'L') {
 310                 buffer[len++] = next;
 311                 next = nextchar(stream);
 312         }
 313
 314         buffer[len++] = '\0';
 315         buf = __alloc_bytes(len);
 316         memcpy(buf, buffer, len);
 317         token_type(token) = TOKEN_FP;
 318         token->fp = buf;
 319         add_token(stream);
 320         return next;
 321 }
 322
 323 static int do_integer(char *buffer, int len, int next, stream_t *stream)
 324 {
 325         struct token *token = stream->token;
 326         void *buf;
 327
 328         if (next == '.' || next == 'e' || next == 'E')
 329                 return do_fp(buffer, len, next, stream);
 330
 331         while (next == 'u' || next == 'U' || next == 'l' || next == 'L') {
 332                 buffer[len++] = next;
 333                 next = nextchar(stream);
 334         }
 335         buffer[len++] = '\0';
 336         buf = __alloc_bytes(len);
 337         memcpy(buf, buffer, len);
 338         token_type(token) = TOKEN_INTEGER;
 339         token->integer = buf;
 340         add_token(stream);
 341         return next;
 342 }
 343
 344 static int get_one_number(int c, stream_t *stream)
 345 {
 346         static char buffer[256];
 347         int next = nextchar(stream);
 348         char *p = buffer;
 349
 350         *p++ = c;
 351         switch (next) {
 352         case '0'...'7':
 353                 if (c == '0') {
 354                         buffer[0] = 'o';
 355                         next = get_base_number(8, &p, next, stream);
 356                         break;
 357                 }
 358                 /* fallthrough */
 359         case '8'...'9':
 360                 next = get_base_number(10, &p, next, stream);
 361                 break;
 362         case 'x': case 'X':
 363                 if (c == '0') {
 364                         buffer[0] = 'x';
 365                         next = get_base_number(16, &p, next, stream);
 366                 }
 367         }
 368         return do_integer(buffer, p - buffer, next, stream);
 369 }
 370
 371 static int escapechar(int first, int type, stream_t *stream, int *valp)
 372 {
 373         int next, value;
 374
 375         next = nextchar(stream);
 376         value = first;
 377
 378         if (first == '\n')
 379                 warn(stream->pos, "Newline in string or character constant");
 380
 381         if (first == '\\' && next != EOF) {
 382                 value = next;
 383                 next = nextchar(stream);
 384                 if (value != type) {
 385                         switch (value) {
 386                         case 'a':
 387                                 value = '\a';
 388                                 break;
 389                         case 'b':
 390                                 value = '\b';
 391                                 break;
 392                         case 't':
 393                                 value = '\t';
 394                                 break;
 395                         case 'n':
 396                                 value = '\n';
 397                                 break;
 398                         case 'v':
 399                                 value = '\v';
 400                                 break;
 401                         case 'f':
 402                                 value = '\f';
 403                                 break;
 404                         case 'r':
 405                                 value = '\r';
 406                                 break;
 407                         case 'e':
 408                                 value = '\e';
 409                                 break;
 410                         case '\\':
 411                                 break;
 412                         case '\'':
 413                                 break;
 414                         case '"':
 415                                 break;
 416                         case '\n':
 417                                 next = escapechar(next, type, stream, &value);
 418                                 break;
 419                         case '0'...'7': {
 420                                 int nr = 2;
 421                                 value -= '0';
 422                                 while (next >= '0' && next <= '9') {
 423                                         value = (value << 3) + (next-'0');
 424                                         next = nextchar(stream);
 425                                         if (!--nr)
 426                                                 break;
 427                                 }
 428                                 value &= 0xff;
 429                                 break;
 430                         }
 431                         case 'x': {
 432                                 int hex = hexval(next);
 433                                 if (hex < 16) {
 434                                         value = hex;
 435                                         next = nextchar(stream);
 436                                         while ((hex = hexval(next)) < 16) {
 437                                                 value = (value << 4) + hex;
 438                                                 next = nextchar(stream);
 439                                         }
 440                                         value &= 0xff;
 441                                         break;
 442                                 }
 443                         }
 444                         /* Fallthrough */
 445                         default:
 446                                 warn(stream->pos, "Unknown escape '%c'", value);
 447                         }
 448                 }
 449                 /* Mark it as escaped */
 450                 value |= 0x100;
 451         }
 452         *valp = value;
 453         return next;
 454 }
 455
 456 static int get_char_token(int next, stream_t *stream)
 457 {
 458         int value;
 459         struct token *token;
 460
 461         next = escapechar(next, '\'', stream, &value);
 462         if (value == '\'' || next != '\'') {
 463                 warn(stream->pos, "Bad character constant");
 464                 drop_token(stream);
 465                 return next;
 466         }
 467
 468         token = stream->token;
 469         token_type(token) = TOKEN_CHAR;
 470         token->character = value & 0xff;
 471
 472         add_token(stream);
 473         return nextchar(stream);
 474 }
 475
 476 static int get_string_token(int next, stream_t *stream)
 477 {
 478         static char buffer[512];
 479         struct string *string;
 480         struct token *token;
 481         int len = 0;
 482
 483         for (;;) {
 484                 int val;
 485                 next = escapechar(next, '"', stream, &val);
 486                 if (val == '"')
 487                         break;
 488                 if (next == EOF) {
 489                         warn(stream->pos, "Enf of file in middle of string");
 490                         return next;
 491                 }
 492                 if (len < sizeof(buffer)) {
 493                         buffer[len] = val;
 494                         len++;
 495                 }
 496
 497         }
 498
 499         if (len > 256)
 500                 warn(stream->pos, "String too long");
 501
 502         string = __alloc_string(len+1);
 503         memcpy(string->data, buffer, len);
 504         string->data[len] = '\0';
 505         string->length = len+1;
 506
 507         /* Pass it on.. */
 508         token = stream->token;
 509         token_type(token) = TOKEN_STRING;
 510         token->string = string;
 511         add_token(stream);
 512
 513         return next;
 514 }
 515
 516 static int drop_stream_eoln(stream_t *stream)
 517 {
 518         int next = nextchar(stream);
 519         drop_token(stream);
 520         for (;;) {
 521                 int curr = next;
 522                 if (curr == EOF)
 523                         return next;
 524                 next = nextchar(stream);
 525                 if (curr == '\n')
 526                         return next;
 527         }
 528 }
 529
 530 static int drop_stream_comment(stream_t *stream)
 531 {
 532         int next = nextchar(stream);
 533         drop_token(stream);
 534         for (;;) {
 535                 int curr = next;
 536                 if (curr == EOF) {
 537                         warn(stream->pos, "End of file in the middle of a comment");
 538                         return curr;
 539                 }
 540                 next = nextchar(stream);
 541                 if (curr == '*' && next == '/')
 542                         break;
 543         }
 544         return nextchar(stream);
 545 }
 546
 547 unsigned char combinations[][3] = COMBINATION_STRINGS;
 548
 549 #define NR_COMBINATIONS (sizeof(combinations)/3)
 550
 551 static int get_one_special(int c, stream_t *stream)
 552 {
 553         struct token *token;
 554         unsigned char c1, c2, c3;
 555         int next, value, i;
 556         char *comb;
 557
 558         next = nextchar(stream);
 559
 560         /*
 561          * Check for strings, character constants, and comments
 562          */
 563         switch (c) {
 564         case '"':
 565                 return get_string_token(next, stream);
 566         case '\'':
 567                 return get_char_token(next, stream);
 568         case '/':
 569                 if (next == '/')
 570                         return drop_stream_eoln(stream);
 571                 if (next == '*')
 572                         return drop_stream_comment(stream);
 573         }
 574
 575         /*
 576          * Check for combinations
 577          */
 578         value = c;
 579         comb = combinations[0];
 580         c1 = c; c2 = next; c3 = 0;
 581         for (i = 0; i < NR_COMBINATIONS; i++) {
 582                 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
 583                         value = i + SPECIAL_BASE;
 584                         next = nextchar(stream);
 585                         if (c3)
 586                                 break;
 587                         c3 = next;
 588                 }
 589                 comb += 3;
 590         }
 591
 592         /* Pass it on.. */
 593         token = stream->token;
 594         token_type(token) = TOKEN_SPECIAL;
 595         token->special = value;
 596         add_token(stream);
 597         return next;
 598 }
 599
 600 #define IDENT_HASH_BITS (10)
 601 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 602 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 603
 604 #define ident_hash_init(c)              (c)
 605 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 606 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 607
 608 static struct ident *hash_table[IDENT_HASH_SIZE];
 609 int ident_hit, ident_miss;
 610
 611 void show_identifier_stats(void)
 612 {
 613         int i;
 614         int distribution[100];
 615
 616         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 617                 ident_hit, ident_miss);
 618
 619         for (i = 0; i < 100; i++)
 620                 distribution[i] = 0;
 621
 622         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 623                 struct ident * ident = hash_table[i];
 624                 int count = 0;
 625
 626                 while (ident) {
 627                         count++;
 628                         ident = ident->next;
 629                 }
 630                 if (count > 99)
 631                         count = 99;
 632                 distribution[count]++;
 633         }
 634
 635         for (i = 0; i < 100; i++) {
 636                 if (distribution[i])
 637                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 638         }
 639 }
 640
 641 static struct ident *alloc_ident(const char *name, int len)
 642 {
 643         struct ident *ident = __alloc_ident(len);
 644         ident->symbols = NULL;
 645         ident->len = len;
 646         memcpy(ident->name, name, len);
 647         return ident;
 648 }
 649
 650 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 651 {
 652         ident->next = hash_table[hash];
 653         hash_table[hash] = ident;
 654         ident_miss++;
 655         return ident;
 656 }
 657
 658 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 659 {
 660         struct ident *ident;
 661
 662         ident = hash_table[hash];
 663         while (ident) {
 664                 if (ident->len == len && !memcmp(ident->name, name, len)) {
 665                         ident_hit++;
 666                         return ident;
 667                 }
 668                 ident = ident->next;
 669         }
 670
 671         return insert_hash(alloc_ident(name, len), hash);
 672 }
 673
 674 static unsigned long hash_name(const char *name, int len)
 675 {
 676         unsigned long hash;
 677         const unsigned char *p = (const unsigned char *)name;
 678
 679         hash = ident_hash_init(*p++);
 680         while (--len) {
 681                 unsigned int i = *p++;
 682                 hash = ident_hash_add(hash, i);
 683         }
 684         return ident_hash_end(hash);
 685 }
 686
 687 struct ident *hash_ident(struct ident *ident)
 688 {
 689         return insert_hash(ident, hash_name(ident->name, ident->len));
 690 }
 691
 692 struct ident *built_in_ident(const char *name)
 693 {
 694         int len = strlen(name);
 695         return create_hashed_ident(name, len, hash_name(name, len));
 696 }
 697
 698 struct token *built_in_token(int stream, const char *name)
 699 {
 700         struct token *token;
 701
 702         token = __alloc_token(0);
 703         token->pos.stream = stream;
 704         token_type(token) = TOKEN_IDENT;
 705         token->ident = built_in_ident(name);
 706         return token;
 707 }
 708
 709 static int get_one_identifier(int c, stream_t *stream)
 710 {
 711         struct token *token;
 712         struct ident *ident;
 713         unsigned long hash;
 714         char buf[256];
 715         int len = 1;
 716         int next;
 717
 718         hash = ident_hash_init(c);
 719         buf[0] = c;
 720         for (;;) {
 721                 next = nextchar(stream);
 722                 switch (next) {
 723                 case '0'...'9':
 724                 case 'a'...'z':
 725                 case 'A'...'Z':
 726                 case '_':
 727                         if (len < sizeof(buf)) {
 728                                 hash = ident_hash_add(hash, next);
 729                                 buf[len] = next;
 730                                 len++;
 731                         }
 732                         continue;
 733                 }
 734                 break;
 735         };
 736         hash = ident_hash_end(hash);
 737
 738         ident = create_hashed_ident(buf, len, hash);
 739
 740         /* Pass it on.. */
 741         token = stream->token;
 742         token_type(token) = TOKEN_IDENT;
 743         token->ident = ident;
 744         add_token(stream);
 745         return next;
 746 }
 747
 748 static int get_one_token(int c, stream_t *stream)
 749 {
 750         switch (c) {
 751         case '0'...'9':
 752                 return get_one_number(c, stream);
 753         case 'a'...'z':
 754         case 'A'...'Z':
 755         case '_':
 756                 return get_one_identifier(c, stream);
 757         default:
 758                 return get_one_special(c, stream);
 759         }
 760 }
 761
 762 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 763         unsigned char *buf, unsigned int buf_size)
 764 {
 765         struct token *begin;
 766
 767         stream->pos.stream = idx;
 768         stream->pos.line = 1;
 769         stream->pos.newline = 1;
 770         stream->pos.whitespace = 0;
 771         stream->pos.pos = 0;
 772
 773         stream->token = NULL;
 774         stream->fd = fd;
 775         stream->offset = 0;
 776         stream->size = buf_size;
 777         stream->buffer = buf;
 778
 779         begin = alloc_token(stream);
 780         token_type(begin) = TOKEN_STREAMBEGIN;
 781         stream->tokenlist = &begin->next;
 782         return begin;
 783 }
 784
 785 static void tokenize_stream(stream_t *stream, struct token *endtoken)
 786 {
 787         int c = nextchar(stream);
 788         while (c != EOF) {
 789                 if (c == '\\') {
 790                         c = nextchar(stream);
 791                         stream->pos.newline = 0;
 792                         stream->pos.whitespace = 1;
 793                         continue;
 794                 }
 795                 if (!isspace(c)) {
 796                         struct token *token = alloc_token(stream);
 797                         stream->token = token;
 798                         stream->pos.newline = 0;
 799                         stream->pos.whitespace = 0;
 800                         c = get_one_token(c, stream);
 801                         continue;
 802                 }
 803                 stream->pos.whitespace = 1;
 804                 c = nextchar(stream);
 805         }
 806         mark_eof(stream, endtoken);
 807 }
 808
 809 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
 810 {
 811         stream_t stream;
 812         struct token *begin;
 813
 814         begin = setup_stream(&stream, 0, -1, buffer, size);
 815         tokenize_stream(&stream, endtoken);
 816         return begin;
 817 }
 818
 819 struct token * tokenize(const char *name, int fd, struct token *endtoken)
 820 {
 821         struct token *begin;
 822         stream_t stream;
 823         unsigned char buffer[BUFSIZE];
 824         int idx;
 825
 826         idx = init_stream(name, fd);
 827         if (idx < 0)
 828                 return endtoken;
 829
 830         begin = setup_stream(&stream, idx, fd, buffer, 0);
 831         tokenize_stream(&stream, endtoken);
 832         return begin;
 833 }