tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the pre-processor.
   4  *
   5  * Copyright (C) 2003 Linus Torvalds, all rights reserved.
   6  */
   7 #include <stdio.h>
   8 #include <stdlib.h>
   9 #include <stdarg.h>
  10 #include <stddef.h>
  11 #include <string.h>
  12 #include <ctype.h>
  13 #include <unistd.h>
  14 #include <sys/stat.h>
  15
  16 #include "lib.h"
  17 #include "token.h"
  18 #include "symbol.h"
  19
  20 #define EOF (-1)
  21
  22 int input_stream_nr = 0;
  23 struct stream *input_streams;
  24 static int input_streams_allocated;
  25
  26 #define BUFSIZE (8192)
  27
  28 typedef struct {
  29         int fd, stream, line, pos, offset, size;
  30         unsigned int newline:1, whitespace:1;
  31         struct token **tokenlist;
  32         struct token *token;
  33         unsigned char *buffer;
  34 } stream_t;
  35
  36
  37 const char *show_special(int val)
  38 {
  39         static const char *combinations[] = COMBINATION_STRINGS;
  40         static char buffer[4];
  41
  42         buffer[0] = val;
  43         buffer[1] = 0;
  44         if (val >= SPECIAL_BASE)
  45                 strcpy(buffer, combinations[val - SPECIAL_BASE]);
  46         return buffer;
  47 }
  48
  49 const char *show_ident(const struct ident *ident)
  50 {
  51         static char buffer[256];
  52         sprintf(buffer, "%.*s", ident->len, ident->name);
  53         return buffer;
  54 }
  55
  56 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  57 {
  58         if (isprint(c)) {
  59                 if (c == escape || c == '\\')
  60                         *ptr++ = '\\';
  61                 *ptr++ = c;
  62                 return ptr;
  63         }
  64         *ptr++ = '\\';
  65         switch (c) {
  66         case '\n':
  67                 *ptr++ = 'n';
  68                 return ptr;
  69         case '\t':
  70                 *ptr++ = 't';
  71                 return ptr;
  72         }
  73         if (!isdigit(next))
  74                 return ptr + sprintf(ptr, "%o", c);
  75
  76         return ptr + sprintf(ptr, "%03o", c);
  77 }
  78
  79 const char *show_token(const struct token *token)
  80 {
  81         static char buffer[256];
  82
  83         if (!token)
  84                 return "<no token>";
  85         switch (token->type) {
  86         case TOKEN_ERROR:
  87                 return "syntax error";
  88
  89         case TOKEN_EOF:
  90                 return "end-of-input";
  91
  92         case TOKEN_IDENT:
  93                 return show_ident(token->ident);
  94
  95         case TOKEN_STRING: {
  96                 char *ptr;
  97                 int i;
  98                 struct string *string = token->string;
  99
 100                 ptr = buffer;
 101                 *ptr++ = '"';
 102                 for (i = 0; i < string->length-1; i++) {
 103                         unsigned char *p = string->data + i;
 104                         ptr = charstr(ptr, p[0], '"', p[1]);
 105                 }
 106                 *ptr++ = '"';
 107                 *ptr = '\0';
 108                 return buffer;
 109         }
 110
 111         case TOKEN_INTEGER: {
 112                 const char *p = token->integer;
 113                 switch (*p) {
 114                 case 'o':       // octal
 115                 case 'x':       // hex
 116                         buffer[0] = '0';
 117                         strcpy(buffer+1, p+1);
 118                         return buffer;
 119                 default:
 120                         return p;
 121                 }
 122         }
 123
 124         case TOKEN_FP:
 125                 return token->fp;
 126
 127         case TOKEN_SPECIAL:
 128                 return show_special(token->special);
 129
 130         case TOKEN_CHAR: {
 131                 char *ptr = buffer;
 132                 int c = token->character;
 133                 *ptr++ = '\'';
 134                 ptr = charstr(ptr, c, '\'', 0);
 135                 *ptr++ = '\'';
 136                 *ptr++ = '\0';
 137                 return buffer;
 138         }
 139
 140         case TOKEN_STREAMBEGIN:
 141                 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->stream)->name);
 142                 return buffer;
 143
 144         case TOKEN_STREAMEND:
 145                 sprintf(buffer, "<end of '%s'>", (input_streams + token->stream)->name);
 146                 return buffer;
 147
 148         default:
 149                 return "WTF???";
 150         }
 151 }
 152
 153 int init_stream(const char *name, int fd)
 154 {
 155         int stream = input_stream_nr;
 156         struct stream *current;
 157
 158         if (stream >= input_streams_allocated) {
 159                 int newalloc = stream * 4 / 3 + 10;
 160                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 161                 if (!input_streams)
 162                         die("Unable to allocate more streams space");
 163                 input_streams_allocated = newalloc;
 164         }
 165         current = input_streams + stream;
 166         memset(current, 0, sizeof(*current));
 167         current->name = name;
 168         current->fd = fd;
 169         current->constant = -1; // "unknown"
 170         if (fd > 0) {
 171                 int i;
 172                 struct stat st;
 173
 174                 fstat(fd, &st);
 175                 current->dev = st.st_dev;
 176                 current->ino = st.st_ino;
 177                 for (i = 0; i < stream; i++) {
 178                         struct stream *s = input_streams + i;
 179                         if (s->dev == st.st_dev && s->ino == st.st_ino) {
 180                                 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
 181                                         return -1;
 182                         }
 183                 }
 184         }
 185         input_stream_nr = stream+1;
 186         return stream;
 187 }
 188
 189 static struct token * alloc_token(stream_t *stream)
 190 {
 191         struct token *token = __alloc_token(0);
 192         token->line = stream->line;
 193         token->pos = stream->pos;
 194         token->stream = stream->stream;
 195         token->newline = stream->newline;
 196         token->whitespace = stream->whitespace;
 197         return token;
 198 }
 199
 200 static int nextchar(stream_t *stream)
 201 {
 202         int offset = stream->offset;
 203         int size = stream->size;
 204         int c;
 205
 206         if (offset >= size) {
 207                 size = read(stream->fd, stream->buffer, BUFSIZE);
 208                 if (size <= 0)
 209                         return EOF;
 210                 stream->size = size;
 211                 stream->offset = 0;
 212                 offset = 0;
 213         }
 214         c = stream->buffer[offset];
 215         stream->offset = offset + 1;
 216         stream->pos++;
 217         if (c == '\n') {
 218                 stream->line++;
 219                 stream->newline = 1;
 220                 stream->pos = 0;
 221         }
 222         return c;
 223 }
 224
 225 struct token eof_token_entry;
 226
 227 static void mark_eof(stream_t *stream, struct token *end_token)
 228 {
 229         struct token *end;
 230
 231         end = alloc_token(stream);
 232         end->type = TOKEN_STREAMEND;
 233         end->newline = 1;
 234
 235         eof_token_entry.next = &eof_token_entry;
 236         eof_token_entry.newline = 1;
 237
 238         if (!end_token)
 239                 end_token =  &eof_token_entry;
 240         end->next = end_token;
 241         *stream->tokenlist = end;
 242         stream->tokenlist = NULL;
 243 }
 244
 245 static void add_token(stream_t *stream)
 246 {
 247         struct token *token = stream->token;
 248
 249         stream->token = NULL;
 250         token->next = NULL;
 251         *stream->tokenlist = token;
 252         stream->tokenlist = &token->next;
 253 }
 254
 255 static void drop_token(stream_t *stream)
 256 {
 257         stream->newline |= stream->token->newline;
 258         stream->whitespace |= stream->token->whitespace;
 259         stream->token = NULL;
 260 }
 261
 262 static int get_base_number(unsigned int base, char **p, int next, stream_t *stream)
 263 {
 264         char *buf = *p;
 265
 266         *buf++ = next;
 267         for (;;) {
 268                 unsigned int n;
 269                 next = nextchar(stream);
 270                 n = hexval(next);
 271                 if (n >= base)
 272                         break;
 273                 *buf++ = next;
 274         }
 275         *p = buf;
 276         return next;
 277 }
 278
 279 static int do_integer(char *buffer, int len, int next, stream_t *stream)
 280 {
 281         struct token *token = stream->token;
 282         void *buf;
 283
 284         while (next == 'u' || next == 'U' || next == 'l' || next == 'L') {
 285                 buffer[len++] = next;
 286                 next = nextchar(stream);
 287         }
 288         buffer[len++] = '\0';
 289         buf = __alloc_bytes(len);
 290         memcpy(buf, buffer, len);
 291         token->type = TOKEN_INTEGER;
 292         token->integer = buf;
 293         add_token(stream);
 294         return next;
 295 }
 296
 297 static int get_one_number(int c, stream_t *stream)
 298 {
 299         static char buffer[256];
 300         int next = nextchar(stream);
 301         char *p = buffer;
 302
 303         *p++ = c;
 304         switch (next) {
 305         case '0'...'7':
 306                 if (c == '0') {
 307                         buffer[0] = 'o';
 308                         next = get_base_number(8, &p, next, stream);
 309                         break;
 310                 }
 311                 /* fallthrough */
 312         case '8'...'9':
 313                 next = get_base_number(10, &p, next, stream);
 314                 break;
 315         case 'x': case 'X':
 316                 if (c == '0') {
 317                         buffer[0] = 'x';
 318                         next = get_base_number(16, &p, next, stream);
 319                 }
 320         }
 321         return do_integer(buffer, p - buffer, next, stream);
 322 }
 323
 324 static int escapechar(int first, int type, stream_t *stream, int *valp)
 325 {
 326         int next, value;
 327
 328         next = nextchar(stream);
 329         value = first;
 330
 331         if (first == '\n')
 332                 warn(stream->token, "Newline in string or character constant");
 333
 334         if (first == '\\' && next != EOF) {
 335                 value = next;
 336                 next = nextchar(stream);
 337                 if (value != type) {
 338                         switch (value) {
 339                         case 'n':
 340                                 value = '\n';
 341                                 break;
 342                         case 't':
 343                                 value = '\t';
 344                                 break;
 345                         case '\\':
 346                                 break;
 347                         case '\'':
 348                                 break;
 349                         case '"':
 350                                 break;
 351                         case '0'...'7': {
 352                                 int nr = 2;
 353                                 value -= '0';
 354                                 while (next >= '0' && next <= '9') {
 355                                         value = (value << 3) + (next-'0');
 356                                         next = nextchar(stream);
 357                                         if (!--nr)
 358                                                 break;
 359                                 }
 360                                 value &= 0xff;
 361                                 break;
 362                         }
 363                         case 'x': {
 364                                 int hex = hexval(next);
 365                                 if (hex < 16) {
 366                                         value = hex;
 367                                         next = nextchar(stream);
 368                                         while ((hex = hexval(next)) < 16) {
 369                                                 value = (value << 4) + hex;
 370                                                 next = nextchar(stream);
 371                                         }
 372                                         value &= 0xff;
 373                                         break;
 374                                 }
 375                         }
 376                         /* Fallthrough */
 377                         default:
 378                                 warn(stream->token, "Unknown escape '%c'", value);
 379                         }
 380                 }
 381                 /* Mark it as escaped */
 382                 value |= 0x100;
 383         }
 384         *valp = value;
 385         return next;
 386 }
 387
 388 static int get_char_token(int next, stream_t *stream)
 389 {
 390         int value;
 391         struct token *token;
 392
 393         next = escapechar(next, '\'', stream, &value);
 394         if (value == '\'' || next != '\'') {
 395                 warn(stream->token, "Bad character constant");
 396                 drop_token(stream);
 397                 return next;
 398         }
 399
 400         token = stream->token;
 401         token->type = TOKEN_CHAR;
 402         token->character = value & 0xff;
 403
 404         add_token(stream);
 405         return nextchar(stream);
 406 }
 407
 408 static int get_string_token(int next, stream_t *stream)
 409 {
 410         static char buffer[512];
 411         struct string *string;
 412         struct token *token;
 413         int len = 0;
 414
 415         for (;;) {
 416                 int val;
 417                 next = escapechar(next, '"', stream, &val);
 418                 if (val == '"')
 419                         break;
 420                 if (next == EOF) {
 421                         warn(stream->token, "Enf of file in middle of string");
 422                         return next;
 423                 }
 424                 if (len < sizeof(buffer)) {
 425                         buffer[len] = val;
 426                         len++;
 427                 }
 428
 429         }
 430
 431         if (len > 256)
 432                 warn(stream->token, "String too long");
 433
 434         string = __alloc_string(len+1);
 435         memcpy(string->data, buffer, len);
 436         string->data[len] = '\0';
 437         string->length = len+1;
 438
 439         /* Pass it on.. */
 440         token = stream->token;
 441         token->type = TOKEN_STRING;
 442         token->string = string;
 443         add_token(stream);
 444
 445         return next;
 446 }
 447
 448 static int drop_stream_eoln(stream_t *stream)
 449 {
 450         int next = nextchar(stream);
 451         drop_token(stream);
 452         for (;;) {
 453                 int curr = next;
 454                 if (curr == EOF)
 455                         return next;
 456                 next = nextchar(stream);
 457                 if (curr == '\n')
 458                         return next;
 459         }
 460 }
 461
 462 static int drop_stream_comment(stream_t *stream)
 463 {
 464         int next = nextchar(stream);
 465         drop_token(stream);
 466         for (;;) {
 467                 int curr = next;
 468                 if (curr == EOF) {
 469                         warn(stream->token, "End of file in the middle of a comment");
 470                         return curr;
 471                 }
 472                 next = nextchar(stream);
 473                 if (curr == '*' && next == '/')
 474                         break;
 475         }
 476         return nextchar(stream);
 477 }
 478
 479 unsigned char combinations[][3] = COMBINATION_STRINGS;
 480
 481 #define NR_COMBINATIONS (sizeof(combinations)/3)
 482
 483 static int get_one_special(int c, stream_t *stream)
 484 {
 485         struct token *token;
 486         unsigned char c1, c2, c3;
 487         int next, value, i;
 488         char *comb;
 489
 490         next = nextchar(stream);
 491
 492         /*
 493          * Check for strings, character constants, and comments
 494          */
 495         switch (c) {
 496         case '"':
 497                 return get_string_token(next, stream);
 498         case '\'':
 499                 return get_char_token(next, stream);
 500         case '/':
 501                 if (next == '/')
 502                         return drop_stream_eoln(stream);
 503                 if (next == '*')
 504                         return drop_stream_comment(stream);
 505         }
 506
 507         /*
 508          * Check for combinations
 509          */
 510         value = c;
 511         comb = combinations[0];
 512         c1 = c; c2 = next; c3 = 0;
 513         for (i = 0; i < NR_COMBINATIONS; i++) {
 514                 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
 515                         value = i + SPECIAL_BASE;
 516                         next = nextchar(stream);
 517                         if (c3)
 518                                 break;
 519                         c3 = next;
 520                 }
 521                 comb += 3;
 522         }
 523
 524         /* Pass it on.. */
 525         token = stream->token;
 526         token->type = TOKEN_SPECIAL;
 527         token->special = value;
 528         add_token(stream);
 529         return next;
 530 }
 531
 532 #define IDENT_HASH_BITS (10)
 533 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 534 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 535
 536 #define ident_hash_init(c)              (c)
 537 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 538 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 539
 540 static struct ident *hash_table[IDENT_HASH_SIZE];
 541 int ident_hit, ident_miss;
 542
 543 void show_identifier_stats(void)
 544 {
 545         int i;
 546         int distribution[100];
 547
 548         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 549                 ident_hit, ident_miss);
 550
 551         for (i = 0; i < 100; i++)
 552                 distribution[i] = 0;
 553
 554         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 555                 struct ident * ident = hash_table[i];
 556                 int count = 0;
 557
 558                 while (ident) {
 559                         count++;
 560                         ident = ident->next;
 561                 }
 562                 if (count > 99)
 563                         count = 99;
 564                 distribution[count]++;
 565         }
 566
 567         for (i = 0; i < 100; i++) {
 568                 if (distribution[i])
 569                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 570         }
 571 }
 572
 573 static struct ident *alloc_ident(const char *name, int len)
 574 {
 575         struct ident *ident = __alloc_ident(len);
 576         ident->symbols = NULL;
 577         ident->len = len;
 578         memcpy(ident->name, name, len);
 579         return ident;
 580 }
 581
 582 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 583 {
 584         ident->next = hash_table[hash];
 585         hash_table[hash] = ident;
 586         ident_miss++;
 587         return ident;
 588 }
 589
 590 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 591 {
 592         struct ident *ident;
 593
 594         ident = hash_table[hash];
 595         while (ident) {
 596                 if (ident->len == len && !memcmp(ident->name, name, len)) {
 597                         ident_hit++;
 598                         return ident;
 599                 }
 600                 ident = ident->next;
 601         }
 602
 603         return insert_hash(alloc_ident(name, len), hash);
 604 }
 605
 606 static unsigned long hash_name(const char *name, int len)
 607 {
 608         unsigned long hash;
 609         const unsigned char *p = (const unsigned char *)name;
 610
 611         hash = ident_hash_init(*p++);
 612         while (--len) {
 613                 unsigned int i = *p++;
 614                 hash = ident_hash_add(hash, i);
 615         }
 616         return ident_hash_end(hash);
 617 }
 618
 619 struct ident *hash_ident(struct ident *ident)
 620 {
 621         return insert_hash(ident, hash_name(ident->name, ident->len));
 622 }
 623
 624 struct ident *built_in_ident(const char *name)
 625 {
 626         int len = strlen(name);
 627         return create_hashed_ident(name, len, hash_name(name, len));
 628 }
 629
 630 struct token *built_in_token(int stream, const char *name)
 631 {
 632         struct token *token;
 633
 634         token = __alloc_token(0);
 635         token->stream = stream;
 636         token->type = TOKEN_IDENT;
 637         token->ident = built_in_ident(name);
 638         return token;
 639 }
 640
 641 static int get_one_identifier(int c, stream_t *stream)
 642 {
 643         struct token *token;
 644         struct ident *ident;
 645         unsigned long hash;
 646         char buf[256];
 647         int len = 1;
 648         int next;
 649
 650         hash = ident_hash_init(c);
 651         buf[0] = c;
 652         for (;;) {
 653                 next = nextchar(stream);
 654                 switch (next) {
 655                 case '0'...'9':
 656                 case 'a'...'z':
 657                 case 'A'...'Z':
 658                 case '_':
 659                         if (len < sizeof(buf)) {
 660                                 hash = ident_hash_add(hash, next);
 661                                 buf[len] = next;
 662                                 len++;
 663                         }
 664                         continue;
 665                 }
 666                 break;
 667         };
 668         hash = ident_hash_end(hash);
 669
 670         ident = create_hashed_ident(buf, len, hash);
 671
 672         /* Pass it on.. */
 673         token = stream->token;
 674         token->type = TOKEN_IDENT;
 675         token->ident = ident;
 676         add_token(stream);
 677         return next;
 678 }
 679
 680 static int get_one_token(int c, stream_t *stream)
 681 {
 682         switch (c) {
 683         case '0'...'9':
 684                 return get_one_number(c, stream);
 685         case 'a'...'z':
 686         case 'A'...'Z':
 687         case '_':
 688                 return get_one_identifier(c, stream);
 689         default:
 690                 return get_one_special(c, stream);
 691         }
 692 }
 693
 694 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 695         unsigned char *buf, unsigned int buf_size)
 696 {
 697         struct token *begin;
 698
 699         stream->stream = idx;
 700         stream->token = NULL;
 701         stream->line = 1;
 702         stream->newline = 1;
 703         stream->whitespace = 0;
 704         stream->pos = 0;
 705         stream->fd = fd;
 706         stream->offset = 0;
 707         stream->size = buf_size;
 708         stream->buffer = buf;
 709
 710         begin = alloc_token(stream);
 711         begin->type = TOKEN_STREAMBEGIN;
 712         stream->tokenlist = &begin->next;
 713         return begin;
 714 }
 715
 716 static void tokenize_stream(stream_t *stream, struct token *endtoken)
 717 {
 718         int c = nextchar(stream);
 719         while (c != EOF) {
 720                 if (c == '\\') {
 721                         c = nextchar(stream);
 722                         stream->newline = 0;
 723                         stream->whitespace = 1;
 724                         continue;
 725                 }
 726                 if (!isspace(c)) {
 727                         struct token *token = alloc_token(stream);
 728                         token->newline = stream->newline;
 729                         token->whitespace = stream->whitespace;
 730                         stream->newline = 0;
 731                         stream->whitespace = 0;
 732                         stream->token = token;
 733                         c = get_one_token(c, stream);
 734                         continue;
 735                 }
 736                 stream->whitespace = 1;
 737                 c = nextchar(stream);
 738         }
 739         mark_eof(stream, endtoken);
 740 }
 741
 742 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
 743 {
 744         stream_t stream;
 745         struct token *begin;
 746
 747         begin = setup_stream(&stream, 0, -1, buffer, size);
 748         tokenize_stream(&stream, endtoken);
 749         return begin;
 750 }
 751
 752 struct token * tokenize(const char *name, int fd, struct token *endtoken)
 753 {
 754         struct token *begin;
 755         stream_t stream;
 756         unsigned char buffer[BUFSIZE];
 757         int idx;
 758
 759         idx = init_stream(name, fd);
 760         if (idx < 0)
 761                 return endtoken;
 762
 763         begin = setup_stream(&stream, idx, fd, buffer, 0);
 764         tokenize_stream(&stream, endtoken);
 765         return begin;
 766 }