tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the pre-processor.
   4  *
   5  * Copyright (C) 2003 Linus Torvalds, all rights reserved.
   6  */
   7 #include <stdio.h>
   8 #include <stdlib.h>
   9 #include <stdarg.h>
  10 #include <stddef.h>
  11 #include <string.h>
  12 #include <ctype.h>
  13 #include <unistd.h>
  14 #include <sys/stat.h>
  15
  16 #include "lib.h"
  17 #include "token.h"
  18 #include "symbol.h"
  19
  20 #define EOF (-1)
  21
  22 int input_stream_nr = 0;
  23 struct stream *input_streams;
  24 static int input_streams_allocated;
  25
  26 #define BUFSIZE (8192)
  27 typedef struct {
  28         int fd, stream, line, pos, offset, size;
  29         unsigned int newline:1, whitespace:1;
  30         struct token **tokenlist;
  31         struct token *token;
  32         unsigned char buffer[BUFSIZE];
  33 } stream_t;
  34
  35
  36 const char *show_special(int val)
  37 {
  38         static const char *combinations[] = COMBINATION_STRINGS;
  39         static char buffer[4];
  40
  41         buffer[0] = val;
  42         buffer[1] = 0;
  43         if (val >= SPECIAL_BASE)
  44                 strcpy(buffer, combinations[val - SPECIAL_BASE]);
  45         return buffer;
  46 }
  47
  48 const char *show_ident(const struct ident *ident)
  49 {
  50         static char buffer[256];
  51         sprintf(buffer, "%.*s", ident->len, ident->name);
  52         return buffer;
  53 }
  54
  55 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  56 {
  57         if (isprint(c)) {
  58                 if (c == escape || c == '\\')
  59                         *ptr++ = '\\';
  60                 *ptr++ = c;
  61                 return ptr;
  62         }
  63         *ptr++ = '\\';
  64         switch (c) {
  65         case '\n':
  66                 *ptr++ = 'n';
  67                 return ptr;
  68         case '\t':
  69                 *ptr++ = 't';
  70                 return ptr;
  71         }
  72         if (!isdigit(next))
  73                 return ptr + sprintf(ptr, "%o", c);
  74
  75         return ptr + sprintf(ptr, "%03o", c);
  76 }
  77
  78 const char *show_token(const struct token *token)
  79 {
  80         static char buffer[256];
  81
  82         if (!token)
  83                 return "<no token>";
  84         switch (token->type) {
  85         case TOKEN_ERROR:
  86                 return "syntax error";
  87
  88         case TOKEN_EOF:
  89                 return "end-of-input";
  90
  91         case TOKEN_IDENT:
  92                 return show_ident(token->ident);
  93
  94         case TOKEN_STRING: {
  95                 char *ptr;
  96                 int i;
  97                 struct string *string = token->string;
  98
  99                 ptr = buffer;
 100                 *ptr++ = '"';
 101                 for (i = 0; i < string->length-1; i++) {
 102                         unsigned char *p = string->data + i;
 103                         ptr = charstr(ptr, p[0], '"', p[1]);
 104                 }
 105                 *ptr++ = '"';
 106                 *ptr = '\0';
 107                 return buffer;
 108         }
 109
 110         case TOKEN_INTEGER: {
 111                 const char *p = token->integer;
 112                 switch (*p) {
 113                 case 'o':       // octal
 114                 case 'x':       // hex
 115                         buffer[0] = '0';
 116                         strcpy(buffer+1, p+1);
 117                         return buffer;
 118                 default:
 119                         return p;
 120                 }
 121         }
 122
 123         case TOKEN_FP:
 124                 return token->fp;
 125
 126         case TOKEN_SPECIAL:
 127                 return show_special(token->special);
 128
 129         case TOKEN_CHAR: {
 130                 char *ptr = buffer;
 131                 int c = token->character;
 132                 *ptr++ = '\'';
 133                 ptr = charstr(ptr, c, '\'', 0);
 134                 *ptr++ = '\'';
 135                 *ptr++ = '\0';
 136                 return buffer;
 137         }
 138
 139         case TOKEN_STREAMBEGIN:
 140                 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->stream)->name);
 141                 return buffer;
 142
 143         case TOKEN_STREAMEND:
 144                 sprintf(buffer, "<end of '%s'>", (input_streams + token->stream)->name);
 145                 return buffer;
 146
 147         default:
 148                 return "WTF???";
 149         }
 150 }
 151
 152 int init_stream(const char *name, int fd)
 153 {
 154         int stream = input_stream_nr;
 155         struct stream *current;
 156
 157         if (stream >= input_streams_allocated) {
 158                 int newalloc = stream * 4 / 3 + 10;
 159                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 160                 if (!input_streams)
 161                         die("Unable to allocate more streams space");
 162                 input_streams_allocated = newalloc;
 163         }
 164         current = input_streams + stream;
 165         memset(current, 0, sizeof(*current));
 166         current->name = name;
 167         current->fd = fd;
 168         current->constant = -1; // "unknown"
 169         if (fd > 0) {
 170                 int i;
 171                 struct stat st;
 172
 173                 fstat(fd, &st);
 174                 current->dev = st.st_dev;
 175                 current->ino = st.st_ino;
 176                 for (i = 0; i < stream; i++) {
 177                         struct stream *s = input_streams + i;
 178                         if (s->dev == st.st_dev && s->ino == st.st_ino) {
 179                                 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
 180                                         return -1;
 181                         }
 182                 }
 183         }
 184         input_stream_nr = stream+1;
 185         return stream;
 186 }
 187
 188 static struct token * alloc_token(stream_t *stream)
 189 {
 190         struct token *token = __alloc_token(0);
 191         token->line = stream->line;
 192         token->pos = stream->pos;
 193         token->stream = stream->stream;
 194         token->newline = stream->newline;
 195         token->whitespace = stream->whitespace;
 196         return token;
 197 }
 198
 199 static int nextchar(stream_t *stream)
 200 {
 201         int offset = stream->offset;
 202         int size = stream->size;
 203         int c;
 204
 205         if (offset >= size) {
 206                 size = read(stream->fd, stream->buffer, sizeof(stream->buffer));
 207                 if (size <= 0)
 208                         return EOF;
 209                 stream->size = size;
 210                 stream->offset = 0;
 211                 offset = 0;
 212         }
 213         c = stream->buffer[offset];
 214         stream->offset = offset + 1;
 215         stream->pos++;
 216         if (c == '\n') {
 217                 stream->line++;
 218                 stream->newline = 1;
 219                 stream->pos = 0;
 220         }
 221         return c;
 222 }
 223
 224 struct token eof_token_entry;
 225
 226 static void mark_eof(stream_t *stream, struct token *end_token)
 227 {
 228         struct token *end;
 229
 230         end = alloc_token(stream);
 231         end->type = TOKEN_STREAMEND;
 232         end->newline = 1;
 233
 234         eof_token_entry.next = &eof_token_entry;
 235         eof_token_entry.newline = 1;
 236
 237         if (!end_token)
 238                 end_token =  &eof_token_entry;
 239         end->next = end_token;
 240         *stream->tokenlist = end;
 241         stream->tokenlist = NULL;
 242 }
 243
 244 static void add_token(stream_t *stream)
 245 {
 246         struct token *token = stream->token;
 247
 248         stream->token = NULL;
 249         token->next = NULL;
 250         *stream->tokenlist = token;
 251         stream->tokenlist = &token->next;
 252 }
 253
 254 static void drop_token(stream_t *stream)
 255 {
 256         stream->newline |= stream->token->newline;
 257         stream->whitespace |= stream->token->whitespace;
 258         stream->token = NULL;
 259 }
 260
 261 static int get_base_number(unsigned int base, char **p, int next, stream_t *stream)
 262 {
 263         char *buf = *p;
 264
 265         *buf++ = next;
 266         for (;;) {
 267                 unsigned int n;
 268                 next = nextchar(stream);
 269                 n = hexval(next);
 270                 if (n >= base)
 271                         break;
 272                 *buf++ = next;
 273         }
 274         *p = buf;
 275         return next;
 276 }
 277
 278 static int do_integer(char *buffer, int len, int next, stream_t *stream)
 279 {
 280         struct token *token = stream->token;
 281         void *buf;
 282
 283         while (next == 'u' || next == 'U' || next == 'l' || next == 'L') {
 284                 buffer[len++] = next;
 285                 next = nextchar(stream);
 286         }
 287         buffer[len++] = '\0';
 288         buf = __alloc_bytes(len);
 289         memcpy(buf, buffer, len);
 290         token->type = TOKEN_INTEGER;
 291         token->integer = buf;
 292         add_token(stream);
 293         return next;
 294 }
 295
 296 static int get_one_number(int c, stream_t *stream)
 297 {
 298         static char buffer[256];
 299         int next = nextchar(stream);
 300         char *p = buffer;
 301
 302         *p++ = c;
 303         switch (next) {
 304         case '0'...'7':
 305                 if (c == '0') {
 306                         buffer[0] = 'o';
 307                         next = get_base_number(8, &p, next, stream);
 308                         break;
 309                 }
 310                 /* fallthrough */
 311         case '8'...'9':
 312                 next = get_base_number(10, &p, next, stream);
 313                 break;
 314         case 'x': case 'X':
 315                 if (c == '0') {
 316                         buffer[0] = 'x';
 317                         next = get_base_number(16, &p, next, stream);
 318                 }
 319         }
 320         return do_integer(buffer, p - buffer, next, stream);
 321 }
 322
 323 static int escapechar(int first, int type, stream_t *stream, int *valp)
 324 {
 325         int next, value;
 326
 327         next = nextchar(stream);
 328         value = first;
 329
 330         if (first == '\n')
 331                 warn(stream->token, "Newline in string or character constant");
 332
 333         if (first == '\\' && next != EOF) {
 334                 value = next;
 335                 next = nextchar(stream);
 336                 if (value != type) {
 337                         switch (value) {
 338                         case 'n':
 339                                 value = '\n';
 340                                 break;
 341                         case 't':
 342                                 value = '\t';
 343                                 break;
 344                         case '\\':
 345                                 break;
 346                         case '\'':
 347                                 break;
 348                         case '"':
 349                                 break;
 350                         case '0'...'7': {
 351                                 int nr = 2;
 352                                 value -= '0';
 353                                 while (next >= '0' && next <= '9') {
 354                                         value = (value << 3) + (next-'0');
 355                                         next = nextchar(stream);
 356                                         if (!--nr)
 357                                                 break;
 358                                 }
 359                                 value &= 0xff;
 360                                 break;
 361                         }
 362                         case 'x': {
 363                                 int hex = hexval(next);
 364                                 if (hex < 16) {
 365                                         value = hex;
 366                                         next = nextchar(stream);
 367                                         while ((hex = hexval(next)) < 16) {
 368                                                 value = (value << 4) + hex;
 369                                                 next = nextchar(stream);
 370                                         }
 371                                         value &= 0xff;
 372                                         break;
 373                                 }
 374                         }
 375                         /* Fallthrough */
 376                         default:
 377                                 warn(stream->token, "Unknown escape '%c'", value);
 378                         }
 379                 }
 380                 /* Mark it as escaped */
 381                 value |= 0x100;
 382         }
 383         *valp = value;
 384         return next;
 385 }
 386
 387 static int get_char_token(int next, stream_t *stream)
 388 {
 389         int value;
 390         struct token *token;
 391
 392         next = escapechar(next, '\'', stream, &value);
 393         if (value == '\'' || next != '\'') {
 394                 warn(stream->token, "Bad character constant");
 395                 drop_token(stream);
 396                 return next;
 397         }
 398
 399         token = stream->token;
 400         token->type = TOKEN_CHAR;
 401         token->character = value & 0xff;
 402
 403         add_token(stream);
 404         return nextchar(stream);
 405 }
 406
 407 static int get_string_token(int next, stream_t *stream)
 408 {
 409         static char buffer[512];
 410         struct string *string;
 411         struct token *token;
 412         int len = 0;
 413
 414         for (;;) {
 415                 int val;
 416                 next = escapechar(next, '"', stream, &val);
 417                 if (val == '"')
 418                         break;
 419                 if (next == EOF) {
 420                         warn(stream->token, "Enf of file in middle of string");
 421                         return next;
 422                 }
 423                 if (len < sizeof(buffer)) {
 424                         buffer[len] = val;
 425                         len++;
 426                 }
 427
 428         }
 429
 430         if (len > 256)
 431                 warn(stream->token, "String too long");
 432
 433         string = __alloc_string(len+1);
 434         memcpy(string->data, buffer, len);
 435         string->data[len] = '\0';
 436         string->length = len+1;
 437
 438         /* Pass it on.. */
 439         token = stream->token;
 440         token->type = TOKEN_STRING;
 441         token->string = string;
 442         add_token(stream);
 443
 444         return next;
 445 }
 446
 447 static int drop_stream_eoln(stream_t *stream)
 448 {
 449         int next = nextchar(stream);
 450         drop_token(stream);
 451         for (;;) {
 452                 int curr = next;
 453                 if (curr == EOF)
 454                         return next;
 455                 next = nextchar(stream);
 456                 if (curr == '\n')
 457                         return next;
 458         }
 459 }
 460
 461 static int drop_stream_comment(stream_t *stream)
 462 {
 463         int next = nextchar(stream);
 464         drop_token(stream);
 465         for (;;) {
 466                 int curr = next;
 467                 if (curr == EOF) {
 468                         warn(stream->token, "End of file in the middle of a comment");
 469                         return curr;
 470                 }
 471                 next = nextchar(stream);
 472                 if (curr == '*' && next == '/')
 473                         break;
 474         }
 475         return nextchar(stream);
 476 }
 477
 478 unsigned char combinations[][3] = COMBINATION_STRINGS;
 479
 480 #define NR_COMBINATIONS (sizeof(combinations)/3)
 481
 482 static int get_one_special(int c, stream_t *stream)
 483 {
 484         struct token *token;
 485         unsigned char c1, c2, c3;
 486         int next, value, i;
 487         char *comb;
 488
 489         next = nextchar(stream);
 490
 491         /*
 492          * Check for strings, character constants, and comments
 493          */
 494         switch (c) {
 495         case '"':
 496                 return get_string_token(next, stream);
 497         case '\'':
 498                 return get_char_token(next, stream);
 499         case '/':
 500                 if (next == '/')
 501                         return drop_stream_eoln(stream);
 502                 if (next == '*')
 503                         return drop_stream_comment(stream);
 504         }
 505
 506         /*
 507          * Check for combinations
 508          */
 509         value = c;
 510         comb = combinations[0];
 511         c1 = c; c2 = next; c3 = 0;
 512         for (i = 0; i < NR_COMBINATIONS; i++) {
 513                 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
 514                         value = i + SPECIAL_BASE;
 515                         next = nextchar(stream);
 516                         if (c3)
 517                                 break;
 518                         c3 = next;
 519                 }
 520                 comb += 3;
 521         }
 522
 523         /* Pass it on.. */
 524         token = stream->token;
 525         token->type = TOKEN_SPECIAL;
 526         token->special = value;
 527         add_token(stream);
 528         return next;
 529 }
 530
 531 #define IDENT_HASH_BITS (10)
 532 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 533 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 534
 535 #define ident_hash_init(c)              (c)
 536 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 537 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 538
 539 static struct ident *hash_table[IDENT_HASH_SIZE];
 540 int ident_hit, ident_miss;
 541
 542 void show_identifier_stats(void)
 543 {
 544         int i;
 545         int distribution[100];
 546
 547         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 548                 ident_hit, ident_miss);
 549
 550         for (i = 0; i < 100; i++)
 551                 distribution[i] = 0;
 552
 553         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 554                 struct ident * ident = hash_table[i];
 555                 int count = 0;
 556
 557                 while (ident) {
 558                         count++;
 559                         ident = ident->next;
 560                 }
 561                 if (count > 99)
 562                         count = 99;
 563                 distribution[count]++;
 564         }
 565
 566         for (i = 0; i < 100; i++) {
 567                 if (distribution[i])
 568                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 569         }
 570 }
 571
 572 static struct ident *alloc_ident(const char *name, int len)
 573 {
 574         struct ident *ident = __alloc_ident(len);
 575         ident->symbols = NULL;
 576         ident->len = len;
 577         memcpy(ident->name, name, len);
 578         return ident;
 579 }
 580
 581 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 582 {
 583         ident->next = hash_table[hash];
 584         hash_table[hash] = ident;
 585         ident_miss++;
 586         return ident;
 587 }
 588
 589 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 590 {
 591         struct ident *ident;
 592
 593         ident = hash_table[hash];
 594         while (ident) {
 595                 if (ident->len == len && !memcmp(ident->name, name, len)) {
 596                         ident_hit++;
 597                         return ident;
 598                 }
 599                 ident = ident->next;
 600         }
 601
 602         return insert_hash(alloc_ident(name, len), hash);
 603 }
 604
 605 static unsigned long hash_name(const char *name, int len)
 606 {
 607         unsigned long hash;
 608         const unsigned char *p = (const unsigned char *)name;
 609
 610         hash = ident_hash_init(*p++);
 611         while (--len) {
 612                 unsigned int i = *p++;
 613                 hash = ident_hash_add(hash, i);
 614         }
 615         return ident_hash_end(hash);
 616 }
 617
 618 struct ident *hash_ident(struct ident *ident)
 619 {
 620         return insert_hash(ident, hash_name(ident->name, ident->len));
 621 }
 622
 623 struct ident *built_in_ident(const char *name)
 624 {
 625         int len = strlen(name);
 626         return create_hashed_ident(name, len, hash_name(name, len));
 627 }
 628
 629 struct token *built_in_token(int stream, const char *name)
 630 {
 631         struct token *token;
 632
 633         token = __alloc_token(0);
 634         token->stream = stream;
 635         token->type = TOKEN_IDENT;
 636         token->ident = built_in_ident(name);
 637         return token;
 638 }
 639
 640 static int get_one_identifier(int c, stream_t *stream)
 641 {
 642         struct token *token;
 643         struct ident *ident;
 644         unsigned long hash;
 645         char buf[256];
 646         int len = 1;
 647         int next;
 648
 649         hash = ident_hash_init(c);
 650         buf[0] = c;
 651         for (;;) {
 652                 next = nextchar(stream);
 653                 switch (next) {
 654                 case '0'...'9':
 655                 case 'a'...'z':
 656                 case 'A'...'Z':
 657                 case '_':
 658                         if (len < sizeof(buf)) {
 659                                 hash = ident_hash_add(hash, next);
 660                                 buf[len] = next;
 661                                 len++;
 662                         }
 663                         continue;
 664                 }
 665                 break;
 666         };
 667         hash = ident_hash_end(hash);
 668
 669         ident = create_hashed_ident(buf, len, hash);
 670
 671         /* Pass it on.. */
 672         token = stream->token;
 673         token->type = TOKEN_IDENT;
 674         token->ident = ident;
 675         add_token(stream);
 676         return next;
 677 }
 678
 679 static int get_one_token(int c, stream_t *stream)
 680 {
 681         switch (c) {
 682         case '0'...'9':
 683                 return get_one_number(c, stream);
 684         case 'a'...'z':
 685         case 'A'...'Z':
 686         case '_':
 687                 return get_one_identifier(c, stream);
 688         default:
 689                 return get_one_special(c, stream);
 690         }
 691 }
 692
 693 struct token * tokenize(const char *name, int fd, struct token *endtoken)
 694 {
 695         struct token *begin;
 696         stream_t stream;
 697         int c, idx;
 698
 699         idx = init_stream(name, fd);
 700         if (idx < 0)
 701                 return endtoken;
 702
 703         stream.stream = idx;
 704         stream.token = NULL;
 705         stream.line = 1;
 706         stream.newline = 1;
 707         stream.whitespace = 0;
 708         stream.pos = 0;
 709         stream.fd = fd;
 710         stream.offset = 0;
 711         stream.size = 0;
 712
 713         begin = alloc_token(&stream);
 714         begin->type = TOKEN_STREAMBEGIN;
 715         stream.tokenlist = &begin->next;
 716
 717         c = nextchar(&stream);
 718         while (c != EOF) {
 719                 if (c == '\\') {
 720                         c = nextchar(&stream);
 721                         stream.newline = 0;
 722                         stream.whitespace = 1;
 723                         continue;
 724                 }
 725                 if (!isspace(c)) {
 726                         struct token *token = alloc_token(&stream);
 727                         token->newline = stream.newline;
 728                         token->whitespace = stream.whitespace;
 729                         stream.newline = 0;
 730                         stream.whitespace = 0;
 731                         stream.token = token;
 732                         c = get_one_token(c, &stream);
 733                         continue;
 734                 }
 735                 stream.whitespace = 1;
 736                 c = nextchar(&stream);
 737         }
 738         mark_eof(&stream, endtoken);
 739         return begin;
 740 }