tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the pre-processor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp, all rights reserved.
   6  */
   7 #include <stdio.h>
   8 #include <stdlib.h>
   9 #include <stdarg.h>
  10 #include <stddef.h>
  11 #include <string.h>
  12 #include <ctype.h>
  13 #include <unistd.h>
  14 #include <sys/stat.h>
  15
  16 #include "lib.h"
  17 #include "token.h"
  18 #include "symbol.h"
  19
  20 #define EOF (-1)
  21
  22 int input_stream_nr = 0;
  23 struct stream *input_streams;
  24 static int input_streams_allocated;
  25
  26 #define BUFSIZE (8192)
  27
  28 typedef struct {
  29         int fd, offset, size;
  30         struct position pos;
  31         struct token **tokenlist;
  32         struct token *token;
  33         unsigned char *buffer;
  34 } stream_t;
  35
  36
  37 const char *show_special(int val)
  38 {
  39         static const char *combinations[] = COMBINATION_STRINGS;
  40         static char buffer[4];
  41
  42         buffer[0] = val;
  43         buffer[1] = 0;
  44         if (val >= SPECIAL_BASE)
  45                 strcpy(buffer, combinations[val - SPECIAL_BASE]);
  46         return buffer;
  47 }
  48
  49 const char *show_ident(const struct ident *ident)
  50 {
  51         static char buffer[256];
  52         if (!ident)
  53                 return "<noident>";
  54         sprintf(buffer, "%.*s", ident->len, ident->name);
  55         return buffer;
  56 }
  57
  58 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  59 {
  60         if (isprint(c)) {
  61                 if (c == escape || c == '\\')
  62                         *ptr++ = '\\';
  63                 *ptr++ = c;
  64                 return ptr;
  65         }
  66         *ptr++ = '\\';
  67         switch (c) {
  68         case '\n':
  69                 *ptr++ = 'n';
  70                 return ptr;
  71         case '\t':
  72                 *ptr++ = 't';
  73                 return ptr;
  74         }
  75         if (!isdigit(next))
  76                 return ptr + sprintf(ptr, "%o", c);
  77
  78         return ptr + sprintf(ptr, "%03o", c);
  79 }
  80
  81 const char *show_string(const struct string *string)
  82 {
  83         static char buffer[256];
  84         char *ptr;
  85         int i;
  86
  87         ptr = buffer;
  88         *ptr++ = '"';
  89         for (i = 0; i < string->length-1; i++) {
  90                 const unsigned char *p = string->data + i;
  91                 ptr = charstr(ptr, p[0], '"', p[1]);
  92         }
  93         *ptr++ = '"';
  94         *ptr = '\0';
  95         return buffer;
  96 }
  97
  98 const char *show_token(const struct token *token)
  99 {
 100         static char buffer[256];
 101
 102         if (!token)
 103                 return "<no token>";
 104         switch (token_type(token)) {
 105         case TOKEN_ERROR:
 106                 return "syntax error";
 107
 108         case TOKEN_EOF:
 109                 return "end-of-input";
 110
 111         case TOKEN_IDENT:
 112                 return show_ident(token->ident);
 113
 114         case TOKEN_STRING:
 115                 return show_string(token->string);
 116
 117         case TOKEN_INTEGER: {
 118                 const char *p = token->integer;
 119                 switch (*p) {
 120                 case 'o':       // octal
 121                 case 'x':       // hex
 122                         buffer[0] = '0';
 123                         strcpy(buffer+1, p+1);
 124                         return buffer;
 125                 default:
 126                         return p;
 127                 }
 128         }
 129
 130         case TOKEN_FP:
 131                 return token->fp;
 132
 133         case TOKEN_SPECIAL:
 134                 return show_special(token->special);
 135
 136         case TOKEN_CHAR: {
 137                 char *ptr = buffer;
 138                 int c = token->character;
 139                 *ptr++ = '\'';
 140                 ptr = charstr(ptr, c, '\'', 0);
 141                 *ptr++ = '\'';
 142                 *ptr++ = '\0';
 143                 return buffer;
 144         }
 145
 146         case TOKEN_STREAMBEGIN:
 147                 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
 148                 return buffer;
 149
 150         case TOKEN_STREAMEND:
 151                 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
 152                 return buffer;
 153
 154         default:
 155                 return "WTF???";
 156         }
 157 }
 158
 159 int init_stream(const char *name, int fd)
 160 {
 161         int stream = input_stream_nr;
 162         struct stream *current;
 163
 164         if (stream >= input_streams_allocated) {
 165                 int newalloc = stream * 4 / 3 + 10;
 166                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 167                 if (!input_streams)
 168                         die("Unable to allocate more streams space");
 169                 input_streams_allocated = newalloc;
 170         }
 171         current = input_streams + stream;
 172         memset(current, 0, sizeof(*current));
 173         current->name = name;
 174         current->fd = fd;
 175         current->constant = -1; // "unknown"
 176         if (fd > 0) {
 177                 int i;
 178                 struct stat st;
 179
 180                 fstat(fd, &st);
 181                 current->dev = st.st_dev;
 182                 current->ino = st.st_ino;
 183                 for (i = 0; i < stream; i++) {
 184                         struct stream *s = input_streams + i;
 185                         if (s->dev == st.st_dev && s->ino == st.st_ino) {
 186                                 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
 187                                         return -1;
 188                         }
 189                 }
 190         }
 191         input_stream_nr = stream+1;
 192         return stream;
 193 }
 194
 195 static struct token * alloc_token(stream_t *stream)
 196 {
 197         struct token *token = __alloc_token(0);
 198         token->pos = stream->pos;
 199         return token;
 200 }
 201
 202 static int nextchar(stream_t *stream)
 203 {
 204         int offset = stream->offset;
 205         int size = stream->size;
 206         int c;
 207
 208         if (offset >= size) {
 209                 size = read(stream->fd, stream->buffer, BUFSIZE);
 210                 if (size <= 0)
 211                         return EOF;
 212                 stream->size = size;
 213                 stream->offset = 0;
 214                 offset = 0;
 215         }
 216         c = stream->buffer[offset];
 217         stream->offset = offset + 1;
 218         stream->pos.pos++;
 219         if (c == '\n') {
 220                 stream->pos.line++;
 221                 stream->pos.newline = 1;
 222                 stream->pos.pos = 0;
 223         }
 224         return c;
 225 }
 226
 227 struct token eof_token_entry;
 228
 229 static void mark_eof(stream_t *stream, struct token *end_token)
 230 {
 231         struct token *end;
 232
 233         end = alloc_token(stream);
 234         token_type(end) = TOKEN_STREAMEND;
 235         end->pos.newline = 1;
 236
 237         eof_token_entry.next = &eof_token_entry;
 238         eof_token_entry.pos.newline = 1;
 239
 240         if (!end_token)
 241                 end_token =  &eof_token_entry;
 242         end->next = end_token;
 243         *stream->tokenlist = end;
 244         stream->tokenlist = NULL;
 245 }
 246
 247 static void add_token(stream_t *stream)
 248 {
 249         struct token *token = stream->token;
 250
 251         stream->token = NULL;
 252         token->next = NULL;
 253         *stream->tokenlist = token;
 254         stream->tokenlist = &token->next;
 255 }
 256
 257 static void drop_token(stream_t *stream)
 258 {
 259         stream->pos.newline |= stream->token->pos.newline;
 260         stream->pos.whitespace |= stream->token->pos.whitespace;
 261         stream->token = NULL;
 262 }
 263
 264 static int get_base_number(unsigned int base, char **p, int next, stream_t *stream)
 265 {
 266         char *buf = *p;
 267
 268         *buf++ = next;
 269         for (;;) {
 270                 unsigned int n;
 271                 next = nextchar(stream);
 272                 n = hexval(next);
 273                 if (n >= base)
 274                         break;
 275                 *buf++ = next;
 276         }
 277         *p = buf;
 278         return next;
 279 }
 280
 281 static int do_integer(char *buffer, int len, int next, stream_t *stream)
 282 {
 283         struct token *token = stream->token;
 284         void *buf;
 285
 286         while (next == 'u' || next == 'U' || next == 'l' || next == 'L') {
 287                 buffer[len++] = next;
 288                 next = nextchar(stream);
 289         }
 290         buffer[len++] = '\0';
 291         buf = __alloc_bytes(len);
 292         memcpy(buf, buffer, len);
 293         token_type(token) = TOKEN_INTEGER;
 294         token->integer = buf;
 295         add_token(stream);
 296         return next;
 297 }
 298
 299 static int get_one_number(int c, stream_t *stream)
 300 {
 301         static char buffer[256];
 302         int next = nextchar(stream);
 303         char *p = buffer;
 304
 305         *p++ = c;
 306         switch (next) {
 307         case '0'...'7':
 308                 if (c == '0') {
 309                         buffer[0] = 'o';
 310                         next = get_base_number(8, &p, next, stream);
 311                         break;
 312                 }
 313                 /* fallthrough */
 314         case '8'...'9':
 315                 next = get_base_number(10, &p, next, stream);
 316                 break;
 317         case 'x': case 'X':
 318                 if (c == '0') {
 319                         buffer[0] = 'x';
 320                         next = get_base_number(16, &p, next, stream);
 321                 }
 322         }
 323         return do_integer(buffer, p - buffer, next, stream);
 324 }
 325
 326 static int escapechar(int first, int type, stream_t *stream, int *valp)
 327 {
 328         int next, value;
 329
 330         next = nextchar(stream);
 331         value = first;
 332
 333         if (first == '\n')
 334                 warn(stream->pos, "Newline in string or character constant");
 335
 336         if (first == '\\' && next != EOF) {
 337                 value = next;
 338                 next = nextchar(stream);
 339                 if (value != type) {
 340                         switch (value) {
 341                         case 'n':
 342                                 value = '\n';
 343                                 break;
 344                         case 't':
 345                                 value = '\t';
 346                                 break;
 347                         case '\\':
 348                                 break;
 349                         case '\'':
 350                                 break;
 351                         case '"':
 352                                 break;
 353                         case '0'...'7': {
 354                                 int nr = 2;
 355                                 value -= '0';
 356                                 while (next >= '0' && next <= '9') {
 357                                         value = (value << 3) + (next-'0');
 358                                         next = nextchar(stream);
 359                                         if (!--nr)
 360                                                 break;
 361                                 }
 362                                 value &= 0xff;
 363                                 break;
 364                         }
 365                         case 'x': {
 366                                 int hex = hexval(next);
 367                                 if (hex < 16) {
 368                                         value = hex;
 369                                         next = nextchar(stream);
 370                                         while ((hex = hexval(next)) < 16) {
 371                                                 value = (value << 4) + hex;
 372                                                 next = nextchar(stream);
 373                                         }
 374                                         value &= 0xff;
 375                                         break;
 376                                 }
 377                         }
 378                         /* Fallthrough */
 379                         default:
 380                                 warn(stream->pos, "Unknown escape '%c'", value);
 381                         }
 382                 }
 383                 /* Mark it as escaped */
 384                 value |= 0x100;
 385         }
 386         *valp = value;
 387         return next;
 388 }
 389
 390 static int get_char_token(int next, stream_t *stream)
 391 {
 392         int value;
 393         struct token *token;
 394
 395         next = escapechar(next, '\'', stream, &value);
 396         if (value == '\'' || next != '\'') {
 397                 warn(stream->pos, "Bad character constant");
 398                 drop_token(stream);
 399                 return next;
 400         }
 401
 402         token = stream->token;
 403         token_type(token) = TOKEN_CHAR;
 404         token->character = value & 0xff;
 405
 406         add_token(stream);
 407         return nextchar(stream);
 408 }
 409
 410 static int get_string_token(int next, stream_t *stream)
 411 {
 412         static char buffer[512];
 413         struct string *string;
 414         struct token *token;
 415         int len = 0;
 416
 417         for (;;) {
 418                 int val;
 419                 next = escapechar(next, '"', stream, &val);
 420                 if (val == '"')
 421                         break;
 422                 if (next == EOF) {
 423                         warn(stream->pos, "Enf of file in middle of string");
 424                         return next;
 425                 }
 426                 if (len < sizeof(buffer)) {
 427                         buffer[len] = val;
 428                         len++;
 429                 }
 430
 431         }
 432
 433         if (len > 256)
 434                 warn(stream->pos, "String too long");
 435
 436         string = __alloc_string(len+1);
 437         memcpy(string->data, buffer, len);
 438         string->data[len] = '\0';
 439         string->length = len+1;
 440
 441         /* Pass it on.. */
 442         token = stream->token;
 443         token_type(token) = TOKEN_STRING;
 444         token->string = string;
 445         add_token(stream);
 446
 447         return next;
 448 }
 449
 450 static int drop_stream_eoln(stream_t *stream)
 451 {
 452         int next = nextchar(stream);
 453         drop_token(stream);
 454         for (;;) {
 455                 int curr = next;
 456                 if (curr == EOF)
 457                         return next;
 458                 next = nextchar(stream);
 459                 if (curr == '\n')
 460                         return next;
 461         }
 462 }
 463
 464 static int drop_stream_comment(stream_t *stream)
 465 {
 466         int next = nextchar(stream);
 467         drop_token(stream);
 468         for (;;) {
 469                 int curr = next;
 470                 if (curr == EOF) {
 471                         warn(stream->pos, "End of file in the middle of a comment");
 472                         return curr;
 473                 }
 474                 next = nextchar(stream);
 475                 if (curr == '*' && next == '/')
 476                         break;
 477         }
 478         return nextchar(stream);
 479 }
 480
 481 unsigned char combinations[][3] = COMBINATION_STRINGS;
 482
 483 #define NR_COMBINATIONS (sizeof(combinations)/3)
 484
 485 static int get_one_special(int c, stream_t *stream)
 486 {
 487         struct token *token;
 488         unsigned char c1, c2, c3;
 489         int next, value, i;
 490         char *comb;
 491
 492         next = nextchar(stream);
 493
 494         /*
 495          * Check for strings, character constants, and comments
 496          */
 497         switch (c) {
 498         case '"':
 499                 return get_string_token(next, stream);
 500         case '\'':
 501                 return get_char_token(next, stream);
 502         case '/':
 503                 if (next == '/')
 504                         return drop_stream_eoln(stream);
 505                 if (next == '*')
 506                         return drop_stream_comment(stream);
 507         }
 508
 509         /*
 510          * Check for combinations
 511          */
 512         value = c;
 513         comb = combinations[0];
 514         c1 = c; c2 = next; c3 = 0;
 515         for (i = 0; i < NR_COMBINATIONS; i++) {
 516                 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
 517                         value = i + SPECIAL_BASE;
 518                         next = nextchar(stream);
 519                         if (c3)
 520                                 break;
 521                         c3 = next;
 522                 }
 523                 comb += 3;
 524         }
 525
 526         /* Pass it on.. */
 527         token = stream->token;
 528         token_type(token) = TOKEN_SPECIAL;
 529         token->special = value;
 530         add_token(stream);
 531         return next;
 532 }
 533
 534 #define IDENT_HASH_BITS (10)
 535 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 536 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 537
 538 #define ident_hash_init(c)              (c)
 539 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 540 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 541
 542 static struct ident *hash_table[IDENT_HASH_SIZE];
 543 int ident_hit, ident_miss;
 544
 545 void show_identifier_stats(void)
 546 {
 547         int i;
 548         int distribution[100];
 549
 550         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 551                 ident_hit, ident_miss);
 552
 553         for (i = 0; i < 100; i++)
 554                 distribution[i] = 0;
 555
 556         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 557                 struct ident * ident = hash_table[i];
 558                 int count = 0;
 559
 560                 while (ident) {
 561                         count++;
 562                         ident = ident->next;
 563                 }
 564                 if (count > 99)
 565                         count = 99;
 566                 distribution[count]++;
 567         }
 568
 569         for (i = 0; i < 100; i++) {
 570                 if (distribution[i])
 571                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 572         }
 573 }
 574
 575 static struct ident *alloc_ident(const char *name, int len)
 576 {
 577         struct ident *ident = __alloc_ident(len);
 578         ident->symbols = NULL;
 579         ident->len = len;
 580         memcpy(ident->name, name, len);
 581         return ident;
 582 }
 583
 584 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 585 {
 586         ident->next = hash_table[hash];
 587         hash_table[hash] = ident;
 588         ident_miss++;
 589         return ident;
 590 }
 591
 592 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 593 {
 594         struct ident *ident;
 595
 596         ident = hash_table[hash];
 597         while (ident) {
 598                 if (ident->len == len && !memcmp(ident->name, name, len)) {
 599                         ident_hit++;
 600                         return ident;
 601                 }
 602                 ident = ident->next;
 603         }
 604
 605         return insert_hash(alloc_ident(name, len), hash);
 606 }
 607
 608 static unsigned long hash_name(const char *name, int len)
 609 {
 610         unsigned long hash;
 611         const unsigned char *p = (const unsigned char *)name;
 612
 613         hash = ident_hash_init(*p++);
 614         while (--len) {
 615                 unsigned int i = *p++;
 616                 hash = ident_hash_add(hash, i);
 617         }
 618         return ident_hash_end(hash);
 619 }
 620
 621 struct ident *hash_ident(struct ident *ident)
 622 {
 623         return insert_hash(ident, hash_name(ident->name, ident->len));
 624 }
 625
 626 struct ident *built_in_ident(const char *name)
 627 {
 628         int len = strlen(name);
 629         return create_hashed_ident(name, len, hash_name(name, len));
 630 }
 631
 632 struct token *built_in_token(int stream, const char *name)
 633 {
 634         struct token *token;
 635
 636         token = __alloc_token(0);
 637         token->pos.stream = stream;
 638         token_type(token) = TOKEN_IDENT;
 639         token->ident = built_in_ident(name);
 640         return token;
 641 }
 642
 643 static int get_one_identifier(int c, stream_t *stream)
 644 {
 645         struct token *token;
 646         struct ident *ident;
 647         unsigned long hash;
 648         char buf[256];
 649         int len = 1;
 650         int next;
 651
 652         hash = ident_hash_init(c);
 653         buf[0] = c;
 654         for (;;) {
 655                 next = nextchar(stream);
 656                 switch (next) {
 657                 case '0'...'9':
 658                 case 'a'...'z':
 659                 case 'A'...'Z':
 660                 case '_':
 661                         if (len < sizeof(buf)) {
 662                                 hash = ident_hash_add(hash, next);
 663                                 buf[len] = next;
 664                                 len++;
 665                         }
 666                         continue;
 667                 }
 668                 break;
 669         };
 670         hash = ident_hash_end(hash);
 671
 672         ident = create_hashed_ident(buf, len, hash);
 673
 674         /* Pass it on.. */
 675         token = stream->token;
 676         token_type(token) = TOKEN_IDENT;
 677         token->ident = ident;
 678         add_token(stream);
 679         return next;
 680 }
 681
 682 static int get_one_token(int c, stream_t *stream)
 683 {
 684         switch (c) {
 685         case '0'...'9':
 686                 return get_one_number(c, stream);
 687         case 'a'...'z':
 688         case 'A'...'Z':
 689         case '_':
 690                 return get_one_identifier(c, stream);
 691         default:
 692                 return get_one_special(c, stream);
 693         }
 694 }
 695
 696 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 697         unsigned char *buf, unsigned int buf_size)
 698 {
 699         struct token *begin;
 700
 701         stream->pos.stream = idx;
 702         stream->pos.line = 1;
 703         stream->pos.newline = 1;
 704         stream->pos.whitespace = 0;
 705         stream->pos.pos = 0;
 706
 707         stream->token = NULL;
 708         stream->fd = fd;
 709         stream->offset = 0;
 710         stream->size = buf_size;
 711         stream->buffer = buf;
 712
 713         begin = alloc_token(stream);
 714         token_type(begin) = TOKEN_STREAMBEGIN;
 715         stream->tokenlist = &begin->next;
 716         return begin;
 717 }
 718
 719 static void tokenize_stream(stream_t *stream, struct token *endtoken)
 720 {
 721         int c = nextchar(stream);
 722         while (c != EOF) {
 723                 if (c == '\\') {
 724                         c = nextchar(stream);
 725                         stream->pos.newline = 0;
 726                         stream->pos.whitespace = 1;
 727                         continue;
 728                 }
 729                 if (!isspace(c)) {
 730                         struct token *token = alloc_token(stream);
 731                         stream->token = token;
 732                         stream->pos.newline = 0;
 733                         stream->pos.whitespace = 0;
 734                         c = get_one_token(c, stream);
 735                         continue;
 736                 }
 737                 stream->pos.whitespace = 1;
 738                 c = nextchar(stream);
 739         }
 740         mark_eof(stream, endtoken);
 741 }
 742
 743 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
 744 {
 745         stream_t stream;
 746         struct token *begin;
 747
 748         begin = setup_stream(&stream, 0, -1, buffer, size);
 749         tokenize_stream(&stream, endtoken);
 750         return begin;
 751 }
 752
 753 struct token * tokenize(const char *name, int fd, struct token *endtoken)
 754 {
 755         struct token *begin;
 756         stream_t stream;
 757         unsigned char buffer[BUFSIZE];
 758         int idx;
 759
 760         idx = init_stream(name, fd);
 761         if (idx < 0)
 762                 return endtoken;
 763
 764         begin = setup_stream(&stream, idx, fd, buffer, 0);
 765         tokenize_stream(&stream, endtoken);
 766         return begin;
 767 }