tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the pre-processor.
   4  *
   5  * Copyright (C) 2003 Linus Torvalds, all rights reserved.
   6  */
   7 #include <stdio.h>
   8 #include <stdlib.h>
   9 #include <stdarg.h>
  10 #include <stddef.h>
  11 #include <string.h>
  12 #include <ctype.h>
  13 #include <unistd.h>
  14 #include <sys/stat.h>
  15
  16 #include "lib.h"
  17 #include "token.h"
  18 #include "symbol.h"
  19
  20 #define EOF (-1)
  21
  22 int input_stream_nr = 0;
  23 struct stream *input_streams;
  24 static int input_streams_allocated;
  25
  26 #define BUFSIZE (8192)
  27 typedef struct {
  28         int fd, stream, line, pos, offset, size;
  29         unsigned int newline:1, whitespace:1;
  30         struct token **tokenlist;
  31         struct token *token;
  32         unsigned char buffer[BUFSIZE];
  33 } stream_t;
  34
  35
  36 const char *show_special(int val)
  37 {
  38         static const char *combinations[] = COMBINATION_STRINGS;
  39         static char buffer[4];
  40
  41         buffer[0] = val;
  42         buffer[1] = 0;
  43         if (val >= SPECIAL_BASE)
  44                 strcpy(buffer, combinations[val - SPECIAL_BASE]);
  45         return buffer;
  46 }
  47
  48 const char *show_ident(const struct ident *ident)
  49 {
  50         static char buffer[256];
  51         sprintf(buffer, "%.*s", ident->len, ident->name);
  52         return buffer;
  53 }
  54
  55 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  56 {
  57         if (isprint(c)) {
  58                 if (c == escape || c == '\\')
  59                         *ptr++ = '\\';
  60                 *ptr++ = c;
  61                 return ptr;
  62         }
  63         *ptr++ = '\\';
  64         switch (c) {
  65         case '\n':
  66                 *ptr++ = 'n';
  67                 return ptr;
  68         case '\t':
  69                 *ptr++ = 't';
  70                 return ptr;
  71         }
  72         if (!isdigit(next))
  73                 return ptr + sprintf(ptr, "%o", c);
  74
  75         return ptr + sprintf(ptr, "%03o", c);
  76 }
  77
  78 const char *show_token(const struct token *token)
  79 {
  80         static char buffer[256];
  81
  82         if (!token)
  83                 return "<no token>";
  84         switch (token->type) {
  85         case TOKEN_ERROR:
  86                 return "syntax error";
  87
  88         case TOKEN_EOF:
  89                 return "end-of-input";
  90
  91         case TOKEN_IDENT:
  92                 return show_ident(token->ident);
  93
  94         case TOKEN_STRING: {
  95                 char *ptr;
  96                 int i;
  97                 struct string *string = token->string;
  98
  99                 ptr = buffer;
 100                 *ptr++ = '"';
 101                 for (i = 0; i < string->length-1; i++) {
 102                         unsigned char *p = string->data + i;
 103                         ptr = charstr(ptr, p[0], '"', p[1]);
 104                 }
 105                 *ptr++ = '"';
 106                 *ptr = '\0';
 107                 return buffer;
 108         }
 109
 110         case TOKEN_INTEGER: {
 111                 const char *p = token->integer;
 112                 switch (*p) {
 113                 case 'o':       // octal
 114                 case 'x':       // hex
 115                         buffer[0] = '0';
 116                         strcpy(buffer+1, p+1);
 117                         return buffer;
 118                 default:
 119                         return p;
 120                 }
 121         }
 122
 123         case TOKEN_FP:
 124                 return token->fp;
 125
 126         case TOKEN_SPECIAL:
 127                 return show_special(token->special);
 128
 129         case TOKEN_CHAR: {
 130                 char *ptr = buffer;
 131                 int c = token->character;
 132                 *ptr++ = '\'';
 133                 ptr = charstr(ptr, c, '\'', 0);
 134                 *ptr++ = '\'';
 135                 *ptr++ = '\0';
 136                 return buffer;
 137         }
 138
 139         case TOKEN_STREAMBEGIN:
 140                 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->stream)->name);
 141                 return buffer;
 142
 143         case TOKEN_STREAMEND:
 144                 sprintf(buffer, "<end of '%s'>", (input_streams + token->stream)->name);
 145                 return buffer;
 146
 147         default:
 148                 return "WTF???";
 149         }
 150 }
 151
 152 int init_stream(const char *name, int fd)
 153 {
 154         int stream = input_stream_nr;
 155         struct stream *current;
 156
 157         if (stream >= input_streams_allocated) {
 158                 int newalloc = stream * 4 / 3 + 10;
 159                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 160                 if (!input_streams)
 161                         die("Unable to allocate more streams space");
 162                 input_streams_allocated = newalloc;
 163         }
 164         current = input_streams + stream;
 165         memset(current, 0, sizeof(*current));
 166         current->name = name;
 167         current->fd = fd;
 168         current->constant = -1; // "unknown"
 169         if (fd > 0) {
 170                 int i;
 171                 struct stat st;
 172
 173                 fstat(fd, &st);
 174                 current->dev = st.st_dev;
 175                 current->ino = st.st_ino;
 176                 for (i = 0; i < stream; i++) {
 177                         struct stream *s = input_streams + i;
 178                         if (s->dev == st.st_dev && s->ino == st.st_ino) {
 179                                 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
 180                                         return -1;
 181                         }
 182                 }
 183         }
 184         input_stream_nr = stream+1;
 185         return stream;
 186 }
 187
 188 static struct token * alloc_token(stream_t *stream)
 189 {
 190         struct token *token = __alloc_token(0);
 191         token->line = stream->line;
 192         token->pos = stream->pos;
 193         token->stream = stream->stream;
 194         token->newline = stream->newline;
 195         token->whitespace = stream->whitespace;
 196         return token;
 197 }
 198
 199 static int nextchar(stream_t *stream)
 200 {
 201         int offset = stream->offset;
 202         int size = stream->size;
 203         int c;
 204
 205         if (offset >= size) {
 206                 size = read(stream->fd, stream->buffer, sizeof(stream->buffer));
 207                 if (size <= 0)
 208                         return EOF;
 209                 stream->size = size;
 210                 stream->offset = 0;
 211                 offset = 0;
 212         }
 213         c = stream->buffer[offset];
 214         stream->offset = offset + 1;
 215         stream->pos++;
 216         if (c == '\n') {
 217                 stream->line++;
 218                 stream->newline = 1;
 219                 stream->pos = 0;
 220         }
 221         return c;
 222 }
 223
 224 struct token eof_token_entry;
 225
 226 static void mark_eof(stream_t *stream, struct token *end_token)
 227 {
 228         struct token *end;
 229
 230         end = alloc_token(stream);
 231         end->type = TOKEN_STREAMEND;
 232         end->newline = 1;
 233
 234         eof_token_entry.next = &eof_token_entry;
 235         eof_token_entry.newline = 1;
 236
 237         if (!end_token)
 238                 end_token =  &eof_token_entry;
 239         end->next = end_token;
 240         *stream->tokenlist = end;
 241         stream->tokenlist = NULL;
 242 }
 243
 244 static void add_token(stream_t *stream)
 245 {
 246         struct token *token = stream->token;
 247
 248         stream->token = NULL;
 249         token->next = NULL;
 250         *stream->tokenlist = token;
 251         stream->tokenlist = &token->next;
 252 }
 253
 254 static void drop_token(stream_t *stream)
 255 {
 256         stream->newline |= stream->token->newline;
 257         stream->whitespace |= stream->token->whitespace;
 258         stream->token = NULL;
 259 }
 260
 261 static int get_base_number(unsigned int base, char **p, int next, stream_t *stream)
 262 {
 263         char *buf = *p;
 264
 265         *buf++ = next;
 266         for (;;) {
 267                 unsigned int n;
 268                 next = nextchar(stream);
 269                 n = hexval(next);
 270                 if (n >= base)
 271                         break;
 272                 *buf++ = next;
 273         }
 274         *p = buf;
 275         return next;
 276 }
 277
 278 static int do_integer(char *buffer, int len, int next, stream_t *stream)
 279 {
 280         struct token *token = stream->token;
 281         void *buf;
 282
 283         while (next == 'u' || next == 'U' || next == 'l' || next == 'L') {
 284                 buffer[len++] = next;
 285                 next = nextchar(stream);
 286         }
 287         buffer[len++] = '\0';
 288         buf = __alloc_bytes(len);
 289         memcpy(buf, buffer, len);
 290         token->type = TOKEN_INTEGER;
 291         token->integer = buf;
 292         add_token(stream);
 293         return next;
 294 }
 295
 296 static int get_one_number(int c, stream_t *stream)
 297 {
 298         static char buffer[256];
 299         int next = nextchar(stream);
 300         char *p = buffer;
 301
 302         *p++ = c;
 303         switch (next) {
 304         case '0'...'7':
 305                 if (c == '0') {
 306                         buffer[0] = 'o';
 307                         next = get_base_number(8, &p, next, stream);
 308                         break;
 309                 }
 310                 /* fallthrough */
 311         case '8'...'9':
 312                 next = get_base_number(10, &p, next, stream);
 313                 break;
 314         case 'x': case 'X':
 315                 if (c == '0') {
 316                         buffer[0] = 'x';
 317                         next = get_base_number(16, &p, next, stream);
 318                 }
 319         }
 320         return do_integer(buffer, p - buffer, next, stream);
 321 }
 322
 323 static int escapechar(int first, int type, stream_t *stream, int *valp)
 324 {
 325         int next, value;
 326
 327         next = nextchar(stream);
 328         value = first;
 329
 330         if (first == '\n')
 331                 warn(stream->token, "Newline in string or character constant");
 332
 333         if (first == '\\' && next != EOF) {
 334                 value = next;
 335                 next = nextchar(stream);
 336                 if (value != type) {
 337                         switch (value) {
 338                         case 'n':
 339                                 value = '\n';
 340                                 break;
 341                         case 't':
 342                                 value = '\t';
 343                                 break;
 344                         case '\\':
 345                                 break;
 346                         case '0'...'7': {
 347                                 int nr = 2;
 348                                 value -= '0';
 349                                 while (next >= '0' && next <= '9') {
 350                                         value = (value << 3) + (next-'0');
 351                                         next = nextchar(stream);
 352                                         if (!--nr)
 353                                                 break;
 354                                 }
 355                                 value &= 0xff;
 356                                 break;
 357                         }
 358                         case 'x': {
 359                                 int hex = hexval(next);
 360                                 if (hex < 16) {
 361                                         value = hex;
 362                                         next = nextchar(stream);
 363                                         while ((hex = hexval(next)) < 16) {
 364                                                 value = (value << 4) + hex;
 365                                                 next = nextchar(stream);
 366                                         }
 367                                         value &= 0xff;
 368                                         break;
 369                                 }
 370                         }
 371                         /* Fallthrough */
 372                         default:
 373                                 warn(stream->token, "Unknown escape '%c'", value);
 374                         }
 375                 }
 376                 /* Mark it as escaped */
 377                 value |= 0x100;
 378         }
 379         *valp = value;
 380         return next;
 381 }
 382
 383 static int get_char_token(int next, stream_t *stream)
 384 {
 385         int value;
 386         struct token *token;
 387
 388         next = escapechar(next, '\'', stream, &value);
 389         if (value == '\'' || next != '\'') {
 390                 warn(stream->token, "Bad character constant");
 391                 drop_token(stream);
 392                 return next;
 393         }
 394
 395         token = stream->token;
 396         token->type = TOKEN_CHAR;
 397         token->character = value & 0xff;
 398
 399         add_token(stream);
 400         return nextchar(stream);
 401 }
 402
 403 static int get_string_token(int next, stream_t *stream)
 404 {
 405         static char buffer[512];
 406         struct string *string;
 407         struct token *token;
 408         int len = 0;
 409
 410         for (;;) {
 411                 int val;
 412                 next = escapechar(next, '"', stream, &val);
 413                 if (val == '"')
 414                         break;
 415                 if (next == EOF) {
 416                         warn(stream->token, "Enf of file in middle of string");
 417                         return next;
 418                 }
 419                 if (len < sizeof(buffer)) {
 420                         buffer[len] = val;
 421                         len++;
 422                 }
 423
 424         }
 425
 426         if (len > 256)
 427                 warn(stream->token, "String too long");
 428
 429         string = __alloc_string(len+1);
 430         memcpy(string->data, buffer, len);
 431         string->data[len] = '\0';
 432         string->length = len+1;
 433
 434         /* Pass it on.. */
 435         token = stream->token;
 436         token->type = TOKEN_STRING;
 437         token->string = string;
 438         add_token(stream);
 439
 440         return next;
 441 }
 442
 443 static int drop_stream_eoln(stream_t *stream)
 444 {
 445         int next = nextchar(stream);
 446         drop_token(stream);
 447         for (;;) {
 448                 int curr = next;
 449                 if (curr == EOF)
 450                         return next;
 451                 next = nextchar(stream);
 452                 if (curr == '\n')
 453                         return next;
 454         }
 455 }
 456
 457 static int drop_stream_comment(stream_t *stream)
 458 {
 459         int next = nextchar(stream);
 460         drop_token(stream);
 461         for (;;) {
 462                 int curr = next;
 463                 if (curr == EOF) {
 464                         warn(stream->token, "End of file in the middle of a comment");
 465                         return curr;
 466                 }
 467                 next = nextchar(stream);
 468                 if (curr == '*' && next == '/')
 469                         break;
 470         }
 471         return nextchar(stream);
 472 }
 473
 474 unsigned char combinations[][3] = COMBINATION_STRINGS;
 475
 476 #define NR_COMBINATIONS (sizeof(combinations)/3)
 477
 478 static int get_one_special(int c, stream_t *stream)
 479 {
 480         struct token *token;
 481         unsigned char c1, c2, c3;
 482         int next, value, i;
 483         char *comb;
 484
 485         next = nextchar(stream);
 486
 487         /*
 488          * Check for strings, character constants, and comments
 489          */
 490         switch (c) {
 491         case '"':
 492                 return get_string_token(next, stream);
 493         case '\'':
 494                 return get_char_token(next, stream);
 495         case '/':
 496                 if (next == '/')
 497                         return drop_stream_eoln(stream);
 498                 if (next == '*')
 499                         return drop_stream_comment(stream);
 500         }
 501
 502         /*
 503          * Check for combinations
 504          */
 505         value = c;
 506         comb = combinations[0];
 507         c1 = c; c2 = next; c3 = 0;
 508         for (i = 0; i < NR_COMBINATIONS; i++) {
 509                 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
 510                         value = i + SPECIAL_BASE;
 511                         next = nextchar(stream);
 512                         if (c3)
 513                                 break;
 514                         c3 = next;
 515                 }
 516                 comb += 3;
 517         }
 518
 519         /* Pass it on.. */
 520         token = stream->token;
 521         token->type = TOKEN_SPECIAL;
 522         token->special = value;
 523         add_token(stream);
 524         return next;
 525 }
 526
 527 #define IDENT_HASH_BITS (10)
 528 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 529 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 530
 531 #define ident_hash_init(c)              (c)
 532 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 533 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 534
 535 static struct ident *hash_table[IDENT_HASH_SIZE];
 536 int ident_hit, ident_miss;
 537
 538 void show_identifier_stats(void)
 539 {
 540         int i;
 541         int distribution[100];
 542
 543         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 544                 ident_hit, ident_miss);
 545
 546         for (i = 0; i < 100; i++)
 547                 distribution[i] = 0;
 548
 549         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 550                 struct ident * ident = hash_table[i];
 551                 int count = 0;
 552
 553                 while (ident) {
 554                         count++;
 555                         ident = ident->next;
 556                 }
 557                 if (count > 99)
 558                         count = 99;
 559                 distribution[count]++;
 560         }
 561
 562         for (i = 0; i < 100; i++) {
 563                 if (distribution[i])
 564                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 565         }
 566 }
 567
 568 static struct ident *alloc_ident(const char *name, int len)
 569 {
 570         struct ident *ident = __alloc_ident(len);
 571         ident->symbols = NULL;
 572         ident->len = len;
 573         memcpy(ident->name, name, len);
 574         return ident;
 575 }
 576
 577 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 578 {
 579         ident->next = hash_table[hash];
 580         hash_table[hash] = ident;
 581         ident_miss++;
 582         return ident;
 583 }
 584
 585 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 586 {
 587         struct ident *ident;
 588
 589         ident = hash_table[hash];
 590         while (ident) {
 591                 if (ident->len == len && !memcmp(ident->name, name, len)) {
 592                         ident_hit++;
 593                         return ident;
 594                 }
 595                 ident = ident->next;
 596         }
 597
 598         return insert_hash(alloc_ident(name, len), hash);
 599 }
 600
 601 static unsigned long hash_name(const char *name, int len)
 602 {
 603         unsigned long hash;
 604         const unsigned char *p = (const unsigned char *)name;
 605
 606         hash = ident_hash_init(*p++);
 607         while (--len) {
 608                 unsigned int i = *p++;
 609                 hash = ident_hash_add(hash, i);
 610         }
 611         return ident_hash_end(hash);
 612 }
 613
 614 struct ident *hash_ident(struct ident *ident)
 615 {
 616         return insert_hash(ident, hash_name(ident->name, ident->len));
 617 }
 618
 619 struct ident *built_in_ident(const char *name)
 620 {
 621         int len = strlen(name);
 622         return create_hashed_ident(name, len, hash_name(name, len));
 623 }
 624
 625 struct token *built_in_token(int stream, const char *name)
 626 {
 627         struct token *token;
 628
 629         token = __alloc_token(0);
 630         token->stream = stream;
 631         token->type = TOKEN_IDENT;
 632         token->ident = built_in_ident(name);
 633         return token;
 634 }
 635
 636 static int get_one_identifier(int c, stream_t *stream)
 637 {
 638         struct token *token;
 639         struct ident *ident;
 640         unsigned long hash;
 641         char buf[256];
 642         int len = 1;
 643         int next;
 644
 645         hash = ident_hash_init(c);
 646         buf[0] = c;
 647         for (;;) {
 648                 next = nextchar(stream);
 649                 switch (next) {
 650                 case '0'...'9':
 651                 case 'a'...'z':
 652                 case 'A'...'Z':
 653                 case '_':
 654                         if (len < sizeof(buf)) {
 655                                 hash = ident_hash_add(hash, next);
 656                                 buf[len] = next;
 657                                 len++;
 658                         }
 659                         continue;
 660                 }
 661                 break;
 662         };
 663         hash = ident_hash_end(hash);
 664
 665         ident = create_hashed_ident(buf, len, hash);
 666
 667         /* Pass it on.. */
 668         token = stream->token;
 669         token->type = TOKEN_IDENT;
 670         token->ident = ident;
 671         add_token(stream);
 672         return next;
 673 }
 674
 675 static int get_one_token(int c, stream_t *stream)
 676 {
 677         switch (c) {
 678         case '0'...'9':
 679                 return get_one_number(c, stream);
 680         case 'a'...'z':
 681         case 'A'...'Z':
 682         case '_':
 683                 return get_one_identifier(c, stream);
 684         default:
 685                 return get_one_special(c, stream);
 686         }
 687 }
 688
 689 struct token * tokenize(const char *name, int fd, struct token *endtoken)
 690 {
 691         struct token *begin;
 692         stream_t stream;
 693         int c, idx;
 694
 695         idx = init_stream(name, fd);
 696         if (idx < 0)
 697                 return endtoken;
 698
 699         stream.stream = idx;
 700         stream.token = NULL;
 701         stream.line = 1;
 702         stream.newline = 1;
 703         stream.whitespace = 0;
 704         stream.pos = 0;
 705         stream.fd = fd;
 706         stream.offset = 0;
 707         stream.size = 0;
 708
 709         begin = alloc_token(&stream);
 710         begin->type = TOKEN_STREAMBEGIN;
 711         stream.tokenlist = &begin->next;
 712
 713         c = nextchar(&stream);
 714         while (c != EOF) {
 715                 if (c == '\\') {
 716                         c = nextchar(&stream);
 717                         stream.newline = 0;
 718                         stream.whitespace = 1;
 719                         continue;
 720                 }
 721                 if (!isspace(c)) {
 722                         struct token *token = alloc_token(&stream);
 723                         token->newline = stream.newline;
 724                         token->whitespace = stream.whitespace;
 725                         stream.newline = 0;
 726                         stream.whitespace = 0;
 727                         stream.token = token;
 728                         c = get_one_token(c, &stream);
 729                         continue;
 730                 }
 731                 stream.whitespace = 1;
 732                 c = nextchar(&stream);
 733         }
 734         mark_eof(&stream, endtoken);
 735         return begin;
 736 }